string 模块是 Python 标准库中的一个模块,提供了常见的字符串操作常量和函数。它包含了许多有用的字符串常量和工具函数,可以帮助你更方便地处理字符串。

输出源码

使用 inspect 输出 string 的源码:

import inspect
import string

# 查看 string 模块的源码
print(inspect.getsource(string))
print(string.__file__)

输出的源码为:

"""A collection of string constants.

Public module variables:

whitespace -- a string containing all ASCII whitespace
ascii_lowercase -- a string containing all ASCII lowercase letters
ascii_uppercase -- a string containing all ASCII uppercase letters
ascii_letters -- a string containing all ASCII letters
digits -- a string containing all ASCII decimal digits
hexdigits -- a string containing all ASCII hexadecimal digits
octdigits -- a string containing all ASCII octal digits
punctuation -- a string containing all ASCII punctuation characters
printable -- a string containing all ASCII characters considered printable

"""

__all__ = ["ascii_letters", "ascii_lowercase", "ascii_uppercase", "capwords",
           "digits", "hexdigits", "octdigits", "printable", "punctuation",
           "whitespace", "Formatter", "Template"]

import _string

# Some strings for ctype-style character classification
whitespace = ' \t\n\r\v\f'
ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz'
ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
ascii_letters = ascii_lowercase + ascii_uppercase
digits = '0123456789'
hexdigits = digits + 'abcdef' + 'ABCDEF'
octdigits = '01234567'
punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
printable = digits + ascii_letters + punctuation + whitespace

# Functions which aren't available as string methods.

# Capitalize the words in a string, e.g. " aBc  dEf " -> "Abc Def".
def capwords(s, sep=None):
    """capwords(s [,sep]) -> string

    Split the argument into words using split, capitalize each
    word using capitalize, and join the capitalized words using
    join.  If the optional second argument sep is absent or None,
    runs of whitespace characters are replaced by a single space
    and leading and trailing whitespace are removed, otherwise
    sep is used to split and join the words.

    """
    return (sep or ' ').join(map(str.capitalize, s.split(sep)))


####################################################################
import re as _re
from collections import ChainMap as _ChainMap

_sentinel_dict = {}

class Template:
    """A string class for supporting $-substitutions."""

    delimiter = '$'
    # r'[a-z]' matches to non-ASCII letters when used with IGNORECASE, but
    # without the ASCII flag.  We can't add re.ASCII to flags because of
    # backward compatibility.  So we use the ?a local flag and [a-z] pattern.
    # See https://bugs.python.org/issue31672
    idpattern = r'(?a:[_a-z][_a-z0-9]*)'
    braceidpattern = None
    flags = _re.IGNORECASE

    def __init_subclass__(cls):
        super().__init_subclass__()
        if 'pattern' in cls.__dict__:
            pattern = cls.pattern
        else:
            delim = _re.escape(cls.delimiter)
            id = cls.idpattern
            bid = cls.braceidpattern or cls.idpattern
            pattern = fr"""
            {delim}(?:
              (?P<escaped>{delim})  |   # Escape sequence of two delimiters
              (?P<named>{id})       |   # delimiter and a Python identifier
              {{(?P<braced>{bid})}} |   # delimiter and a braced identifier
              (?P<invalid>)             # Other ill-formed delimiter exprs
            )
            """
        cls.pattern = _re.compile(pattern, cls.flags | _re.VERBOSE)

    def __init__(self, template):
        self.template = template

    # Search for $$, $identifier, ${identifier}, and any bare $'s

    def _invalid(self, mo):
        i = mo.start('invalid')
        lines = self.template[:i].splitlines(keepends=True)
        if not lines:
            colno = 1
            lineno = 1
        else:
            colno = i - len(''.join(lines[:-1]))
            lineno = len(lines)
        raise ValueError('Invalid placeholder in string: line %d, col %d' %
                         (lineno, colno))

    def substitute(self, mapping=_sentinel_dict, /, **kws):
        if mapping is _sentinel_dict:
            mapping = kws
        elif kws:
            mapping = _ChainMap(kws, mapping)
        # Helper function for .sub()
        def convert(mo):
            # Check the most common path first.
            named = mo.group('named') or mo.group('braced')
            if named is not None:
                return str(mapping[named])
            if mo.group('escaped') is not None:
                return self.delimiter
            if mo.group('invalid') is not None:
                self._invalid(mo)
            raise ValueError('Unrecognized named group in pattern',
                             self.pattern)
        return self.pattern.sub(convert, self.template)

    def safe_substitute(self, mapping=_sentinel_dict, /, **kws):
        if mapping is _sentinel_dict:
            mapping = kws
        elif kws:
            mapping = _ChainMap(kws, mapping)
        # Helper function for .sub()
        def convert(mo):
            named = mo.group('named') or mo.group('braced')
            if named is not None:
                try:
                    return str(mapping[named])
                except KeyError:
                    return mo.group()
            if mo.group('escaped') is not None:
                return self.delimiter
            if mo.group('invalid') is not None:
                return mo.group()
            raise ValueError('Unrecognized named group in pattern',
                             self.pattern)
        return self.pattern.sub(convert, self.template)

    def is_valid(self):
        for mo in self.pattern.finditer(self.template):
            if mo.group('invalid') is not None:
                return False
            if (mo.group('named') is None
                and mo.group('braced') is None
                and mo.group('escaped') is None):
                # If all the groups are None, there must be
                # another group we're not expecting
                raise ValueError('Unrecognized named group in pattern',
                    self.pattern)
        return True

    def get_identifiers(self):
        ids = []
        for mo in self.pattern.finditer(self.template):
            named = mo.group('named') or mo.group('braced')
            if named is not None and named not in ids:
                # add a named group only the first time it appears
                ids.append(named)
            elif (named is None
                and mo.group('invalid') is None
                and mo.group('escaped') is None):
                # If all the groups are None, there must be
                # another group we're not expecting
                raise ValueError('Unrecognized named group in pattern',
                    self.pattern)
        return ids

# Initialize Template.pattern.  __init_subclass__() is automatically called
# only for subclasses, not for the Template class itself.
Template.__init_subclass__()


########################################################################
# the Formatter class
# see PEP 3101 for details and purpose of this class

# The hard parts are reused from the C implementation.  They're exposed as "_"
# prefixed methods of str.

# The overall parser is implemented in _string.formatter_parser.
# The field name parser is implemented in _string.formatter_field_name_split

class Formatter:
    def format(self, format_string, /, *args, **kwargs):
        return self.vformat(format_string, args, kwargs)

    def vformat(self, format_string, args, kwargs):
        used_args = set()
        result, _ = self._vformat(format_string, args, kwargs, used_args, 2)
        self.check_unused_args(used_args, args, kwargs)
        return result

    def _vformat(self, format_string, args, kwargs, used_args, recursion_depth,
                 auto_arg_index=0):
        if recursion_depth < 0:
            raise ValueError('Max string recursion exceeded')
        result = []
        for literal_text, field_name, format_spec, conversion in \
                self.parse(format_string):

            # output the literal text
            if literal_text:
                result.append(literal_text)

            # if there's a field, output it
            if field_name is not None:
                # this is some markup, find the object and do
                #  the formatting

                # handle arg indexing when empty field_names are given.
                if field_name == '':
                    if auto_arg_index is False:
                        raise ValueError('cannot switch from manual field '
                                         'specification to automatic field '
                                         'numbering')
                    field_name = str(auto_arg_index)
                    auto_arg_index += 1
                elif field_name.isdigit():
                    if auto_arg_index:
                        raise ValueError('cannot switch from manual field '
                                         'specification to automatic field '
                                         'numbering')
                    # disable auto arg incrementing, if it gets
                    # used later on, then an exception will be raised
                    auto_arg_index = False

                # given the field_name, find the object it references
                #  and the argument it came from
                obj, arg_used = self.get_field(field_name, args, kwargs)
                used_args.add(arg_used)

                # do any conversion on the resulting object
                obj = self.convert_field(obj, conversion)

                # expand the format spec, if needed
                format_spec, auto_arg_index = self._vformat(
                    format_spec, args, kwargs,
                    used_args, recursion_depth-1,
                    auto_arg_index=auto_arg_index)

                # format the object and append to the result
                result.append(self.format_field(obj, format_spec))

        return ''.join(result), auto_arg_index


    def get_value(self, key, args, kwargs):
        if isinstance(key, int):
            return args[key]
        else:
            return kwargs[key]


    def check_unused_args(self, used_args, args, kwargs):
        pass


    def format_field(self, value, format_spec):
        return format(value, format_spec)


    def convert_field(self, value, conversion):
        # do any conversion on the resulting object
        if conversion is None:
            return value
        elif conversion == 's':
            return str(value)
        elif conversion == 'r':
            return repr(value)
        elif conversion == 'a':
            return ascii(value)
        raise ValueError("Unknown conversion specifier {0!s}".format(conversion))


    # returns an iterable that contains tuples of the form:
    # (literal_text, field_name, format_spec, conversion)
    # literal_text can be zero length
    # field_name can be None, in which case there's no
    #  object to format and output
    # if field_name is not None, it is looked up, formatted
    #  with format_spec and conversion and then used
    def parse(self, format_string):
        return _string.formatter_parser(format_string)


    # given a field_name, find the object it references.
    #  field_name:   the field being looked up, e.g. "0.name"
    #                 or "lookup[3]"
    #  used_args:    a set of which args have been used
    #  args, kwargs: as passed in to vformat
    def get_field(self, field_name, args, kwargs):
        first, rest = _string.formatter_field_name_split(field_name)

        obj = self.get_value(first, args, kwargs)

        # loop through the rest of the field_name, doing
        #  getattr or getitem as needed
        for is_attr, i in rest:
            if is_attr:
                obj = getattr(obj, i)
            else:
                obj = obj[i]

        return obj, first

__all__ 的作用:

__all__ 是 Python 中的一个特殊标识符,通常定义在模块的顶层,用于指定当使用 from module import * 时,哪些名称会被导出。换句话说,__all__ 控制着模块的公共接口。

如上述的 string 模块,如果在其他模块中使用 from string import *__all__ 中定义的常量、函数或类会被导入。

不过不建议使用 from module import * ,而是使用 import module*from module import name。推荐的两种方法的优点是明确依赖关系,避免命名冲突,以提高代码的可读性和可维护性。

现在定义 __all__ 是一种良好的编程实践。它可以明确模块的公共接口,告诉使用者哪些名称是模块的“官方”接口,哪些是内部实现细节;提供对工具和 IDE 的支持,比如许多开发工具会使用 __all__ 来提供代码补全、文档提示等功能;如果确实有人使用了 from module import *__all__ 可以确保只导入必要的名称,避免命名空间污染。

常量

在 string 里定义了一些常量:

# Some strings for ctype-style character classification
whitespace = ' \t\n\r\v\f'
ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz'
ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
ascii_letters = ascii_lowercase + ascii_uppercase
digits = '0123456789'
hexdigits = digits + 'abcdef' + 'ABCDEF'
octdigits = '01234567'
punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
printable = digits + ascii_letters + punctuation + whitespace

这些常量基本包含所有的字符,可以直接使用:

import string

whitespace = string.whitespace
print(repr(whitespace))  # 输出: ' \t\n\r\x0b\x0c'
ascil_lower = string.ascii_lowercase
print(ascil_lower) # 输出: 'abcdefghijklmnopqrstuvwxyz'
ascil_upper = string.ascii_uppercase
print(ascil_upper) # 输出: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
ascil_letters = string.ascii_letters
print(ascil_letters) # 输出: 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
ascil_digits = string.digits
print(ascil_digits) # 输出: '0123456789'
ascil_hexdigits = string.hexdigits
print(ascil_hexdigits) # 输出: '0123456789abcdefABCDEF'
ascil_octdigits = string.octdigits
print(ascil_octdigits) # 输出: '01234567'
ascil_punctuation = string.punctuation
print(ascil_punctuation) # 输出: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
ascil_printable = string.printable
print(repr(ascil_printable)) # 输出: '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

capwords 方法

capwords 用于将字符串中的每个单词的首字母大写,其余字母小写。它的作用类似于 str.title(),但比 str.title() 更灵活,因为它可以处理额外的空格和特殊字符。

def capwords(s, sep=None):
    """capwords(s [,sep]) -> string

    Split the argument into words using split, capitalize each
    word using capitalize, and join the capitalized words using
    join.  If the optional second argument sep is absent or None,
    runs of whitespace characters are replaced by a single space
    and leading and trailing whitespace are removed, otherwise
    sep is used to split and join the words.

    """
    return (sep or ' ').join(map(str.capitalize, s.split(sep)))

在 string.pyi 文件中,对 capwords 函数的注解是:

def capwords(s: StrOrLiteralStr, sep: StrOrLiteralStr | None = None) -> StrOrLiteralStr: ...

它表示 capwords 函数接受两个字符串值(StrOrLiteralStr类型),其中 sep 可以为空(为空时,值为None),返回一个字符串值。

capwords 的基本用法:

import string

s = "hello  world! this is a test."
result = string.capwords(s)
print(result)  # 输出: "Hello  World! This Is A Test."

接下来看看 capwords 函数是怎么实现将字符串中的每个单词首字母大写的:

  1. s.split(sep) 首先将 s 按照 sep 拆分成一个列表。如下:
split()

split() 是 Python 中字符串对象的一个内置方法,用于将字符串按照指定的分隔符拆分成一个列表。它的主要作用是将一个字符串分割成多个子字符串,并返回这些子字符串组成的列表。
split() 的基本语法为:str.split(sep=None, maxsplit=-1)

  • sep:分隔符,默认为 None。如果 sep 为 None,则默认使用空白字符(空格、制表符、换行符等)进行分割。
  • maxsplit:最大分割次数,默认为 -1,表示不限制分割次数。
  • 返回值:返回一个列表,包含分割后的子字符串。
s = "you are so beautiful!"
print(s.split(" ")) # 输出: ['you', 'are', 'so', 'beautiful!']
  1. map(str.capitalize, s.split(sep)) 将 str.capitalize 函数应用到 s.split(sep) 拆分的列表中的每个元素上,并返回一个迭代器(map 对象)。如下:

map()

map() 是 Python 中的一个内置函数,用于将一个函数应用到可迭代对象(如列表、元组等)的每个元素上,并返回一个迭代器(map 对象)。
map() 的基本语法为:map(function, iterable, …)

  • function:一个函数,可以是内置函数、自定义函数或 lambda 函数。
  • iterable:一个可迭代对象(如列表、元组、字符串等)。
  • 返回值:返回一个 map 对象(迭代器),可以通过 list()、tuple() 等函数将其转换为列表或元组。

capitalize()

capitalize() 是 Python 中字符串对象的一个内置方法,用于将字符串的第一个字符大写,其余字符小写。
capitalize() 的基本语法为:str.capitalize()

  • str:一个字符串。
  • 返回值:返回一个新的字符串,第一个字符大写,其余字符小写。

s = "you are so beautiful!"
sep=' '
print(list(map(str.capitalize, s.split(sep)))) # 输出: ['You', 'Are', 'So', 'Beautiful!']
  1. (sep or ’ ‘).join(map(str.capitalize, s.split(sep))) 将 map 返回的可迭代对象按照(sep or ’ ‘)组装成一个字符串。如下:

join()

join() 是 Python 中字符串对象的一个内置方法,用于将一个可迭代对象(如列表、元组等)中的元素拼接成一个字符串,并使用指定的分隔符分隔。
join() 的基本语法为:str.join(iterable)

  • str:分隔符,用于连接可迭代对象中的元素。
  • iterable:一个可迭代对象(如列表、元组、集合等),其中的元素必须是字符串类型。
  • 返回值:返回一个新的字符串,由可迭代对象中的元素拼接而成,元素之间用 str 分隔。

(sep or ' ')
(sep or ’ ‘) 是 Python 中一种常见的表达式,用于动态选择一个值。它的作用是:如果 sep 不是 None 或空值(如 ‘’),则使用 sep;否则使用默认的空格 ’ ‘。这种写法通常用于设置默认值或动态选择分隔符。

s = "you are so beautiful!"
sep=' '
print((sep or ' ').join(map(str.capitalize, s.split(sep)))) # 输出: "You Are So Beautiful!"

Template 类

Template 类提供了一种简化的方式来执行字符串替换操作,特别是当你需要一种比传统的格式化方法(如%操作符或str.format())更简单且更安全的方法时。Template类特别适用于用户输入数据的场景,因为它可以避免许多常见的格式化错误和安全问题。它使用 $ 符号作为占位符,适合处理简单的字符串格式化需求。适合配置文件、模板引擎等场景。

Template 类的实现包括以下代码:

import re as _re
from collections import ChainMap as _ChainMap

_sentinel_dict = {}

class Template:
    """A string class for supporting $-substitutions."""

    delimiter = '$'
    # r'[a-z]' matches to non-ASCII letters when used with IGNORECASE, but
    # without the ASCII flag.  We can't add re.ASCII to flags because of
    # backward compatibility.  So we use the ?a local flag and [a-z] pattern.
    # See https://bugs.python.org/issue31672
    idpattern = r'(?a:[_a-z][_a-z0-9]*)'
    braceidpattern = None
    flags = _re.IGNORECASE

    def __init_subclass__(cls):
        super().__init_subclass__()
        if 'pattern' in cls.__dict__:
            pattern = cls.pattern
        else:
            delim = _re.escape(cls.delimiter)
            id = cls.idpattern
            bid = cls.braceidpattern or cls.idpattern
            pattern = fr"""
            {delim}(?:
              (?P<escaped>{delim})  |   # Escape sequence of two delimiters
              (?P<named>{id})       |   # delimiter and a Python identifier
              {{(?P<braced>{bid})}} |   # delimiter and a braced identifier
              (?P<invalid>)             # Other ill-formed delimiter exprs
            )
            """
        cls.pattern = _re.compile(pattern, cls.flags | _re.VERBOSE)

    def __init__(self, template):
        self.template = template

    # Search for $$, $identifier, ${identifier}, and any bare $'s

    def _invalid(self, mo):
        i = mo.start('invalid')
        lines = self.template[:i].splitlines(keepends=True)
        if not lines:
            colno = 1
            lineno = 1
        else:
            colno = i - len(''.join(lines[:-1]))
            lineno = len(lines)
        raise ValueError('Invalid placeholder in string: line %d, col %d' %
                         (lineno, colno))

    def substitute(self, mapping=_sentinel_dict, /, **kws):
        if mapping is _sentinel_dict:
            mapping = kws
        elif kws:
            mapping = _ChainMap(kws, mapping)
        # Helper function for .sub()
        def convert(mo):
            # Check the most common path first.
            named = mo.group('named') or mo.group('braced')
            if named is not None:
                return str(mapping[named])
            if mo.group('escaped') is not None:
                return self.delimiter
            if mo.group('invalid') is not None:
                self._invalid(mo)
            raise ValueError('Unrecognized named group in pattern',
                             self.pattern)
        return self.pattern.sub(convert, self.template)

    def safe_substitute(self, mapping=_sentinel_dict, /, **kws):
        if mapping is _sentinel_dict:
            mapping = kws
        elif kws:
            mapping = _ChainMap(kws, mapping)
        # Helper function for .sub()
        def convert(mo):
            named = mo.group('named') or mo.group('braced')
            if named is not None:
                try:
                    return str(mapping[named])
                except KeyError:
                    return mo.group()
            if mo.group('escaped') is not None:
                return self.delimiter
            if mo.group('invalid') is not None:
                return mo.group()
            raise ValueError('Unrecognized named group in pattern',
                             self.pattern)
        return self.pattern.sub(convert, self.template)

    def is_valid(self):
        for mo in self.pattern.finditer(self.template):
            if mo.group('invalid') is not None:
                return False
            if (mo.group('named') is None
                and mo.group('braced') is None
                and mo.group('escaped') is None):
                # If all the groups are None, there must be
                # another group we're not expecting
                raise ValueError('Unrecognized named group in pattern',
                    self.pattern)
        return True

    def get_identifiers(self):
        ids = []
        for mo in self.pattern.finditer(self.template):
            named = mo.group('named') or mo.group('braced')
            if named is not None and named not in ids:
                # add a named group only the first time it appears
                ids.append(named)
            elif (named is None
                and mo.group('invalid') is None
                and mo.group('escaped') is None):
                # If all the groups are None, there must be
                # another group we're not expecting
                raise ValueError('Unrecognized named group in pattern',
                    self.pattern)
        return ids

# Initialize Template.pattern.  __init_subclass__() is automatically called
# only for subclasses, not for the Template class itself.
Template.__init_subclass__()

初始化

Template 类的初始化操作包括两个方法,__init____init_subclass__

def __init_subclass__(cls):
	super().__init_subclass__()
	if 'pattern' in cls.__dict__:
		pattern = cls.pattern
	else:
		delim = _re.escape(cls.delimiter)
		id = cls.idpattern
		bid = cls.braceidpattern or cls.idpattern
		pattern = fr"""
		{delim}(?:
		  (?P<escaped>{delim})  |   # Escape sequence of two delimiters
		  (?P<named>{id})       |   # delimiter and a Python identifier
		  {{(?P<braced>{bid})}} |   # delimiter and a braced identifier
		  (?P<invalid>)             # Other ill-formed delimiter exprs
		)
		"""
	cls.pattern = _re.compile(pattern, cls.flags | _re.VERBOSE)

def __init__(self, template):
	self.template = template
  1. __init__

它决定了在实例 Template 类时,需传参 template ,这个 template 应是一个字符串。默认应类似此格式:"Hello, $name! Welcome to $place.",需要替换的字符串前加上$

  1. __init_subclass__

它用于定义类属性 pattern(正则表达式对象)。__init_subclass__ 方法在子类继承父类时才会被调用,所以在 string 模块中,有 Template.__init_subclass__() ,在导入 Template 类时就会执行 __init_subclass__ 方法。

Template.__init_subclass__() 定义了 cls.pattern 的值:

  • 如果没有子类继承或子类继承时不指定 pattern ,会生成默认的 pattern。
  • 如果子类继承Template 类时,指定了 pattern,__init_subclass__ 会直接使用该属性。
  • 也可以通过重写父类的 delimiter、idpattern 来修改 pattern 属性。

默认的 pattern 作用如下:

  • 匹配两个连续的美元符号 $$ ,将 $ 放入 escaped 组。
  • 匹配 $+遵循Python变量名规则的标识符(如$var_name123),将标识符(如var_name123)放入 named 组。
  • 匹配 $+{遵循Python变量名规则的标识符}(如${var_name123}),将标识符(如var_name123)放入 braced 组。
  • 匹配 $+遵循Python变量名规则的标识符,如($12name),将空字符串放入 invalid 组。

substitute 方法

substitute 用于替换模板字符串中的占位符。它会将模板中的 $标识符(如 $name 或 ${name})替换为提供的实际值。如果模板中存在未提供的标识符,substitute 会抛出 KeyError 异常。

def substitute(self, mapping=_sentinel_dict, /, **kws):
    if mapping is _sentinel_dict:
        mapping = kws
    elif kws:
        mapping = _ChainMap(kws, mapping)

    # Helper function for .sub()
    def convert(mo):
        # Check the most common path first.
        named = mo.group('named') or mo.group('braced')
        if named is not None:
            return str(mapping[named])
        if mo.group('escaped') is not None:
            return self.delimiter
        if mo.group('invalid') is not None:
            self._invalid(mo)
        raise ValueError('Unrecognized named group in pattern',
                         self.pattern)

    return self.pattern.sub(convert, self.template)

在 string.pyi 文件中,对 substitute 方法的注解是:

def substitute(self, mapping: Mapping[str, object] = {}, /, **kwds: object) -> str: ...

它的类型标注为:

  • Mapping[str, object]:一个键为字符串、值为任意对象的映射的参数。它是一个键值对的集合,支持通过键访问值,但不支持添加、删除或修改键值对。
  • **kwds:关键字参数,用于提供标识符和对应的值。值可以是任意对象。
  • 返回值:返回替换后的字符串。

substitute 的基本用法:

  1. 使用 Mapping 参数
from string import Template

# 创建模板
template = Template("Hello, $name! Welcome to $place.")

# 替换占位符
result = template.substitute({"name": "JZY", "place": "HangZhou"})
print(result)  # 输出: "Hello, JZY! Welcome to HangZhou."
  1. 使用 **kwds 参数
from string import Template

# 创建模板
template = Template("Hello, $name! Welcome to $place.")

# 替换占位符
result = template.substitute(name="JZY", place="HangZhou")
print(result)  # 输出: "Hello, JZY! Welcome to HangZhou."
  1. Mapping 参数与 **kwds 参数混合使用
from string import Template

# 创建模板
template = Template("Hello, $name! Welcome to $place.")

# 替换占位符
result = template.substitute({"name": "JZY"}, place="HangZhou")
print(result)  # 输出: "Hello, JZY! Welcome to HangZhou."

修改 delimiter、idpattern ,适应不同需求,比如替换占位符并支持中文:

from string import Template

class ChineseTemplate(Template):
    # 修改 delimiter 占位符为 #
    delimiter = '#'
    # 修改 idpattern 以支持中文字符
    idpattern = r'(?a:[\u4e00-\u9fa5_a-zA-Z][\u4e00-\u9fa5_a-zA-Z0-9]*)'

# 使用自定义的 ChineseTemplate
template = ChineseTemplate("你好,#姓名!欢迎来到#地址。")

# 替换占位符
result = template.substitute({"姓名": "张三", "地址": "杭州"})

print(result)  # 输出: "你好,张三!欢迎来到杭州。"

ok,我们看看 substitute 方法的具体实现逻辑:

  1. 处理传参
  • 如果 mapping 没有传参,则将 mapping 设置为 kws
  • 如果 mapping 非空且 kws 非空,则将 mapping 和 kws 合并为一个 _ChainMap 对象

比如:

from collections import ChainMap as _ChainMap
_sentinel_dict = {}
def test(mapping=_sentinel_dict, **kwargs):
    if mapping is _sentinel_dict:
        mapping = kwargs
    elif kwargs:
        mapping = _ChainMap(kwargs, mapping)
    return mapping


if __name__ == '__main__':
    result = test(name='JZY', age="18", sex='man', address='China')
    print(result) # 输出 {'name': 'JZY', 'age': '18', 'sex': 'man', 'address': 'China'}
    mpping = {'name': 'JZY', 'age': "18", 'sex': 'man', 'address': 'China'}
    result = test(mpping) # 输出 {'name': 'JZY', 'age': '18', 'sex': 'man', 'address': 'China'}
    print(result)
    mpping = {'sex': 'man', 'address': 'China'}
    result = test(mpping, name='JZY', age=18)
    print(result) # 输出 ChainMap({'name': 'JZY', 'age': 18}, {'sex': 'man', 'address': 'China'})
  1. 返回 self.pattern.sub(convert, self.template)
sub()

sub:是 Python 中 re 模块(正则表达式模块)提供的一个方法,用于替换字符串中与正则表达式匹配的部分。

正则表达式(pattern)是字符串时,这样调用:re.sub(pattern, repl, string, count=0, flags=0)

正则表达式(pattern)是一个已编译的正则表达式对象时,这样调用:pattern.sub(repl, string, count=0)

  • pattern: 正则表达式,用来匹配目标字符串中的子串。
  • repl: 替换匹配到的子串的字符串或一个调用替换逻辑的函数。
  • string: 需要处理的目标字符串。
  • count: 可选参数,指明替换的最大次数;默认值为0,意味着替换所有匹配项。
  • flags: 可选参数,修饰符标志,如re.IGNORECASE等,改变模式的行为。

示例:

import re

#

# 定义需要处理的目标字符串
string = "Hello 123 world 456"
# 定义正则表达式模式, r'\d+' 表示匹配数字
pattern = r'\d+'


# 定义替换匹配到的子串的字符串
repl = 'JZY'
# 使用 re.sub() 方法替换匹配到的子串
new_string = re.sub(pattern, repl, string)

print(new_string)  # 输出: Hello JZY world JZY


# 定义一个函数,用于替换匹配到的子串
def test(match):
    value = match.group()
    if value == '123':
        return 'JZY'
    elif value == '456':
        return 'jzy'
    else:
        return 'good'

# 使用 re.sub() 方法替换匹配到的子串
new_string = re.sub(pattern, test, string)

print(new_string)  # 输出: Hello JZY world jzy

我们看一下 convert 函数的实现,假设模板文本是:"$name is a ${describe} person, The salary of $name is $$100.",传入 substitute 的参数是:{"name": "JZY", "describe": "good"}

首先 sub 函数会按照 pattern 函数模式匹配模板文本,它将依次匹配到 $name、${describe}、$name、$$,并依次将匹配到的值传入 convert 函数。我们已知 name 会在 named组中,describe 会在 escaped 组中,$ 会在 invalid 组中。

convert 函数内部的处理就比较直观了:

def convert(mo):
	# Check the most common path first.
	named = mo.group('named') or mo.group('braced')
	if named is not None:
		return str(mapping[named])
	if mo.group('escaped') is not None:
		return self.delimiter
	if mo.group('invalid') is not None:
		self._invalid(mo)
	raise ValueError('Unrecognized named group in pattern',
					 self.pattern)

如果模板文本为"$1name is a good person",就会有 invalid 组存在,导致调用 self._invalid(mo) ,抛出异常。

还需要注意:如果想要自定义 pattern ,不要改变捕获组的名称,不然会导致 raise ValueError('Unrecognized named group in pattern',self.pattern)

safe_substitute 方法

safe_substitute 方法用于替换模板字符串中的标识符。它会将模板中的 $标识符(如 $name 或 ${name})替换为提供的实际值。如果模板中存在未提供的标识符,safe_substitute 方法会忽略,原样输出。

def safe_substitute(self, mapping=_sentinel_dict, /, **kws):
	if mapping is _sentinel_dict:
		mapping = kws
	elif kws:
		mapping = _ChainMap(kws, mapping)
	# Helper function for .sub()
	def convert(mo):
		named = mo.group('named') or mo.group('braced')
		if named is not None:
			try:
				return str(mapping[named])
			except KeyError:
				return mo.group()
		if mo.group('escaped') is not None:
			return self.delimiter
		if mo.group('invalid') is not None:
			return mo.group()
		raise ValueError('Unrecognized named group in pattern',
						 self.pattern)
	return self.pattern.sub(convert, self.template)

safe_substitut 方法与 substitut 方法的功能是一样的,他们的区别是:safe_substitute 的 convert 函数引入 try 来处理未提供的标识符,直接返回原值。

is_valid 方法

safe_substitute 方法用于查找模板文本是否合法。如果模板中存在未提供的标识符,substitute 会抛出 KeyError 异常,如果包含无效的 Python 标识符,返回 False ;合法返回 True 。

代码比较直观,易于理解:

def is_valid(self):
	for mo in self.pattern.finditer(self.template):
		if mo.group('invalid') is not None:
			return False
		if (mo.group('named') is None
			and mo.group('braced') is None
			and mo.group('escaped') is None):
			# If all the groups are None, there must be
			# another group we're not expecting
			raise ValueError('Unrecognized named group in pattern',
				self.pattern)
	return True

get_identifiers 方法

get_identifiers 方法用于获取模板文本的标识符。如果模板中未存在任何标识符(包含无效标识符及$$),substitute 会抛出 KeyError 异常。

代码比较直观,易于理解:

def get_identifiers(self):
	ids = []
	for mo in self.pattern.finditer(self.template):
		named = mo.group('named') or mo.group('braced')
		if named is not None and named not in ids:
			# add a named group only the first time it appears
			ids.append(named)
		elif (named is None
			and mo.group('invalid') is None
			and mo.group('escaped') is None):
			# If all the groups are None, there must be
			# another group we're not expecting
			raise ValueError('Unrecognized named group in pattern',
				self.pattern)
	return ids

Formatter 类

Formatter 类提供了一种底层的方式来进行字符串格式化。它允许你以更灵活和可扩展的方式来格式化字符串,而不是仅仅依赖于内置的 % 操作符或 .format() 方法。通过继承 Formatter 类并重写其方法,你可以自定义字符串格式化的规则。

Formatter 类和 Template 类都用于字符串格式化,但它们的设计目标、使用场景以及提供的功能存在显著差异:

  • Formatter 类更灵活,适用于更复杂的场景,例如科学计算结果的展示、特定领域内的数据表示等。
  • Template 类更安全,更易用,更简单。

Template 类的实现包括以下代码:

class Formatter:
    def format(self, format_string, /, *args, **kwargs):
        return self.vformat(format_string, args, kwargs)

    def vformat(self, format_string, args, kwargs):
        used_args = set()
        result, _ = self._vformat(format_string, args, kwargs, used_args, 2)
        self.check_unused_args(used_args, args, kwargs)
        return result

    def _vformat(self, format_string, args, kwargs, used_args, recursion_depth,
                 auto_arg_index=0):
        if recursion_depth < 0:
            raise ValueError('Max string recursion exceeded')
        result = []
        for literal_text, field_name, format_spec, conversion in \
                self.parse(format_string):

            # output the literal text
            if literal_text:
                result.append(literal_text)

            # if there's a field, output it
            if field_name is not None:
                # this is some markup, find the object and do
                #  the formatting

                # handle arg indexing when empty field_names are given.
                if field_name == '':
                    if auto_arg_index is False:
                        raise ValueError('cannot switch from manual field '
                                         'specification to automatic field '
                                         'numbering')
                    field_name = str(auto_arg_index)
                    auto_arg_index += 1
                elif field_name.isdigit():
                    if auto_arg_index:
                        raise ValueError('cannot switch from manual field '
                                         'specification to automatic field '
                                         'numbering')
                    # disable auto arg incrementing, if it gets
                    # used later on, then an exception will be raised
                    auto_arg_index = False

                # given the field_name, find the object it references
                #  and the argument it came from
                obj, arg_used = self.get_field(field_name, args, kwargs)
                used_args.add(arg_used)

                # do any conversion on the resulting object
                obj = self.convert_field(obj, conversion)

                # expand the format spec, if needed
                format_spec, auto_arg_index = self._vformat(
                    format_spec, args, kwargs,
                    used_args, recursion_depth-1,
                    auto_arg_index=auto_arg_index)

                # format the object and append to the result
                result.append(self.format_field(obj, format_spec))

        return ''.join(result), auto_arg_index


    def get_value(self, key, args, kwargs):
        if isinstance(key, int):
            return args[key]
        else:
            return kwargs[key]


    def check_unused_args(self, used_args, args, kwargs):
        pass


    def format_field(self, value, format_spec):
        return format(value, format_spec)


    def convert_field(self, value, conversion):
        # do any conversion on the resulting object
        if conversion is None:
            return value
        elif conversion == 's':
            return str(value)
        elif conversion == 'r':
            return repr(value)
        elif conversion == 'a':
            return ascii(value)
        raise ValueError("Unknown conversion specifier {0!s}".format(conversion))


    # returns an iterable that contains tuples of the form:
    # (literal_text, field_name, format_spec, conversion)
    # literal_text can be zero length
    # field_name can be None, in which case there's no
    #  object to format and output
    # if field_name is not None, it is looked up, formatted
    #  with format_spec and conversion and then used
    def parse(self, format_string):
        return _string.formatter_parser(format_string)


    # given a field_name, find the object it references.
    #  field_name:   the field being looked up, e.g. "0.name"
    #                 or "lookup[3]"
    #  used_args:    a set of which args have been used
    #  args, kwargs: as passed in to vformat
    def get_field(self, field_name, args, kwargs):
        first, rest = _string.formatter_field_name_split(field_name)

        obj = self.get_value(first, args, kwargs)

        # loop through the rest of the field_name, doing
        #  getattr or getitem as needed
        for is_attr, i in rest:
            if is_attr:
                obj = getattr(obj, i)
            else:
                obj = obj[i]

        return obj, first

format 方法

© 转载需要保留原始链接,未经明确许可,禁止商业使用。CC BY-NC-ND 4.0