2025-09-10:仓库迁移

2025-09-10 16:12:45 +08:00
parent e0e49b0ac9
commit 3130e336a1
146 changed files with 4066 additions and 0 deletions
--- a/2.字符串和文本/1.任意多分隔符字符串拆分.py
+++ b/2.字符串和文本/1.任意多分隔符字符串拆分.py
@@ -0,0 +1,23 @@
+# 众所周知，split函数可以针对单个分隔符把字符串拆开
+
+a = "suka,caonimade,shabi"
+b = a.split(",")
+print(b)
+
+# 但是假如系统可以检测到这种粗鄙之语，那就会让🐎🐎消失
+# 想要保卫自己的🐎，那就要粗鄙的隐晦一点：
+
+a = "suka,*&&**caonimade,_&^shabi"
+# 这时候关键词检测就暴毙了
+b = a.split(",")
+print(b)
+
+# 但是聪明的审核会用re库来解决这个问题：
+import re
+# re库中的split函数接受多个输入，可以同时干掉多种干扰：
+b = re.split(r'[,*&^_]', a)
+print(b)
+
+# 如果要保留分隔符， 记得使用正则闭包
+b = re.split(r'(,|&|^|_)', a)
+print(b)
--- a/2.字符串和文本/10.用正则处理unicode.py
+++ b/2.字符串和文本/10.用正则处理unicode.py
@@ -0,0 +1,11 @@
+import re
+
+if __name__ == '__main__':
+    # 默认情况下，re模块已经认识了某些unicode字符，比如\d现在已经可以匹配unicode的数字
+    num = re.compile(r'\d+')
+    # match会从第一个字符开始匹配，如果不匹配就返回None
+    print(num.match('123'))
+
+    print(num.match('\u0661\u0662\u0663'))
+
+    # 我的评价是别这样干，在处理东西之前先将输入标准化为ascii编码是一个程序员的基本素养
--- a/2.字符串和文本/11.从字符串中去掉不需要的字符.py
+++ b/2.字符串和文本/11.从字符串中去掉不需要的字符.py
@@ -0,0 +1,24 @@
+
+if __name__ == '__main__':
+    # 使用strip方法可以删除不要的字符串，默认是删除空格
+    s = ' hello world \n'
+    print(s.strip())
+
+    # 同样的，这个方法还有进阶版本，lstrip和rstrip可以从左右开始检测指定符号进行删除
+    # 如果左边开头或右边开头没有指定符号，则不会起作用
+    s2 = '---hello world==='
+    print(s2.lstrip('-'))
+    print(s2.rstrip('='))
+    print(s2.strip('-='))
+
+    # 在这个例子中，hello world中间的空格不会被strip系列函数删除，因为这个方法不会管字符串中间的字符
+    # 如果需要删除hello world中间的空格，请使用replace方法
+
+    # 在一个文件中，我们可以这样进行每一行的信息过滤：
+
+    # with open(file) as f:
+    #     lines = (line.strip() for line in f)
+    #     for line in lines:
+    #         print(line)
+    
+
--- a/2.字符串和文本/12.文本过滤和清理.py
+++ b/2.字符串和文本/12.文本过滤和清理.py
@@ -0,0 +1,26 @@
+import sys
+
+import unicodedata
+
+if __name__ == '__main__':
+    # 总有脑瘫喜欢输入一些奇奇怪怪的东西，这个时候我们就需要做一些过滤操作
+    s = 'pyth\u0303on\fis\tawesome\r\n'
+    # 这个字符串看起来就非常恶心了，这时候我们可以用translate方法对一些特殊符号进行迭代：
+    remap = {
+        ord('\t'): ' ',
+        ord('\f'): ' ',
+        ord('\r'): None,
+    }
+    s = s.translate(remap)
+    print(s)
+
+    # 这个时候我们就过滤掉了各种空格符和回车符
+    # 我们也可以构建更大的order_dict去把unicode过滤掉
+    b = unicodedata.normalize("NFD", s)
+    # 对每一个unicode建立一个None的映射，
+    cmb_dict = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
+    b = b.translate(cmb_dict)
+    print(b)
+
+    # 如果只是简单的替换操作，那么replace函数已经很好了，速度也很快；
+    # 在做字符映射操作的时候可以使用translate方法并构建一个映射字典
--- a/2.字符串和文本/13.对齐文本.py
+++ b/2.字符串和文本/13.对齐文本.py
@@ -0,0 +1,22 @@
+
+
+if __name__ == "__main__":
+    text = 'Hello World'
+    # 如果想要对齐字符串，可以使用ljest、rjust和center方法
+    print(text.ljust(20))
+    print(text.rjust(20))
+    print(text.center(20))
+
+    # 当然，这个轮椅也支持填充操作
+    print(text.ljust(20, '='))
+    print(text.rjust(20, '-'))
+    print(text.center(20, '='))
+
+    # 除了轮椅函数，建议使用format函数，这个东西更加泛用，还能进行格式转换
+    print(format(text, '-<20'))     # <表示宽度20居左
+    print(format(text, '=>20'))     # >表示宽度20居右
+    print(format(text, '+^20'))     # ^表示宽度20居中
+    # <>^前面的符号表示使用该符号填充
+    # format的好处主要在于，它的处理对象不仅是字符串，可以对任何数据形式进行格式化
+    num = 1.23456
+    print(format(num, '=^20.2f'))   # 这表示宽度20，保留两位小数，居中，使用=填充
--- a/2.字符串和文本/14.字符串连接与合并.py
+++ b/2.字符串和文本/14.字符串连接与合并.py
@@ -0,0 +1,15 @@
+
+
+if __name__ == "__main__":
+    # 如果有一堆字符串, 那我想join函数是一个好选择, 但如果就几个那用+就可以了
+    parts = ["Is", "Chicago", "Not", "Chicago?"]
+    str_c = ' '.join(parts)
+    print(str_c)
+
+    # 如果我有密钥的两部分，想把a和b连接在一起，那可以这样
+    keyword = 'su' 'ka'
+    print(keyword)
+
+    # 在实际运用中，如果遇到一堆字符串拼接，千万别用+=，这样会产生大量内存垃圾
+    # 如果有大量的片段需要拼接，则应该考虑使用生成器函数（字段确定的情况下）
+    
--- a/2.字符串和文本/15.给字符串中的变量做插值处理.py
+++ b/2.字符串和文本/15.给字符串中的变量做插值处理.py
@@ -0,0 +1,43 @@
+import sys
+
+if __name__ == "__main__":
+    # 一般来说，我们用{}和format函数对字符串进行插值
+    s = "{name} has {n} messages"
+    s1 = s.format(name="Sam", n=10)
+    print(s1)
+
+    # 当然如果你想偷懒，也可以这样：
+    name = "Sam"
+    n = 10
+    s2 = s.format_map(vars())
+    print(s2)
+    # 在上面这段代码中，vars函数会从堆栈里自动寻找变量名称的值
+    # vars还能解析类实例：
+    class INFO:
+        def __init__(self, name, n):
+            self.name = name
+            self.n = n
+    info = INFO(name, n)
+    s3 = s.format_map(vars(info))
+    print(s3)
+
+    # 爽吗？确实爽，但是有一个缺点，在填充值缺失的时候这玩意儿会报错
+    try:
+        s4 = s.format(name=name)
+    except KeyError:
+        print("少参数")
+
+    # 我们可以定义一个类的__missing__方法来防止出现这种情况
+    class Safe_Sub(dict):
+        def __missing__(self, key):
+            return '{'+ key +'}'
+
+    del n
+    print(s.format_map(Safe_Sub(vars())))
+
+    # 如果经常操作，那么可以把这个功能藏在函数里,同时从当前调用的堆栈里找到变量（我反正不建议这么干，不过得知道）
+    def sub(text):
+        # 不！要！在！代码里！碰！tmd！堆栈！这是手贱行为，可能导致各种bug，除非有必要的理由否则不要这么做
+        return text.format_map(Safe_Sub(sys._getframe(1).f_locals))
+    name = "Suka"
+    print(sub(s))
--- a/2.字符串和文本/16.以固定列数重新格式化文本.py
+++ b/2.字符串和文本/16.以固定列数重新格式化文本.py
@@ -0,0 +1,20 @@
+import textwrap
+import os
+
+if __name__ == '__main__':
+    words = ' '.join(["look", "into", "my", "eyes", "look", "into", "my", "eyes",
+        "the", "eyes", "the", "eyes", "the", "eyes", "not", "around", "the",
+        "eyes", "don't", "look", "around", "the", "eyes", "look", "into",
+        "my", "eyes", "you're", "under"])
+    print(words)
+    # 可以看到这样打印出来的字符串巨长无比，如果想要控制行宽度，就用textwrap模块
+    print(textwrap.fill(words, 70))
+    print(textwrap.fill(words, 40))
+    # 第二个参数width可以控制显示宽度，如果是控制台输出，你还能去找os.get_terminal_size()
+    # 注意，由于未知原因这在pycharm中不好使（2024.8.27）
+    t_width = os.get_terminal_size().columns
+    print(t_width)
+    print(textwrap.fill(words, t_width))
+
+    # 用的应该不多，在需要显示长字符串的时候去查textwrap库就行
+
--- a/2.字符串和文本/17.处理HTML和XML文件.py
+++ b/2.字符串和文本/17.处理HTML和XML文件.py
@@ -0,0 +1,4 @@
+
+# 这本书的年代有些久远，当时HTML和XML解析器还不是特别完善
+
+# 在现在的工具中，想要处理HTML和XML文件，只需要寻找HTML Parser和XML解析库就行
--- a/2.字符串和文本/18.文本分词.py
+++ b/2.字符串和文本/18.文本分词.py
@@ -0,0 +1,37 @@
+import re
+from collections import namedtuple
+
+if __name__ == "__main__":
+    text = 'foo = 23 + 42 * 10'
+    # 我们想要将这个字符串解析成这个样子：
+    # tokens = [('NAME', 'foo'), ('EQ', '='), ('NUMS', '23'), ('PLUS', '+'),
+    #           ('NUMS', '42'), ('TIMES', '*'), ('NUMS', '10')]
+    # 首先，我们要对每种情况编写一个捕获组
+    NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
+    NUM = r'(?P<NUM>\d+)'
+    PLUS = r'(?P<PLUS>\+)'
+    TIMES = r'(?P<TIMES>\*)'
+    EQ = r'(?P<EQ>=)'
+    WS = r'(?P<WS>\s+)'
+    # 把这些捕获组静态编译给re模块
+    master_pat = re.compile('|'.join([NAME,NUM,PLUS,TIMES,EQ,WS]))
+
+    # 定义一个数据结构来存储每个部分
+    Token = namedtuple('Token', ['type', 'value'])
+
+    def generate_tokens(pat, text):
+        # 调用re.compile的scanner模块来扫描，并用iter模块将其变成可迭代对象进行迭代
+        scanner = pat.scanner(text)
+        for m in iter(scanner.match, None):
+            # 生成结构来存储分词
+            yield Token(m.lastgroup, m.group())
+
+    # 对分词类型进行过滤，去掉空格
+    tokens = (tok for tok in generate_tokens(master_pat, text) if tok.type != 'WS')
+    for tok in tokens:
+        print(tok)
+
+
+    # 看起来很简单是吧，唯一的难点就在compile的时候，
+    # 要把大词放前面先搜完再搜小词，保证较长的模式先匹配；比如<=的顺序就应该比<靠前
+    
--- a/2.字符串和文本/19.递归下降解析器.py
+++ b/2.字符串和文本/19.递归下降解析器.py
@@ -0,0 +1,3 @@
+"""
+我的评价是去看编译原理的文本解析部分
+"""
--- a/2.字符串和文本/2.字符串首尾文字匹配.py
+++ b/2.字符串和文本/2.字符串首尾文字匹配.py
@@ -0,0 +1,21 @@
+from numpy.core.defchararray import startswith
+
+if __name__ == '__main__':
+    str = "https://www.baidu.com"
+    str2 = "http://www.baidu.cn"
+    str3 = "suka"
+    # 如果想要在字符串首进行匹配，可以使用：
+    is_start_with_https = str.startswith("https")
+    print(is_start_with_https)
+    # 这会输出一个布尔值，
+    ## 比如上面匹配成功，就会输出True
+
+    # 同理，可以设置endswith
+    is_end_with_dotcom = str.endswith(".com")
+    print(is_end_with_dotcom)
+
+    # 如果想要匹配多种开头和结尾，可以将函数输入改成元组()
+    is_url = [url for url in [str, str2, str3] if url.startswith(('http', 'https'))]
+    print(is_url)
+
+    # 当然，复杂的操作还请使用正则表达式进行匹配，不过简单的检查用这个方法那是又快又好
--- a/2.字符串和文本/20.在字节串上执行文本操作.py
+++ b/2.字符串和文本/20.在字节串上执行文本操作.py
@@ -0,0 +1,33 @@
+import re
+
+from pyasn1.codec.ber.decoder import decode
+
+if __name__ == '__main__':
+    data = b'Hello World'
+    # 目前字节串已经支持大部分和字符串相同的操作
+    data05 = data[0:5]
+    print(data05)
+
+    is_startwith_hello = data.startswith(b'Hello')
+    print(is_startwith_hello)
+
+    data_sp = data.split()
+    print(data_sp)
+
+    data_suka = data.replace(b'Hello', b'Hello Suka')
+    print(data_suka)
+
+    # 唯一需要注意的是，如果用re做了匹配，那记得要用字节串的形式来匹配
+    data = b'foo:bar,spam'
+    try:
+        re.split('[:,]', data)
+    except:
+        print("匹配错误，换字节串匹配")
+        data_bsp = re.split(b'[:,]', data)
+        print(data_bsp)
+
+    # 字节串上的字符不能直接被识别，需要先用ascii解码
+    print(data[0])
+    print(data.decode("ascii")[0])
+
+    # 我的建议是除非万不得已，请不要用字节串处理文本，统一使用ascii解码成字符串后再进行操作
--- a/2.字符串和文本/3.shell通配符做字符串匹配.py
+++ b/2.字符串和文本/3.shell通配符做字符串匹配.py
@@ -0,0 +1,26 @@
+from fnmatch import fnmatch, fnmatchcase
+
+if __name__ == '__main__':
+    # 如果想用shell通配符做字符串匹配，可以用上面的这两个函数：
+    is_txt = fnmatch("suka.txt", "*.txt")
+    print(is_txt)
+    is_txt = fnmatchcase("suka.txt", "??ka.txt")
+    print(is_txt)
+
+    # 注意，fnmatchcase不会对输入的name进行大小写标准化，而fnmatch会对输入标准化后再进行匹配
+    # 需要小心的是，大小写标准化的模式与底层文件系统相同，比如windows不需要区分大小写，但是mac要
+
+    address = [
+        '5412 N CLARK ST',
+        '1060 W ADDISON ST',
+        '1039 W GRANVILLE AVE',
+        '2122 N CLARK ST',
+        '4802 N BROADWAY'
+    ]
+
+    # 可以用正则进行筛选
+    ST = [addr for addr in address if fnmatchcase(addr, "* ST")]
+    print(ST)
+    NUM_S = [addr for addr in address if fnmatchcase(addr, '54[0-9][0-9] *CLARK*')]
+    print(NUM_S)
+    
--- a/2.字符串和文本/4.使用正则对文本进行匹配和查找.py
+++ b/2.字符串和文本/4.使用正则对文本进行匹配和查找.py
@@ -0,0 +1,49 @@
+import re
+
+if __name__ == '__main__':
+    # 如果只是简单的文字匹配或者查找，下面三个方法足以解决问题：
+    url = "http://www.baidu.com"
+    url.startswith("http")
+    url.endswith(".com")
+    url.find("baidu")
+
+    # 但如果是更加复杂的匹配，就要用到re库的正则了
+    text1 = '11/27/2012'
+    text2 = 'Nov 27, 2012'
+
+    if re.match(r'\d+/\d+/\d+', text1):
+        print("yes")
+    else:
+        print("no")
+
+    if re.match(r'\d+/\d+/\d+', text2):
+        print("yes")
+    else:
+        print("no")
+
+    # match可以被一次性消费，但是如果想要多次匹配，就要先把正则编译
+    datepat = re.compile(r'\d+/\d+/\d+')
+
+    if datepat.match(text1):
+        print("yes")
+    else:
+        print("no")
+
+    if datepat.match(text2):
+        print("yes")
+    else:
+        print("no")
+
+    # 这里要注意的是，match方法是从头匹配，如果要匹配的内容在一堆垃圾里面，请使用findall
+
+    # 我们还会使用捕获组，这样可以把每个组单独提取出来
+    datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
+    m = datepat.match(text1)
+    print(m.group(0))
+    print(m.group(1))
+    print(m.group(2))
+    print(m.group(3))
+
+    # match只能匹配开头，它不管结尾，如果想要精确匹配需要加休止符$
+    datepat = re.compile(r'(\d+)/(\d+)/(\d+)$')
+    
--- a/2.字符串和文本/5.查找和替换文本.py
+++ b/2.字符串和文本/5.查找和替换文本.py
@@ -0,0 +1,22 @@
+import re
+
+if __name__ == "__main__":
+    text = "yeah, but no, but yeah, but no, but yeah"
+
+    # 简单的替换可以使用replace函数来完成
+    text_change = text.replace('yeah', 'yep')
+    print(text_change)
+
+    # 对复杂的替换，我们可以使用re.sub模块
+    text = "Today is 11/27/2012. PyCon starts 3/12/2013"
+    text_change = re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)
+    print(text_change)
+
+    # 如果需要多次替换，记得先编译再sub
+    datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
+    print(datepat.sub(r'\3-\1-\2', text))
+
+    # 想要知道完成了几次替换，可以使用subn
+    # subn返回一个元组，结构为（替换后的字符串， 替换次数）
+    print(datepat.subn(r'\3-\1-\2', text))
+
--- a/2.字符串和文本/6.不区分大小写的对文本进行匹配和区分大小写的替换.py
+++ b/2.字符串和文本/6.不区分大小写的对文本进行匹配和区分大小写的替换.py
@@ -0,0 +1,35 @@
+import re
+
+if __name__ == '__main__':
+    text = "UPPER PYTHON, lower python, mixed Python"
+
+    # 想要找到所有的python，可以在函数里面找一下有没有flag关键字，并设为re.IGNORECASE
+
+    pythons = re.findall('python', text, flags=re.IGNORECASE)
+    print(pythons)
+
+    # 但是这样在替换的时候就有问题发生，不能把替换文本的大小写设置成和原文本一样
+    python_replace = re.sub('python', 'snake', text, flags=re.IGNORECASE)
+    print(python_replace)
+    # 这个时候我们需要一个辅助函数
+    def matchcase(word):
+        def replace(m):
+            text = m.group()
+            if text.isupper():
+                return word.upper()
+            elif text.islower():
+                return word.lower()
+            elif text[0].isupper():
+                # 这个函数将字符串的首字母大写，其余部分转成小写
+                return word.capitalize()
+            else:
+                return word
+        return replace
+    python_replace_with_func = re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE)
+    print(python_replace_with_func)
+    # 这个辅助函数结构陌生，这里记个笔记备注一下防止以后再看忘记：
+    # 1.首先，sub函数检测到要调用matchcase函数，进入函数体返回replace函数     sub->matchcsae
+    # 2.此时matchcase函数的输入word仍在堆栈中等待调用                       matchcase -replace-> sub
+    # 3.在替换的时候，sub函数将识别到的python替换成replace(被识别到的部分)    text -> (a1,a2,a3,...)
+    # 4.replace函数返回大小写处理结果                                      replace(a1), replace(a2), ...
+    # 5.函数返回被替换字符串，re.sub函数进行替换                            将识别到的关键词替换成处理过的word
--- a/2.字符串和文本/7.定义实现最短匹配的正则表达式.py
+++ b/2.字符串和文本/7.定义实现最短匹配的正则表达式.py
@@ -0,0 +1,18 @@
+import re
+
+if __name__ == '__main__':
+    # 在进行文本匹配的时候，re库使用的是贪心算法，即找最长匹配字符串
+    text1 = 'Computer says "no".'
+    text2 = 'Computer says "yes" and "no".'
+    # 这样的算法在简单的环境中不会出现问题，但如果遇到闭包匹配（最典型的是双引号）就会出现问题
+    said = re.compile(r'\"(.*)\"')
+    print(said.findall(text1))
+
+    # 看，这里就输出了距离最长的两个引号中间的内容
+    print(said.findall(text2))
+
+    # 问题就出现在.匹配字符上，默认.*会匹配除了\n以外的所有字符，其中也包括“”
+    # 解决方案是强制取消正则的贪心算法，进行最短匹配
+    # 技术上我们在.*后界面加一个?来强制取消贪心
+    said_shot = re.compile(r'\"(.*?)\"')
+    print(said_shot.findall(text2))
--- a/2.字符串和文本/8.编写多行模式的正则表达式.py
+++ b/2.字符串和文本/8.编写多行模式的正则表达式.py
@@ -0,0 +1,20 @@
+import re
+
+if __name__ == '__main__':
+    # 正常情况下，我们会想用.来匹配所有字符串，但这个东西不能匹配换行符
+    text1 = '/*This is a comment*/'
+    text2 = '''/*This is a
+                 multiline comment   */'''
+
+    comment = re.compile(r'/\*(.*?)\*/')
+    print(comment.findall(text1))
+    print(comment.findall(text2))
+    # 你会发现欸我靠咋匹配不到了，那是因为由于没办法识别\n,第二行被抛弃了
+    # 想要识别出\n,需要使用(?:.|\n)指定一个非捕获组，意思是在识别到/n的时候只做匹配但不触发捕获退出
+    comment_pro = re.compile(r'\*((?:.|\n)*?)\*/')
+    print(comment_pro.findall(text2))
+
+    # 对于这种简单的情况，可以在compile函数中加入一个参数re.DOTALL来让.匹配包括\n在内的所有字符串
+    # PS：复杂情况请另请高明
+    comment_pro_se = re.compile(r'\*(.*?)\*/', re.DOTALL)
+    print(comment_pro_se.findall(text2))
--- a/2.字符串和文本/9.将unicode文本进行统一规范表示.py
+++ b/2.字符串和文本/9.将unicode文本进行统一规范表示.py
@@ -0,0 +1,37 @@
+import unicodedata
+
+if __name__ == '__main__':
+    s1 = 'Spicy Jalape\u00f1o'
+    s2 = 'Spicy Jalapen\u0303o'
+
+    # 可以看到，某些字符串的表示在unicode下有多种选项
+    print(s1,s2)
+    print(s1==s2)
+
+    # 显然，这种情况在我们判断字符串的时候极其不利，这时候我们就需要把unicode编码进行规范
+
+    # 有两种规范方式，NFC全组成和NFD组合字符
+    t1 = unicodedata.normalize('NFD', s1)
+    t2 = unicodedata.normalize('NFD', s2)
+    print(t1, t2)
+    print(t1==t2)
+    print(ascii(t1))
+
+    t3 = unicodedata.normalize('NFC', s1)
+    t4 = unicodedata.normalize('NFC', s2)
+    print(ascii(t3))
+
+    # unicodedata同时还提供NFKC和NFKD编码，这种编码提供了额外的兼容功能，能把下面这种字符分开
+    s = '\ufb01'
+    print(s)
+    print(unicodedata.normalize('NFD', s))
+    print(unicodedata.normalize('NFKC', s))
+    print(unicodedata.normalize('NFKD', s))
+
+    # 如果要去除音符标记~，那么我们可以先用组合字符NFD标准化，再进行去除
+    suka = unicodedata.normalize('NFD', s1)
+    print(suka)
+    suka = ''.join(c for c in unicodedata.normalize('NFD', suka) if not unicodedata.combining(c))
+    print(suka)
+
+    # 在上面的例子里,我们使用了unicodedata.combining()函数来判断字符是否属于组合型字符