Files
Python_CookBook_repo/2.字符串和文本/18.文本分词.py
2025-09-10 16:12:45 +08:00

37 lines
1.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from collections import namedtuple
if __name__ == "__main__":
text = 'foo = 23 + 42 * 10'
# 我们想要将这个字符串解析成这个样子:
# tokens = [('NAME', 'foo'), ('EQ', '='), ('NUMS', '23'), ('PLUS', '+'),
# ('NUMS', '42'), ('TIMES', '*'), ('NUMS', '10')]
# 首先,我们要对每种情况编写一个捕获组
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'
# 把这些捕获组静态编译给re模块
master_pat = re.compile('|'.join([NAME,NUM,PLUS,TIMES,EQ,WS]))
# 定义一个数据结构来存储每个部分
Token = namedtuple('Token', ['type', 'value'])
def generate_tokens(pat, text):
# 调用re.compile的scanner模块来扫描并用iter模块将其变成可迭代对象进行迭代
scanner = pat.scanner(text)
for m in iter(scanner.match, None):
# 生成结构来存储分词
yield Token(m.lastgroup, m.group())
# 对分词类型进行过滤,去掉空格
tokens = (tok for tok in generate_tokens(master_pat, text) if tok.type != 'WS')
for tok in tokens:
print(tok)
# 看起来很简单是吧唯一的难点就在compile的时候
# 要把大词放前面先搜完再搜小词,保证较长的模式先匹配;比如<=的顺序就应该比<靠前