37 lines
1.4 KiB
Python
37 lines
1.4 KiB
Python
|
import re
|
|||
|
from collections import namedtuple
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
text = 'foo = 23 + 42 * 10'
|
|||
|
# 我们想要将这个字符串解析成这个样子:
|
|||
|
# tokens = [('NAME', 'foo'), ('EQ', '='), ('NUMS', '23'), ('PLUS', '+'),
|
|||
|
# ('NUMS', '42'), ('TIMES', '*'), ('NUMS', '10')]
|
|||
|
# 首先,我们要对每种情况编写一个捕获组
|
|||
|
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
|
|||
|
NUM = r'(?P<NUM>\d+)'
|
|||
|
PLUS = r'(?P<PLUS>\+)'
|
|||
|
TIMES = r'(?P<TIMES>\*)'
|
|||
|
EQ = r'(?P<EQ>=)'
|
|||
|
WS = r'(?P<WS>\s+)'
|
|||
|
# 把这些捕获组静态编译给re模块
|
|||
|
master_pat = re.compile('|'.join([NAME,NUM,PLUS,TIMES,EQ,WS]))
|
|||
|
|
|||
|
# 定义一个数据结构来存储每个部分
|
|||
|
Token = namedtuple('Token', ['type', 'value'])
|
|||
|
|
|||
|
def generate_tokens(pat, text):
|
|||
|
# 调用re.compile的scanner模块来扫描,并用iter模块将其变成可迭代对象进行迭代
|
|||
|
scanner = pat.scanner(text)
|
|||
|
for m in iter(scanner.match, None):
|
|||
|
# 生成结构来存储分词
|
|||
|
yield Token(m.lastgroup, m.group())
|
|||
|
|
|||
|
# 对分词类型进行过滤,去掉空格
|
|||
|
tokens = (tok for tok in generate_tokens(master_pat, text) if tok.type != 'WS')
|
|||
|
for tok in tokens:
|
|||
|
print(tok)
|
|||
|
|
|||
|
|
|||
|
# 看起来很简单是吧,唯一的难点就在compile的时候,
|
|||
|
# 要把大词放前面先搜完再搜小词,保证较长的模式先匹配;比如<=的顺序就应该比<靠前
|
|||
|
|