46 lines
1.3 KiB
Python
46 lines
1.3 KiB
Python
|
from xml.etree.ElementTree import iterparse
|
|||
|
from urllib.request import urlopen
|
|||
|
|
|||
|
|
|||
|
# 在尤其关注节省内存时,牺牲运行速度,使用增量生成的方式读取XML文档
|
|||
|
def parse_and_remove(filename, path):
|
|||
|
# 将节点路径划分
|
|||
|
path_parts = path.split('/')
|
|||
|
# 通过迭代的方式解析XML文件
|
|||
|
doc = iterparse(filename, events=('start', 'end'))
|
|||
|
# 跳过根节点
|
|||
|
next(doc)
|
|||
|
|
|||
|
# 设置存储栈
|
|||
|
tag_stack = []
|
|||
|
elem_stack = []
|
|||
|
|
|||
|
for event, elem in doc:
|
|||
|
# iterparse会输出event和elem元组,表示状态和元素
|
|||
|
if event == 'start':
|
|||
|
# 如果检测状态是start就入栈
|
|||
|
tag_stack.append(elem.tag)
|
|||
|
elem_stack.append(elem)
|
|||
|
elif event == 'end':
|
|||
|
# 如果检测状态是end就出栈
|
|||
|
# print(tag_stack, path_parts)
|
|||
|
|
|||
|
# 如果路径和需要寻找的标签相同,就生成并从栈中抹除
|
|||
|
if tag_stack == path_parts:
|
|||
|
yield elem
|
|||
|
elem_stack[-2].remove(elem)
|
|||
|
try:
|
|||
|
tag_stack.pop()
|
|||
|
elem_stack.pop()
|
|||
|
except IndexError:
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
u = urlopen('http://planet.python.org/rss20.xml')
|
|||
|
data = parse_and_remove(u, 'channel/item/title')
|
|||
|
# print(data)
|
|||
|
for d in data:
|
|||
|
print(d.text)
|