Files
Python_CookBook_repo/6.数据编码与处理/4.以增量方式解析大型XML文件.py
2025-09-10 16:12:45 +08:00

46 lines
1.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from xml.etree.ElementTree import iterparse
from urllib.request import urlopen
# 在尤其关注节省内存时牺牲运行速度使用增量生成的方式读取XML文档
def parse_and_remove(filename, path):
# 将节点路径划分
path_parts = path.split('/')
# 通过迭代的方式解析XML文件
doc = iterparse(filename, events=('start', 'end'))
# 跳过根节点
next(doc)
# 设置存储栈
tag_stack = []
elem_stack = []
for event, elem in doc:
# iterparse会输出event和elem元组表示状态和元素
if event == 'start':
# 如果检测状态是start就入栈
tag_stack.append(elem.tag)
elem_stack.append(elem)
elif event == 'end':
# 如果检测状态是end就出栈
# print(tag_stack, path_parts)
# 如果路径和需要寻找的标签相同,就生成并从栈中抹除
if tag_stack == path_parts:
yield elem
elem_stack[-2].remove(elem)
try:
tag_stack.pop()
elem_stack.pop()
except IndexError:
pass
if __name__ == '__main__':
u = urlopen('http://planet.python.org/rss20.xml')
data = parse_and_remove(u, 'channel/item/title')
# print(data)
for d in data:
print(d.text)