from xml.etree.ElementTree import iterparse from urllib.request import urlopen # 在尤其关注节省内存时,牺牲运行速度,使用增量生成的方式读取XML文档 def parse_and_remove(filename, path): # 将节点路径划分 path_parts = path.split('/') # 通过迭代的方式解析XML文件 doc = iterparse(filename, events=('start', 'end')) # 跳过根节点 next(doc) # 设置存储栈 tag_stack = [] elem_stack = [] for event, elem in doc: # iterparse会输出event和elem元组,表示状态和元素 if event == 'start': # 如果检测状态是start就入栈 tag_stack.append(elem.tag) elem_stack.append(elem) elif event == 'end': # 如果检测状态是end就出栈 # print(tag_stack, path_parts) # 如果路径和需要寻找的标签相同,就生成并从栈中抹除 if tag_stack == path_parts: yield elem elem_stack[-2].remove(elem) try: tag_stack.pop() elem_stack.pop() except IndexError: pass if __name__ == '__main__': u = urlopen('http://planet.python.org/rss20.xml') data = parse_and_remove(u, 'channel/item/title') # print(data) for d in data: print(d.text)