1 from lxml import etree 2 text = "<div><p>nmsl</p><span>nmsl</span></div>" 3 def htmlstree(text): 4 html = etree.HTML(text) 5 result = etree.tostring(html) 6 print(result) 7 return result.decode('utf-8') 8 #解析html字符串並且會為標簽自動加上<html><body></body></html> 9 def parseetree(): 10 parser = etree.HTMLParser(encoding='utf-8') 11 html = etree.parse("index111.html",parser=parser) 12 result = etree.tostring(html,encoding='utf-8').decode("utf-8") 13 print(result) 14 #解析xml 由於某寫html標簽會不全用普通的xml解析器會出錯 如<br/> 所以要指定html解析器 15 if __name__ == '__main__': 16 parseetree()
以上為etree的使用范例
分別解析了html字符串和html文件
from lxml import etree def parseetree():#找到包含onclick和id屬性的a標簽 parser = etree.HTMLParser(encoding='utf-8') html = etree.parse("index111.html",parser=parser) trs = html.xpath("//a[@onclick][@id]") for tr in trs: result = etree.tostring(tr,encoding='utf-8').decode("utf-8") print(result) def parseetree1():#找到包含onclick和id屬性的a標簽中的第4個,即[3] parser = etree.HTMLParser(encoding='utf-8') html = etree.parse("index111.html",parser=parser) tr = html.xpath("//a[@onclick][@id]")[3] result = etree.tostring(tr,encoding='utf-8').decode("utf-8") print(result) def parseetree2():#找到包含onclik和id屬性的a標簽中的id的值 和 文本的值 parser = etree.HTMLParser(encoding='utf-8') html = etree.parse("index111.html",parser=parser) trs = html.xpath("//a[@onclick][@id]/@id") for tr in trs: print(tr) trs2 = html.xpath("//a[@onclick][@id]/text()") for tr in trs2: print(tr) if __name__ == "__main__": parseetree() print("***************") parseetree1() print("***************") parseetree2()
以上為運用xpath來對html進行解析
以下是運行結果
附:https://www.w3school.com.cn/xpath/xpath_syntax.asp xpath語法