python小白學習記錄運用lxml的xpath解析html文件

本文轉載自查看原文 2020-02-09 17:36 646 python

 1 from lxml import etree
 2 text = "<div><p>nmsl</p><span>nmsl</span></div>"
 3 def htmlstree(text):
 4     html = etree.HTML(text)
 5     result = etree.tostring(html)
 6     print(result)
 7     return result.decode('utf-8')
 8 #解析html字符串並且會為標簽自動加上<html><body></body></html>
 9 def parseetree():
10     parser = etree.HTMLParser(encoding='utf-8')
11     html = etree.parse("index111.html",parser=parser)
12     result = etree.tostring(html,encoding='utf-8').decode("utf-8")
13     print(result)
14 #解析xml 由於某寫html標簽會不全用普通的xml解析器會出錯 如<br/>  所以要指定html解析器
15 if __name__ == '__main__':
16     parseetree()

以上為etree的使用范例

分別解析了html字符串和html文件

from lxml import etree
def parseetree():#找到包含onclick和id屬性的a標簽
    parser = etree.HTMLParser(encoding='utf-8')
    html = etree.parse("index111.html",parser=parser)
    trs = html.xpath("//a[@onclick][@id]")
    for tr in trs:
        result = etree.tostring(tr,encoding='utf-8').decode("utf-8")
        print(result)
def parseetree1():#找到包含onclick和id屬性的a標簽中的第4個，即[3]
    parser = etree.HTMLParser(encoding='utf-8')
    html = etree.parse("index111.html",parser=parser)
    tr = html.xpath("//a[@onclick][@id]")[3]
    result = etree.tostring(tr,encoding='utf-8').decode("utf-8")
    print(result)
def parseetree2():#找到包含onclik和id屬性的a標簽中的id的值 和 文本的值
    parser = etree.HTMLParser(encoding='utf-8')
    html = etree.parse("index111.html",parser=parser)
    trs = html.xpath("//a[@onclick][@id]/@id")
    for tr in trs:
        print(tr)
    trs2 = html.xpath("//a[@onclick][@id]/text()")
    for tr in trs2:
        print(tr)
if __name__ == "__main__":
    parseetree()
    print("***************")
    parseetree1()
    print("***************")
    parseetree2()

以上為運用xpath來對html進行解析

以下是運行結果

附：https://www.w3school.com.cn/xpath/xpath_syntax.asp xpath語法

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python 通過lxml遍歷html xpath Python 基於lxml.etree實現xpath查找HTML元素 Python爬蟲 | lxml解析html頁面用lxml解析HTML 用Xpath選擇器解析網頁（lxml） python筆記27-lxml.etree解析html Python網頁解析：BeautifulSoup vs lxml.html Python 之lxml解析模塊 python解析xml之lxml Python爬蟲之Lxml庫與Xpath語法

python小白學習記錄 運用lxml的xpath解析html文件

免責聲明！

python小白學習記錄運用lxml的xpath解析html文件