一、XPath常用規則





二、解析html文件
from lxml import etree # 讀取HTML文件進行解析 def parse_html_file(): html = etree.parse("./test.html", parser=etree.HTMLParser()) print(etree.tostring(html).decode("utf-8")) ''' <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"/> <title>Title</title> </head> <body> <h1>yangs</h1> </body> </html> ''' # 讀取文本解析節點 def get_text_node(text): html = etree.HTML(text, parser=etree.HTMLParser()) print(html.xpath("//ul/li[position()=2]/text()")) # ['你好!!!'] print(html.xpath("//ul/li[2]/text()")) # ['你好!!!'] # 獲取所有節點 def get_all_node(text): html = etree.HTML(text, parser=etree.HTMLParser()) print(html.xpath( "//*")) # [<Element html at 0x20be0903f48>, <Element body at 0x20be0910048>, <Element div at 0x20be0910088>, <Element ul at 0x20be09100c8>, <Element li at 0x20be0910108>, <Element a at 0x20be0910188>, <Element li at 0x20be09101c8>, <Element li at 0x20be0910208>, <Element span at 0x20be0910248>] # 獲取子節點 def get_children_node(text): html = etree.HTML(text, parser=etree.HTMLParser()) print(html.xpath("//div/ul/li/a")) # [<Element a at 0x1e15740e108>] # 獲取父節點 def get_parent_node(text): html = etree.HTML(text, parser=etree.HTMLParser()) print(html.xpath("//a/..")) # [<Element li at 0x28a7d2ae108>, <Element li at 0x28a7d2ae208>] # 屬性匹配 def math_attr(text): html = etree.HTML(text, parser=etree.HTMLParser()) print(html.xpath("//a[@href='2.html']/text()")) # ['hello world'] # 屬性獲取 def get_attr(text): html = etree.HTML(text, parser=etree.HTMLParser()) print(html.xpath("//a/@href")) # ['1.html', '2.html'] # 屬性多值匹配 def match_more_attr(text): html = etree.HTML(text, parser=etree.HTMLParser()) print(html.xpath("//li[contains(@class, 'aaa')]/a/text()")) # ['yangs'] if __name__ == '__main__': text = ''' <div> <ul> <li class="aaa last-li"><a href="1.html">yangs</a></li> <li>你好!!!</li> <li class="last-li"><a href="2.html">hello world</a></li> </ul> </div> '''
三、去哪兒網html抓取案例
import requests from lxml import etree def go_where(keyword): url = "https://piao.qunar.com/ticket/list.htm?keyword=" + keyword headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" } try: html = requests.get(url, headers=headers).content.decode("utf-8") except RuntimeError as e: print(e) try: html_object = etree.HTML(html, parser=etree.HTMLParser()) # 獲取總共多少條數據 count = len(html_object.xpath("//div[@class='sight_item']")) return_data = [] for i in range(count): name = html_object.xpath("//div[@class='sight_item']/@data-sight-name") districts = html_object.xpath("//div[@class='sight_item']/@data-districts") point = html_object.xpath("//div[@class='sight_item']/@data-point") img_url = html_object.xpath("//div[@class='sight_item']/@data-sight-img-u-r-l") address = html_object.xpath("//div[@class='sight_item']/@data-address") return_data.append({ "name": name[i], "districts": districts[i], "point": point[i], "address": address[i], "img_url": img_url[i] }) return return_data except RuntimeError as e: print(e) if __name__ == '__main__': data = go_where("溫州") print(data) # [{'name': '雁盪山', 'districts': '浙江·溫州·樂清市', 'point': '121.095868,28.352028', 'address': '浙江省溫州樂清市雁盪鎮雁山路88號', 'img_url': 'https://imgs.qunarzz.com/sight/p0/1604/70/7094d064511234be90.img.jpg_280x200_03cb9d77.jpg'}, {'name': '江心嶼', 'districts': '浙江·溫州·鹿城區', 'point': '120.645422,28.032889', 'address': '浙江省溫州市鹿城區望江東路119號', 'img_url': 'https://imgs.qunarzz.com/sight/p0/201402/24/0b725e8cd5bb14af0a634e7dc7057e15.jpg_280x200_6042c3f2.jpg'}, {'name': '楠溪江', 'districts': '浙江·溫州·永嘉縣', 'point': '120.696651,28.063045', 'address': '浙江省溫州市永嘉縣楠溪江風景區', 'img_url': 'https://imgs.qunarzz.com/sight/p0/1603/20/20e0961e888e8db790.water.jpg_280x200_373cc32f.jpg'}, {'name': '石桅岩', 'districts': '浙江·溫州·楠溪江', 'point': '120.906672,28.38873', 'address': '浙江省溫州市永嘉縣鶴盛鄉', 'img_url': 'https://imgs.qunarzz.com/sight/p0/201301/16/18efacf1a049d44793835fbb.jpg_280x200_ded84edf.jpg'}, {'name': '龍灣潭國家森林公園', 'districts': '浙江·溫州·楠溪江', 'point': '120.881758,28.343969', 'address': '浙江省溫州市永嘉縣鶴盛鄉季家嶴', 'img_url': 'https://imgs.qunarzz.com/sight/p0/201301/15/3a5b3d27b59a888393835fbb.jpg_280x200_fb391fc7.jpg'}, {'name': '大龍湫', 'districts': '浙江·溫州·雁盪山', 'point': '121.060234,28.354889', 'address': '浙江省溫州樂清市雁盪山雁山路88號', 'img_url': 'https://imgs.qunarzz.com/sight/p0/201405/20/2d7f19b34f7a6064e9bb8dc0d531e4b1.jpg_280x200_cc049369.jpg'}, {'name': '仙疊岩', 'districts': '浙江·溫州·洞頭縣', 'point': '121.171743,27.82429', 'address': '浙江省溫州市洞頭縣', 'img_url': 'https://imgs.qunarzz.com/sight/p73/201211/03/b3a8633322999c0d93835fbb.jpg_280x200_8d32214b.jpg'}, {'name': '靈峰', 'districts': '浙江·溫州·雁盪山', 'point': '121.122449,28.38293', 'address': '浙江省溫州樂清市中雁盪山的東大門', 'img_url': 'https://imgs.qunarzz.com/sight/p0/1410/14/72a3cf0e134514459208762c339ea137.jpg_280x200_a9c29a7f.jpg'}, {'name': '雁盪山凈名谷', 'districts': '浙江·溫州·雁盪山', 'point': '121.106647,28.37454', 'address': '浙江省溫州樂清市雁盪鎮響嶺頭村凈名路16-1號', 'img_url': 'https://imgs.qunarzz.com/sight/p0/201301/16/32144da037b4f5bd93835fbb.jpg_280x200_9d664b50.jpg'}, {'name': '小龍湫', 'districts': '浙江·溫州·靈岩', 'point': '121.09865,28.365099', 'address': '浙江省溫州市樂清市雁盪山白芙線旁', 'img_url': 'https://imgs.qunarzz.com/sight/p47/201211/02/bf0df4ce367cf77893835fbb.jpg_280x200_7db1df9d.jpg'}, {'name': '雁盪山飛拉達攀岩景區', 'districts': '浙江·溫州·雁盪山', 'point': '121.059208,28.399697', 'address': '浙江省溫州市樂清市仙溪鎮龍西鄉庄屋村', 'img_url': 'https://imgs.qunarzz.com/sight/p0/1802/48/488f3680d455fc9da3.img.jpg_280x200_62d7e7f7.jpg'}]
有我案例代碼優化的,可以發給我。。。
