@
目錄
前言
本章同樣是解析網頁,不過使用的解析技術為XPath。
相對於之前的BeautifulSoup,我感覺還行,也是一個比較常用的一種解析方式
,
並且更加的符合我們之前的一個邏輯思維,不過看情況吧,看各位准備怎么用吧。
XPath的使用方法
同樣的先下載lxml插件,並且導入里面的etree
"""
XPath的學習
"""
from lxml import etree
# 案例文件
html_doc = """
<div>
<ul>
<li class="item-0"><a href="www.baidu.com">baidu</a>
<li class="item-1 one" name="first"><a href="https://blog.csdn.net/qq_25343557">myblog</a>
<li class="item-1 two" name="first"><a href="https://blog.csdn.net/qq_25343557">myblog2</a>
<li class="item-2"><a href="https://www.csdn.net/">csdn</a>
<li class="item-3"><a href="https://hao.360.cn/?a1004">bbb</a>
<li class="aaa"><a href="https://hao.360.cn/?a1004">aaa</a>
"""
html = etree.HTML(html_doc)
# 1、獲取所有li下的所有a標簽
print(html.xpath("//li/a"))
#2、獲取指定的li標簽item-0
print(html.xpath("//li[@class='item-0']"))
#3、獲取指定的li標簽item-0下面的a標簽
print(html.xpath("//li[@class='item-0']/a"))
#4、獲取指定的li標簽item-0下面的a標簽里面的內容
print(html.xpath("//li[@class='item-0']/a/text()"))
# 高級進階用法
# 1、匹配屬性以什么類型開頭的class(starts-with())
print(html.xpath("//li[starts-with(@class,'item-')]"))
# 2、匹配里面所有相同的item-1,(contains())
print(html.xpath("//li[contains(@class,'item-1')]"))
# 3、多屬性的匹配(and)
print(html.xpath("//li[contains(@class,'one') and contains(@name,'first')]/a/text()"))
# 4、按順序來排序
# 第2個
print(html.xpath("//li[2]/a/text()"))
# 最后一個
print(html.xpath("//li[last()]/a/text()"))
# 最后一個-1個
print(html.xpath("//li[last()-1]/a/text()"))
# 小於等於3的序號li
print(html.xpath("//li[position()<=3]/a/text()"))
XPath爬取數據
"""
案例:爬取《51job》相關職位信息,並保存成cvs文件格式
"""
import requests
from lxml import etree
import csv
import time
headers = {
"User-Agent": "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14"
}
f = open("Python職位.csv", "w", newline="")
writer = csv.writer(f)
writer.writerow(['編號', '職位名稱', '公司名稱', '薪資', '地址', '發布時間'])
i = 1
for page in range(1, 159):
response = requests.get(f"https://search.51job.com/list/020000,000000,0000,00,9,99,python,2,{page}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=", headers=headers)
response.encoding = "gbk"
if response.status_code == 200:
html = etree.HTML(response.text)
els = html.xpath("//div[@class='el']")[4:]
for el in els:
jobname = str(el.xpath("p[contains(@class,'t1')]/span/a/@title")).strip("[']")
jobcom = str(el.xpath("span[@class='t2']/a/@title")).strip("[']")
jobaddress = str(el.xpath("span[@class='t3']/text()")).strip("[']")
jobmoney = str(el.xpath("span[@class='t4']/text()")).strip("[']")
jobdate = str(el.xpath("span[@class='t5']/text()")).strip("[']")
writer.writerow([i, jobname, jobcom, jobaddress, jobmoney, jobdate])
i +=1
print(f"第{page}頁獲取完畢")
后言
多學一種解析網頁的方式多一種選擇