爬蟲之lxml - etree - xpath的使用


# 解析原理:
# - 獲取頁面源碼數據
# - 實例化一個etree對象,並且將頁面源碼數據加載到該對象中
# - 調用該對象的xpath方法進行指定標簽定位
# - xpath函數必須結合着xpath表達式進行標簽定位和內容捕獲
# xpath表達式:
# - 屬性定位: //div[@class="song"] 找到class屬性值為song的div 返回一個列表
# - 索引層級定位: //div[@class="tang"]/ul/li[2]/a
# - 邏輯運算: //a[@href="" and @class="du"] 並且
# - 模糊匹配: //div[contains(@class, 'ng')] class包含 ng 的div
#            //div[startwith(@class, 'ta')] class以 ta 開頭的div
# - 取文本: //div[@class="song"]/p[1]/text() div下的文本內容
#          //div[@class="tang"]//text() div下以及字標簽下的文本內容 返回列表
# - 取屬性: // div[@class="tang"]//a[1]/@href

下面上幾個小案例:

import requests
from lxml import etree

url = 'https://bj.58.com/ershoufang/?utm_source=sem-sales-baidu-pc&spm=85077276202.21974091622&utm_campaign=sell&utm_medium=cpc&showpjs=pc_fg'

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}

page_text = requests.get(url=url, headers=headers).text

tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@class="house-list-wrap"]/li') # 返回的是Element對象
fp = open('58.csv', 'w', encoding='utf8')
for li in li_list:
    title = li.xpath('./div[2]/h2/a/text()')[0] # 局部頁面解析要加'.'
    price1 = li.xpath('./div[3]//text()')
    price = ''.join(price1)
    fp.write(title+":"+price+'\n')
fp.close()
print('over')
爬取 58二手房信息
xpath 解析圖片資源

import requests
from lxml import etree

url = "http://pic.netbian.com/4kmeinv/"

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
page_text = requests.get(url=url, headers=headers).text

tree = etree.HTML(page_text)
# etree.parse(page_text) 解析本地文件推薦使用
li_list = tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
    image_name = li.xpath('./a/b/text()')[0]
    image_name = image_name.encode('iso-8859-1').decode('gbk')
    image_url = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
    image_path = './img/'+image_name+'.jpg'
    img = requests.get(image_url).content
    with open(image_path, 'wb') as f:
        f.write(img)
    print(image_path+'下載成功')
圖片怎么爬取呢?
import requests
import base64
from lxml import etree

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
url = 'http://jandan.net/top'
response = requests.get(url=url, headers=headers)
page_text = response.text
tree = etree.HTML(page_text)
code_list = tree.xpath('//span[@class="img-hash"]/text()')
for img_code in code_list:
    img_url = 'http:'+base64.b64decode(img_code).decode()
    img_name = img_url.split('/')[-1]
    img_path = f'./jd_img/{img_name}'
    print(img_url)
    content = requests.get(img_url).content
    with open(img_path, 'wb') as f:
        f.write(content)
    print(img_name+'成功')
print('over')
有的時候我找不到我要的圖片鏈接呀

上面是煎蛋網采用了js的方法對圖片鏈接地址進行了base64的加密

# 簡歷模板爬取(ip禁用問題)
# 解決方法:
#       ip代理,
#       請求頭中添加Connection字段:close
import requests
import random
from lxml import etree

url = 'http://sc.chinaz.com/jianli/free.html'

headers = {
          'Connection': 'close', # 每次請求成功之后,發馬上斷開連接(修改后有幾率無法立即生效,出現Httppool...錯誤- 重新運行)
          'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
a_list = tree.xpath('//div[@id="container"]/div/a[1]')
for a in a_list:
    title = a.xpath('./img/@alt')[0].encode('iso-8859-1').decode('utf-8')
    detail_url = a.xpath('./@href')[0]
    detail_text = requests.get(url=detail_url, headers=headers).text
    d_tree = etree.HTML(detail_text)
    down_url_list = d_tree.xpath('//div[@class="down_wrap"]//li/a/@href')
    down_url = random.choice(down_url_list)
    data = requests.get(down_url,headers=headers).content
    with open(f'./簡歷模板/{title}.rar', 'wb') as f:
        f.write(data)
    print(title+'完成')
print('over')
站長之家模板資源爬取下載

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM