6、通過xpath獲取網頁數據

本文轉載自查看原文 2018-03-21 16:45 1255 python/ pytho爬蟲

1、xpath解析網頁源文件

from urllib import request
from lxml import etree
# 請求的url
url = "http://www.dfenqi.cn/Product/Index"
# 請求的頭文件
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
}
# 創建請求對象
req = request.Request(url,headers = headers)
# 創建處理器對象
httpHandler = request.HTTPHandler()
# 創建opener
opener = request.build_opener(httpHandler)
# 發送請求
response = opener.open(req)
# 讀取源文件
html = response.read().decode('utf-8')
# 創建xpath關系
xpath = "//div[@class='liebiao']/ul/li/p/text()"
# 獲取屬性值列表
# xpath = "//div[@class='liebiao']/ul/li/p/@class"
# 將html轉換成可解析對象
selector = etree.HTML(html)
# 返回xpath查詢列表
goodsList = selector.xpath(xpath)
# 顯示商品標題
for goods in goodsList:
    print(goods)

2、xpath解析源文件，並下載圖片至本地

from urllib import request
from lxml import etree
import os

class Spilder():
    def __init__(self,pageUrl):
        # 需要爬取網頁的url
        self.pageUrl = pageUrl
        # 請求頭文件
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
        }
        # 請求的處理器
        self.httpHandler = request.HTTPHandler()
        # 請求的opener
        self.opener = request.build_opener(self.httpHandler)

    def loadPage(self):
        '''
        請求網頁
        :return: 返回網頁源文件
        '''
        req = request.Request(self.pageUrl,headers = self.headers)
        response = self.opener.open(req)
        return response.read()

    def getImageUrls(self,html,xpath):
        '''
        根據xpath解析源文件
        :param html: 源文件
        :param xpath: xpath解析字符串
        :return: 解析列表
        '''
        selector = etree.HTML(html)
        imgUrls = selector.xpath(xpath)
        return imgUrls

    def loadImage(self,url):
        '''
        下載圖片
        :param url: 圖片url
        :return: 返回圖片數據
        '''
        req = request.Request(url,headers=self.headers)
        response = self.opener.open(req)
        return response.read()

    def writeImage(self,img,imgName):
        '''
        在當前文件夾下面創建image子文件夾，將圖片寫入本地，
        :param img: 圖片數據
        :param imgName: 圖片名稱
        :return:
        '''
        folderName = os.path.join(os.path.abspath(os.curdir),"image")
        if not(os.path.isdir(folderName)):
            os.mkdir(folderName)
        with open('image/%s' % imgName,'wb') as f:
            f.write(img)

if __name__ == "__main__":
    url = "http://www.dfenqi.cn/Product/Index"
    spilder = Spilder(url)
    html = spilder.loadPage()
    xpath = "//div[@class='liebiao']/ul/li/div/a/img/@src"
    imgUrls = spilder.getImageUrls(html,xpath)
    index = 0
    for url in imgUrls:
        index += 1
        img = spilder.loadImage(url)
        spilder.writeImage(img,'img%s.jpg' % index)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 [PHP] xpath提取網頁數據內容使用XPath爬取網頁數據 uniCloud爬蟲獲取網頁數據使用HtmlUnit動態獲取網頁數據 VB中獲取網頁數據獲取豆瓣電影數據（R與API獲取網頁數據）使用Xpath從網頁中獲取數據數據抽樣與R實現、獲取網頁數據 JAVA 爬蟲獲取js動態生成的網頁數據 Java 獲取網頁數據的一般步驟和方式