[Python爬蟲] 之三：Selenium 調用IEDriverServer 抓取數據

本文轉載自查看原文 2017-03-23 11:20 2886 [Python爬蟲] 之三：Selenium 調用IEDriverServer 抓取數據/ python Selenium+phantomjs爬蟲

接着上一遍，在用Selenium+phantomjs 抓取數據過程中發現，有時候抓取不到，所以又測試了用Selenium+瀏覽器驅動的方式：具體代碼如下：

#coding=utf-8
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
import IniFile
class IEDriverCrawler:

    def __init__(self):
        #通過配置文件獲取IEDriverServer.exe路徑
        configfile = os.path.join(os.getcwd(),'config.conf')
        cf = IniFile.ConfigFile(configfile)
        IEDriverServer = cf.GetValue("section", "IEDriverServer")
        #每抓取一頁數據延遲的時間，單位為秒，默認為5秒
        self.pageDelay = 5
        pageInteralDelay = cf.GetValue("section", "pageInteralDelay")
        if pageInteralDelay:
            self.pageDelay = int(pageInteralDelay)

        os.environ["webdriver.ie.driver"] = IEDriverServer
        self.driver = webdriver.Ie(IEDriverServer)


    def CatchData(self,id,firstUrl,nextUrl,restUrl):
        '''
        抓取數據
        :param id: 要獲取元素標簽的ID
        :param firstUrl: 首頁Url
        :param nextUrl: 下一頁URL
        :param restUrl: 下一頁URL的組成部分
        :return:
        '''
        #加載首頁
        self.driver.get(firstUrl)
        #打印標題
        print self.driver.title
        # id = "J_albumFlowCon"
        element = self.driver.find_element_by_id(id)
        txt = element.text.encode('utf8')
        #打印獲取的信息
        print txt
        print ' '
        time.sleep(20)  # 延遲20秒,
        #由於有多頁數據，為了測試，只取出幾頁數據
        for i in range(2, 4):
            print ' '
            time.sleep(20)  # 延遲20秒,
            url = nextUrl + str(i) + restUrl
            self.driver.get(url)
            element = self.driver.find_element_by_id(id)
            txt = element.text.encode('utf8')
            print txt
        self.driver.close()
        self.driver.quit()

    def CatchDatabyClickNextButton(self,id,firstUrl):
        '''
        抓取數據
        :param id: 要獲取元素標簽的ID
        :param firstUrl: 首頁Url
        :return:
        '''
        start = time.clock()
        #加載首頁
        self.driver.get(firstUrl)
        #打印標題
        print self.driver.title
        # id = "J_ItemList"
        firstPage = self.driver.find_element_by_id(id)
        txt = firstPage.text.encode('utf8')
        self.printTxt(1,txt)

        #獲取總頁數
        name = 'filterPageForm'
        totalPageElement = self.driver.find_element_by_name(name)
        txt = totalPageElement.text.encode('utf8')#ui-page-next
        pattern = re.compile(r'\d+')
        flist  = re.findall(pattern, txt)
        pageCount = 1
        if flist and len(flist)>0:
            pageCount = int(flist[0])
        if pageCount > 1:
            pageCount = 10 #先爬三頁
            for index in range(2,pageCount + 1):
                time.sleep(self.pageDelay) #延遲五秒
                nextElement = self.driver.find_element_by_xpath("//a[@class='ui-page-next']")
                nextUrl = nextElement.get_attribute('href')
                self.driver.get(nextUrl)
                # ActionChains(self.driver).click(element)
                dataElement = self.driver.find_element_by_id(id)
                txt = dataElement.text.encode('utf8')  # ui-page-next
                print ' '
                self.printTxt(index, txt)

        self.driver.close()
        self.driver.quit()
        end = time.clock()
        print ' '
        print "抓取每頁數據后延遲 %d 秒" % self.pageDelay
        print "總共抓取了 %d頁數據" % pageCount
        print "整個過程用時間: %f 秒" % (end - start)

    def printTxt(self,pageIndex,stringTxt):
        '''
        打印抓取的每頁數據
        :param pageIndex:頁數
        :param stringTxt:每頁抓取的數據
        :return:
        '''
        if stringTxt.find('¥') > -1:
            itemList = stringTxt.split('¥')
            print '第' + str(pageIndex) + '頁數據'
            print ' '
            for item in itemList:
                if len(item) > 0:
                    its = item.split('\n')
                    if len(its)>=4:
                        print '單價：        ¥%s' % its[0]
                        print '品牌：        %s' % its[1]
                        print '銷售店鋪名稱： %s' % its[2]
                        print '成交量：      %s' % its[3]
                        print ' '


#測試抓取淘寶數據
# obj = IEDriverCrawler()
# firstUrl = "https://ai.taobao.com/search/index.htm?pid=mm_26632323_6762370_25910879&unid=&source_id=search&key=%E6%89%8B%E6%9C%BA&b=sousuo_ssk&clk1=&prepvid=200_11.251.246.148_396_1490081427029&spm=a231o.7712113%2Fa.a3342.1"
# nextUrl='https://ai.taobao.com/search/index.htm?pid=mm_26632323_6762370_25910879&unid=&source_id=search&key=%E6%89%8B%E6%9C%BA&b=sousuo_ssk&clk1=&prepvid=200_11.251.246.157_19825_1490081412211&spm=a231o.7076277.1998559105.1&page='
# # url='https://ai.taobao.com/search/index.htm?pid=mm_26632323_6762370_25910879&unid=&source_id=search&key=%E6%89%8B%E6%9C%BA&b=sousuo_ssk&clk1=&prepvid=200_11.251.246.148_396_1490081427029&spm=a231o.7712113%2Fa.a3342.1&page=2&pagesize=120'
# # url='https://ai.taobao.com/search/index.htm?pid=mm_26632323_6762370_25910879&unid=&source_id=search&key=%E6%89%8B%E6%9C%BA&b=sousuo_ssk&clk1=&prepvid=200_11.251.246.148_396_1490081427029&spm=a231o.7712113%2Fa.a3342.1&page=3&pagesize=120'
# restUrl = '&pagesize=120'
# obj.CatchData("J_albumFlowCon",firstUrl,nextUrl,restUrl)

#測試抓取天貓數據
obj = IEDriverCrawler()
firstUrl = "https://list.tmall.com/search_product.htm?q=%CA%D6%BB%FA&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton"
obj.CatchDatabyClickNextButton("J_ItemList",firstUrl)

本文章僅僅作為交流。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 [Python爬蟲] 之四：Selenium 抓取微博數據使用selenium爬蟲抓取數據 python 爬蟲抓取亞馬遜數據 python爬蟲(一)_爬蟲原理和數據抓取 Python開發爬蟲之動態網頁抓取篇：爬取博客評論數據——通過Selenium模擬瀏覽器抓取網頁爬蟲--python3.6+selenium+BeautifulSoup實現動態網頁的數據抓取，適用於對抓取頻率不高的情況 Python爬蟲實例（二）使用selenium抓取斗魚直播平台數據 [Python爬蟲] 之二十五：Selenium +phantomjs 利用 pyquery抓取今日頭條網數據 Selenium 調用IEDriverServer打開IE瀏覽器 Selenium 調用IEDriverServer打開IE瀏覽器