selenium和pyquery抓取異步加載數據


 

安裝selenium和pyquery

打開命令行輸入:

pip install selenium

pip install pyquery

chromedriver的下載地址如下:
http://chromedriver.storage.googleapis.com/index.html

 

 

 



from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from pyquery import PyQuery as pq
import time

#打開不同的瀏覽器實例
def openBrower(brower_type):
    if brower_type == 'chrome':
        return webdriver.Chrome("C:/Users/net/PycharmProjects/untitled/venv/Scripts/chromedriver.exe")
    elif brower_type == 'firefox':
        return webdriver.Firefox()
    elif brower_type == 'safari':
        return webdriver.Safari()
    elif brower_type == 'PhantomJS':
        return webdriver.PhantomJS()
    else :
        return webdriver.Ie()

def parse_website():
    # 通過Chrome()方法打開chrome瀏覽器
    browser = openBrower('chrome')
    # 訪問京東網站
    browser.get("https://www.jd.com")
    # 等待50秒
    wait = WebDriverWait(browser, 50)
    # 通過css選擇器的id屬性獲得輸入框。until方法表示瀏覽器完全加載到對應的節點,才返回相應的對象。presence_of_all_elements_located是通過css選擇器加載節點
    input = wait.until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#key'))
    )

    # input = browser.find_element_by_id('key')
    # 在輸入框中寫入要查詢的信息
    input[0].send_keys('計算機書籍')
    # 查詢按鈕完全加載完畢,返回查詢按鈕對象
    submit_button = wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, '.button'))
    )
    # 點擊查詢按鈕
    submit_button.click()

    # 模擬下滑到底部操作
    for i in range(0,3):
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)

    # 商品列表的總頁數
    total = wait.until(
        EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')
        )
    )
    html = browser.page_source.replace('xmlns', 'another_attr')

    parse_book(1,html)

    for page_num in range(2,int(total[0].text) + 1):
        print('當前第' + str(page_num) + '')
        parse_next_page(page_num,browser,wait)

##解析下一頁
def parse_next_page(page_num,browser,wait):

    next_page_button = wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.pn-next > em'))
    )
    next_page_button.click()

    #滑動到頁面底部,用於加載數據
    for i in range(0,3):
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(10)

    #一頁顯示60個商品,"#J_goodsList > ul > li:nth-child(60)確保60個商品都正常加載出來。
    wait.until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)"))
    )
    # 判斷翻頁成功,當底部的分頁界面上顯示第幾頁時,就顯示翻頁成功。
    wait.until(
        EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#J_bottomPage > span.p-num > a.curr"), str(page_num))
    )

    html = browser.page_source.replace('xmlns', 'another_attr')
    parse_book(page_num, html)

def parse_book(page,html):
    doc = pq(html)
    li_list = doc('.gl-item').items()
    print('-------------------第' + str(page) + '頁的圖書信息---------------------')
    for item in li_list:
        image_html = item('.gl-i-wrap .p-img')
        book_img_url = item.find('img').attr('data-lazy-img')
        if book_img_url == "done":
            book_img_url = item.find('img').attr('src')
        print('圖片地址:' + book_img_url)
        item('.p-name').find('font').remove()
        book_name = item('.p-name').find('em').text()
        print('書名:' + book_name)
        price = item('.p-price').find('em').text() + str(item('.p-price').find('i').text())
        print('價格:' + price)
        commit = item('.p-commit').find('strong').text()
        print('評價數量:' + commit)
        shopnum = item('.p-shopnum').find('a').text()
        print('出版社:' + shopnum)
        print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

def main():
    parse_website()
if __name__ == "__main__":
    main()
 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM