爬蟲實現翻頁功能
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import json
import csv
import random
# 聲明一個谷歌驅動器,並設置不加載圖片,間接加快訪問速度
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
browser = webdriver.Chrome(options=options)
# url
url = 'https://www.jd.com/'
# 聲明一個list,存儲dict
data_list = []
def start_spider():
# 請求url
browser.get(url)
# 獲取輸入框的id,並輸入關鍵字python爬蟲
browser.find_element_by_id('key').send_keys('python爬蟲')
# 輸入回車進行搜索
browser.find_element_by_id('key').send_keys(Keys.ENTER)
# 顯示等待下一頁的元素加載完成
WebDriverWait(browser, 1000).until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'pn-next')
)
)
# 先獲取一個有多少頁
all_page = eval(browser.find_element_by_css_selector('span.p-skip em b').text)
print(all_page)
# 設置一個計數器
count = 0
# 無限循環
while True:
try:
count += 1
# 顯示等待商品信息加載完成
WebDriverWait(browser, 1000).until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'gl-item')
)
)
# 將滾動條拉到最下面的位置,因為往下拉才能將這一頁的商品信息全部加載出來
browser.execute_script('document.documentElement.scrollTop=10000')
# 隨機延遲,等下元素全部刷新
time.sleep(random.randint(1, 3))
browser.execute_script('document.documentElement.scrollTop=0')
# 開始提取信息,找到ul標簽下的全部li標簽
lis = browser.find_elements_by_class_name('gl-item')
# 遍歷
for li in lis:
# 名字
name = li.find_element_by_xpath('.//div[@class="p-name p-name-type-2"]//em').text
# 價格
price = li.find_element_by_xpath('.//div[@class="p-price"]//i').text
# 評論數
comment = li.find_elements_by_xpath('.//div[@class="p-commit"]//a')
if comment:
comment = comment[0].text
else:
comment = None
# 商鋪名字
shop_name = li.find_elements_by_class_name('J_im_icon')
if shop_name:
shop_name = shop_name[0].text
else:
shop_name = None
# 商家類型
shop_type = li.find_elements_by_class_name('goods-icons')
if shop_type:
shop_type = shop_type[0].text
else:
shop_type = None
# 聲明一個字典存儲數據
data_dict = {}
data_dict['name'] = name
data_dict['price'] = price
data_dict['comment'] = comment
data_dict['shop_name'] = shop_name
data_dict['shop_type'] = shop_type
data_list.append(data_dict)
print(data_dict)
except Exception as e:
continue
# 如果count==all_page就退出循環
if count == all_page:
break
# 找到下一頁的元素pn-next
fp_next = browser.find_element_by_xpath('//*[@id = "J_bottomPage"]/span[1]/a[9]/em')
#fp_next = browser.find_element_by_css_selector('a.fp-next')
# 點擊下一頁
fp_next.click()
def main():
start_spider()
# 將數據寫入jsonwenj
# with open('data_json.json', 'a+', encoding='utf-8') as f:
# json.dump(data_list, f, ensure_ascii=False, indent=4)
# print('json文件寫入完成')
#
# with open('data_csv.csv', 'w', encoding='utf-8', newline='') as f:
# # 表頭
# title = data_list[0].keys()
# # 聲明writer
# writer = csv.DictWriter(f, title)
# # 寫入表頭
# writer.writeheader()
# # 批量寫入數據
# writer.writerows(data_list)
# print('csv文件寫入完成')
if __name__ == '__main__':
main()
# 退出瀏覽器
browser.quit()