Selenium+Chrome抓取淘宝数据


在学习了网易云课堂上崔庆才老师的Python3爬虫三大案例实战分享之后模仿了一段代码,PhantomJS和MongoDB还没学,暂时没放进去,用pandas代替。

 1 from selenium import webdriver
 2 from selenium.common.exceptions import TimeoutException
 3 from selenium.webdriver.common.by import By
 4 from selenium.webdriver.support.ui import WebDriverWait
 5 from selenium.webdriver.support import expected_conditions as EC
 6 import re
 7 from pyquery import PyQuery as pq
 8 import pandas as pd
 9 
10 browser = webdriver.Chrome()
11 wait = WebDriverWait(browser, 10)
12 totaldata = []
13 def search():
14     global totaldata
15     try:
16         browser.get('https://www.taobao.com')
17         input = wait.until(
18             EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))
19         )
20         submit = wait.until(
21             EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button"))
22         )
23         input.send_keys('鸡蛋')
24         submit.click()
25         total = wait.until(
26             EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total"))
27         )
28         totaldata.extend(get_products())
29         return total.text
30     except TimeoutException:
31         return search()
32 
33 def next_page(page_number):
34     global totaldata
35     try:
36         input = wait.until(
37             EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))
38         )
39         submit = wait.until(
40             EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))
41         )
42         input.clear()
43         input.send_keys(page_number)
44         submit.click()
45         wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))
46         totaldata.extend(get_products())
47     except TimeoutException:
48         return next_page(page_number)
49 
50 def get_products():
51     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
52     html = browser.page_source
53     doc = pq(html)
54     items = doc('#mainsrp-itemlist .items .item').items()
55     data = []
56     for item in items:
57         product = {
58             'image': item.find('.pic .img').attr('src'),
59             'price': item.find('.price').text().replace('\n', ''),
60             'deal': item.find('.deal-cnt').text()[:-3],
61             'title': item.find('.title').text().replace('\n', ''),
62             'shop': item.find('.shop').text(),
63             'location': item.find('.location').text()
64         }
65         data.append(product)
66     return data
67 
68 def main():
69 
70     search()
71     total = search()
72     total = int(re.compile('(\d+)').search(total).group(1))
73     for i in range(2, total+1):
74         next_page(i)
75     df = pd.DataFrame(totaldata)
76     df.to_excel('taobaoeggs.xlsx')
77 
78 if __name__ == '__main__':
79     main()

 




免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM