由於PhantomJS已經停止更新,所以使用chrome瀏覽器的headless模式代替,代碼如下:
from selenium import webdriver chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') browser=webdriver.Chrome(options=chrome_options) browser.get('https://www.baidu.com/') print(browser.current_url)
爬取淘寶的代碼:
別人的代碼:
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import re from pyquery import PyQuery as pq from pymongo import MongoClient client = MongoClient() db = client['MONGO_DB'] browser = webdriver.Chrome() wait = WebDriverWait(browser,10) #使用webdriver打開chrome,打開淘寶頁面,搜索美食關鍵字,返回總頁數 def search(): try: browser.get('https://www.taobao.com') input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#q'))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button'))) input.send_keys('ipad') submit.click() total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))) get_products() return total.text except TimeoutException: print('timeout!') return search() #進行頁面的跳轉,輸入下一頁的頁號,然后點擊確定按鈕,在高亮區域判定是否正確跳轉 def next_page(page_num): try: input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ' 'div.form > input'))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page_num) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_num))) get_products() except TimeoutException: next_page(page_num) #獲取商品詳情 def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) html = browser.page_source doc = pq(html) items = doc('#mainsrp-itemlist .items .item').items() for item in items: product = { 'image':item.find('.pic .img').attr('src'), 'price':item.find('.price').text(), 'deal':item.find('.deal-cnt').text()[:-3], 'title':item.find('.title').text(), 'shop':item.find('.shop').text(), 'location':item.find('.location').text(), } print(product) save_to_mongo(product) def save_to_mongo(result): try: if db['MONGO_DB'].insert(result): print('存儲成功',result) except Exception: print('存儲失敗',result) def main(): total = search() total = int(re.search('(\d+)',total).group(1)) #'\d'表示匹配數字 for i in range(2,total+1): next_page(i) if __name__ == '__main__': main()
崔老師的代碼:
from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from urllib.parse import quote from pyquery import PyQuery as pq from pymongo import MongoClient browser = webdriver.Chrome() wait = WebDriverWait(browser, 10) KEYWORD = 'iPad' def index_page(page): try: url = 'https://s.taobao.com/search?q=' + quote(KEYWORD) browser.get(url) if page > 1: input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input'))) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page) submit.click() wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page))) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item'))) get_products() except TimeoutException: index_page(page) def get_products(): """ 提取商品數據 """ html = browser.page_source doc = pq(html) items = doc('#mainsrp-itemlist .items .item').items() for item in items: product = { 'image': item.find('.pic .img').attr('data-src'), 'price': item.find('.price').text(), 'deal': item.find('.deal-cnt').text(), 'title': item.find('.title').text(), 'shop': item.find('.shop').text(), 'location': item.find('.location').text() } print(product) save_to_mongo(product) MONGO_URL = 'localhost' MONGO_DB = 'taobao' MONGO_COLLECTION = 'products' client = MongoClient(MONGO_URL) db = client[MONGO_DB] def save_to_mongo(result): """ 保存至MongoDB :param result: 結果 """ try: if db[MONGO_COLLECTION].insert(result): print('存儲到MongoDB成功') except Exception: print('存儲到MongoDB失敗') MAX_PAGE = 100 if __name__ == '__main__': for i in range(1, MAX_PAGE + 1): index_page(i)
其他人幫助的代碼
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.common.exceptions import TimeoutException from urllib.parse import quote from pyquery import PyQuery as pq import os import openpyxl import random import time browser=webdriver.Chrome() base_url='https://s.taobao.com/search?q=' keywords='ipad' url=base_url+quote(keywords) wait=WebDriverWait(browser,15) page_max=100 def log_out(browser): login_switch=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'div.login-switch i#J_Quick2Static'))) login_switch.click() weibo_login=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.weibo-login'))) weibo_login.click() username=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.inp.username input'))) password=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.inp.password input'))) username.send_keys('xxx') password.send_keys('xxx') submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'span[node-type="submitStates"]'))) submit.click() def get_page(page): print('正在打印 %d 頁'%page) try: if page==1: browser.get(url) if '手機掃碼,安全登錄' in browser.page_source: log_out(browser) else: input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'input.J_Input'))) submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'span.J_Submit'))) input.clear() input.send_keys(page) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'div#mainsrp-pager ul.items li.item.active span'),str(page))) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.m-itemlist'))) if browser.page_source!=None: return browser else: get_page(page) except TimeoutException as e: get_page(page) def get_products(browser): html=browser.page_source doc=pq(html) for item in doc('#mainsrp-itemlist .items .item').items(): image=item.find('.pic .img').attr('data-src') price=item.find('.price').text().replace('\n','') deal=item.find('.deal-cnt').text() title=item.find('.title').text() shop=item.find('.shop').text() location=item.find('.location').text().replace(' ','') yield [image,price,deal,title,shop,location] def save(out): filename='taobao_'+keywords+'.xlsx' if not os.path.exists(filename): workbook=openpyxl.Workbook() sheet=workbook.create_sheet(index=0,title=keywords) sheet.append(['圖片','價格','成交人數','商品','店鋪','地點']) workbook.save(filename) workbook=openpyxl.load_workbook(filename) sheet=workbook[keywords] for row in out: print(row) sheet.append(row) workbook.save(filename) def main(): for page in range(1,page_max+1): browser=get_page(page) out=get_products(browser) save(out) time.sleep(random.randint(1,5)) if __name__=='__main__': main()
自己的代碼:
from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from pyquery import PyQuery as pq from pymongo import MongoClient #創建mogodb數據對象 client=MongoClient() db=client['taobao'] collection=db['taobao'] browser=webdriver.Chrome() wait=WebDriverWait(browser,10) max_page=100 def index_page(): try: browser.get('https://www.taobao.com') input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))) input.send_keys('ipad') submit.click() print('正在爬取第', page, '頁') get_products() num = browser.find_element_by_link_text('下一頁') num.click() except TimeoutException: print('time out!') return index_page() # 提取商品數據 def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'mainsrp-itemlist.items.item'))) html=browser.page_source doc=pq(html) items=doc('#mainsrp-itemlist.items.item').items() for item in items: product={ 'image': item.find('.pic a img').attr('data-src'), 'price': item.find('.price').text(), 'deal': item.finc('.deal-cnt').text(), 'title': item.find('.title').text(), 'shop': item.find('.shop').text(), 'location': item.find('.location').text() } print(product) save_to_mogo(product) #保存到mogodb def save_to_mogo(result): try: if collection.insert(result): print('保存成功',result) except Exception: print('保存失敗',result) if __name__=='__main__': for page in range(2, max_page + 1): index_page(page)