任務1:利用cookie可以免去登錄的煩惱(驗證碼)
''' 只需要有登錄后的cookie,就可以繞過驗證碼 登錄后的cookie可以通過Selenium用第三方(微博)進行登錄,不需要進行淘寶的滑動驗證碼 ''' import requests from urllib.parse import urlencode headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36', # 登錄后的cookie 'cookie': 'xxx', } params = { 'q': 'iphone', 'imgfile':'', 'commend': 'all', 'ssid': 's5-e', 'search_type': 'item', 'sourceId': 'tb.index', 'spm': 'a21bo.2017.201856-taobao-item.2', 'ie': 'utf8', 'initiative_id': 'tbindexz_20170306', } url = 'https://s.taobao.com/search?' + urlencode(params) s = requests.Session() response = s.get(url,headers=headers,verify=False).text print(response)
任務2:爬取淘寶商品信息
from selenium import webdriver # 通用選擇 from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver import ActionChains from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # 超時 from selenium.common.exceptions import TimeoutException from lxml import etree import json import random import time browser = webdriver.Chrome() browser.set_window_size(1400, 900) ''' 爬到第19頁的時候淘寶會彈窗,熟悉的滑動解鎖= =,以后解決了這個滑動在繼續吧 總體思路沒問題 ''' def taobao_login(): ''' 淘寶的滑動驗證碼過不去,使用第三方登錄 ''' ''' # 需要先登錄 login = browser.find_element(By.ID,'J_Quick2Static').click() username = browser.find_element(By.CSS_SELECTOR,'#TPL_username_1') username.send_keys('XXX') password = browser.find_element(By.CSS_SELECTOR,'#TPL_password_1') password.send_keys('XXX') button = browser.find_element(By.ID, 'J_SubmitStatic').click() ActionChains(browser).move_by_offset(random.randint(10, 60), random.randint(10, 60)).perform() # 判斷大小 input2 = browser.find_element(By.ID,'nc_1__scale_text') print(input2.size) # 滑動驗證碼 action = ActionChains(browser) source = browser.find_element(By.ID,'nc_1_n1z') # 按住不放 action.click_and_hold(source).perform() # 需要滑動的坐標 action.move_by_offset(298,0) # 釋放鼠標 action.release().perform() ''' def weibo_login(): ''' 賬號密碼輸入后有滑動驗證碼,滑動成功也無法登錄,考慮繞過去,采用第三方登錄 通過微博賬號登錄 ''' weibo_button = browser.find_element(By.CSS_SELECTOR, '.weibo-login') weibo_button.click() # 網速有點慢 需要先注冊一個微博賬號且綁定一個淘寶賬號,真麻煩= = # 多登錄幾次,微博就會彈出當前網絡超時,請稍后再試(600002),需要等一段時間才能進去 time.sleep(2) username = browser.find_element(By.NAME, 'username') username.send_keys('賬號') time.sleep(1) password = browser.find_element(By.NAME, 'password') password.send_keys('密碼') browser.find_element(By.CSS_SELECTOR, '.W_btn_g').click() def index_page(page): print('正在抓取第', page, '頁') try: if page > 1: # 等待直到頁碼輸入框出現 input_box = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input'))) # 等待確定按鈕可以被點擊 submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit'))) input_box.clear() input_box.send_keys(page) submit.click() # time.sleep(2) # 等待直到跳轉的頁碼等於高亮顯示的頁碼,說明跳轉成功 wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page))) # 等待直到所有商品加載出來 wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item'))) # 爬取詳細信息 # time.sleep(3) get_products() except TimeoutException: # 超時就重試一次 index_page(page) def get_products(): html = etree.HTML(browser.page_source) items = html.xpath('//div[@class="m-itemlist"]//div[@class="items"]/div') for item in items: product = {} product['image'] = item.xpath('.//img/@data-src') product['price'] = item.xpath('.//strong/text()') product['title'] = item.xpath('.//img[@class="J_ItemPic img"]/@alt') product['shop'] = item.xpath('.//a[@class="shopname J_MouseEneterLeave J_ShopInfo"]/span[2]/text()') product['location'] = item.xpath('.//div[@class="location"]/text()') print(product) result = json.dumps(product, ensure_ascii=False) + ',\n' with open('product.json', 'ab') as f: f.write(result.encode('utf-8')) # print(browser.get_cookies()) # print(browser.page_source) wait = WebDriverWait(browser, 10) MAX_PACE = 100 def main(): try: browser.get('http://www.taobao.com') print(browser.window_handles) input = browser.find_element(By.ID, 'q') # 搜索的信息 input.send_keys('iphone') input.send_keys(Keys.ENTER) browser.find_element(By.ID, 'J_Quick2Static').click() time.sleep(1) # 處理登錄 weibo_login() except: main() for i in range(1, MAX_PACE + 1): index_page(i) if __name__ == '__main__': main()
總結:
1.學會利用cookie繞過驗證碼
2.學會從第三方進入需要爬取的網站
3.淘寶的滑動解鎖(真的麻煩)
4.通過selenium模擬點擊,爬取網站,雖然只爬了20頁,但是思路沒問題
淘寶必殺:滑動三連
小知識:
import json a = {"name":"123","age":123} text = json.dumps(a,ensure_ascii=False) + ",\n" with open('1234.json','wb') as f: f.write(text.encode('utf-8'))