一、定義實現隨機User-Agent的下載中間件
1.在middlewares.py中完善代碼
1 import random 2 from Tencent.settings import USER_AGENTS_LIST # 注意導入路徑,請忽視pycharm的錯誤提示 3 4 class UserAgentMiddleware(object): 5 def process_request(self, request, spider): 6 user_agent = random.choice(USER_AGENTS_LIST) 7 request.headers['User-Agent'] = user_agent 8 # 不寫return 9 10 class CheckUA: 11 def process_response(self,request,response,spider): 12 print(request.headers['User-Agent']) 13 return response # 不能少!
2.在settings中設置開啟自定義的下載中間件,設置方法同管道
1 DOWNLOADER_MIDDLEWARES = { 2 'Tencent.middlewares.UserAgentMiddleware': 543, # 543是權重值 3 'Tencent.middlewares.CheckUA': 600, # 先執行543權重的中間件,再執行600的中間件 4 }
3.在settings中添加UA的列表
1 USER_AGENTS_LIST = [ 2 "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 3 "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 4 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 5 "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 6 "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" 7 ]
二、代理ip的使用
1.在middlewares.py中完善代碼
1 class RandomProxy(object): 2 3 def process_request(self, request, spider): 4 5 proxy = random.choice(PROXY_LIST) 6 print(proxy) 7 8 if 'user_passwd' in proxy: 9 # 對賬號密碼進行編碼,基礎認證,python3中需要是bytes類型的數據才能編碼 10 b64_up = base64.b64encode(proxy['user_passwd'].encode()) 11 12 # 進行代理認證 13 request.headers['Proxy-Authorization'] = 'Basic ' + b64_up.decode() 14 15 # 設置代理 16 request.meta['proxy'] = proxy['ip_port'] 17 else: 18 #設置代理 19 request.meta['proxy'] = proxy['ip_port']
2.檢測代理ip是否可用
在使用了代理ip的情況下可以在下載中間件的process_response()方法中處理代理ip的使用情況,如果該代理ip不能使用可以替換其他代理ip
1 class ProxyMiddleware(object): 2 ...... 3 def process_response(self, request, response, spider): 4 if response.status != '200': 5 request.dont_filter = True # 重新發送的請求對象能夠再次進入隊列 6 return requst
3.在settings中添加代理ip的列表
1 PROXY_LIST = [ 2 {"ip_port": "139.199.121.163:16818", "user_passwd": "user:password"},#收費代理 3 # {"ip_port": "114.234.81.72:9000"} # 免費代理 4 ]
三. 在中間件中使用selenium
以github登陸為例
1. 完成爬蟲代碼
1 import scrapy 2 3 class Login4Spider(scrapy.Spider): 4 name = 'login4' 5 allowed_domains = ['github.com'] 6 start_urls = ['https://github.com/returnes'] # 直接對驗證的url發送請求 7 8 def parse(self, response): 9 with open('check.html', 'w') as f: 10 f.write(response.body.decode())
2.在middlewares.py中使用selenium獲取cookie信息
1 import time 2 from selenium import webdriver 3 4 5 def getCookies(): 6 # 使用selenium模擬登陸,獲取並返回cookie 7 username = input('輸入github賬號:') 8 password = input('輸入github密碼:') 9 options = webdriver.ChromeOptions() 10 options.add_argument('--headless') 11 options.add_argument('--disable-gpu') 12 driver = webdriver.Chrome('/home/worker/Desktop/driver/chromedriver', 13 chrome_options=options) 14 driver.get('https://github.com/login') 15 time.sleep(1) 16 driver.find_element_by_xpath('//*[@id="login_field"]').send_keys(username) 17 time.sleep(1) 18 driver.find_element_by_xpath('//*[@id="password"]').send_keys(password) 19 time.sleep(1) 20 driver.find_element_by_xpath('//*[@id="login"]/form/div[3]/input[3]').click() 21 time.sleep(2) 22 cookies_dict = {cookie['name']: cookie['value'] for cookie in driver.get_cookies()} 23 driver.quit() 24 return cookies_dict 25 26 class LoginDownloaderMiddleware(object): 27 28 def process_request(self, request, spider): 29 cookies_dict = getCookies() 30 print(cookies_dict) 31 request.cookies = cookies_dict # 對請求對象的cookies屬性進行替換
3.在middlewares.py中使用selenium獲取指定頁面渲染后的html源碼
1 class SelMiddleWare(object): 2 3 def process_request(self, request, spider): 4 5 url = request.url 6 # 過濾需要渲染的請求對象 7 if 'daydata' in url: 8 9 driver = webdriver.Chrome() 10 11 driver.get(url) 12 time.sleep(3) 13 14 data = driver.page_source 15 driver.close() 16 17 res = HtmlResponse( 18 url=url, 19 body=data, 20 encoding='utf-8', 21 request=request 22 ) 23 24 return res