大部分商業網站需要我們登錄后才能爬取內容,所以對於爬蟲來說,生成cookies給代理使用成為了一個必須要做的事情。今天我們交流下關於使用selenium訪問目標網站遇到的一些問題。
因為業務需求我們需要采集小紅書的一些數據,程序在掛上代理訪問目標網站的時候彈出了驗證框。如圖所示

這個問題從來沒有遇到過,我以為是的代理的問題,咨詢客服才知道這個是因為我的瀏覽器的驅動和版本的問題,然后更新了新版本就可以解決了。那我們分享下使用chrome driver來進行登錄和cookie的生成。
import os import time import zipfile from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait class GenCookies(object): USER_AGENT = open('useragents.txt').readlines() # 16yun 代理配置 PROXY_HOST = 't.16yun.cn' # proxy or host PROXY_PORT = 31111 # port PROXY_USER = 'USERNAME' # username PROXY_PASS = 'PASSWORD' # password @classmethod def get_chromedriver(cls, use_proxy=False, user_agent=None): manifest_json = """ { "version": "1.0.0", "manifest_version": 2, "name": "Chrome Proxy", "permissions": [ "proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking" ], "background": { "scripts": ["background.js"] }, "minimum_chrome_version":"22.0.0" } """ background_js = """ var config = { mode: "fixed_servers", rules: { singleProxy: { scheme: "http", host: "%s", port: parseInt(%s) }, bypassList: ["localhost"] } }; chrome.proxy.settings.set({value: config, scope: "regular"}, function() {}); function callbackFn(details) { return { authCredentials: { username: "%s", password: "%s" } }; } chrome.webRequest.onAuthRequired.addListener( callbackFn, {urls: ["<all_urls>"]}, ['blocking'] ); """ % (cls.PROXY_HOST, cls.PROXY_PORT, cls.PROXY_USER, cls.PROXY_PASS) path = os.path.dirname(os.path.abspath(__file__)) chrome_options = webdriver.ChromeOptions() if use_proxy: pluginfile = 'proxy_auth_plugin.zip' with zipfile.ZipFile(pluginfile, 'w') as zp: zp.writestr("manifest.json", manifest_json) zp.writestr("background.js", background_js) chrome_options.add_extension(pluginfile) 