大部分商業網站需要我們登錄后才能爬取內容,所以對於爬蟲來說,生成cookies給代理使用成為了一個必須要做的事情。今天我們交流下關於使用selenium訪問目標網站遇到的一些問題。
因為業務需求我們需要采集小紅書的一些數據,程序在掛上代理訪問目標網站的時候彈出了驗證框。如圖所示

這個問題從來沒有遇到過,我以為是的代理的問題,咨詢客服才知道這個是因為我的瀏覽器的驅動和版本的問題,然后更新了新版本就可以解決了。那我們分享下使用chrome driver來進行登錄和cookie的生成。
import os import time import zipfile  from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait   class GenCookies(object):  USER_AGENT = open('useragents.txt').readlines()  # 16yun 代理配置  PROXY_HOST = 't.16yun.cn' # proxy or host  PROXY_PORT = 31111 # port  PROXY_USER = 'USERNAME' # username  PROXY_PASS = 'PASSWORD' # password   @classmethod  def get_chromedriver(cls, use_proxy=False, user_agent=None):  manifest_json = """  {  "version": "1.0.0",  "manifest_version": 2,  "name": "Chrome Proxy",  "permissions": [  "proxy",  "tabs",  "unlimitedStorage",  "storage",  "<all_urls>",  "webRequest",  "webRequestBlocking"  ],  "background": {  "scripts": ["background.js"]  },  "minimum_chrome_version":"22.0.0"  }  """   background_js = """  var config = {  mode: "fixed_servers",  rules: {  singleProxy: {  scheme: "http",  host: "%s",  port: parseInt(%s)  },  bypassList: ["localhost"]  }  };   chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});   function callbackFn(details) {  return {  authCredentials: {  username: "%s",  password: "%s"  }  };  }   chrome.webRequest.onAuthRequired.addListener(  callbackFn,  {urls: ["<all_urls>"]},  ['blocking']  );  """ % (cls.PROXY_HOST, cls.PROXY_PORT, cls.PROXY_USER, cls.PROXY_PASS)  path = os.path.dirname(os.path.abspath(__file__))  chrome_options = webdriver.ChromeOptions()  if use_proxy:  pluginfile = 'proxy_auth_plugin.zip'   with zipfile.ZipFile(pluginfile, 'w') as zp:  zp.writestr("manifest.json", manifest_json)  zp.writestr("background.js", background_js)  chrome_options.add_extension(pluginfile) 