Python3爬蟲(十五) 代理


 Infi-chu:

http://www.cnblogs.com/Infi-chu/

一、設置代理

1.urllib

#HTTP代理類型
from urllib.error import URLError
from urllib.requests import ProxyHandler,build_opener
proxy='127.0.0.1:9743'
# proxy='username:password@127.0.0.1:9743'  用戶名密碼放在開頭
proxy_handler=ProxyHandler({
	'http':'http://'+proxy,
	'https':'https://'+proxy
})
opener=build_opener(proxy_handler)
try:
    res = opener.open('http://httpbin.org/get')
	print(res.read().decode('uft-8'))
except URLError as e:
	print(e.reason)
#SOCK5代理類型
import socks	# pip3 install PySocks
import socket
from urllib import request
from urllib.error import URLError
socks.set_default_proxy(socks.SOCKS5,'127.0.0.1',9742)
socket.socket=socks.socksocket
try:
    res = request.urlopen('http://httpbin.org/get')
	print(res.read().decode('utf-8'))
except URLError as e:
	print(e.reason)

2.requests
比urllib簡單

# HTTP代理類型
improt requests
proxy='127.0.0.1:9743'
proxies = {
	'http':'http://'+proxy,
	'https':'https://'+proxy,
}
try:
    res = requests.get('http://httpbin.org/get',proxies=proxies)
	print(res.text)
except requests.exceptions.ConnectionError as e:
    print('Error',e.args)

# SOCK5代理類型(1)
import requests    # pip3 install 'requests[socks]'
proxy='127.0.0.1:9742'
proxies={
	'http':'socks5://'+proxy,
	'https':'socks5://'+proxy,
}
try:
    res = requests.get('http://httpbin.org/get',proxies=proxies)
	print(res.text)
except requests.exceptions.ConnectionError as e:
    print('Error',e.args)
# SOCK5代理類型(2)
import requests,socks,socket
socks.set_default_proxy(socks.SOCKS5,'127.0.0.1',9742)
socket.socket=socks.socksocket
try:
    res = requests.get('http://httpbin.org/get',proxies=proxies)
	print(res.text)
except requests.exceptions.ConnectionError as e:
    print('Error',e.args)

3.Selenium
設置瀏覽器代理

from selenium import webdriver
proxy='127.0.0.1:9743'
chrome_options=webdriver.ChromeOptions()	# 使用此方法傳參數
chrome_options.add_argument('--proxy-server=http://'+proxy)
browser=webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://httpbin.org/get')

設置認證代理

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import zipfile

ip='127.0.0.1'
port=9743
username='test'
password='test'
manifest_json="""
{
	"version":"1.0.0",
	"manifest_version":2,
	"name":"Chrome Proxy",
	"permissions":[
	"proxy",
	"tabs",
	"unlimitedStorage",
	"storage",
	"<all_urls>",
	"webRequest",
	"webRequestBlocking"
	],
	"background":{"scripts":["background.js"]}
}
"""
background_js="""
var config={
	mode:"fixed_servers",
	rules:{
		singleProxy:{
			scheme:"http",
			host:"%(ip)s",
			port:"%(port)s"
		}
	}
}

chrome.proxy.settings.set({value:config,scope:"regular"},function(){});
function callbackFn(details){
	return{
		authCredentials:{
			username:"%(username)s",
			password:"%(password)s"
		}
	}
}	
chrome.webRequest.onAuthRequired.addListener(
	callbackFn,
	{urls:["<all_urls>"]},
	['blocking']
)
"""%{'ip':ip,'port':port,'username':username,'port':port}
plugin_file='proxy_auth_plugin.zip'
with zipfile.ZipFile(plugin_file,'w') as zp:
    zp.writestr("manifest_json",manifest_json)
	zp.writestr("background.js",background_js)
chrome_options=Options()
chrome_options.add_argument('--start-maximized')
chrome_options.add_extension(plugin_file)
browser=webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://httpbin.org/get')

二、代理池維護
單一代理並不能完成我們的代理任務,所以需要更多數量的代理為我們服務。
我們將對代理進行篩選,並高效的為我們提供服務。
1.准備
需要使用redis數據庫,aiohttp、requests、redis-py、pyquery、flask庫
2.代理池的目標:存儲模塊、獲取模塊、檢測模塊、接口模塊
3.各模塊的實現:

https://github.com/Infi-chu/proxypool

三、利用代理爬取微信文章

https://github.com/Infi-chu/weixinspider

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM