設置代理ip只需要,自定義一個中間件,重寫process_request方法,
request.meta['proxy'] = "http://185.82.203.146:1080" 設置代理IP
中間件,注意將中間件注冊到配置文件里去
from adc.daili_ip.sh_yong_ip.sh_yong_ip import sui_ji_hq_ip
from fake_useragent import UserAgent #導入瀏覽器用戶代理模塊
class RequestsUserAgentmiddware(object): #自定義瀏覽器代理中間件
#中間件隨機更換Requests請求頭信息的User-Agent瀏覽器用戶代理
def __init__(self,crawler):
super(RequestsUserAgentmiddware, self).__init__() #獲取上一級父類基類的,__init__方法里的對象封裝值
self.ua = UserAgent() #實例化瀏覽器用戶代理模塊類
self.ua_type = crawler.settings.get('RANDOM_UA_TYPE','random') #獲取settings.py配置文件里的RANDOM_UA_TYPE配置的瀏覽器類型,如果沒有,默認random,隨機獲取各種瀏覽器類型
@classmethod #函數上面用上裝飾符@classmethod,函數里有一個必寫形式參數cls用來接收當前類名稱
def from_crawler(cls, crawler): #重載from_crawler方法
return cls(crawler) #將crawler爬蟲返回給類
def process_request(self, request, spider): #重載process_request方法
def get_ua(): #自定義函數,返回瀏覽器代理對象里指定類型的瀏覽器信息
return getattr(self.ua, self.ua_type)
sssf = get_ua()
print('啟用用戶代理瀏覽器信息:{0}'.format(sssf))
request.headers.setdefault('User-Agent', get_ua()) #將瀏覽器代理信息添加到Requests請求
class MyproxiesSpiderMiddleware(object):
#中間件隨機更換IP
def process_request(self, request, spider): #重寫process_request方法
#到數據庫隨機獲取一個IP
xieyi = request._get_url() #_get_url可以獲取到請求URL,來判斷是什么協議請求如https
print(xieyi)
dai_ip = sui_ji_hq_ip('http') #到數據庫隨機獲取一個代理IP
request.meta['proxy'] = "http://{0}".format(dai_ip) #字符串格式化設置代理IP
#request.meta['proxy'] = "http://185.82.203.146:1080" 設置代理IP
隨機數據庫獲取IP
#!/usr/bin/env python
# -*- coding:utf8 -*-
import time
import requests
from adc.daili_ip.mysq import shujuku as ORM
def suiji_ip(rst):
"""
調用此函數隨機到數據庫獲取代理IP返回IP,如果IP不可用會自動刪除返回False
"""
atime = time.localtime(time.time()-240) #設置獲取多少時間以內檢測過的IP(單位秒)
sudu = '00:00:03' #設置獲取訪問速度小於等於多少的IP,單位(時分秒)默認3秒
dqatime = "{0}-{1}-{2} {3}:{4}:{5}".format(
atime.tm_year,
atime.tm_mon,
atime.tm_mday,
atime.tm_hour,
atime.tm_min,
atime.tm_sec
) # 將格式化時間日期,單獨取出來拼接成一個完整日期
try:
mysq = ORM.session()
shuju = mysq.query(
ORM.daili_ip.ip,
ORM.daili_ip.port,
ORM.daili_ip.xtype,
ORM.daili_ip.seshi_ri_qi,
ORM.daili_ip.connectTimeMs
).from_statement(
"SELECT ip,port,xtype,seshi_ri_qi,connectTimeMs FROM daili_ip WHERE xtype='{0}' AND ce_shi='{1}' AND seshi_ri_qi>='{2}' AND connectTimeMs<='{3}' ORDER BY RAND() LIMIT 1".format(rst, '1', dqatime, sudu)
).all()
mysq.close()
if shuju:
print('獲取到IP')
else:
print('獲取IP失敗,請檢查獲取條件')
except Exception as e:
print('查詢代理IP數據出錯')
return True
ip = shuju[0][0]
duan_kou = shuju[0][1]
print('啟用代理IP,數據庫獲取到IP:{0}'.format(shuju))
http_url = '{0}://image.baidu.com/'.format(rst)
proxy_url = '{0}://{1}:{2}'.format(rst, ip, duan_kou)
headers = {
'Referer': http_url,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
}
print('啟用代理IP,測試網址:{0}'.format(http_url))
print('啟用代理IP,測試頭:{0}'.format(proxy_url))
try:
proxy_dict = {
'http': proxy_url
}
response = requests.get(http_url, proxies=proxy_dict, headers=headers)
except Exception as e:
print('啟用代理IP,測速連接失敗{0}'.format(e))
print('啟用代理IP,測速連接失敗,當前IP不可用,刪除當前ip!')
fanhui = mysq.query(ORM.daili_ip).filter(ORM.daili_ip.ip == ip).delete() # 刪除不可以數據
mysq.commit()
mysq.close()
if fanhui == 1:
print("成功刪除當前IP")
else:
print('刪除當前IP失敗')
return False
else:
code = response.status_code # 獲取狀態嗎
sudu = str(response.elapsed) # 獲取響應時間
if code >= 200 and code < 300:
atime = time.localtime()
dqatime = "{0}-{1}-{2} {3}:{4}:{5}".format(
atime.tm_year,
atime.tm_mon,
atime.tm_mday,
atime.tm_hour,
atime.tm_min,
atime.tm_sec
) # 將格式化時間日期,單獨取出來拼接成一個完整日期
print('啟用代理IP,測試代理ip--{0}{1}--狀態可用--狀態碼--{2}'.format(ip, duan_kou, code))
print('啟用代理IP,當前IP可以,正在向數據庫標記')
fanhui = mysq.query(ORM.daili_ip).filter(ORM.daili_ip.ip == ip).update({
"ce_shi": "1",
"seshi_ri_qi": dqatime,
"connectTimeMs": sudu
})
mysq.commit()
mysq.close()
if fanhui == 1:
print('向數據庫成功標記可用IP!')
else:
print('向數據庫標記可用IP失敗!!!')
print('向爬蟲返回IP:{0}:{1}'.format(ip, duan_kou))
return ip + ':' + duan_kou
else:
print('啟用代理IP,測試代理ip--{0}{1}--狀態不可用--狀態碼--{2}'.format(ip, duan_kou, code))
print('返回狀態碼不可以,正在向數據庫刪除當前IP')
fanhui = mysq.query(ORM.daili_ip).filter(ORM.daili_ip.ip == ip).delete() # 刪除不可以數據
mysq.commit()
mysq.close()
if fanhui == 1:
print('刪除當前IP成功')
else:
print('刪除當前IP失敗')
return False
def sui_ji_hq_ip(rst):
"""
正式使用:調用此函數,接收一個參數協議,如http
循環到數據庫獲取IP,IP如果不可用刪除后繼續獲取,直到ip可以后返回ip
值循環獲取測試30分鍾內有效的IP
"""
n = True
h = None
while n:
youxiao_ip = suiji_ip(rst)
if youxiao_ip:
h = youxiao_ip
n = False
return h
# print(sui_ji_hq_ip('http'))
數據庫模塊文件
import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, ForeignKey, UniqueConstraint, Index,text,DATETIME,TIME
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy import create_engine
import requests
import json
import time
import datetime
#配置數據庫引擎信息
ENGINE = create_engine("mysql+pymysql://root:279819@127.0.0.1:3306/cshi?charset=utf8", max_overflow=500, echo=True)
Base = declarative_base() #創建一個SQLORM基類
class daili_ip(Base): #ip池設計表
__tablename__ = 'daili_ip'
id = Column(Integer, primary_key=True, autoincrement=True)
ip = Column(String(300), unique=True) #IP
port = Column(String(300)) #端口
city = Column(String(300)) #城市
isp = Column(String(300)) #運營商
connectTimeMs = Column(TIME()) #速度
anonymity = Column(String(300)) #匿名方式
country = Column(String(300)) #國家
xtype = Column(String(300)) #協議
zhuang_tai_ma = Column(String(300)) #狀態碼
ruku_riqi = Column(DATETIME()) #入庫日期
ce_shi = Column(String(300)) #測試狀態
seshi_ri_qi = Column(DATETIME()) #測試日期
shi_xiao_riqi = Column(DATETIME()) # 失效日期
def init_db():
Base.metadata.create_all(ENGINE) #向數據庫創建指定表
def drop_db():
Base.metadata.drop_all(ENGINE) #向數據庫刪除指定表
def session():
cls = sessionmaker(bind=ENGINE) #創建sessionmaker類,操作表
return cls()
# drop_db() #刪除表
# init_db()

