python爬蟲之反爬蟲(隨機user-agent,獲取代理ip,檢測代理ip可用性)
目錄
隨機User-Agent 獲取代理ip 檢測代理ip可用性
隨機User-Agent
fake_useragent庫,偽裝請求頭
from fake_useragent import UserAgent ua = UserAgent() # ie瀏覽器的user agent print(ua.ie) # opera瀏覽器 print(ua.opera) # chrome瀏覽器 print(ua.chrome) # firefox瀏覽器 print(ua.firefox) # safri瀏覽器 print(ua.safari) # 最常用的方式 # 寫爬蟲最實用的是可以隨意變換headers,一定要有隨機性。支持隨機生成請求頭 print(ua.random) print(ua.random) print(ua.random)
獲取代理ip
在免費的代理網站爬取代理ip,免費代理的采集也很簡單,無非就是:訪問頁面頁面 —> 正則/xpath提取 —> 保存
代理ip網站 有代理:https://www.youdaili.net/Daili/guonei/ 66代理:http://www.66ip.cn/6.html 西刺代理:https://www.xicidaili.com/ 快代理:https://www.kuaidaili.com/free/
#根據網頁結果,適用正則表達式匹配 #這種方法適合翻頁的網頁
import re
import requests
import time
def get_ip():
url='https://www.kuaidaili.com/free/inha/'
url_list=[url+str(i+1) for i in range(5)] #生成url列表,5代表只爬取5頁
print(url_list)
ip_list = []
for i in range(len(url_list)):
url =url_list[i]
html = requests.get(url=url,).text
regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>'
matcher = re.compile(regip,re.S)
ipstr = re.findall(matcher, html)
time.sleep(1)
for j in ipstr:
ip_list.append(j[0]+':'+j[1]) #ip+port
print(ip_list)
print('共收集到%d個代理ip' % len(ip_list))
return ip_list
if __name__=='__main__':
get_ip()

#先獲取特定標簽 #解析
import requests
from bs4 import BeautifulSoup
def get_ip_list(obj):
ip_text = obj.findAll('tr', {'class': 'odd'}) # 獲取帶有IP地址的表格的所有行
ip_list = []
for i in range(len(ip_text)):
ip_tag = ip_text[i].findAll('td')
ip_port = ip_tag[1].get_text() + ':' + ip_tag[2].get_text() # 提取出IP地址和端口號
ip_list.append(ip_port)
print("共收集到了{}個代理IP".format(len(ip_list)))
print(ip_list)
return ip_list
url = 'http://www.xicidaili.com/'
headers = {
'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
request = requests.get(url, headers=headers)
response =request.text
bsObj = BeautifulSoup(response, 'lxml') # 解析獲取到的html
lists=get_ip_list(bsObj)
檢測代理ip可用性
第一種方法:通過返回的狀態碼判斷
import requests
import random
import re
import time
def get_ip():
url='https://www.kuaidaili.com/free/inha/'
url_list=[url+str(i+1) for i in range(1)]
print(url_list)
ip_list = []
for i in range(len(url_list)):
url =url_list[i]
html = requests.get(url=url,).text
regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>'
matcher = re.compile(regip,re.S)
ipstr = re.findall(matcher, html)
time.sleep(1)
for j in ipstr:
ip_list.append(j[0]+':'+j[1])
print('共收集到%d個代理ip' % len(ip_list))
print(ip_list)
return ip_list
def valVer(proxys):
badNum = 0
goodNum = 0
good=[]
for proxy in proxys:
try:
proxy_host = proxy
protocol = 'https' if 'https' in proxy_host else 'http'
proxies = {protocol: proxy_host}
print('現在正在測試的IP:',proxies)
response = requests.get('http://www.baidu.com', proxies=proxies, timeout=2)
if response.status_code != 200:
badNum += 1
print (proxy_host, 'bad proxy')
else:
goodNum += 1
good.append(proxies)
print (proxy_host, 'success proxy')
except Exception as e:
print( e)
# print proxy_host, 'bad proxy'
badNum += 1
continue
print ('success proxy num : ', goodNum)
print( 'bad proxy num : ', badNum)
print(good)
if __name__ == '__main__':
ip_list=get_ip()
valVer(ip_list)

第二種方法:使用requests包來進行驗證
import requests
import random
import re
import time
def get_ip():
url='https://www.kuaidaili.com/free/inha/'
url_list=[url+str(i+1) for i in range(1)]
print(url_list)
ip_list = []
for i in range(len(url_list)):
url =url_list[i]
html = requests.get(url=url,).text
regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>'
matcher = re.compile(regip,re.S)
ipstr = re.findall(matcher, html)
time.sleep(1)
for j in ipstr:
ip_list.append(j[0]+':'+j[1])
print(ip_list)
print('共收集到%d個代理ip' % len(ip_list))
return ip_list
def valVer(proxys):
badNum = 0
goodNum = 0
good=[]
for proxy in proxys:
print("現在正在檢測ip",proxy)
try:
requests.get('http://wenshu.court.gov.cn/', proxies={"http":"http://"+str(proxy)}, timeout=2)
except:
badNum+=1
print('connect failed')
else:
goodNum=1
good.append(proxy)
print('success')
print ('success proxy num : ', goodNum)
print( 'bad proxy num : ', badNum)
print(good)
if __name__ == '__main__':
ip_list=get_ip()
valVer(ip_list)

第三種方法:使用telnet
import telnetlib
try:
telnetlib.Telnet('127.0.0.1', port='80', timeout=20)
except:
print 'connect failed'
else:
print 'success'

