網站反爬蟲:一個IP頻繁訪問就先將該IP加入黑名單
反爬蟲策略:限制IP訪問頻率,超過頻率就自動斷開:降低爬蟲的速度,在每個請求前加time.sleep,或更換IP
策略二:后台對訪問進行統計,如果單個userAgent訪問超過閾值,予以封鎖:誤傷較大,一般網站不使用
策略三:針對cookies:一般網站不使用
本例利用反爬蟲策略來抓取糗事百科的段子
1 #網站反爬蟲:一個IP頻繁訪問就先將該IP加入黑名單 2 #反爬蟲策略:限制IP訪問頻率,超過頻率就自動斷開:降低爬蟲的速度,在每個請求前加time.sleep,或更換IP 3 #策略二:后台對訪問進行統計,如果單個userAgent訪問超過閾值,予以封鎖:誤傷較大,一般網站不使用 4 #策略三:針對cookies:一般網站不使用 5 6 import requests 7 import re 8 import random 9 import time 10 11 #首先,我們找一個發布代理IP的網站,從該網站爬取代理IP來訪問網頁,當本地IP失效,啟用代理IP 12 13 class download(object): 14 def __init__(self): 15 self.ip_list=[] #初始化列表用來存儲獲取到的IP 16 html=requests.get("http://haoip.cc/tiqu.htm") 17 iplistn=re.findall(r'r/>(.*?)<b',html.text,re.S) #從html代碼中獲取所有/><b中的內容 re.S的意思是匹配包括所有換行符 18 for ip in iplistn: 19 i=re.sub("\n","",ip) #re.sub是re模塊替換的方法,這表示將\n替換為空 20 self.ip_list.append(i.strip()) #將IP添加到初始化列表中 21 22 self.user_agent_list=[ 23 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 24 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 25 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 26 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 27 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 28 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 29 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 30 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 31 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 32 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 33 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 34 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 35 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 36 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 37 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 38 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 39 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 40 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 41 ] 42 def get(self,url,timeout,proxy=None,num_retries=6): 43 ua=random.choice(self.user_agent_list) #從user_agent_list中隨機抽取出一個字符串 44 # print(ua) 45 header={"User-Agent":ua} #構造一個完整的User_Agent 46 47 if proxy==None: #當代理為空時,不使用代理獲取response 48 try: 49 response=requests.get(url,headers=header,timeout=timeout) 50 return response 51 except: 52 if num_retries>0: 53 time.sleep(10) 54 print(u"獲取網頁錯誤,10s后將獲取倒數第:",num_retries,u"次") 55 return self.get(url,timeout,num_retries-1) #調用自身並將次數減1 56 else: 57 print(u"開始使用代理") 58 time.sleep(10) 59 IP="".join(str(random.choice(self.ip_list)).strip()) 60 proxy={"http":IP} 61 return self.get(url,timeout,proxy) 62 63 else: 64 try: 65 IP="".join(str(random.choice(self.ip_list)).strip()) #隨機取IP並去除空格 66 proxy={"http":IP} #構造一個代理 67 response=requests.get(url,headers=header,proxies=proxy,timeout=timeout) #使用代理來獲取response 68 return response 69 except: 70 if num_retries>0: 71 time.sleep(10) 72 IP="".join(str(random.choice(self.ip_list)).strip()) 73 print(u"正在更換代理,10s后將重新獲取第",num_retries,u"次") 74 print(u"當前代理是:",proxy) 75 return self.get(url,timeout,proxy,num_retries-1) 76 else: 77 print(u"代理發生錯誤,取消代理") 78 return self.get(url,3) 79 80 request=download();
實現段子抓取
1 #模擬抓取糗事百科的段子 2 import requests 3 from bs4 import BeautifulSoup 4 from Download import request 5 def qsbk(url): 6 # header={ 7 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 8 # 'Accept-Encoding': 'gzip, deflate, sdch', 9 # 'Accept-Language': 'zh-CN,zh;q=0.8', 10 # 'Cache-Control': 'max-age=0', 11 # 'Connection': 'keep-alive', 12 # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235' 13 # } 14 # rep=requests.get(url,headers=header) 15 # html=rep.text 16 # bs=BeautifulSoup(html,"html.parser") 17 # body=bs.body #獲取html文件的body部分 18 # data=body.find_all("div",{"class":"content"}) #此時的他為set 類型 19 # for joke in data: 20 # joke_duan=joke.find("span") 21 # if "<br/>" not in str(joke_duan): #如果段子中有<br/>,則string會變為None 22 # print(joke_duan.string) 23 # print("") 24 # # with open("joke.txt","w") as f: 25 # # f.write(joke_duan.string) 26 html=request.get(url,3) 27 dz=BeautifulSoup(html.text,"html.parser").find_all("div",{"class":"content"}) #獲取一個集合 28 # print(dz) 29 # print(len(dz)) 30 for joke in dz: #joke為一段html代碼 31 duanzi=joke.get_text() 32 print(duanzi) 33 34 if __name__=="__main__": 35 url="http://www.qiushibaike.com/" 36 qsbk(url)
上述抓取例子有兩個,第一個為沒有啟用反爬蟲策略,第二個為啟用了反爬蟲策略
