前言:本文是介紹利用代理IP池以及多線程完成前程無憂網站的是十萬條招聘信息的采集工作,已適當控制采集頻率,采集數據僅為了學習使用,采集十萬條招聘信息大概需要十個小時。
起因是在知乎上看到另一個程序猿寫的前程無憂的爬蟲代碼,對於他的一些反反爬蟲處理措施抱有一絲懷疑態度,於是在他的代碼的基礎上進行改造,優化了線程的分配以及頁面訪問的頻率,並加入了代理IP池的處理,優化了爬蟲效率。
原始代碼文章鏈接:https://zhuanlan.zhihu.com/p/146425439
首先,奉上本文依賴的基礎的爬蟲代碼
def getdata(bot,top):
for i in range(bot,top):
print("正在爬取第" + str(i) + "頁的數據")
url0 = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE,2,"
url_end = ".html?"
url = url0 + str(i) + url_end
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
html = requests.get(url, headers=headers)
html.encoding = "gbk"
etree = etree.HTML(html.text)
# ①崗位名稱
JobName = etree.xpath('//div[@class="dw_table"]/div[@class="el"]//p/span/a[@target="_blank"]/@title')
# ②公司名稱
CompanyName = etree.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t2"]/a[@target="_blank"]/@title')
# ③工作地點
Address = etree.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t3"]/text()')
# ④工資
sal = etree.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t4"]')
salary = [i.text for i in sal]
# ⑤發布時間
ShowTime = etree.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t5"]/text()')
# ⑥獲取職位詳情url
DetailUrl = etree.xpath('//div[@class="dw_table"]/div[@class="el"]//p/span/a[@target="_blank"]/@href')
OthersInfo = []
JobDescribe = []
CompanyType = []
CompanySize = []
Industry = []
for i in range(len(DetailUrl)):
htmlInfo = requests.get(DetailUrl[i], headers=headers)
htmlInfo.encoding = "gbk"
etreeInfo = etree.HTML(htmlInfo.text)
# ⑦經驗、學歷信息等其他信息
otherinfo = etreeInfo.xpath('//div[@class="tHeader tHjob"]//div[@class="cn"]/p[@class="msg ltype"]/text()')
# ⑧崗位詳情
JobDescibe = etreeInfo.xpath('//div[@class="tBorderTop_box"]//div[@class="bmsg job_msg inbox"]/p/text()')
# ⑨公司類型
CompanyType = etreeInfo.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[1]/@title')
# ⑩公司規模(人數)
CompanySize = etreeInfo.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[2]/@title')
# ⑪所屬行業(公司)
industry = etreeInfo.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[3]/@title')
#將上述信息存入列表中
OthersInfo.append(otherinfo)
JobDescribe.append(JobDescibe)
CompanyType.append(CompanyType)
CompanySize.append(CompanySize)
Industry.append(industry)
# 休眠
time.sleep(0.5)
# 一邊爬取一邊寫入
data = pd.DataFrame()
data["崗位名稱"] = JobName
data["工作地點"] = Address
data["公司名稱"] = CompanyName
data["工資"] = salary
data["發布日期"] = ShowTime
data["經驗、學歷"] = OthersInfo
data["所屬行業"] = Industry
data["公司類型"] = CompanyType
data["公司規模"] = CompanySize
data["崗位描述"] = JobDescribe
# 有些網頁會跳轉到公司官網,會返回空值,所以將其忽略
try:
data.to_csv("job_info.csv", mode="a+", header=None, index=None, encoding="gbk")
except:
print("跳轉官網,無數據")
time.sleep(1)
print("數據爬取完成!!!!")
經過實驗,發現這段代碼存在以下幾個問題,1.爬蟲的效率低;2.爬蟲的過程中報錯有點頻繁;3.訪問網頁的延時時間都是固定的,這樣很容易被網站識別到
首先,解決第一個問題,原作者的解決方案是以多線程的方式處理,代碼如下
import requests,time,warnings,threading
import pandas as pd
from lxml import etree
warnings.filterwarnings("ignore")
def getdata(bot,top):
for i in range(bot,top):
print("正在爬取第" + str(i) + "頁的數據")
url0 = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE,2,"
url_end = ".html?"
url = url0 + str(i) + url_end
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
html = requests.get(url, headers=headers)
html.encoding = "gbk"
Html = etree.HTML(html.text)
# ①崗位名稱
JobName = Html.xpath('//div[@class="dw_table"]/div[@class="el"]//p/span/a[@target="_blank"]/@title')
# ②公司名稱
CompanyName = Html.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t2"]/a[@target="_blank"]/@title')
# ③工作地點
Address = Html.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t3"]/text()')
# ④工資
sal = Html.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t4"]')
salary = [i.text for i in sal]
# ⑤發布時間
ShowTime = Html.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t5"]/text()')
# ⑥獲取職位詳情url
DetailUrl = Html.xpath('//div[@class="dw_table"]/div[@class="el"]//p/span/a[@target="_blank"]/@href')
OthersInfo = []
JobDescribe = []
CompanyType = []
CompanySize = []
Industry = []
for i in range(len(DetailUrl)):
HtmlInfo = requests.get(DetailUrl[i], headers=headers)
HtmlInfo.encoding = "gbk"
HtmlInfo = etree.HTML(HtmlInfo.text)
# ⑦經驗、學歷信息等其他信息
otherinfo = HtmlInfo.xpath('//div[@class="tHeader tHjob"]//div[@class="cn"]/p[@class="msg ltype"]/text()')
# ⑧崗位詳情
JobDescibe = HtmlInfo.xpath('//div[@class="tBorderTop_box"]//div[@class="bmsg job_msg inbox"]/p/text()')
# ⑨公司類型
ComType = HtmlInfo.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[1]/@title')
# ⑩公司規模(人數)
ComSize = HtmlInfo.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[2]/@title')
# ⑪所屬行業(公司)
industry = HtmlInfo.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[3]/@title')
#將上述信息存入列表中
OthersInfo.append(otherinfo)
JobDescribe.append(JobDescibe)
CompanyType.append(ComType)
CompanySize.append(ComSize)
Industry.append(industry)
# 休眠
time.sleep(0.5)
# 一邊爬取一邊寫入
data = pd.DataFrame()
data["崗位名稱"] = JobName
data["工作地點"] = Address
data["公司名稱"] = CompanyName
data["工資"] = salary
data["發布日期"] = ShowTime
data["經驗、學歷"] = OthersInfo
data["所屬行業"] = Industry
data["公司類型"] = CompanyType
data["公司規模"] = CompanySize
data["崗位描述"] = JobDescribe
# 有些網頁會跳轉到公司官網,會返回空值,所以將其忽略
try:
data.to_csv("job_info.csv", mode="a+", header=None, index=None, encoding="gbk")
except:
print("跳轉官網,無數據")
time.sleep(1)
print("數據爬取完成!!!!")
threads = []
t1 = threading.Thread(target=getdata,args=(1,125))
threads.append(t1)
t2 = threading.Thread(target=getdata,args=(125,250))
threads.append(t2)
t3 = threading.Thread(target=getdata,args=(250,375))
threads.append(t3)
t4 = threading.Thread(target=getdata,args=(375,500))
threads.append(t4)
t5 = threading.Thread(target=getdata,args=(500,625))
threads.append(t5)
t6 = threading.Thread(target=getdata,args=(625,750))
threads.append(t6)
t7 = threading.Thread(target=getdata,args=(750,875))
threads.append(t7)
t8 = threading.Thread(target=getdata,args=(875,1000))
threads.append(t8)
t9 = threading.Thread(target=getdata,args=(1000,1125))
threads.append(t9)
t10 = threading.Thread(target=getdata,args=(1125,1250))
threads.append(t10)
t11 = threading.Thread(target=getdata,args=(1250,1375))
threads.append(t11)
t12 = threading.Thread(target=getdata,args=(1375,1500))
threads.append(t12)
if __name__ == "__main__":
for t in threads:
t.setDaemon(True)
t.start()
確實增加了爬蟲的速度,但這樣做會有一個問題,就是爬蟲的質量變差了,准確的說就是出錯的幾率提高了,被反爬蟲策略識別到的次數增加了
首先從代碼生成的角度,我優化了一下多線程的生成方法,允許用戶自定義線程數作為參數傳遞,通過總的頁數進行均分,如下所示
# 分配線程任務
def start_spider(num):
start = 1
end = 0
count = 2000
size = count//(num-1)
print(size)
while num > 1:
end = start+size
t = threading.Thread(target=getdata,args=(start,end))
start = end+1
t.start()
num = num-1
# 分配剩下的任務給新的線程
if(end < count):
start = end+1
end = count
t = threading.Thread(target=getdata,args=(start,end))
t.start()
代碼優化了之后,我們調整下爬蟲時頁面訪問的延遲,改為一個隨機數
Industry.append(industry)
# 休眠
time.sleep(random.uniform(0.1,1))
# 一邊爬取一邊寫入
data = pd.DataFrame()
data["崗位名稱"] = JobName
data["工作地點"] = Address
data["公司名稱"] = CompanyName
data["工資"] = salary
data["發布日期"] = ShowTime
data["經驗、學歷"] = OthersInfo
data["所屬行業"] = Industry
data["公司類型"] = CompanyType
data["公司規模"] = CompanySize
data["崗位描述"] = JobDescribe
# 有些網頁會跳轉到公司官網,會返回空值,所以將其忽略
try:
data.to_csv("job_info.csv", mode="a+", header=None, index=None, encoding="gbk")
except:
print("跳轉官網,無數據")
time.sleep(random.uniform(0.2,0.5))
最后利用代理IP池的方式來提高爬蟲的質量
這里我分享一個很好用的代理IP池項目:https://github.com/jhao104/proxy_pool
這個項目在我等會分享的gitee開源項目中也拷貝了一份:https://gitee.com/chengrongkai/OpenSpiders
配置IP代理池的方法就參考這個項目的readme就行了
下面我奉上我對這個項目的代碼改造
# 利用代理IP請求
def getHtml(url):
# ....
retry_count = 5
proxy = get_proxy().get("proxy")
while retry_count > 0:
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
print("代理信息:{}".format(proxy))
html = requests.get(url,headers=headers, proxies={"http": "http://{}".format(proxy)})
# 使用代理訪問
return html
except Exception:
retry_count -= 1
# 出錯5次, 刪除代理池中代理
delete_proxy(proxy)
return None
def getdata(bot,top):
for i in range(bot,top):
print("正在爬取第" + str(i) + "頁的數據")
url0 = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE,2,"
url_end = ".html?"
url = url0 + str(i) + url_end
html = getHtml(url)
if(html == None):
continue
html.encoding = "gbk"
Html = etree.HTML(html.text)
# ①崗位名稱
JobName = Html.xpath('//div[@class="dw_table"]/div[@class="el"]//p/span/a[@target="_blank"]/@title')
# ②公司名稱
CompanyName = Html.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t2"]/a[@target="_blank"]/@title')
# ③工作地點
Address = Html.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t3"]/text()')
# ④工資
sal = Html.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t4"]')
salary = [i.text for i in sal]
# ⑤發布時間
ShowTime = Html.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t5"]/text()')
# ⑥獲取職位詳情url
DetailUrl = Html.xpath('//div[@class="dw_table"]/div[@class="el"]//p/span/a[@target="_blank"]/@href')
OthersInfo = []
JobDescribe = []
CompanyType = []
CompanySize = []
Industry = []
for i in range(len(DetailUrl)):
HtmlInfo = getHtml(DetailUrl[i])
HtmlInfo.encoding = "gbk"
HtmlInfo = etree.HTML(HtmlInfo.text)
if(HtmlInfo == None):
continue
# ⑦經驗、學歷信息等其他信息
otherinfo = HtmlInfo.xpath('//div[@class="tHeader tHjob"]//div[@class="cn"]/p[@class="msg ltype"]/text()')
# ⑧崗位詳情
JobDescibe = HtmlInfo.xpath('//div[@class="tBorderTop_box"]//div[@class="bmsg job_msg inbox"]/p/text()')
# ⑨公司類型
ComType = HtmlInfo.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[1]/@title')
# ⑩公司規模(人數)
ComSize = HtmlInfo.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[2]/@title')
# ⑪所屬行業(公司)
industry = HtmlInfo.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[3]/@title')
#將上述信息存入列表中
OthersInfo.append(otherinfo)
JobDescribe.append(JobDescibe)
CompanyType.append(ComType)
CompanySize.append(ComSize)
Industry.append(industry)
# 休眠
time.sleep(random.uniform(0.1,1))
# 一邊爬取一邊寫入
data = pd.DataFrame()
data["崗位名稱"] = JobName
data["工作地點"] = Address
data["公司名稱"] = CompanyName
data["工資"] = salary
data["發布日期"] = ShowTime
data["經驗、學歷"] = OthersInfo
data["所屬行業"] = Industry
data["公司類型"] = CompanyType
data["公司規模"] = CompanySize
data["崗位描述"] = JobDescribe
# 有些網頁會跳轉到公司官網,會返回空值,所以將其忽略
try:
data.to_csv("job_info.csv", mode="a+", header=None, index=None, encoding="gbk")
except:
print("跳轉官網,無數據")
time.sleep(random.uniform(0.2,0.5))
print("數據爬取完成!!!!")
我自己的機器測試了下,8個線程爬取了一個半小時,采集了一萬五的數據,這里我有意的降慢了速度,大家可以根據實際情況進行調整,比如代理IP的重試可以去掉,如果出現無法采集就直接刪除代理IP池中的該IP即可,另外線程數也可以按照電腦配置適當增加,在不計質量的情況下,應該可以達到一個小時一萬五左右的采集量,單機的情況下,如果有更好的解決方案,歡迎留言,下篇文章將講述如何對獲取到的數據進行清洗以及數據分析。
本文所有代碼均開源在https://gitee.com/chengrongkai/OpenSpiders
歡迎star,你的鼓勵是我最大的動力