1.需求背景
拉勾網的爬蟲是做的很好的,要想從他的網站上爬取數據,那可以說是相當的不容易啊。如果采取一般的requests + xpath解析,很快就能給你識別為爬蟲,並提示你操作頻繁。基於這種情況,只能使用selenium來進行爬取,並且在爬取的時候還不能太快,太快也容易閃到腰的,下面是具體的實現代碼,部分代碼來自CSDN博客,我只是根據2020年4月13日的頁面進行了代碼修改,因為拉勾網隨時都在更新自己的網站,做反爬蟲機制,需要不斷的去修改我們的代碼,才能適應。以下代碼僅供參考。
2.實現代碼
import re
from lxml import etree
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
class LagouSpider(object):
# chromedriver的絕對路徑
driver_path = r'D:\Python\chromedriver.exe'
def __init__(self):
self.driver = webdriver.Chrome(executable_path=self.driver_path)
self.url = 'https://www.lagou.com/jobs/list_Python/p-city_282'
self.position_info = []
def run(self):
self.driver.get(self.url) # 打開網頁
count = 1
while True:
print('正在獲取第{}頁的數據...'.format(count))
self.parse_detail_page(self.driver.page_source)
# 如果頁數是最后一頁,則退出循環
if re.search(r'class="pager_next pager_next_disabled"',self.driver.page_source):
break
self.next_page() # 點擊進入下一頁
count += 1
self.driver.quit() # 將瀏覽器退出
def parse_detail_page(self,html):
source_html = etree.HTML(html) # 解析頁面
detail_list = source_html.xpath('//a[@class="position_link"]/@href') # 找到每一頁里面的職位詳情頁鏈接
self.driver.execute_script("window.open()") # 開啟新的標簽頁
self.driver.switch_to.window(self.driver.window_handles[1]) # 切換到新的標簽頁
for url in detail_list: # 遍歷職位的詳情頁
self.driver.get(url) # 打開職位的詳情頁
detail_url = etree.HTML(self.driver.page_source) # 解析詳情頁
company_name = detail_url.xpath("//h4[@class='company']/text()")[0].replace('招聘','')
name = re.findall(r'<h1 class="name">([^<]*)',self.driver.page_source)[0] # 得到職位名稱
advantage = re.findall(r'職位誘惑:.*?<p>([^<]*)',self.driver.page_source,re.DOTALL)[0] # 得到職位誘惑內容
job_request = detail_url.xpath('//dd[@class="job_request"]')
for job_info in job_request:
salary = job_info.xpath('.//span[@class="salary"]/text()')[0] # 獲取薪資
job_info_other = job_info.xpath('.//span/text()') # 獲取職位要求信息,薪資后面跟的那一串簡單信息
req = re.sub('/', '', ','.join(job_info_other[1:]))
request = re.sub(' ','',req) # 去除每個逗號前的空格
job_descript = detail_url.xpath('//div[@class="job-detail"]//p/text()') # 獲取職位描述
job_descript = ' '.join(job_descript)
job_descript = "".join(job_descript.split())# 此處的split是為了去除\xao(\xa0 是不間斷空白符 )
address = re.findall(r'<input type="hidden" name="positionAddress" value="([^"]*)',self.driver.page_source)[0] # 獲取工作地點
position = { # 將獲取到的數據存入字典
'company_name':company_name,
'name':name,
'address':address,
'advantage':advantage,
'salary':salary,
'request':request,
'job_descript':job_descript
}
sleep(1) # 睡一下 以防開啟太快被臨時封ip
self.position_info.append(position) # 將存放數據的字典添加到列表中
self.write_to_csv() # 數據寫入csv文件中
print(self.position_info)
print('' * 30)
# 清空列表
self.position_info = []
self.driver.close() # 關閉標簽頁
self.driver.switch_to.window(self.driver.window_handles[0]) # 切換頁面
def next_page(self):
# 找到下一頁標簽
element = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "pager_next")))
element.click() # 點擊下一頁標簽
sleep(1)
def write_to_csv(self): # 寫入文件
header = ['company_name','name', 'address','advantage','salary','request','job_descript']
with open('positons.csv','a',newline='',encoding='utf-8') as fp :
write = csv.DictWriter(fp, header)
with open("positons.csv", "r", newline="", encoding='utf-8') as f:
reader = csv.reader(f)
if not [row for row in reader]:
write.writeheader()
write.writerows(self.position_info)
else:
write.writerows(self.position_info)
if __name__ == '__main__':
spider = LagouSpider()
spider.run()
程序運行截圖:
爬取后的CSV截圖: