1 # coding:utf-8 2 import json 3 import re 4 from lxml import etree 5 import requests 6 import time 7 8 9 class Lagou(object): 10 def __init__(self): 11 # 構建初始url 12 self.url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0' 13 # 構建請求頭,lg需要設置登錄cookie,登錄后瀏覽器檢查工具獲得即可,referer為跳轉網頁,也可以獨立請求跳轉 14 self.headers = { 15 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', 16 'Cookie': '此處代碼為登錄后獲取的cookie,全部粘貼進來即可', 17 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=' 18 } 19 self.pattern = re.compile(r'"positionId":(\d+)') # 構建正則匹配用於獲取urlId 20 self.base_url = 'https://www.lagou.com/jobs/{}.html' 21 self.file = open('lagou.json', 'w') 22 23 def get_post_data(self, page=1): 24 """獲取列表頁json數據""" 25 print('正在登陸----') 26 post_data = { 27 'first': 'true', 28 'pn': page, # 頁數 29 'kd': 'python', # 此處可以動態修改 30 } 31 response = requests.post(self.url, headers=self.headers, data=post_data) 32 print('獲取得列表頁響應') 33 return response.content.decode() 34 35 def get_page(self, url): 36 """獲取詳情頁響應數據""" 37 response = requests.post(url, headers=self.headers) 38 return response.content 39 40 def parse_url(self, data): 41 """解析列表頁數據獲取urlid""" 42 print('開始解析列表頁數據,獲取id') 43 id_list = self.pattern.findall(data) 44 url_list = [] 45 for id in id_list: 46 url_list.append(self.base_url.format(id)) 47 print('id獲取完畢') 48 return url_list 49 50 def parse_detail_data(self, str_data): 51 """解析詳情頁數據""" 52 print('正在獲取詳情頁數據') 53 html = etree.HTML(str_data) 54 data = {} 55 data['name'] = html.xpath('//div/span[@class="name"]/text()')[0] if len(html.xpath('//div/span[@class="name"]/text()'))>0 else None 56 data['salary'] = html.xpath('//span[@class="salary"]/text()')[0] if len(html.xpath('//span[@class="salary"]/text()'))>0 else None 57 temp = html.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()')[0] if len(html.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()'))>0 else None 58 data['city'] = temp.replace('/', '').strip() 59 data['company'] = html.xpath('//div[@class="company"]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None 60 temp = html.xpath('//dd/p[1]/span[4]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None 61 data['education'] = temp.replace('/', '').strip() 62 data['job_type'] = html.xpath('//dd/p[1]/span[5]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None 63 data['anvantage'] = html.xpath('//dd[@class="job-advantage"]/p/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None 64 desc_list = html.xpath('//dd[@class="job_bt"]/div/p/text()') 65 temp = '' 66 for desc in desc_list: 67 temp += desc 68 data['responsibilities'] = temp.replace('\xa0', '') 69 return data 70 71 def parse_detail(self, url_list): 72 """獲取單頁詳情頁數據列表""" 73 print('開始拼裝詳情頁url') 74 data_list = [] 75 for url in url_list: 76 str_data = self.get_page(url) 77 # print(str_data.decode()) 78 data_list.append(self.parse_detail_data(str_data)) 79 # print(str) 80 print('獲取完畢') 81 return data_list 82 83 def save_data(self, data_list): 84 """保存數據模塊""" 85 print('開始保存數據') 86 for data in data_list: 87 str_data = json.dumps(data, ensure_ascii=False) + ',\n' # 將python字典轉換為json字符串 88 self.file.write(str_data) 89 90 def run(self): 91 """爬蟲運行邏輯模塊""" 92 for page in range(1, 10): # 翻頁 93 data = self.get_post_data(page) # lg數據需要登陸爬取,使用post稍微安全些,大概吧 94 url_list = self.parse_url(data) # 獲取詳情頁所需id 95 data_list = self.parse_detail(url_list) # 獲取單頁詳情頁數據列表 96 # print(data_list) # debug 97 self.save_data(data_list) # 保存數據 98 99 def __del__(self): 100 print('數據保存完畢') 101 self.file.close() # 關閉文件 102 103 104 if __name__ == '__main__': 105 lagou = Lagou() 106 lagou.run()
一個簡單的爬取拉勾網詳情頁信息的爬蟲,非常粗糙,主要使用了requests進行請求,登錄后才能爬取所有數據,需要用到登錄后的cookie,最好使用post請求,雖然只安全一點點,僅供參考。