關於拉勾網的爬蟲


  1 # coding:utf-8
  2 import json
  3 import re
  4 from lxml import etree
  5 import requests
  6 import time
  7 
  8 
  9 class Lagou(object):
 10     def __init__(self):
 11         # 構建初始url
 12         self.url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0'
 13         # 構建請求頭,lg需要設置登錄cookie,登錄后瀏覽器檢查工具獲得即可,referer為跳轉網頁,也可以獨立請求跳轉
 14         self.headers = {
 15             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
 16             'Cookie': '此處代碼為登錄后獲取的cookie,全部粘貼進來即可',
 17             'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
 18         }
 19         self.pattern = re.compile(r'"positionId":(\d+)')  # 構建正則匹配用於獲取urlId
 20         self.base_url = 'https://www.lagou.com/jobs/{}.html'
 21         self.file = open('lagou.json', 'w')
 22 
 23     def get_post_data(self, page=1):
 24         """獲取列表頁json數據"""
 25         print('正在登陸----')
 26         post_data = {
 27             'first': 'true',
 28             'pn': page,  # 頁數
 29             'kd': 'python', # 此處可以動態修改
 30         }
 31         response = requests.post(self.url, headers=self.headers, data=post_data)
 32         print('獲取得列表頁響應')
 33         return response.content.decode()
 34 
 35     def get_page(self, url):
 36         """獲取詳情頁響應數據"""
 37         response = requests.post(url, headers=self.headers)
 38         return response.content
 39 
 40     def parse_url(self, data):
 41         """解析列表頁數據獲取urlid"""
 42         print('開始解析列表頁數據,獲取id')
 43         id_list = self.pattern.findall(data)
 44         url_list = []
 45         for id in id_list:
 46             url_list.append(self.base_url.format(id))
 47         print('id獲取完畢')
 48         return url_list
 49 
 50     def parse_detail_data(self, str_data):
 51         """解析詳情頁數據"""
 52         print('正在獲取詳情頁數據')
 53         html = etree.HTML(str_data)
 54         data = {}
 55         data['name'] = html.xpath('//div/span[@class="name"]/text()')[0] if len(html.xpath('//div/span[@class="name"]/text()'))>0 else None
 56         data['salary'] = html.xpath('//span[@class="salary"]/text()')[0] if len(html.xpath('//span[@class="salary"]/text()'))>0 else None
 57         temp = html.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()')[0] if len(html.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()'))>0 else None
 58         data['city'] = temp.replace('/', '').strip()
 59         data['company'] = html.xpath('//div[@class="company"]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None
 60         temp = html.xpath('//dd/p[1]/span[4]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None
 61         data['education'] = temp.replace('/', '').strip()
 62         data['job_type'] = html.xpath('//dd/p[1]/span[5]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None
 63         data['anvantage'] = html.xpath('//dd[@class="job-advantage"]/p/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None
 64         desc_list = html.xpath('//dd[@class="job_bt"]/div/p/text()')
 65         temp = ''
 66         for desc in desc_list:
 67             temp += desc
 68         data['responsibilities'] = temp.replace('\xa0', '')
 69         return data
 70 
 71     def parse_detail(self, url_list):
 72         """獲取單頁詳情頁數據列表"""
 73         print('開始拼裝詳情頁url')
 74         data_list = []
 75         for url in url_list:
 76             str_data = self.get_page(url)
 77             # print(str_data.decode())
 78             data_list.append(self.parse_detail_data(str_data))
 79             # print(str)
 80         print('獲取完畢')
 81         return data_list
 82 
 83     def save_data(self, data_list):
 84         """保存數據模塊"""
 85         print('開始保存數據')
 86         for data in data_list:
 87             str_data = json.dumps(data, ensure_ascii=False) + ',\n'  # 將python字典轉換為json字符串
 88             self.file.write(str_data)
 89 
 90     def run(self):
 91         """爬蟲運行邏輯模塊"""
 92         for page in range(1, 10):  # 翻頁
 93             data = self.get_post_data(page)  # lg數據需要登陸爬取,使用post稍微安全些,大概吧
 94             url_list = self.parse_url(data)  # 獲取詳情頁所需id
 95             data_list = self.parse_detail(url_list)  # 獲取單頁詳情頁數據列表
 96             # print(data_list)  # debug
 97             self.save_data(data_list)  # 保存數據
 98 
 99     def __del__(self):
100         print('數據保存完畢')
101         self.file.close()  # 關閉文件
102 
103 
104 if __name__ == '__main__':
105     lagou = Lagou()
106     lagou.run()

  一個簡單的爬取拉勾網詳情頁信息的爬蟲,非常粗糙,主要使用了requests進行請求,登錄后才能爬取所有數據,需要用到登錄后的cookie,最好使用post請求,雖然只安全一點點,僅供參考。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM