Python Scrapy框架爬取BOSS直聘招聘信息
1.創建項目
庫的下載:
pip install scrapy -i https://pypi.tuna.tsinghua.edu.cn/simple
cd 到想要創建爬蟲的目錄執行命令
scrapy startproject 項目名
成功創建項目之后,會得到如圖的文件目錄結構


根據提示cd到scrapy 執行 scrapy genspider 爬蟲名 域名
此時,項目中就新建好了一個爬蟲文件 :

啟動爬蟲可以使用命令: Scrapy crawl 爬蟲名
但是,真實的開發中,這樣寫是不方便調試的,應該使用一個文件來啟動爬蟲。
在項目中建立一個main.py文件,然后運行該文件即可。

下面是相關代碼:
from scrapy import cmdline
cmdline.execute("scrapy crawl spider_bokeyuan".split())
2.項目分析
1.根據網址鏈接可以看到只要更改page 這兩個參數就可以進行翻頁

2.查看網頁源代碼可以看到有一長串cookie 后期我們可以直接復制(如果不傳cookie是訪問不到正確的頁面的)

3.產看源代碼看到詳情頁的鏈接所在標簽

3.代碼實現
1.目錄結構

2.bosspq.py
# -*- coding: utf-8 -*- import scrapy from items import BossItem import time class BossSpiderMiddleware(scrapy.Spider): name = "bosspq" # 爬蟲名 allowed_domains = ["zhipin.com"] base_url = "https://www.zhipin.com/c101270100/?query=python&page=%s&ka=page-%s" # 設置一個列表存儲url鏈接 url_list = [] # 循環遍歷出url並添加到列表中 for i, n in zip(range(1, 3), range(1, 3)): url = base_url % (i, n) url_list.append(url) start_urls = url_list # 設置cookie cookie_list = "lastCity=101270100; __c=1577149081; __g=-; __l=l=%2Fwww.zhipin.com%2Fchengdu%2F&r=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DVdOvDbkv31yb5VBBMrEk9NSaHuvhbawb1E3gS5qFjgVlJpFAGeLGRoy0IDNK4oMk%26wd%3D%26eqid%3Dfeb3956900384054000000025e01628b&friend_source=0&friend_source=0; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1577115314,1577115319,1577149082,1577155616; __a=89188227.1575600840.1577115310.1577149081.127.11.24.37; __zp_stoken__=cdcfOUiphacBlF%2BS%2F3XnCbXfVxOlbboK5u4MjFQnYZqQ%2B3y3Du3Rs7ZOZLkVuqG42JoUkPzRRXC1PGT2XxFnW37wkM24kGa%2BuQg8ZN54Rh3dKsEtzTbe9GjieJtE47n7VmCN; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1577175217" custom_settings = { 'DEFAULT_REQUEST_HEADERS': { 'Cookie': cookie_list, 'Referer': 'https://www.zhipin.com/chengdu/', } } def parse(self, response): print(response.text) print("*" * 40) link = {} a = response.xpath("//div[@class='info-primary']/h3") for i in a: link["url"] = "https://www.zhipin.com/" + i.xpath('./a/@href').get() print(link) yield scrapy.Request(link['url'], callback=self.boss_list) def boss_list(self, response): item = BossItem() item['job'] = response.xpath('//*[@id="main"]/div[1]/div/div/div[2]/div[2]/h1/text()').get() item['wage'] = response.xpath('//*[@id="main"]/div[1]/div/div/div[2]/div[2]/span/text()').get().strip() item['name'] = response.xpath('//*[@id="main"]/div[3]/div/div[1]/div[2]/div/a[2]/text()').get().strip() item['job_xq'] = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div/text()').get().strip() # 設置爬取間隔 time.sleep(1) yield item
注意:如果要使用該程序需要在瀏覽器上從新復制cookie 粘貼到cookie_list
3.item.py
import scrapy class BossItem(scrapy.Item): # 職位 job = scrapy.Field() # 工資 wage = scrapy.Field() # 公司名 name = scrapy.Field() # 工作需求 job_xq = scrapy.Field()
4.main.py
from scrapy import cmdline cmdline.execute("scrapy crawl bosspq".split())
5.middlewares.py
import random # 隨機請求頭 class UserAgentDownloadMiddleWare(object): # 需要隨機的請求頭 USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36', 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; pl-PL; rv:1.0.1) Gecko/20021111 Chimera/0.6', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418.8 (KHTML, like Gecko, Safari) Cheshire/1.0.UNOFFICIAL', 'Mozilla/5.0 (X11; U; Linux i686; nl; rv:1.8.1b2) Gecko/20060821 BonEcho/2.0b2 (Debian-1.99+2.0b2+dfsg-1)' ] def process_request(self, request, spider): # 隨機生成一個請求頭 user_agent = random.choice(self.USER_AGENTS) request.headers['User-Agent'] = user_agent
6.pipelines.py
from pymongo import MongoClient class BossPipeline(object): # mongodb的鏈接 def __init__(self, databaseIp='127.0.0.1', databasePort=27017, mongodbName='mydb'): # 與mongodb建立鏈接 client = MongoClient(databaseIp, databasePort) # 進入數據庫mydb self.db = client[mongodbName] def process_item(self, item, spider): postItem = dict(item) # 把item轉化成字典形式 self.db.scrapy.insert(postItem) # 向數據庫中的scrapy集合插入一條記錄 return item # 會在控制台輸出原item數據,可以選擇不寫
7.settings.py
BOT_NAME = 'boss' SPIDER_MODULES = ['boss.spiders'] NEWSPIDER_MODULE = 'boss.spiders' ROBOTSTXT_OBEY = False # 表示不遵守robots協議 """cookie的設置""" COOKIES_ENABLED = False """開啟中間建""" DOWNLOADER_MIDDLEWARES = { 'boss.middlewares.UserAgentDownloadMiddleWare': 543, } ITEM_PIPELINES = { 'boss.pipelines.BossPipeline': 300, }
注:具體實現流程請參考代碼注釋
