python3 scrapy 爬取騰訊招聘


安裝scrapy不再贅述,

在控制台中輸入scrapy startproject tencent 創建爬蟲項目名字為 tencent

接着cd tencent

用pycharm打開tencent項目

構建item文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #職位名
    positionname = scrapy.Field()
    #詳細鏈接
    positionLink = scrapy.Field()
    #職位類別
    positionType = scrapy.Field()
    #招聘人數
    peopleNum = scrapy.Field()
    #工作地點
    workLocation = scrapy.Field()
    #發布時間
    publishTime = scrapy.Field()

  接着在spiders文件夾中新建tencentPostition.py文件代碼如下注釋寫的很清楚

# -*- coding: utf-8 -*-
import scrapy
from tencent.items import TencentItem

class TencentpostitionSpider(scrapy.Spider):
    #爬蟲名
    name = 'tencent'
    #爬蟲域
    allowed_domains = ['tencent.com']
    #設置URL
    url = 'http://hr.tencent.com/position.php?&start='
    #設置頁碼
    offset = 0
    #默認url
    start_urls = [url+str(offset)]

    def parse(self, response):
        #xpath匹配規則
        for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
            item = TencentItem()
            # 職位名
            item["positionname"] = each.xpath("./td[1]/a/text()").extract()[0]
            # 詳細鏈接
            item["positionLink"] = each.xpath("./td[1]/a/@href").extract()[0]
            # 職位類別
            try:
                item["positionType"] = each.xpath("./td[2]/text()").extract()[0]
            except:
                item["positionType"] = '空'
            # 招聘人數
            item["peopleNum"] = each.xpath("./td[3]/text()").extract()[0]
            # 工作地點
            item["workLocation"] = each.xpath("./td[4]/text()").extract()[0]
            # 發布時間
            item["publishTime"] = each.xpath("./td[5]/text()").extract()[0]
            #把數據交給管道文件
            yield item
        #設置新URL頁碼
        if(self.offset<2620):
            self.offset += 10
        #把請求交給控制器
        yield scrapy.Request(self.url+str(self.offset),callback=self.parse)

  接着配置管道文件pipelines.py代碼如下

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json
class TencentPipeline(object):
    def __init__(self):
        #在初始化方法中打開文件
        self.fileName = open("tencent.json","wb")

    def process_item(self, item, spider):
        #把數據轉換為字典再轉換成json
        text = json.dumps(dict(item),ensure_ascii=False)+"\n"
        #寫到文件中編碼設置為utf-8
        self.fileName.write(text.encode("utf-8"))
        #返回item
        return item

    def close_spider(self,spider):
        #關閉時關閉文件
        self.fileName.close()

  接下來需要配置settings.py文件

不遵循ROBOTS規則

ROBOTSTXT_OBEY = False

  

#下載延遲
DOWNLOAD_DELAY = 3

  

#設置請求頭
DEFAULT_REQUEST_HEADERS = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}

 

#交給哪個管道文件處理 文件夾.管道文件名.類名
ITEM_PIPELINES = {
    'tencent.pipelines.TencentPipeline': 300,
}

 接下來再控制台中輸入 

scrapy crawl tencent

即可爬取

源碼地址

https://github.com/ingxx/scrapy_to_tencent 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM