安裝scrapy不再贅述,
在控制台中輸入scrapy startproject tencent 創建爬蟲項目名字為 tencent
接着cd tencent
用pycharm打開tencent項目
構建item文件
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TencentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#職位名
positionname = scrapy.Field()
#詳細鏈接
positionLink = scrapy.Field()
#職位類別
positionType = scrapy.Field()
#招聘人數
peopleNum = scrapy.Field()
#工作地點
workLocation = scrapy.Field()
#發布時間
publishTime = scrapy.Field()
接着在spiders文件夾中新建tencentPostition.py文件代碼如下注釋寫的很清楚
# -*- coding: utf-8 -*-
import scrapy
from tencent.items import TencentItem
class TencentpostitionSpider(scrapy.Spider):
#爬蟲名
name = 'tencent'
#爬蟲域
allowed_domains = ['tencent.com']
#設置URL
url = 'http://hr.tencent.com/position.php?&start='
#設置頁碼
offset = 0
#默認url
start_urls = [url+str(offset)]
def parse(self, response):
#xpath匹配規則
for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
item = TencentItem()
# 職位名
item["positionname"] = each.xpath("./td[1]/a/text()").extract()[0]
# 詳細鏈接
item["positionLink"] = each.xpath("./td[1]/a/@href").extract()[0]
# 職位類別
try:
item["positionType"] = each.xpath("./td[2]/text()").extract()[0]
except:
item["positionType"] = '空'
# 招聘人數
item["peopleNum"] = each.xpath("./td[3]/text()").extract()[0]
# 工作地點
item["workLocation"] = each.xpath("./td[4]/text()").extract()[0]
# 發布時間
item["publishTime"] = each.xpath("./td[5]/text()").extract()[0]
#把數據交給管道文件
yield item
#設置新URL頁碼
if(self.offset<2620):
self.offset += 10
#把請求交給控制器
yield scrapy.Request(self.url+str(self.offset),callback=self.parse)
接着配置管道文件pipelines.py代碼如下
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class TencentPipeline(object):
def __init__(self):
#在初始化方法中打開文件
self.fileName = open("tencent.json","wb")
def process_item(self, item, spider):
#把數據轉換為字典再轉換成json
text = json.dumps(dict(item),ensure_ascii=False)+"\n"
#寫到文件中編碼設置為utf-8
self.fileName.write(text.encode("utf-8"))
#返回item
return item
def close_spider(self,spider):
#關閉時關閉文件
self.fileName.close()
接下來需要配置settings.py文件
不遵循ROBOTS規則
ROBOTSTXT_OBEY = False
#下載延遲 DOWNLOAD_DELAY = 3
#設置請求頭
DEFAULT_REQUEST_HEADERS = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
#交給哪個管道文件處理 文件夾.管道文件名.類名
ITEM_PIPELINES = {
'tencent.pipelines.TencentPipeline': 300,
}
接下來再控制台中輸入
scrapy crawl tencent
即可爬取
源碼地址
https://github.com/ingxx/scrapy_to_tencent
