分頁抓取博客園新聞,先從列表里分析下一頁按鈕
相關代碼:
# -*- coding: utf-8 -*- import scrapy from cnblogs.items import ArticleItem class BlogsSpider(scrapy.Spider): name = 'blogs' allowed_domains = ['news.cnblogs.com'] start_urls = ['https://news.cnblogs.com/'] def parse(self, response): articleList=response.css('.content') for item in articleList: # 由於詳情頁里瀏覽次數是js動態加載的,無法獲取,這里需要傳遞過去 viewcount = item.css('.view::text').extract_first()[:-3].strip() detailurl = item.css('.news_entry a::attr(href)').extract_first() detailurl = response.urljoin(detailurl) yield scrapy.Request(url=detailurl, callback=self.parse_detail, meta={"viewcount": viewcount}) #獲取下一頁標簽 text=response.css('#sideleft > div.pager > a:last-child::text').extract_first().strip() if text=='Next >': next = response.css('#sideleft > div.pager > a:last-child::attr(href)').extract_first() url=response.urljoin(next) yield scrapy.Request(url=url,callback=self.parse) ##解析詳情頁內容 def parse_detail(self, response): article=ArticleItem() article['linkurl']=response.url article['title']=response.css('#news_title a::text').extract_first() article['img'] = response.css('#news_content img::attr(src)').extract_first("default.png") article['source'] = response.css('.news_poster ::text').extract_first().strip() article['releasetime'] = response.css('.time::text').extract_first()[3:].strip() article['viewcount']= response.meta["viewcount"] article['content']=response.css('#news_body').extract_first("") yield article
寫入數據庫,先在setting.py頁面配置mongo連接數據信息
ROBOTSTXT_OBEY = True MONGODB_HOST='localhost' MONGO_PORT=27017 MONGO_DBNAME='cnblogs' MONGO_DOCNAME='article'
修改pipelines.py頁面,相關代碼
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo from scrapy.conf import settings from cnblogs.items import ArticleItem class CnblogsPipeline(object): #初始化信息 def __init__(self): host = settings['MONGODB_HOST'] port = settings['MONGO_PORT'] db_name = settings['MONGO_DBNAME'] client = pymongo.MongoClient(host=host, port=port) db = client[db_name] self.post=db[settings['MONGO_DOCNAME']] ##獲取值進行入庫 def process_item(self, item, spider): article=dict(item) self.post.insert(article) return item
__init__函數里,獲取配置文件里的mongo連接信息,連接mongo庫
process_item函數里獲取blogs.py里parse里yield返回的每一行,然后將數據入庫
最后需要在setting取消注釋pipelines.py頁面運行的注釋,不修改(pipelines.py頁面代碼可能無法正常調用)
ITEM_PIPELINES = { 'cnblogs.pipelines.CnblogsPipeline': 300, }
最后在Terminal終端運行命令:scrapy crawl blogs

啟用后便會開始進行抓取,結束后打開mongo客戶端工具:庫和表名創建的都是setting.py里配置的


到此,3000條文章資訊數據一條不差的下載下來了
