Python的scrapy之爬取頂點小說網的所有小說


閑來無事用Python的scrapy框架練練手,爬取頂點小說網的所有小說的詳細信息。

看一下網頁的構造:

tr標簽里面的 td 使我們所要爬取的信息

下面是我們要爬取的二級頁面 小說的簡介信息:

 

下面上代碼:

mydingdian.py

import scrapy
from scrapy.http import Request
from ..items import DingdianItem


class MydingdianSpider(scrapy.Spider):
    name = 'mydingdian'
    allowed_domains = ['www.x23us.com/']
    start_url = ['https://www.x23us.com/class/']
    starturl=['.html']


    def start_requests(self):
        # for i in range(1,11):
        for i in range(5, 6):
            #print(i)
            url_con=str(i)+'_1'
            #print(url_con)
            url1 = self.start_url+list(url_con)+self.starturl
            #print(url1)
            url=''
            for j in url1:
                url+=j+''
            #print(url)
            yield Request(url, self.parse)

    def parse(self, response):

        baseurl=response.url #真正的url鏈接
        #print(baseurl)
        max_num = response.xpath('//*[@id="pagelink"]/a[14]/text()').extract_first()  # 獲取當前頁面的最大頁碼數
        #print(max_num) #頁碼數
        baseurl = baseurl[:-7]
        #print(baseurl)

        for num in range(1,int(max_num)+1):
        #for num in range(1, 5):
            #print(list("_" + str(num)))
            newurl1 = list(baseurl) + list("_" + str(num)) + self.starturl
            #print(newurl1)
            newurl=''
            for j in newurl1:
                newurl+=j+''
            print(newurl)
            # 此處使用dont_filter和不使用的效果不一樣,使用dont_filter就能夠抓取到第一個頁面的內容,不用就抓不到
            # scrapy會對request的URL去重(RFPDupeFilter),加上dont_filter則告訴它這個URL不參與去重。
            yield Request(newurl, dont_filter=True, callback=self.get_name)  # 將新的頁面url的內容傳遞給get_name函數去處理

    def get_name(self,response):
        item=DingdianItem()
        for nameinfo in response.xpath('//tr'):
            #print(nameinfo)
            novelurl = nameinfo.xpath('td[1]/a/@href').extract_first()  # 小說地址
            #print(novelurl)
            name = nameinfo.xpath('td[1]/a[2]/text()').extract_first()  # 小說名字
            #print(name)
            newchapter=nameinfo.xpath('td[2]/a/text()').extract_first()   #最新章節
            #print(newchapter)
            date=nameinfo.xpath('td[5]/text()').extract_first()    #更新日期
            #print(date)
            author = nameinfo.xpath('td[3]/text()').extract_first()  # 小說作者
            #print(author)
            serialstatus = nameinfo.xpath('td[6]/text()').extract_first()  # 小說狀態
            #print(serialstatus)
            serialsize = nameinfo.xpath('td[4]/text()').extract_first()  # 小說大小
            #print(serialnumber)
            #print('--==--'*10)
            if novelurl:
                item['novel_name'] = name
                #print(item['novel_name'])
                item['author'] = author
                item['novelurl'] = novelurl
                #print(item['novelurl'])
                item['serialstatus'] = serialstatus
                item['serialsize'] = serialsize
                item['date']=date
                item['newchapter']=newchapter

                print('小說名字:', item['novel_name'])
                print('小說作者:', item['author'])
                print('小說地址:', item['novelurl'])
                print('小說狀態:', item['serialstatus'])
                print('小說大小:', item['serialsize'])
                print('更新日期:', item['date'])
                print('最新章節:', item['newchapter'])

                print('===='*5)

                #yield Request(novelurl,dont_filter=True,callback=self.get_novelcontent,meta={'item':item})
                yield item
                '''
    def get_novelcontent(self,response):
        #print(123124)  #測試調用成功url
        item=response.meta['item']
        novelurl=response.url
        #print(novelurl)
        serialnumber = response.xpath('//tr[2]/td[2]/text()').extract_first()  # 連載字數
        #print(serialnumber)
        category = response.xpath('//tr[1]/td[1]/a/text()').extract_first()  # 小說類別
        #print(category)
        collect_num_total = response.xpath('//tr[2]/td[1]/text()').extract_first()  # 總收藏
        #print(collect_num_total)
        click_num_total = response.xpath('//tr[3]/td[1]/text()').extract_first()  # 總點擊
        novel_breif = response.xpath('//dd[2]/p[2]').extract_first()        #小說簡介

        # item['serialnumber'] = serialnumber
        # item['category'] = category
        # item['collect_num_total']=collect_num_total
        # item['click_num_total']=click_num_total
        # item['novel_breif']=novel_breif
        #
        # print('小說字數:', item['serialnumber'])
        # print('小說類別:', item['category'])
        # print('總收藏:', item['collect_num_total'])
        # print('總點擊:', item['click_num_total'])
        # print('小說簡介:', item['novel_breif'])
        # print('===='*10)

        yield item

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DingdianItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    novel_name = scrapy.Field()  # 小說名字
    author = scrapy.Field()  # 作者
    novelurl = scrapy.Field()  # 小說地址
    serialstatus = scrapy.Field()  # 狀態
    serialsize = scrapy.Field()  # 連載大小
    date=scrapy.Field()        #小說日期
    newchapter=scrapy.Field()   #最新章節
      
    serialnumber = scrapy.Field()  # 連載字數
    category = scrapy.Field()  # 小說類別
    collect_num_total = scrapy.Field()  # 總收藏
    click_num_total = scrapy.Field()  # 總點擊
    novel_breif = scrapy.Field()  # 小說簡介
  
    

插入數據庫的管道  iopipelines.py

from 爬蟲大全.dingdian.dingdian import dbutil

# 作業: 自定義的管道,將完整的爬取數據,保存到MySql數據庫中
class DingdianPipeline(object):
    def process_item(self, item, spider):
        dbu = dbutil.MYSQLdbUtil()
        dbu.getConnection()  # 開啟事物

        # 1.添加
        try:
            #sql = "insert into movies (電影排名,電影名稱,電影短評,評價分數,評價人數)values(%s,%s,%s,%s,%s)"
            sql = "insert into ebook (novel_name,author,novelurl,serialstatus,serialsize,ebookdate,newchapter)values(%s,%s,%s,%s,%s,%s,%s)"
            #date = [item['rank'],item['title'],item['quote'],item['star']]
            #dbu.execute(sql, date, True)
            dbu.execute(sql, (item['novel_name'],item['author'],item['novelurl'],item['serialstatus'],item['serialsize'],item['date'],item['newchapter']),True)
            #dbu.execute(sql,True)
            dbu.commit()
            print('插入數據庫成功!!')
        except:
            dbu.rollback()
            dbu.commit()  # 回滾后要提交
        finally:
            dbu.close()
        return item

 

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for dingdian project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'dingdian'

SPIDER_MODULES = ['dingdian.spiders']
NEWSPIDER_MODULE = 'dingdian.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'dingdian (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 2

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
    'dingdian.middlewares.DingdianSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'dingdian.middlewares.DingdianDownloaderMiddleware': 543,
    'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None,
    'dingdian.rotate_useragent.RotateUserAgentMiddleware' :400
}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    #'dingdian.pipelines.DingdianPipeline': 300,
    #'dingdian.iopipelines.DingdianPipeline': 301,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

LOG_LEVEL='INFO'
LOG_FILE='dingdian.log'

 

在往數據庫插入數據的時候 ,我遇到了

pymysql.err.InterfaceError: (0, '') 這種問題,百度了好久才解決。。。

那是因為scrapy異步的存儲的原因,太快。

解決方法:只要放慢爬取速度就能解決,setting.py中設置 DOWNLOAD_DELAY = 2

 

 

詳細代碼 附在Github上了 

tyutltf/dingdianbook: 爬取頂點小說網的所有小說信息  https://github.com/tyutltf/dingdianbook

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM