閑來無事用Python的scrapy框架練練手,爬取頂點小說網的所有小說的詳細信息。
看一下網頁的構造:
tr標簽里面的 td 使我們所要爬取的信息
下面是我們要爬取的二級頁面 小說的簡介信息:
下面上代碼:
mydingdian.py
import scrapy from scrapy.http import Request from ..items import DingdianItem class MydingdianSpider(scrapy.Spider): name = 'mydingdian' allowed_domains = ['www.x23us.com/'] start_url = ['https://www.x23us.com/class/'] starturl=['.html'] def start_requests(self): # for i in range(1,11): for i in range(5, 6): #print(i) url_con=str(i)+'_1' #print(url_con) url1 = self.start_url+list(url_con)+self.starturl #print(url1) url='' for j in url1: url+=j+'' #print(url) yield Request(url, self.parse) def parse(self, response): baseurl=response.url #真正的url鏈接 #print(baseurl) max_num = response.xpath('//*[@id="pagelink"]/a[14]/text()').extract_first() # 獲取當前頁面的最大頁碼數 #print(max_num) #頁碼數 baseurl = baseurl[:-7] #print(baseurl) for num in range(1,int(max_num)+1): #for num in range(1, 5): #print(list("_" + str(num))) newurl1 = list(baseurl) + list("_" + str(num)) + self.starturl #print(newurl1) newurl='' for j in newurl1: newurl+=j+'' print(newurl) # 此處使用dont_filter和不使用的效果不一樣,使用dont_filter就能夠抓取到第一個頁面的內容,不用就抓不到 # scrapy會對request的URL去重(RFPDupeFilter),加上dont_filter則告訴它這個URL不參與去重。 yield Request(newurl, dont_filter=True, callback=self.get_name) # 將新的頁面url的內容傳遞給get_name函數去處理 def get_name(self,response): item=DingdianItem() for nameinfo in response.xpath('//tr'): #print(nameinfo) novelurl = nameinfo.xpath('td[1]/a/@href').extract_first() # 小說地址 #print(novelurl) name = nameinfo.xpath('td[1]/a[2]/text()').extract_first() # 小說名字 #print(name) newchapter=nameinfo.xpath('td[2]/a/text()').extract_first() #最新章節 #print(newchapter) date=nameinfo.xpath('td[5]/text()').extract_first() #更新日期 #print(date) author = nameinfo.xpath('td[3]/text()').extract_first() # 小說作者 #print(author) serialstatus = nameinfo.xpath('td[6]/text()').extract_first() # 小說狀態 #print(serialstatus) serialsize = nameinfo.xpath('td[4]/text()').extract_first() # 小說大小 #print(serialnumber) #print('--==--'*10) if novelurl: item['novel_name'] = name #print(item['novel_name']) item['author'] = author item['novelurl'] = novelurl #print(item['novelurl']) item['serialstatus'] = serialstatus item['serialsize'] = serialsize item['date']=date item['newchapter']=newchapter print('小說名字:', item['novel_name']) print('小說作者:', item['author']) print('小說地址:', item['novelurl']) print('小說狀態:', item['serialstatus']) print('小說大小:', item['serialsize']) print('更新日期:', item['date']) print('最新章節:', item['newchapter']) print('===='*5) #yield Request(novelurl,dont_filter=True,callback=self.get_novelcontent,meta={'item':item}) yield item ''' def get_novelcontent(self,response): #print(123124) #測試調用成功url item=response.meta['item'] novelurl=response.url #print(novelurl) serialnumber = response.xpath('//tr[2]/td[2]/text()').extract_first() # 連載字數 #print(serialnumber) category = response.xpath('//tr[1]/td[1]/a/text()').extract_first() # 小說類別 #print(category) collect_num_total = response.xpath('//tr[2]/td[1]/text()').extract_first() # 總收藏 #print(collect_num_total) click_num_total = response.xpath('//tr[3]/td[1]/text()').extract_first() # 總點擊 novel_breif = response.xpath('//dd[2]/p[2]').extract_first() #小說簡介 # item['serialnumber'] = serialnumber # item['category'] = category # item['collect_num_total']=collect_num_total # item['click_num_total']=click_num_total # item['novel_breif']=novel_breif # # print('小說字數:', item['serialnumber']) # print('小說類別:', item['category']) # print('總收藏:', item['collect_num_total']) # print('總點擊:', item['click_num_total']) # print('小說簡介:', item['novel_breif']) # print('===='*10) yield item
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class DingdianItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() novel_name = scrapy.Field() # 小說名字 author = scrapy.Field() # 作者 novelurl = scrapy.Field() # 小說地址 serialstatus = scrapy.Field() # 狀態 serialsize = scrapy.Field() # 連載大小 date=scrapy.Field() #小說日期 newchapter=scrapy.Field() #最新章節 serialnumber = scrapy.Field() # 連載字數 category = scrapy.Field() # 小說類別 collect_num_total = scrapy.Field() # 總收藏 click_num_total = scrapy.Field() # 總點擊 novel_breif = scrapy.Field() # 小說簡介
插入數據庫的管道 iopipelines.py
from 爬蟲大全.dingdian.dingdian import dbutil # 作業: 自定義的管道,將完整的爬取數據,保存到MySql數據庫中 class DingdianPipeline(object): def process_item(self, item, spider): dbu = dbutil.MYSQLdbUtil() dbu.getConnection() # 開啟事物 # 1.添加 try: #sql = "insert into movies (電影排名,電影名稱,電影短評,評價分數,評價人數)values(%s,%s,%s,%s,%s)" sql = "insert into ebook (novel_name,author,novelurl,serialstatus,serialsize,ebookdate,newchapter)values(%s,%s,%s,%s,%s,%s,%s)" #date = [item['rank'],item['title'],item['quote'],item['star']] #dbu.execute(sql, date, True) dbu.execute(sql, (item['novel_name'],item['author'],item['novelurl'],item['serialstatus'],item['serialsize'],item['date'],item['newchapter']),True) #dbu.execute(sql,True) dbu.commit() print('插入數據庫成功!!') except: dbu.rollback() dbu.commit() # 回滾后要提交 finally: dbu.close() return item
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for dingdian project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'dingdian' SPIDER_MODULES = ['dingdian.spiders'] NEWSPIDER_MODULE = 'dingdian.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'dingdian (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 2 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = { 'dingdian.middlewares.DingdianSpiderMiddleware': 543, } # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'dingdian.middlewares.DingdianDownloaderMiddleware': 543, 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None, 'dingdian.rotate_useragent.RotateUserAgentMiddleware' :400 } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { #'dingdian.pipelines.DingdianPipeline': 300, #'dingdian.iopipelines.DingdianPipeline': 301, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 0 HTTPCACHE_DIR = 'httpcache' HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' LOG_LEVEL='INFO' LOG_FILE='dingdian.log'
在往數據庫插入數據的時候 ,我遇到了
pymysql.err.InterfaceError: (0, '') 這種問題,百度了好久才解決。。。
那是因為scrapy異步的存儲的原因,太快。
解決方法:只要放慢爬取速度就能解決,setting.py中設置 DOWNLOAD_DELAY = 2
詳細代碼 附在Github上了
tyutltf/dingdianbook: 爬取頂點小說網的所有小說信息 https://github.com/tyutltf/dingdianbook