-
案例分析:
- 需求:爬取網易新聞的國內板塊下的新聞數據
- 需求分析:當點擊國內超鏈進入國內對應的頁面時,會發現當前頁面展示的新聞數據是被動態加載出來的,如果直接通過程序對url進行請求,是獲取不到動態加載出的新聞數據的。則就需要我們使用selenium實例化一個瀏覽器對象,在該對象中進行url的請求,獲取動態加載的新聞數據。
selenium在scrapy中使用的原理分析:
- 當引擎將國內板塊url對應的請求提交給下載器后,下載器進行網頁數據的下載,然后將下載到的頁面數據,封裝到response中,提交給引擎,引擎將response在轉交給Spiders。Spiders接受到的response對象中存儲的頁面數據里是沒有動態加載的新聞數據的。要想獲取動態加載的新聞數據,則需要在下載中間件中對下載器提交給引擎的response響應對象進行攔截,切對其內部存儲的頁面數據進行篡改,修改成攜帶了動態加載出的新聞數據,然后將被篡改的response對象最終交給Spiders進行解析操作。
selenium在scrapy中的使用流程:
- 重寫爬蟲文件的構造方法,在該方法中使用selenium實例化一個瀏覽器對象(因為瀏覽器對象只需要被實例化一次)
- 重寫爬蟲文件的closed(self,spider)方法,在其內部關閉瀏覽器對象。該方法是在爬蟲結束時被調用
- 重寫下載中間件的process_response方法,讓該方法對響應對象進行攔截,並篡改response中存儲的頁面數據
- 在配置文件中開啟下載中間件
代碼實現:
spider
import scrapy from selenium import webdriver from selenium.webdriver.chrome.options import Options from wangyiPro.items import WangyiproItem """ 爬取網易國內和國際新聞標題和內容 """ class WangyiSpider(scrapy.Spider): name = 'wangyi' # allowed_domains = ['www.163.com'] start_urls = ['https://news.163.com/domestic/','https://news.163.com/world/'] def __init__(self): options = webdriver.ChromeOptions() options.add_argument('--window-position=0,0'); # chrome 啟動初始位置 options.add_argument('--window-size=1080,800'); # chrome 啟動初始大小 self.browser = webdriver.Chrome(executable_path='C://xx//chromedriver.exe' ,chrome_options=options) def parse(self, response): div_list = response.xpath('//div[@class="ndi_main"]/div') for div_item in div_list: title = div_item.xpath('./div/div[1]/h3/a/text()').extract_first() new_detail_url=div_item.xpath('./div/div[1]/h3/a/@href').extract_first() item = WangyiproItem() item['title'] = title # 對於新聞詳情頁發起request yield scrapy.Request(url= new_detail_url,callback=self.parse_detail,meta={'item':item}) # 請求傳參item # 解析新聞內容 def parse_detail(self,response): content = response.xpath('//*[@id="endText"]//text()').extract() content = ''.join(content) item = response.meta['item'] item['content'] = content.strip() yield item def closed(self,spider): self.browser.quit()
middleware
from scrapy import signals from time import sleep from scrapy.http import HtmlResponse class WangyiproDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None # 攔截響應對象進行篡改 def process_response(self, request, response, spider): # Called with the response returned from the downloader. #挑選指定的響應對象進行篡改 #通過url指定request #通過request指定response #spider爬蟲對象 bro = spider.browser # 獲取爬蟲類定義的瀏覽器對象 if request.url in spider.start_urls: #response # 進行篡改 實例化新的響應對象(包含動態加載的新聞數據)替代原來的舊響應對象 # 基於seleium便捷獲取動態數據 bro.get(request.url) sleep(3) bro.execute_script('window.scrollTo(0, document.body.scrollHeight)') sleep(1) page_text = bro.page_source # 包含了動態加載對象 new_response = HtmlResponse(url=request.url,body=page_text,encoding="utf-8",request=request) return new_response else: # response # 其他請求 # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
pipeline文件
import pymysql class WangyiproPipeline(object): # 構造方法 def __init__(self): self.conn = None # 定義一個文件描述符屬性 self.cursor = None self.num = 0 # 下列都是在重寫父類的方法: # 開始爬蟲時,執行一次 def open_spider(self, spider): self.conn = pymysql.Connect(host='192.168.xx.xx', port=3306, user='root', password='xx', db='xx_db', charset='utf8') print('爬蟲數據庫開始') # 專門處理item對象 # 因為該方法會被執行調用多次,所以文件的開啟和關閉操作寫在了另外兩個只會各自執行一次的方法中。 def process_item(self, item, spider): author = item['title'] content = item['content'] self.cursor = self.conn.cursor() try: self.cursor.execute('insert into qiubai values(%s,%s)', (author, content)) self.conn.commit() except Exception as e: print(e,content[0,20]) self.conn.rollback() return item def close_spider(self, spider): print('爬蟲數據庫結束') self.cursor.close() self.conn.close()
items文件
class WangyiproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() content = scrapy.Field() pass
setting配置
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' # 偽裝請求載體身份 # Obey robots.txt rules # ROBOTSTXT_OBEY = True ROBOTSTXT_OBEY = False #可以忽略或者不遵守robots協議 #只顯示指定類型的日志信息 LOG_LEVEL='ERROR' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'wangyiPro.middlewares.WangyiproSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'wangyiPro.pipelines.WangyiproPipeline': 300, }