1. 創建項目
scrapy startproject qiumeimei
2. 建蜘蛛文件qiumei.py
cd qiumeimei
scrapy genspider qiumei www.qiumeimei.com
3. 考慮到只需要下載圖片,先在items.py定義字段
import scrapy class QiumeimeiItem(scrapy.Item): # define the fields for your item here like: img_path = scrapy.Field() pass
4. 寫蜘蛛文件qiumei.py
# -*- coding: utf-8 -*- import scrapy from qiumeimei.items import QiumeimeiItem class QiumeiSpider(scrapy.Spider): name = 'qiumei' # allowed_domains = ['www.qiumeimei.com'] start_urls = ['http://www.qiumeimei.com/image'] def parse(self, response): img_url = response.css('.main>p>img::attr(data-lazy-src)').extract() # print(img_url) for url in img_url: # print(url) item = QiumeimeiItem() item['img_path'] = url yield item next_url = response.css('.pagination a.next::attr(href)').extract_first() if next_url: yield scrapy.Request(url=next_url,callback=self.parse)
5. 管道文件pipelines.py 這里圖片是全部放在了一個文件夾里,在settings.py中定義了一個路徑,見下文第6步:
import os,scrapy from scrapy.pipelines.images import ImagesPipeline from qiumeimei.settings import IMAGES_STORE as images_store class QiumeimeiPipeline(ImagesPipeline): def get_media_requests(self, item, info): img_path = item['img_path'] # print(000) yield scrapy.Request(url=img_path) def item_completed(self, results, item, info): old_name_list = [x['path'] for t, x in results] old_name = images_store + old_name_list[0] # print(111) #圖片名稱 from datetime import datetime i = str(datetime.now()) # print(222) img_path = item['img_path'] img_type = img_path.split('.')[-1] img_name = i[:4]+i[5:7]+i[8:10]+i[11:13]+i[14:16]+i[17:19]+i[20:] #圖片路徑 所有圖片放在一個文件夾里 # print(333) path = images_store + img_name +'.'+ img_type print(path+' 已下載...') os.rename(old_name,path) return item
6. 設置文件settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False #圖片路徑,會自動創建 IMAGES_STORE = './images/' #開啟管道 ITEM_PIPELINES = { 'qiumeimei.pipelines.QiumeimeiPipeline': 300, }
已成功: