進行設置settings
#啟動圖片管道 ITEM_PIPELINES = { 'mztu.pipelines.ImagesPipelinse': 300, } #設置默認目錄地址 注意下載圖片的話默認地址必須設置!!! IMAGES_STORE = "E:\study\Python\scrapy\mztu\imges" #設置圖片通道失效時間 IMAGES_EXPIRES =90 #縮略圖生成 #IMAGES_THUMBS = { # 'small': (50, 50), # 'big': (270, 270), #}
spider目錄
# -*- coding: utf-8 -*- import scrapy from mztu.items import MztuItem class ZimdgSpider(scrapy.Spider): name = 'zimdg' allowed_domains = ['mzitu.com'] #生成鏈接列表 start_urls = ['http://www.mzitu.com/xinggan/page/{}/'.format(str(x)) for x in range(118)] def parse(self, response): #解析出鏈接 set_li = response.xpath("//div[@class='postlist']/ul/li") for ecth in set_li: ed = ecth.xpath('./a/@href').extract() #進行二次分類解析 yield scrapy.Request(ed[0],callback=self.parse_item) def parse_item(self,response): itme = MztuItem() # 獲取頁數鏈接進行訪問 offset = int(response.xpath('//div[@class="pagenavi"]/a/span/text()')[4].extract()) #生成鏈接訪問 #遍歷鏈接訪問 for i in [response.url+"/{}".format(str(x)) for x in range(1,offset+1)]: itme['Referer']=i #將meta傳入鏈接 yield scrapy.Request(itme['Referer'],meta={'meta_1':itme}, callback=self.parse_ponse) # for i in url: def parse_ponse(self,response): #獲取itme資源 itme = response.meta['meta_1'] #獲取圖片地址 imgs = response.xpath('//div[@class="main-image"]/p/a/img/@src')[0].extract() #獲取圖片目錄 title = response.xpath('//div[@class="main-image"]/p/a/img/@alt')[0].extract() itme["title"]= title itme["imge_url"]= imgs #itme["nickname"] = itme["Referer"][itme["Referer"].rfind("/"):]+itme["imge_url"][itme["imge_url"].rfind('/')+1:itme["imge_url"].rfind('.')] #itme["nickname"] = itme["imge_url"][itme["imge_url"].rfind('/')+1:itme["imge_url"].rfind('.')] yield itme
items
import scrapy class MztuItem(scrapy.Item): #目錄 title = scrapy.Field() #圖片地址 imge_url = scrapy.Field() #請求頭 Referer = scrapy.Field() image_Path = scrapy.Field() #圖片名稱 # nickname = scrapy.Field()
pipelines管道
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html # 導入這個包為了移動文件 import shutil #此包不解釋 import scrapy # 導入項目設置 from scrapy.utils.project import get_project_settings # 導入scrapy框架的圖片下載類 from scrapy.pipelines.images import ImagesPipeline #此包不解釋 import os class ImagesPipelinse(ImagesPipeline): #def process_item(self, item, spider): # return item # 獲取settings文件里設置的變量值 IMAGES_STORE = get_project_settings().get("IMAGES_STORE") # 重寫ImagesPipeline類的此方法 # 發送圖片下載請求 def get_media_requests(self, item, info): image_url = item["imge_url"] #headers是請求頭主要是防反爬蟲 yield scrapy.Request(image_url,headers={'Referer':item['Referer']}) def item_completed(self, result, item, info): image_path = [x["path"] for ok, x in result if ok] # 定義分類保存的路徑 img_path = "%s\%s" % (self.IMAGES_STORE, item['title']) # 目錄不存在則創建目錄 if os.path.exists(img_path) == False: os.mkdir(img_path) # 將文件從默認下路路徑移動到指定路徑下 shutil.move(self.IMAGES_STORE + "\\" +image_path[0], img_path + "\\" +image_path[0][image_path[0].find("full\\")+6:]) item['image_Path'] = img_path + "\\" + image_path[0][image_path[0].find("full\\")+6:] return item
這里實現圖片保存到不同的目錄下,主要函數是shutil.move(),將圖片從原始默認路徑移動到指定目錄下