Scrapy爬取妹子圖保存到不同目錄下


 

進行設置settings

#啟動圖片管道
ITEM_PIPELINES = {
   'mztu.pipelines.ImagesPipelinse': 300,
}
#設置默認目錄地址  注意下載圖片的話默認地址必須設置!!!
IMAGES_STORE = "E:\study\Python\scrapy\mztu\imges"
#設置圖片通道失效時間
IMAGES_EXPIRES =90
#縮略圖生成
#IMAGES_THUMBS = {
 #   'small': (50, 50),
#    'big': (270, 270),
#}

spider目錄

# -*- coding: utf-8 -*-
import scrapy
from mztu.items import MztuItem

class ZimdgSpider(scrapy.Spider):
    name = 'zimdg'
    allowed_domains = ['mzitu.com']
    #生成鏈接列表
    start_urls = ['http://www.mzitu.com/xinggan/page/{}/'.format(str(x)) for x in range(118)]
    def parse(self, response):
        #解析出鏈接
        set_li = response.xpath("//div[@class='postlist']/ul/li")
        for ecth in set_li:
            ed = ecth.xpath('./a/@href').extract()
            #進行二次分類解析
            yield scrapy.Request(ed[0],callback=self.parse_item)


    def parse_item(self,response):
        itme = MztuItem()
        # 獲取頁數鏈接進行訪問
        offset = int(response.xpath('//div[@class="pagenavi"]/a/span/text()')[4].extract())
        #生成鏈接訪問
        #遍歷鏈接訪問
        for i in [response.url+"/{}".format(str(x))  for x in range(1,offset+1)]:
            itme['Referer']=i
            #將meta傳入鏈接
            yield scrapy.Request(itme['Referer'],meta={'meta_1':itme}, callback=self.parse_ponse)
        # for i in url:

    def parse_ponse(self,response):
        #獲取itme資源
        itme = response.meta['meta_1']
        #獲取圖片地址
        imgs = response.xpath('//div[@class="main-image"]/p/a/img/@src')[0].extract()
        #獲取圖片目錄
        title = response.xpath('//div[@class="main-image"]/p/a/img/@alt')[0].extract()
        itme["title"]= title
        itme["imge_url"]= imgs
        #itme["nickname"] = itme["Referer"][itme["Referer"].rfind("/"):]+itme["imge_url"][itme["imge_url"].rfind('/')+1:itme["imge_url"].rfind('.')]
        #itme["nickname"] = itme["imge_url"][itme["imge_url"].rfind('/')+1:itme["imge_url"].rfind('.')]
        yield itme

items

import scrapy


class MztuItem(scrapy.Item):
    #目錄
    title = scrapy.Field()
    #圖片地址
    imge_url = scrapy.Field()
    #請求頭
    Referer = scrapy.Field()

    image_Path = scrapy.Field()
    #圖片名稱
   # nickname = scrapy.Field()

pipelines管道

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 導入這個包為了移動文件
import shutil
#此包不解釋
import scrapy
# 導入項目設置
from scrapy.utils.project import get_project_settings
# 導入scrapy框架的圖片下載類
from scrapy.pipelines.images import ImagesPipeline
#此包不解釋
import os

class ImagesPipelinse(ImagesPipeline):
    #def process_item(self, item, spider):
    #    return item
    # 獲取settings文件里設置的變量值
    IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
    # 重寫ImagesPipeline類的此方法
    # 發送圖片下載請求
    def get_media_requests(self, item, info):
        image_url = item["imge_url"]
        #headers是請求頭主要是防反爬蟲
        yield scrapy.Request(image_url,headers={'Referer':item['Referer']})

    def item_completed(self, result, item, info):
        image_path = [x["path"] for ok, x in result if ok]
        # 定義分類保存的路徑
        img_path = "%s\%s" % (self.IMAGES_STORE, item['title'])
        # 目錄不存在則創建目錄
        if os.path.exists(img_path) == False:
            os.mkdir(img_path)
        # 將文件從默認下路路徑移動到指定路徑下
        shutil.move(self.IMAGES_STORE + "\\" +image_path[0], img_path + "\\" +image_path[0][image_path[0].find("full\\")+6:])
        item['image_Path'] = img_path + "\\" + image_path[0][image_path[0].find("full\\")+6:]
        return item

這里實現圖片保存到不同的目錄下,主要函數是shutil.move(),將圖片從原始默認路徑移動到指定目錄下


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM