scrapy下載圖片到指定目錄,創建縮略圖,存儲入庫


環境和工具:python2.7,scrapy

實驗網站:http://www.XXXX.com/tag/333.html  爬去所有兔女郎圖片,下面的推薦需要過濾

邏輯:分析網站信息,下載圖片和入庫需要開啟ITEM_PIPELINES,開啟縮略圖配置,轉移圖片

 -----settings.py

##不按照robots.txt
ROBOTSTXT_OBEY = False
##默認
DOWNLOAD_DELAY = 3
##關閉cookie
COOKIES_ENABLED = False
##開啟ITEM_PIPELINES
ITEM_PIPELINES = {
                    'MyPicSpider.pipelines.MyImagesPipeline': 300,
                    'MyPicSpider.pipelines.MysqlPipeline': 400
                  }
##存儲路徑
IMAGES_STORE ='G:\\www\\scrapy_rpo\\pic\\meinv\\rabbit\\'
##過濾圖片
IMAGES_MIN_HEIGHT = 110
IMAGES_MIN_WIDTH = 110
##縮略圖片
IMAGES_THUMBS = {
    'big': (270, 270),
}

------items.py

import scrapy


class PicspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    tag = scrapy.Field()
    image_urls = scrapy.Field()
    images_data = scrapy.Field()
    img_path = scrapy.Field()
    img_big_path = scrapy.Field()
    file_path = scrapy.Field()

----pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


import scrapy,os,datetime
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import shutil,os,pymysql
# 導入項目設置
from scrapy.utils.project import get_project_settings
#conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test', charset="UTF8")
#cursor = conn.cursor()
class MyImagesPipeline(ImagesPipeline):
    # 從項目設置文件中導入圖片下載路徑
    img_store = get_project_settings().get('IMAGES_STORE')
    def get_media_requests(self, item, info):
        ''' 多個url'''
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info, ):
        image_paths = [x["path"] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        file_path = item['file_path']
        # 定義分類保存的路徑
        if os.path.exists(file_path) == False:
            os.mkdir(file_path)
        print image_paths
        ## pic  ==  full/80dd7db02e4da4e63f05d9d49c1092fc7fdcb43e.jpg
        pic_list = []
        for v in image_paths:
            pic_name = v.replace('full/','')
            pic_small_name =pic_name.replace('.jpg','')+'_s.jpg'
            pic_big_name = pic_name.replace('.jpg', '') + '_b.jpg'
            ##獲取創建的圖片名字
            # 將文件從默認下路路徑移動到指定路徑下
            # 移動圖片
            shutil.move(self.img_store + 'full\\'+pic_name, file_path + "\\" + pic_name)
            # 移動縮略圖
            #shutil.move(self.img_store + 'thumbs\\small\\'+ pic_name, file_path + "\\" + pic_small_name)
            shutil.move(self.img_store + 'thumbs\\big\\' + pic_name, file_path + "\\" + pic_big_name)
            #img_path_dict['img_path'] = file_path + "\\" + pic_name
            #img_path_dict['img_small_path'] = file_path + "\\" + pic_small_name
            #img_path_dict['img_big_path'] = file_path + "\\" + pic_big_name
            img_path_dict = ('picture/meinv/rabbit/'+item['tag']+"/" + pic_name,'picture/meinv/rabbit/'+item['tag']+"/" +pic_big_name)
            pic_list.append(img_path_dict)
        item["img_path"] = pic_list
        return item

##入庫
class MysqlPipeline(object):
    def __init__(self):
        self.conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test1', charset="UTF8")
        # 創建指針
        self.cursor = self.conn.cursor()
    def process_item(self, item, spider):
        ###組裝數據
        list = []
        datetime_now  =datetime.datetime.now()
        datetime_now = datetime.datetime.now()
        datetime_str = '{0}-{1}-{2} {3}:{4}:{5}'.format(datetime_now.year, datetime_now.month, datetime_now.day,datetime_now.hour, datetime_now.minute, datetime_now.second)
        ##增加type
        result = self.cursor.execute(u"select id from network_type where RESOURCETYPE ='p' and TYPENAME='{0}'".format(item['tag']))
        if result==0:
            self.cursor.execute("insert into network_type(PID,RESOURCETYPE,TYPENAME)values(%s,%s,%s) ",(2415,'p',item['tag']))
            typeid = self.cursor.lastrowid
            self.conn.commit()
        else:
            #tag_id = self.cursor.fetchall()
            #typeid = tag_id[0][0]
            return False

        types = ','+str(typeid)+','
        #print item['img_path']
        self.cursor.execute('select  id from network_picture order by cast(id as SIGNED INTEGER) desc limit 0,1')
        old_id = self.cursor.fetchone()
        if old_id:
            id_n = str(int(old_id[0]) + 1)
        else:
            id_n = str(1)
        for v in item['img_path']:
            path1 = v[0]
            path2 = v[1]
            self.cursor.execute(u'select  id from network_picture where FILEPATH="{0}" and fileScalPath="{1}"'.format(path1,path2))
            data = self.cursor.fetchone()
            if data:
                print u'該數據已經存在'
            else:
                a = (str(id_n),'',path1,'',types,0,datetime_str,path2)
            list.append(a)
            id_n = int(id_n) + 1
        print list
        self.cursor.executemany("insert into network_picture(ID,NAME,FILEPATH,FILESIZE,TYPES,STATUS,DATETIME,fileScalPath)values(%s,%s,%s,%s,%s,%s,%s,%s)", list)
        self.conn.commit()
        return item

----spider.py

# -*- coding: utf-8 -*-
import scrapy,os,urllib2
from scrapy.linkextractors import LinkExtractor   ##引入linkextractors  用於篩選鏈接和跟進鏈接,還有很多功能,可以去百度下
from scrapy.spiders import CrawlSpider, Rule     ##定義spider的模板,引入Rule規則
from MyPicSpider.items import PicspiderItem      ##引入定義的items.py
# 導入項目設置
from scrapy.utils.project import get_project_settings
from bs4 import BeautifulSoup
import time,pymysql
headers = {'User_agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test1', charset="UTF8")
# 創建指針
cursor = conn.cursor()
class PicSpider(CrawlSpider):    ##繼承模板CrawlSpider 普通模板繼承Spider
    name = 'pic'     ###定義spider名    運行---$ scrapy crawl blog
    allowed_domains = ['www.xxxx.com']    ##  定義查找范圍
    start_urls = ['http://www.xxxx.com/tag/333.html']   ###初始url
    ####當有follow=True  則會跟進該頁面
    ####原理就是  spider在初始頁面查找,同時查找帖子詳情頁的url和下一個分頁,同時跟進下一個分頁頁面,繼續查找下一個分頁頁面和上面的詳情頁url,詳情頁面使用回調函數進行采集
    rules = (
        ###爬去索引頁並跟蹤其中鏈接
        ###查找start_urls  所有的分頁頁面
        Rule(LinkExtractor(allow=r'/tag/[0-9]*_[0-9]*.html'),follow=True),
        ###爬去items頁面並將下載響應返回個頭parse_item函數
        ####查詢每個分頁頁面的詳情頁
        Rule(LinkExtractor(allow=r'http://www.xxxx.com/ent/[a-z]*/[0-9]*/[0-9]*.html'), callback='parse_item', follow=False,),
    )
    ####詳情頁面回調函數
    def parse_item(self,response):
        start_url = response.url
        item = PicspiderItem()
        tag_name = response.xpath('//h1[@class="articleV4Tit"]/text()').extract()[0]
        # cursor.execute(u'select id from network_type  where PID=258 AND TYPENAME="{0}" limit 0,1'.format(tag_name))
        # old_id = cursor.fetchone()
        # if old_id:
        #     exit()
        name = u'兔'
        if name in tag_name:
            pass
        else:
            print u'----這是其他的分類----'
            return False
        li_list =  response.xpath('//ul[@class="articleV4Page l"]/li').extract()
        srcs = []
        for v in range(1, (len(li_list) - 3)):
            if v == 1:
                url_s = start_url
            else:
                url_s = start_url.replace('.html', '') + '_' + str(v) + '.html'
            try:
                request = urllib2.Request(url_s, headers=headers)
                response = urllib2.urlopen(request, timeout=200).read()
            except urllib2.URLError, err:
                print err, '錯誤的url' + url
            obj = BeautifulSoup(response, 'html.parser')
            try:
                pic_url = obj.find('center').find('img')['src']
            except:
                print u'----第一種獲取方式失敗----'
                try:
                    pic_url = obj.find('div', {'id': 'picBody'}).find('img')['src']
                except:
                    print u'----第二種方式獲取失敗----'
                    try:
                        pic_url = obj.find('p', attrs={"style": "text-align: center"}).find('img')['src']
                    except:
                        print u'----第三種獲取方式失敗----'
            srcs.append(pic_url)
        item['tag'] = tag_name
        item['file_path'] = '%s%s' %(get_project_settings().get('IMAGES_STORE'),tag_name)
        item['image_urls'] = srcs
        return item

------scrapy的去重方面我還不是特別了解,有知道的大佬可以告知本白,謝謝。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM