scrapy框架爬取糗妹妹網站妹子圖分類的所有圖片


爬取所有圖片,一個頁面的圖片建一個文件夾。難點,圖片中有不少.gif圖片,需要重寫下載規則,

創建scrapy項目

scrapy startproject qiumeimei

創建爬蟲應用

cd qiumeimei

scrapy genspider -t crawl qmm www.xxx.com

items.py文件中定義下載字段

import scrapy


class QiumeimeiItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    page = scrapy.Field()
    image_url = scrapy.Field()

qmm.py文件中寫爬蟲主程序

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from qiumeimei.items import QiumeimeiItem

class QmmSpider(CrawlSpider):
    name = 'qmm'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://www.qiumeimei.com/image']

    rules = (
        Rule(LinkExtractor(allow=r'http://www.qiumeimei.com/image/page/\d+'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        page = response.url.split('/')[-1]
        if not page.isdigit():
            page = '1'
        image_urls = response.xpath('//div[@class="main"]/p/img/@data-lazy-src').extract()
        for image_url in image_urls:
            item = QiumeimeiItem()
            item['image_url'] = image_url
            item['page'] = page
            yield item

pipelines.py文件中定義下載規則

import scrapy
import os
from scrapy.utils.misc import md5sum
# 導入scrapy 框架里的 管道文件的里的圖像 圖像處理的專用管道文件
from scrapy.pipelines.images import ImagesPipeline
# 導入圖片路徑名稱
from qiumeimei.settings import IMAGES_STORE as images_store
# 必須繼承 ImagesPipeline
class QiumeimeiPipeline(ImagesPipeline):
    # 定義返回文件名
    def file_path(self, request, response=None, info=None):
        file_name = request.url.split('/')[-1]
        return file_name
    # 重寫父類的 下載文件的 方法
    def get_media_requests(self, item, info):
        yield scrapy.Request(url=item['image_url'])
    #     完成圖片存儲的方法 名稱
    def item_completed(self, results, item, info):
        # print(results)
        page = item['page']
        print('正在下載第'+page+'頁圖片')
        image_url = item['image_url']
        image_name = image_url.split('/')[-1]
        old_name_list = [x['path'] for t, x in results if t]
        # 真正的原圖片的存儲路徑
        old_name = images_store + old_name_list[0]
        image_path = images_store + page + "/"
        # 判斷圖片存放的目錄是否存在
        if not os.path.exists(image_path):
            # 根據當前頁碼創建對應的目錄
            os.mkdir(image_path)
        # 新名稱
        new_name = image_path + image_name
        # 重命名
        os.rename(old_name, new_name)
        return item
    # 重寫下載規則
    def image_downloaded(self, response, request, info):
        checksum = None
        for path, image, buf in self.get_images(response, request, info):
            if checksum is None:
                buf.seek(0)
                checksum = md5sum(buf)
            width, height = image.size
            if self.check_gif(image):
                self.persist_gif(path, response.body, info)
            else:
                self.store.persist_file(
                    path, buf, info,
                    meta={'width': width, 'height': height},
                    headers={'Content-Type': 'image/jpeg'})
        return checksum

    def check_gif(self, image):
        if image.format is None:
            return True

    def persist_gif(self, key, data, info):
        root, ext = os.path.splitext(key)
        absolute_path = self.store._get_filesystem_path(key)
        self.store._mkdir(os.path.dirname(absolute_path), info)
        f = open(absolute_path, 'wb')  # use 'b' to write binary data.
        f.write(data)

settings.py文件中定義請求頭和打開下載管道

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36'

ITEM_PIPELINES = {
   'qiumeimei.pipelines.QiumeimeiPipeline': 300,
}

運行爬蟲

scrapy crawl qmm --nolog

查看文件夾是否下載成功

.gif為動態圖。

done。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM