爬取所有圖片,一個頁面的圖片建一個文件夾。難點,圖片中有不少.gif圖片,需要重寫下載規則,
創建scrapy項目
scrapy startproject qiumeimei
創建爬蟲應用
cd qiumeimei
scrapy genspider -t crawl qmm www.xxx.com
items.py文件中定義下載字段
import scrapy class QiumeimeiItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() page = scrapy.Field() image_url = scrapy.Field()
qmm.py文件中寫爬蟲主程序
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from qiumeimei.items import QiumeimeiItem class QmmSpider(CrawlSpider): name = 'qmm' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.qiumeimei.com/image'] rules = ( Rule(LinkExtractor(allow=r'http://www.qiumeimei.com/image/page/\d+'), callback='parse_item', follow=True), ) def parse_item(self, response): page = response.url.split('/')[-1] if not page.isdigit(): page = '1' image_urls = response.xpath('//div[@class="main"]/p/img/@data-lazy-src').extract() for image_url in image_urls: item = QiumeimeiItem() item['image_url'] = image_url item['page'] = page yield item
pipelines.py文件中定義下載規則
import scrapy import os from scrapy.utils.misc import md5sum # 導入scrapy 框架里的 管道文件的里的圖像 圖像處理的專用管道文件 from scrapy.pipelines.images import ImagesPipeline # 導入圖片路徑名稱 from qiumeimei.settings import IMAGES_STORE as images_store # 必須繼承 ImagesPipeline class QiumeimeiPipeline(ImagesPipeline): # 定義返回文件名 def file_path(self, request, response=None, info=None): file_name = request.url.split('/')[-1] return file_name # 重寫父類的 下載文件的 方法 def get_media_requests(self, item, info): yield scrapy.Request(url=item['image_url']) # 完成圖片存儲的方法 名稱 def item_completed(self, results, item, info): # print(results) page = item['page'] print('正在下載第'+page+'頁圖片') image_url = item['image_url'] image_name = image_url.split('/')[-1] old_name_list = [x['path'] for t, x in results if t] # 真正的原圖片的存儲路徑 old_name = images_store + old_name_list[0] image_path = images_store + page + "/" # 判斷圖片存放的目錄是否存在 if not os.path.exists(image_path): # 根據當前頁碼創建對應的目錄 os.mkdir(image_path) # 新名稱 new_name = image_path + image_name # 重命名 os.rename(old_name, new_name) return item # 重寫下載規則 def image_downloaded(self, response, request, info): checksum = None for path, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size if self.check_gif(image): self.persist_gif(path, response.body, info) else: self.store.persist_file( path, buf, info, meta={'width': width, 'height': height}, headers={'Content-Type': 'image/jpeg'}) return checksum def check_gif(self, image): if image.format is None: return True def persist_gif(self, key, data, info): root, ext = os.path.splitext(key) absolute_path = self.store._get_filesystem_path(key) self.store._mkdir(os.path.dirname(absolute_path), info) f = open(absolute_path, 'wb') # use 'b' to write binary data. f.write(data)
settings.py文件中定義請求頭和打開下載管道
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36' ITEM_PIPELINES = { 'qiumeimei.pipelines.QiumeimeiPipeline': 300, }
運行爬蟲
scrapy crawl qmm --nolog
查看文件夾是否下載成功
.gif為動態圖。
done。