開發環境:
- Python:3.5
- Scrapy:1.5.1
- scrapy-djangoitem:1.1.1
- Django:2.1.4
以虎嗅網人工智能板塊下《神經網絡生成極慢視頻,從此不再錯過任何細節》一文的縮略圖為例。圖片點這里
通過重寫file_path方法,可以將文件后綴更改為gif。
1 def file_path(self, request, response=None, info=None): 2 item = request.meta['item'] 3 index = request.meta['index'] 4 filename = u'huxiu_article/{0}/{1}'.format(article_id, 5 'image' + str(index) + "." + "gif") 6 return filename
但是這樣保存下來的圖片依然是靜態的。如下圖所示:
通過觀察scrapy.pipelines.images.py文件中的ImagesPipeline類,發現如下代碼,其默認格式為jpeg。
1 def image_downloaded(self, response, request, info): 2 checksum = None 3 for path, image, buf in self.get_images(response, request, info): 4 if checksum is None: 5 buf.seek(0) 6 checksum = md5sum(buf) 7 width, height = image.size 8 self.store.persist_file( 9 path, buf, info, 10 meta={'width': width, 'height': height}, 11 headers={'Content-Type': 'image/jpeg'}) 12 return checksum
所以如果我們想要下載GIF圖片,則需要在繼承ImagesPipeline類后,對image_downloaded方法進行重寫。
1 def check_gif(self, image): 2 if image.format is None: 3 return True 4 5 def persist_gif(self, key, data, info): 6 root, ext = os.path.splitext(key) 7 absolute_path = self.store._get_filesystem_path(key) 8 self.store._mkdir(os.path.dirname(absolute_path), info) 9 f = open(absolute_path, 'wb') # use 'b' to write binary data. 10 f.write(data) 11 12 def image_downloaded(self, response, request, info): 13 checksum = None 14 for path, image, buf in self.get_images(response, request, info): 15 if checksum is None: 16 buf.seek(0) 17 checksum = md5sum(buf) 18 width, height = image.size 19 if self.check_gif(image): 20 self.persist_gif(path, response.body, info) 21 else: 22 self.store.persist_file( 23 path, buf, info, 24 meta={'width': width, 'height': height}, 25 headers={'Content-Type': 'image/jpeg'}) 26 return checksum
這里需要注意的是check_gif方法,如果圖片是jpg/jpeg格式的,那么在debug模式下可以看到format這里是正常的。
但是圖片是GIF格式是,format就為None了。而不是我以為的'GIF'。
再次運行程序,可以看到這次不再是靜態的圖片了 。
完整代碼如下:
1 import os 2 3 from scrapy import Request 4 from scrapy.pipelines.images import ImagesPipeline 5 from scrapy.utils.misc import md5sum 6 7 8 class ImagePipeline(ImagesPipeline): 9 def file_path(self, request, response=None, info=None): 10 # 定義文件名格式 11 filename = 'my file' 12 return filename 13 14 def get_media_requests(self, item, info): 15 for index, img_url in enumerate(item['image_urls']): 16 yield Request(img_url, meta={'item': item, 'index': index}) 17 18 def check_gif(self, image): 19 if image.format is None: 20 return True 21 22 def persist_gif(self, key, data, info): 23 root, ext = os.path.splitext(key) 24 absolute_path = self.store._get_filesystem_path(key) 25 self.store._mkdir(os.path.dirname(absolute_path), info) 26 f = open(absolute_path, 'wb') # use 'b' to write binary data. 27 f.write(data) 28 29 def image_downloaded(self, response, request, info): 30 checksum = None 31 for path, image, buf in self.get_images(response, request, info): 32 if checksum is None: 33 buf.seek(0) 34 checksum = md5sum(buf) 35 width, height = image.size 36 if self.check_gif(image): 37 self.persist_gif(path, response.body, info) 38 else: 39 self.store.persist_file( 40 path, buf, info, 41 meta={'width': width, 'height': height}, 42 headers={'Content-Type': 'image/jpeg'}) 43 return checksum