使用scrapy里自帶的Image功能下載,下面貼代碼,解釋在代碼的注釋里。
items.py
1 import scrapy 2 3 class ImageItem(scrapy.Item): 4 #注意這里的item是ImageItem
5 image_urls = scrapy.Field() 6 images = scrapy.Field() 7 8 #image_urls和images是固定的,不能改名字
settings.py
1 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' 2 #上面只是個訪問header,加個降低被拒絕的保險 3 4 ITEM_PIPELINES = { 5 'scrapy.pipelines.images.ImagesPipeline': 1 6 } 7 #打開Images的通道 8 9 IMAGES_STORE = 'F:\\pics' 10 #一定要設置這個存儲地址,要是真實的硬盤,可以不創建pics文件夾,會自己生成,還生成默認的full文件夾
spider.py(這里是carhome)
1 import scrapy 2 3 from car.items import ImageItem 4 5 class CarhomeSpider(scrapy.Spider): 6 name = 'carhome' 7 allowed_domains = ['sohu.com'] 8 start_urls = ['http://www.sohu.com/a/337634404_100032610'] 9 download_delay = 1 10 11 def parse(self, response): 12 item = ImageItem() 13 srcs = response.css('.article img::attr(src)').extract() #用css方法找到的所有圖片地址 14 item['image_urls'] = srcs 15 yield item
pipelines.py
1 from scrapy.pipelines.images import ImagesPipeline 2 from scrapy.exceptions import DropItem 3 from scrapy.http import Request 4 #這里的兩個函數get_media_requests和item_completed都是scrapy的內置函數,想重命名的就這這里操作 5 #可以直接復制這里的代碼就可以用了 6 class MyImagesPipeline(ImagesPipeline) : 7 def get_media_requests(self, item, info) : 8 for image_url in item['image_urls'] : 9 yield Request(image_url) 10 11 def item_completed(self, results, item, info) : 12 image_path = [x['path'] for ok, x in results if ok] 13 if not image_path : 14 raise DropItem('Item contains no images') 15 item['image_paths'] = image_path 16 return item
代碼就這么多,下面執行一下:
文件夾結果:
還不會的快去試試吧!