scrapy 圖片爬取 多層多頁 保存不同的文件夾 重命名full文件夾


記錄下整個爬蟲代碼,我已經把實驗網站爬完了。。

items.py

1 import scrapy
2 
3 
4 class DemoItem(scrapy.Item):
5     # define the fields for your item here like:
6     folder_name = scrapy.Field()  #選取頁面里的主題或者標題作為文件夾名字,代替full文件夾
7     #img_name = scrapy.Field()  # 提取的圖片的名字,如果沒有就不使用
8     img_url = scrapy.Field()  # 圖片的鏈接

spider.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from demo.items import DemoItem
 4 
 5 
 6 class LogosSpider(scrapy.Spider):
 7     name = 'logos'
 8     allowed_domains = ['tttt8.net']
 9     #start_urls = ['http://www.tttt8.net/category/legbaby/']
10     #start_urls = ['http://www.tttt8.net/category/ugirls/']
11     #start_urls = ['http://www.tttt8.net/category/kelagirls/']
12     start_urls = ['http://www.tttt8.net/category/xiurenwang/micatruisg/']
13     #這里的page是用來構造一級頁面翻頁鏈接
14     page = 1
15 
16     def parse(self, response):
17         #提取所有專欄的列表
18         li_list = response.xpath('//*[@id="post_container"]/li')
19         for li in li_list:
20             #實例化
21             item = DemoItem()
22             #這里只提取專欄的名字,用於后面儲存文件夾的命名
23             item['folder_name'] = li.xpath('./div[2]/h2/a/text()').extract_first()
24             #提取二級頁面的鏈接,准備給二級頁面函數去工作
25             next_plink = li.xpath('./div[1]/a/@href').extract_first()
26             #item把一級頁面實例化的內容傳送給二級頁面接收
27             yield scrapy.Request(url = next_plink, callback = self.parse2, meta = {'item':item})
28         # 一級頁面的翻頁列表/ 自行查找別的方法翻頁,網上好多,我覺得這個構造更簡單
29         page_list = response.xpath('//div[@class="pagination"]/a/@href').extract()
30         #找到最后一頁的頁碼
31         last_page = page_list[-1]
32         #提取出最大的頁碼
33         max_num = int(last_page[-2])
34         #構造翻頁的頁碼鏈接
35         if self.page <= max_num:
36             self.page += 1
37             new_page_url = self.start_urls[0] + 'page/' + str(self.page) + '/'
38             yield scrapy.Request(url = new_page_url, callback = self.parse)
39 
40     def parse2(self, response):
41         #接收一級頁面的內容,和翻頁請求的銜接,不然翻頁就丟失了,會報錯
42         item = response.meta['item']
43         p_list = response.xpath('//*[@id="post_content"]/p/img')
44         #正常的提取圖片的鏈接給pepeline去下載
45         for img in p_list:
46             img_url = img.xpath('./@src').extract_first()
47             # 這里必須加中括號[],圖片下載的函數,要求是list類型
48             item['img_url'] = [img_url]
49             yield item
50         # 二級頁面的翻頁列表,然后在后面yield請求
51         next_page_list = response.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/a/@href').extract()
52         for next_page in next_page_list:
53             #這里一定要加meta,不然二級的翻頁就error,這里我搞了好久才發現。
54             yield scrapy.Request(url = next_page, callback = self.parse2, meta = {'item':item})

settings.py

 1 # -*- coding: utf-8 -*-
 2 
 3 
 4 BOT_NAME = 'demo'
 5 SPIDER_MODULES = ['demo.spiders']
 6 NEWSPIDER_MODULE = 'demo.spiders'
 7 
 8 #存儲路徑和header
 9 IMAGES_STORE = 'D:\pics'
10 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
11 
12 DOWNLOAD_DELAY = 0.2
13 #機器人關掉
14 ROBOTSTXT_OBEY = False
15 
16 ITEM_PIPELINES = {
17     'demo.pipelines.DemoPipeline': 300,
18 }
19 #下載指定的field
20 IMAGES_URLS_FIELD = 'img_url'

pipelines.py

 1 import scrapy
 2 from scrapy.exceptions import DropItem
 3 from scrapy.pipelines.images import ImagesPipeline
 4 
 5 
 6 class DemoPipeline(ImagesPipeline):
 7     # 固定的改寫的函數,不需要修改
 8     def get_media_requests(self, item, info):
 9         for img_url in item['img_url']:
10             # 下載完后給別的函數去改名字,所以用meta傳下去
11             yield scrapy.Request(img_url, meta = {'item':item})
12 
13     def file_path(self, request, response = None, info = None):
14         item = request.meta['item']
15         folder_name = item['folder_name']
16         # img_name = item['img_name']  #圖片沒有名字不啟用這個語句
17         # 因為圖片沒有名字就用url截取最后的字符串作為名字
18         image_guid = request.url.split('/')[-1]
19         img_name = image_guid
20         # name = img_name + image_guid
21         # name = name + '.jpg'
22         # 0 代表文件夾,1 代表文件
23         filename = u'{0}/{1}'.format(folder_name, img_name)
24         return filename
25 
26     # 固定改寫的函數,不需要修改
27     def item_completed(self, results, item, info):
28         image_paths = [x['path'] for ok, x in results if ok]
29         if not image_paths:
30             raise DropItem('Image Downloaded Failed')
31         return item

結果:

 

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM