下載 百度貼吧-動漫壁紙吧 所有圖片
定義item
Spider
spider 只需要得到圖片的url,必須以列表的形式給管道處理
class PictureSpiderSpider(scrapy.Spider):
name = 'picture_spider'
allowed_domains = ['tieba.baidu.com']
start_urls = ['https://tieba.baidu.com/f?kw=%E5%8A%A8%E6%BC%AB%E5%A3%81%E7%BA%B8']
def parse(self, response):
# 貼吧中一頁帖子的ID和標題
theme_urls = re.findall(r'<a rel="noreferrer" href="/p/(\d+)" title="(.*?)" target="_blank" class="j_th_tit ">',
response.text, re.S)
for theme in theme_urls:
# 帖子的url
theme_url = 'https://tieba.baidu.com/p/' + theme[0]
# 進入各個帖子
yield scrapy.Request(url=theme_url, callback=self.parse_theme)
# 貼吧下一頁的url
next_url = re.findall(
r'<a href="//tieba.baidu.com/f\?kw=%E5%8A%A8%E6%BC%AB%E5%A3%81%E7%BA%B8&ie=utf-8&pn=(\d+)" class="next pagination-item " >下一頁></a>',
response.text, re.S)
if next_url:
next_url = self.start_urls[0] + '&pn=' + next_url[0]
yield scrapy.Request(url=next_url)
# 下載每個帖子里的所有圖片
def parse_theme(self, response):
item = PostBarItem()
# 每個貼子一頁圖片的縮略圖的url
pic_ids = response.xpath('//img[@class="BDE_Image"]/@src').extract()
# 用列表來裝圖片的url
item['pic_urls'] = []
for pic_url in pic_ids:
# 取出每張圖片的名稱
item['pic_name'] = pic_url.split('/')[-1]
# 圖片URL
url = 'http://imgsrc.baidu.com/forum/pic/item/' + item['pic_name']
# 將url添加進列表
item['pic_urls'].append(url)
# 將item交給pipelines下載
yield item
# 下完一頁圖片后繼續下一頁
next_url = response.xpath('//a[contains(text(),"下一頁")]/@href').extract_first()
if next_url:
yield scrapy.Request('https://tieba.baidu.com' + next_url, callback=self.parse_theme)
ImagesPipeline
- from scrapy.pipelines.images import ImagesPipeline
- 繼承ImagesPipeline,重寫get_media_requests()和file_path()方法
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class PostBarPipeline(ImagesPipeline):
# 需要headers的網站,再使用
headers = {
'User-Agent': '',
'Referer': '',
}
def get_media_requests(self, item, info):
for pic_url in item['pic_urls']:
# 為每個url生成一個Request
yield scrapy.Request(pic_url)
# 需要請求頭的時候,添加headers參數
# yield scrapy.Request(pic_url, headers=self.headers)
def file_path(self, request, response=None, info=None):
# 重命名(包含后綴名),若不重寫這函數,圖片名為哈希
pic_path = request.url.split('/')[-1]
return pic_path
settings文件
-
激活管道
-
設置圖片保存地址