”python爬蟲系列“目錄:
scrapy爬取二級頁面的內容
1.定義數據結構item.py文件
# -*- coding: utf-8 -*-
'''
field: item.py
'''
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TupianprojectItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 圖片標題
title = scrapy.Field()
# 發布時間
publish_time = scrapy.Field()
# 瀏覽量
look = scrapy.Field()
# 收藏量
collect = scrapy.Field()
# 下載量
download = scrapy.Field()
# 圖片鏈接
image_url = scrapy.Field()
2.爬蟲文件
# -*- coding: utf-8 -*-
import scrapy
from tupianproject.items import TupianprojectItem
class ImageSpider(scrapy.Spider):
name = 'image'
allowed_domains = ['699pic.com']
start_urls = ['http://699pic.com/people-1-0-0-0-0-0-0.html']
url = 'http://699pic.com/people-{}-0-0-0-0-0-0.html'
page = 1
def parse(self, response):
# 在一級頁面中,應該將所有的圖片詳情頁面的鏈接獲取到
image_detail_url_list = response.xpath('//div[@class="list"]/a/@href').extract()
# pass
# 遍歷詳情頁面,向每一個詳情頁面發送請求即可
for image_detail_url in image_detail_url_list:
yield scrapy.Request(url=image_detail_url, callback=self.parse_detail)
# 接着發送其他請求
if self.page <= 3:
self.page += 1
url = self.url.format(self.page)
yield scrapy.Request(url=url, callback=self.parse)
def parse_detail(self, response):
# 創建一個item對象
item = TupianprojectItem()
# 提取圖片的每一個信息
# title
item['title'] = response.xpath('//div[@class="photo-view"]/h1/text()').extract_first()
# 發布時間
item['publish_time'] = response.xpath('//div[@class="photo-view"]/div/span[@class="publicityt"]')[0].xpath('string(.)').extract_first()
# 獲取瀏覽量
item['look'] = response.xpath('//div[@class="photo-view"]/div/span[@class="look"]/read/text()').extract_first()
# 獲取收藏量
item['collect'] = response.xpath('//div[@class="photo-view"]/div/span[@class="collect"]')[0].xpath('string(.)').extract_first()
# 獲取下載量
item['download'] = response.xpath('//div[@class="photo-view"]/div/span[@class="download"]')[0].xpath('string(.)').extract_first().strip('\n\t')
# 獲取圖片的鏈接
item['image_url'] = response.xpath('//div[@class="huabu"]//img/@src').extract_first()
# 將item發送出去
yield item
3.管道文件
# -*- coding: utf-8 -*-
'''
filed: pipelines.py
'''
s
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import urllib.request
import os
class TupianprojectPipeline(object):
def open_spider(self, spider):
self.fp = open('tupian.json', 'w', encoding='utf8')
def process_item(self, item, spider):
d = dict(item)
string = json.dumps(d, ensure_ascii=False)
self.fp.write(string + '\n')
# 下載圖片
self.download(item)
return item
def download(self, item):
dirname = './people'
suffix = item['image_url'].split('.')[-1]
filename = item['title'] + '.' + suffix
filepath = os.path.join(dirname, filename)
urllib.request.urlretrieve(item['image_url'], filepath)
def close_spider(self, spider):
self.fp.close()