Scrapy爬取色花堂磁力和圖片
重點說明爬取圖片
一.創建項目
scrapy startproject SeHuaTang
scrapy genspider SeHuaTang ""
二.修改settings.py文件
# 控制日志輸出等級
LOG_LEVEL = "WARNING"
# 定義爬取網頁的基本地址
BASE_URL="https://rtretyrytre.xyz/"
# 修改是否遵守robots協議
ROBOTSTXT_OBEY = False
# 設置圖片保存地址
import os
# # 配置保存本地的地址
# project_dir = os.path.abspath(os.path.dirname(__file__)) # 獲取當前爬蟲項目的絕對路徑
# IMAGES_STORE = os.path.join(project_dir, 'images') # 組裝新的圖片
IMAGES_STORE = 'D:/ImageSpider/'
# 打開管道
ITEM_PIPELINES = {
'SeHuaTang.pipelines.SeHuaTangPipeline': 300,
}
三.修改items.py文件
import scrapy
class SeHuaTangItem(scrapy.Item):
# 影片名
common=scrapy.Field()
# 瀏覽數量
num=scrapy.Field()
# 影片詳情地址
url=scrapy.Field()
# 磁力鏈接
cili=scrapy.Field()
# 圖片鏈接
picture=scrapy.Field()
四.修改SeHuaTang.py文件
import re
import scrapy
from SeHuaTang.settings import BASE_URL
from SeHuaTang.items import SeHuaTangItem
class SeHuaTangSpider(scrapy.Spider):
# 定義鏈接可變鏈接
base_url = BASE_URL
name = 'sehuatang' # 爬蟲名
# allowed_domains = ['itcast.cn'] #允許爬蟲的范圍
start_urls = [base_url + '/forum-2-2.html'] # 最開始請求的url地址
def parse(self, response):
# 獲取行列表
tr_list = response.xpath('//table//tr')[5:-2]
# print(len(tr_list))
for tr in tr_list:
item = SeHuaTangItem()
# 影片名稱
item["common"] = tr.xpath('./th/a[@onclick="atarget(this)"]/text()').extract_first()
# 查看數量
item["num"] = tr.xpath('./td[@class="num"]/em/text()').extract_first()
# 詳情頁鏈接
item["url"] = self.base_url + str(
tr.xpath('./th/a[@onclick="atarget(this)"]/@href').extract_first())
# print(item["url"])
yield scrapy.Request(
url=item["url"],
callback=self.parse1,
meta={'item': item}
)
# 找到總頁數
# page_count = str(response.xpath('//*[@id="fd_page_bottom"]/div/label/span/text()')
# .extract_first()).replace('/', "").replace("頁", "")
page_count = 2
# 獲取當前頁
current_page = str(response.xpath('//*[@id="fd_page_bottom"]/div/strong/text()').extract_first())
if int(page_count) != int(current_page):
# 說明不是最后一頁
# 找到下一頁url地址
next_url = response.xpath('//*[@id="fd_page_bottom"]/div/a[@class="nxt"]/@href').extract_first()
next_url = self.base_url + next_url
print(next_url, int(page_count), int(current_page))
# 提交任務
yield scrapy.Request(
url=next_url,
callback=self.parse
)
# 處理詳情頁
def parse1(self, response, **kwargs):
item = response.meta['item']
# 通過正則表達式匹配
# guize1 = "(magnet:\?xt=urn:btih:[0-9a-fA-F]{40}.*?)"
# item["cili"]=re.findall(guize, response.text)
# 通過xpath匹配磁力鏈接
text_xpath = '/html/body/div[6]/div[6]/div[2]/div[1]/table//tr[1]/td[2]/div[2]/div/div[1]/table//tr/td'
item["cili"] = response.xpath(text_xpath).extract_first()
item["cili"] = re.findall("(magnet:\?xt=urn:btih:[0-9a-fA-F]{40}.*?)", item["cili"])
# 獲取下載圖片鏈接
# picture_xpath = '//ignore_js_op/img/@zoomfile'
picture_xpath = '//img[@class="zoom"]/@file'
# 獲取圖片列表
item["picture"] = response.xpath(picture_xpath).extract()
yield item
五.修改pipelines.py文件
重點在於繼承ImagesPipeline這個類
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import math
import os
import random
import time
import pymongo
import scrapy
from SeHuaTang.settings import IMAGES_STORE
from scrapy.pipelines.images import ImagesPipeline
class SeHuaTangPipeline(ImagesPipeline):
# 下載圖片
def get_media_requests(self, item, info):
# 將數據存入數據庫
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient['SeHuaTang']
mycol = mydb["demo"]
mycol.insert_one(dict(item))
image_url=item['picture']
for x in image_url:
# print(x)
yield scrapy.Request(x)
# print('下載圖片',image_url)
# 對圖片重命名
def item_completed(self, results, item, info):
# 取出results中的圖片路徑的值
image_path=[x["path"] for ok,x in results if ok]
for x in range(len(image_path)):
# 舊名,新名
if os.path.exists(IMAGES_STORE+image_path[x]):
os.rename(IMAGES_STORE+image_path[x],
IMAGES_STORE+str(item["common"])+str(time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()))
+str(random.randint(1,10000000))+'.jpg')
# print(results)
return item
