Scrapy爬取色花堂磁力和圖片


Scrapy爬取色花堂磁力和圖片

重點說明爬取圖片

一.創建項目

scrapy startproject SeHuaTang
scrapy genspider SeHuaTang ""

二.修改settings.py文件

# 控制日志輸出等級
LOG_LEVEL = "WARNING"
# 定義爬取網頁的基本地址
BASE_URL="https://rtretyrytre.xyz/"
# 修改是否遵守robots協議
ROBOTSTXT_OBEY = False
# 設置圖片保存地址

import os
# # 配置保存本地的地址
# project_dir = os.path.abspath(os.path.dirname(__file__))  # 獲取當前爬蟲項目的絕對路徑
# IMAGES_STORE = os.path.join(project_dir, 'images')  # 組裝新的圖片
IMAGES_STORE = 'D:/ImageSpider/'
# 打開管道
ITEM_PIPELINES = {
    'SeHuaTang.pipelines.SeHuaTangPipeline': 300,
}

三.修改items.py文件

import scrapy

class SeHuaTangItem(scrapy.Item):
    # 影片名
    common=scrapy.Field()
    # 瀏覽數量
    num=scrapy.Field()
    # 影片詳情地址
    url=scrapy.Field()
    # 磁力鏈接
    cili=scrapy.Field()
    # 圖片鏈接
    picture=scrapy.Field()

四.修改SeHuaTang.py文件

import re

import scrapy

from SeHuaTang.settings import BASE_URL
from SeHuaTang.items import SeHuaTangItem


class SeHuaTangSpider(scrapy.Spider):
    # 定義鏈接可變鏈接
    base_url = BASE_URL
    name = 'sehuatang'  # 爬蟲名
    # allowed_domains = ['itcast.cn'] #允許爬蟲的范圍
    start_urls = [base_url + '/forum-2-2.html']  # 最開始請求的url地址

    def parse(self, response):

        # 獲取行列表
        tr_list = response.xpath('//table//tr')[5:-2]
        # print(len(tr_list))
        for tr in tr_list:
            item = SeHuaTangItem()
            # 影片名稱
            item["common"] = tr.xpath('./th/a[@onclick="atarget(this)"]/text()').extract_first()
            # 查看數量
            item["num"] = tr.xpath('./td[@class="num"]/em/text()').extract_first()
            # 詳情頁鏈接
            item["url"] = self.base_url + str(
                tr.xpath('./th/a[@onclick="atarget(this)"]/@href').extract_first())
            # print(item["url"])
            yield scrapy.Request(
                url=item["url"],
                callback=self.parse1,
                meta={'item': item}
            )
        # 找到總頁數
        # page_count = str(response.xpath('//*[@id="fd_page_bottom"]/div/label/span/text()')
        #                  .extract_first()).replace('/', "").replace("頁", "")
        page_count = 2
        # 獲取當前頁
        current_page = str(response.xpath('//*[@id="fd_page_bottom"]/div/strong/text()').extract_first())
        if int(page_count) != int(current_page):
            # 說明不是最后一頁
            # 找到下一頁url地址
            next_url = response.xpath('//*[@id="fd_page_bottom"]/div/a[@class="nxt"]/@href').extract_first()
            next_url = self.base_url + next_url
            print(next_url, int(page_count), int(current_page))
            # 提交任務
            yield scrapy.Request(
                url=next_url,
                callback=self.parse
            )

    # 處理詳情頁
    def parse1(self, response, **kwargs):
        item = response.meta['item']
        # 通過正則表達式匹配
        # guize1 = "(magnet:\?xt=urn:btih:[0-9a-fA-F]{40}.*?)"
        # item["cili"]=re.findall(guize, response.text)
        # 通過xpath匹配磁力鏈接
        text_xpath = '/html/body/div[6]/div[6]/div[2]/div[1]/table//tr[1]/td[2]/div[2]/div/div[1]/table//tr/td'
        item["cili"] = response.xpath(text_xpath).extract_first()
        item["cili"] = re.findall("(magnet:\?xt=urn:btih:[0-9a-fA-F]{40}.*?)", item["cili"])
        # 獲取下載圖片鏈接
        # picture_xpath = '//ignore_js_op/img/@zoomfile'
        picture_xpath = '//img[@class="zoom"]/@file'
        # 獲取圖片列表
        item["picture"] = response.xpath(picture_xpath).extract()
        yield item

五.修改pipelines.py文件

重點在於繼承ImagesPipeline這個類

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
import math
import os
import random
import time
import pymongo

import scrapy
from SeHuaTang.settings import IMAGES_STORE
from scrapy.pipelines.images import ImagesPipeline

class SeHuaTangPipeline(ImagesPipeline):
    # 下載圖片
    def get_media_requests(self, item, info):
        # 將數據存入數據庫
        myclient = pymongo.MongoClient("mongodb://localhost:27017/")
        mydb = myclient['SeHuaTang']
        mycol = mydb["demo"]
        mycol.insert_one(dict(item))

        image_url=item['picture']
        for x in image_url:
            # print(x)
            yield scrapy.Request(x)
        # print('下載圖片',image_url)

    # 對圖片重命名
    def item_completed(self, results, item, info):
        # 取出results中的圖片路徑的值
        image_path=[x["path"] for ok,x in results if ok]
        for x in range(len(image_path)):
            # 舊名,新名
            if os.path.exists(IMAGES_STORE+image_path[x]):
                os.rename(IMAGES_STORE+image_path[x],
                          IMAGES_STORE+str(item["common"])+str(time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()))
                          +str(random.randint(1,10000000))+'.jpg')
        # print(results)
        return item


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM