Scrapy 实现抓取800资源网 按分类抓取全站资源 ,X站慎入! 手机电脑 可以直接看


 

首先创建 itemSpider 

在spiders 里创建 item_spider.py 输入

"""
语言版本:

python:3.6.1
scrapy:1.3.3

"""

import scrapy
import re

class itemSpider(scrapy.Spider):
    name = 'niu'
    start_urls = ['http://800zy17.com/']

    def parse(self, response):
        urls1 = response.xpath("//div[@class='width1200']//@href").extract()
        #mingcheng = response.xpath("//div[@class='width1200']//a//text()").extract()
        e = []
        urls2 = ['http://800zy17.com']
        for i in range(len(urls1)):
            c1 = urls2[0] + urls1[i]
            e.append(c1)
        for urls3 in e:
            yield scrapy.Request(urls3, callback=self.fenlei)





    def fenlei(self, response):

        urls = response.xpath("//a[@class='videoName']//@href").extract()
        c = []
        url1 = ['http://800zy17.com']
        for i in range(len(urls)):
            c1 = url1[0] + urls[i]
            c.append(c1)
        for url3 in c:
            yield scrapy.Request(url3, callback=self.get_title)
        next_page1 = response.xpath('//a[@target="_self"][text()="下一页"]//@href').extract()
        d = []
        for i in range(len(next_page1)):
            d1 = url1[0] + next_page1[i]
            d.append(d1)
        for g in d:
            if d is not None:
                g = response.urljoin(g)
                yield scrapy.Request(g, callback=self.fenlei)

    def get_title(self, response):
        # item = IPpronsItem()
        #mingyan = response.xpath("/html/body/b/b/b/div[4]")
        IP = response.xpath("//p[@class='whitetitle']//text()").extract_first()
        port = response.xpath('//div[@class="playlist wbox"]//text()').extract_first()
        mingcheng = response.xpath('//div[@class="right"]//a//text()').extract_first()
        #port = re.findall('[a-zA-Z]+://[^\s]*[.com|.cn]*[.m3u8]', port)
        IP =re.findall('[\u4e00-\u9fa5]+', IP)

        IP = ':'.join(IP)
        #port = ','.join(port)

        fileName = '%s.txt' % mingcheng # 爬取的内容存入文件
        f = open(fileName, "a+", encoding='utf-8')  # 追加写入文件
        f.write(port + ',')

        f.write('\n')

        f.write(IP + ',')

        f.close()

然后  运行   scrapy crawl niu  就可以抓取全部了  其他不用修改。

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM