Scrapy 实现抓取玉米资源网 按分类抓取全站资源 ,X站慎入! 手机电脑 可以直接看


 

 

 

 首先创建 itemSpider

在spiders 里面创建 item_spider.py 输入

"""
语言版本:

python:3.6.1
scrapy:1.3.3


"""

import scrapy
import re

class itemSpider(scrapy.Spider):
    name = 'yumi'
    start_urls = ['http://3000.ym788.vip/']

    def parse(self, response):
        urls1 = response.xpath("//ul[@class='nav navbar-nav']//@href").extract()
        #mingcheng = response.xpath("//div[@class='width1200']//a//text()").extract()
        e = []
        urls2 = ['http://3000.ym788.vip']
        for i in range(len(urls1)):
            c1 = urls2[0] + urls1[i]
            e.append(c1)
        for urls3 in e:
            yield scrapy.Request(urls3, callback=self.fenlei)





    def fenlei(self, response):

        urls = response.xpath("//div[@class='name left']//@href").extract()
        c = []
        url1 = ['http://3000.ym788.vip']
        for i in range(len(urls)):
            c1 = url1[0] + urls[i]
            c.append(c1)
        for url3 in c:
            yield scrapy.Request(url3, callback=self.get_title)

        next_page1 = response.xpath('//a[@target="_self"][text()="下一页"]//@href').extract()
        d = []
        for i in range(len(next_page1)):
            d1 = url1[0] + next_page1[i]
            d.append(d1)
        for g in d:
            if d is not None:
                g = response.urljoin(g)
                yield scrapy.Request(g, callback=self.fenlei)

    def get_title(self, response):
        # item = IPpronsItem()
        #mingyan = response.xpath("/html/body/b/b/b/div[4]")
        IP = response.xpath("//div[@class='col-xs-9 movie-info padding-right-5']//h1").extract_first()
        port = response.xpath('//a[@is_source="no"]//text()').extract_first()
        mingcheng = response.xpath('/html/body/div[2]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[3]/td[2]').extract_first()

        #port = re.findall('[a-zA-Z]+://[^\s]*[.com|.cn]*[.m3u8]', port)
        IP =re.findall('[\u4e00-\u9fa5]+', IP)
        mingcheng = re.findall('[\u4e00-\u9fa5]+', mingcheng)

        IP = ':'.join(IP)
        mingcheng = ','.join(mingcheng)

        fileName = '%s.txt' % mingcheng # 爬取的内容存入文件
        f = open(fileName, "a+", encoding='utf-8')  # 追加写入文件
        f.write(IP + ',')
        f.write('\n')
        f.write(port + ',')
        f.write('\n')



        f.close()

在settings里面添加

DOWNLOAD_DELAY = 0
CONCURRENT_REQUESTS = 100
CONCURRENT_REQUESTS_PER_DOMAIN = 100
CONCURRENT_REQUESTS_PER_IP = 100
COOKIES_ENABLED = False
LOG_LEVEL = 'ERROR'

最后 运行就可以了


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM