scrapy循环爬取色花堂标题和浏览次数

爬虫部分代码

import scrapy


class ItcastSpider(scrapy.Spider):
    name = 'sehuatang' # 爬虫名
    # allowed_domains = ['itcast.cn'] #允许爬虫的范围
    start_urls = ['https://rtuytuytuewr.xyz/forum-2-2.html'] # 最开始请求的url地址

    def parse(self, response):
        # 获取行列表
        tr_list=response.xpath('//table//tr')[5:-2]
        # print(len(tr_list))
        for tr in tr_list:
            item={}
            # 影片名称
            item["common"]=tr.xpath('./th/a[@onclick="atarget(this)"]/text()').extract_first()
            # 查看数量
            item["num"]=tr.xpath('./td[@class="num"]/em/text()').extract_first()
            yield item
        # 找到总页数
        page_count=str(response.xpath('//*[@id="fd_page_bottom"]/div/label/span/text()').extract_first()).replace('/',"").replace("页","")
        # 获取当前页
        current_page=str(response.xpath('//*[@id="fd_page_bottom"]/div/strong/text()').extract_first())
        if int(page_count)!=int(current_page):
            # 说明不是最后一页
            # 找到下一页url地址
            next_url = response.xpath('//*[@id="fd_page_bottom"]/div/a[@class="nxt"]/@href').extract_first()
            next_url='https://rtuytuytuewr.xyz/'+next_url
            print(next_url,int(page_count),int(current_page))
            # 提交任务
            yield scrapy.Request(
                url=next_url,
                callback=self.parse
            )

免责声明！

本站转载的文章为个人学习借鉴使用，本站对版权不负任何法律责任。如果侵犯了您的隐私权益，请联系本站邮箱yoyou2525@163.com删除。

猜您在找 Scrapy爬取色花堂磁力和图片使用谷歌浏览器取色器取色通过协程管理实现scrapy异步循环爬取谷歌浏览器取色 python3下scrapy爬虫(第八卷:循环爬取网页多页数据） scrapy多url爬取 scrapy爬取京东 scrapy增量爬取 C语言 for循环次数 thymeleaf 循环固定次数