scrapy循环爬取色花堂标题和浏览次数
爬虫部分代码
import scrapy
class ItcastSpider(scrapy.Spider):
name = 'sehuatang' # 爬虫名
# allowed_domains = ['itcast.cn'] #允许爬虫的范围
start_urls = ['https://rtuytuytuewr.xyz/forum-2-2.html'] # 最开始请求的url地址
def parse(self, response):
# 获取行列表
tr_list=response.xpath('//table//tr')[5:-2]
# print(len(tr_list))
for tr in tr_list:
item={}
# 影片名称
item["common"]=tr.xpath('./th/a[@onclick="atarget(this)"]/text()').extract_first()
# 查看数量
item["num"]=tr.xpath('./td[@class="num"]/em/text()').extract_first()
yield item
# 找到总页数
page_count=str(response.xpath('//*[@id="fd_page_bottom"]/div/label/span/text()').extract_first()).replace('/',"").replace("页","")
# 获取当前页
current_page=str(response.xpath('//*[@id="fd_page_bottom"]/div/strong/text()').extract_first())
if int(page_count)!=int(current_page):
# 说明不是最后一页
# 找到下一页url地址
next_url = response.xpath('//*[@id="fd_page_bottom"]/div/a[@class="nxt"]/@href').extract_first()
next_url='https://rtuytuytuewr.xyz/'+next_url
print(next_url,int(page_count),int(current_page))
# 提交任务
yield scrapy.Request(
url=next_url,
callback=self.parse
)