scrapy循環爬取色花堂標題和瀏覽次數
爬蟲部分代碼
import scrapy
class ItcastSpider(scrapy.Spider):
name = 'sehuatang' # 爬蟲名
# allowed_domains = ['itcast.cn'] #允許爬蟲的范圍
start_urls = ['https://rtuytuytuewr.xyz/forum-2-2.html'] # 最開始請求的url地址
def parse(self, response):
# 獲取行列表
tr_list=response.xpath('//table//tr')[5:-2]
# print(len(tr_list))
for tr in tr_list:
item={}
# 影片名稱
item["common"]=tr.xpath('./th/a[@onclick="atarget(this)"]/text()').extract_first()
# 查看數量
item["num"]=tr.xpath('./td[@class="num"]/em/text()').extract_first()
yield item
# 找到總頁數
page_count=str(response.xpath('//*[@id="fd_page_bottom"]/div/label/span/text()').extract_first()).replace('/',"").replace("頁","")
# 獲取當前頁
current_page=str(response.xpath('//*[@id="fd_page_bottom"]/div/strong/text()').extract_first())
if int(page_count)!=int(current_page):
# 說明不是最后一頁
# 找到下一頁url地址
next_url = response.xpath('//*[@id="fd_page_bottom"]/div/a[@class="nxt"]/@href').extract_first()
next_url='https://rtuytuytuewr.xyz/'+next_url
print(next_url,int(page_count),int(current_page))
# 提交任務
yield scrapy.Request(
url=next_url,
callback=self.parse
)