首先创建 itemSpider
在spiders 里创建 item_spider.py 输入
""" 语言版本: python:3.6.1 scrapy:1.3.3 """ import scrapy import re class itemSpider(scrapy.Spider): name = 'niu' start_urls = ['http://800zy17.com/'] def parse(self, response): urls1 = response.xpath("//div[@class='width1200']//@href").extract() #mingcheng = response.xpath("//div[@class='width1200']//a//text()").extract() e = [] urls2 = ['http://800zy17.com'] for i in range(len(urls1)): c1 = urls2[0] + urls1[i] e.append(c1) for urls3 in e: yield scrapy.Request(urls3, callback=self.fenlei) def fenlei(self, response): urls = response.xpath("//a[@class='videoName']//@href").extract() c = [] url1 = ['http://800zy17.com'] for i in range(len(urls)): c1 = url1[0] + urls[i] c.append(c1) for url3 in c: yield scrapy.Request(url3, callback=self.get_title) next_page1 = response.xpath('//a[@target="_self"][text()="下一页"]//@href').extract() d = [] for i in range(len(next_page1)): d1 = url1[0] + next_page1[i] d.append(d1) for g in d: if d is not None: g = response.urljoin(g) yield scrapy.Request(g, callback=self.fenlei) def get_title(self, response): # item = IPpronsItem() #mingyan = response.xpath("/html/body/b/b/b/div[4]") IP = response.xpath("//p[@class='whitetitle']//text()").extract_first() port = response.xpath('//div[@class="playlist wbox"]//text()').extract_first() mingcheng = response.xpath('//div[@class="right"]//a//text()').extract_first() #port = re.findall('[a-zA-Z]+://[^\s]*[.com|.cn]*[.m3u8]', port) IP =re.findall('[\u4e00-\u9fa5]+', IP) IP = ':'.join(IP) #port = ','.join(port) fileName = '%s.txt' % mingcheng # 爬取的内容存入文件 f = open(fileName, "a+", encoding='utf-8') # 追加写入文件 f.write(port + ',') f.write('\n') f.write(IP + ',') f.close()
然后 运行 scrapy crawl niu 就可以抓取全部了 其他不用修改。