提取電影網站的片名,導演,影片播放地址。
item.py
1 import scrapy 2 3 4 class MovieItem(scrapy.Item): 5 # define the fields for your item here like: 6 name = scrapy.Field() #沒啥好說的,定義三個返回變量 7 actor = scrapy.Field() 8 link = scrapy.Field()
spider,py
1 import scrapy 2 3 from movie.items import MovieItem 4 5 class MovieproSpider(scrapy.Spider): 6 name = 'moviePro' 7 allowed_domains = ['4567tv.tv'] 8 start_urls = ['https://www.4567tv.tv/frim/index1.html'] 9 page = 1 10 page_url = 'https://www.4567tv.tv/frim/index1-%s.html' 11 12 13 def parse(self, response): 14 li_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]') 15 for li in li_list: 16 item = MovieItem() 17 name = li.xpath('./div/div/h4/a/text()').extract_first() 18 detail_url = 'https://www.4567tv.tv' + li.xpath('./div/div/h4/a/@href').extract_first() 19 item['name'] = name 20 yield scrapy.Request(url = detail_url, callback = self.parse_detail, meta ={'item':item}) 21 22 if self.page <= 10: 23 self.page += 1 24 new_page_url = self.page_url % self.page 25 yield scrapy.Request(url = new_page_url, callback = (self.parse)) 26 27 def parse_detail(self, response): 28 item = response.meta['item'] #注意這里實例化的是meta的,是parse函數傳遞過來的第二層內容 29 actor = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[3]/a/text()').extract_first() 30 link = response.xpath('/html/body/div[1]/div/div/div/div[1]/a/@href').extract_first() 31 item['actor'] = actor 32 item['link'] = 'https://www.4567tv.tv' + link #這個連接是相對路徑,重新構造地址 33 yield item
settings.py
1 FEED_EXPORT_ENCODING ='utf-8' #插入這兩行,解碼成中文,不然出現的是亂碼
2 FEED_EXPORT_ENCODING = 'gb18030'
運行
scrapy crawl moviePro -o mov.csv
結果