基於scrapy框架的爬影評
爬蟲主程序:
import scrapy from ..items import DoubanmovieItem class MoviespiderSpider(scrapy.Spider): name = 'moviespider' allowed_domains = ['douban.com'] start_urls = ['http://movie.douban.com/top250'] def parse(self, response): movie_items=response.xpath('//div[@class="item"]') for item in movie_items: #print(type(item)) movie =DoubanmovieItem() movie['rank']=item.xpath('div[@class="pic"]/em/text()').extract() movie['title']=item.xpath('div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()').extract() movie['quote'] = item.xpath( 'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"][1]/text()').extract() movie['star'] = item.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()').extract() movie['src']=item.xpath( 'div[@class="pic"]/a/img/@src').extract() yield movie pass #取下一頁的地址 nextPageURL = response.xpath('//span[@class="next"]/a/@href').extract() #print(nextPageURL) if nextPageURL: url = response.urljoin(nextPageURL[-1]) #print('url', url) # 發送下一頁請求並調用parse()函數繼續解析 yield scrapy.Request(url, self.parse, dont_filter=False) pass else: print("退出") pass
items 對象
import scrapy class DoubanmovieItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() rank=scrapy.Field() title=scrapy.Field() quote=scrapy.Field() star=scrapy.Field() src=scrapy.Field() pass
pipelines 輸出管道
class DoubanmoviePipeline(object): def process_item(self, item, spider): print('電影排名:{0}'.format(item['rank'][0])) print('電影名稱:{0}'.format(item['title'][0])) print('電影短評:{0}'.format(item['quote'][0])) print('評價分數:{0}'.format(item['star'][0])) print('評價人數:{0}'.format(item['star'][1]))
print('圖片鏈接:{0}'.format(item['src']))
print('-' * 20)
return item
在控制台輸出的結果
可以通過爬出的圖片鏈接,下載電影的劇照,這就另說了,也可以設置一個插入數據庫的管道,將這些數據插入到數據庫中