Python的scrapy之爬取豆瓣影評和排名


基於scrapy框架的爬影評

爬蟲主程序:

import scrapy
from ..items import DoubanmovieItem

class MoviespiderSpider(scrapy.Spider):
    name = 'moviespider'
    allowed_domains = ['douban.com']
    start_urls = ['http://movie.douban.com/top250']

    def parse(self, response):
        movie_items=response.xpath('//div[@class="item"]')
        for item in movie_items:
            #print(type(item))

            movie =DoubanmovieItem()
            movie['rank']=item.xpath('div[@class="pic"]/em/text()').extract()
            movie['title']=item.xpath('div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()').extract()
            movie['quote'] = item.xpath(
                'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"][1]/text()').extract()
            movie['star'] = item.xpath(
                'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()').extract()

            movie['src']=item.xpath(
                'div[@class="pic"]/a/img/@src').extract()


            yield movie
            pass

        #取下一頁的地址
        nextPageURL = response.xpath('//span[@class="next"]/a/@href').extract()
        #print(nextPageURL)
        if nextPageURL:
            url = response.urljoin(nextPageURL[-1])
            #print('url', url)
            # 發送下一頁請求並調用parse()函數繼續解析
            yield scrapy.Request(url, self.parse, dont_filter=False)
            pass
        else:
            print("退出")
        pass

 

items 對象

import scrapy


class DoubanmovieItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    rank=scrapy.Field()
    title=scrapy.Field()
    quote=scrapy.Field()
    star=scrapy.Field()
    src=scrapy.Field()
    pass

pipelines 輸出管道

class DoubanmoviePipeline(object):
    def process_item(self, item, spider):
        print('電影排名:{0}'.format(item['rank'][0]))
        print('電影名稱:{0}'.format(item['title'][0]))
        print('電影短評:{0}'.format(item['quote'][0]))
        print('評價分數:{0}'.format(item['star'][0]))
        print('評價人數:{0}'.format(item['star'][1]))
        print('圖片鏈接:{0}'.format(item['src']))
print('-' * 20)
return item

在控制台輸出的結果

 

可以通過爬出的圖片鏈接,下載電影的劇照,這就另說了,也可以設置一個插入數據庫的管道,將這些數據插入到數據庫中

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM