修改配置文件settings.py添加
ITEM_PIPELINES = { # 'tutorial.pipelines.QQNewsPipeline': 300, 'tutorial.pipelines.QQNewsMongoPipeline':400 } MONGO_URI = 'mongodb://localhost:27017' MONGO_DB = "qqNews"
修改pipelines.py添加
class QQNewsMongoPipeline(object): collection = 'military_affairs' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): ''' scrapy為我們訪問settings提供了這樣的一個方法,這里, 我們需要從settings.py文件中,取得數據庫的URI和數據庫名稱 ''' return cls( mongo_uri = crawler.settings.get('MONGO_URI'), mongo_db = crawler.settings.get('MONGO_DB') ) def open_spider(self, spider): ''' 爬蟲一旦開啟,就會實現這個方法,連接到數據庫 ''' self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): ''' 爬蟲一旦關閉,就會實現這個方法,關閉數據庫連接 ''' self.client.close() def process_item(self, item, spider): ''' 每個實現保存的類里面必須都要有這個方法,且名字固定,用來具體實現怎么保存 ''' if not item['title']: return item data={ 'title':item['title'][0], 'content':item['content'] } table = self.db[self.collection] table.insert_one(data) return item