Scrapy連接到各類數據庫(SQLite,Mysql,Mongodb,Redis)

本文轉載自查看原文 2020-02-28 09:51 695 scrapy

如何使用scrapy連接到(SQLite,Mysql,Mongodb,Redis)數據庫，並把爬取的數據存儲到相應的數據庫中。

一、SQLite

1.修改pipelines.py文件加入如下代碼

# 爬取到的數據寫入到SQLite數據庫
import sqlite3

class SQLitePipeline(object):

    #打開數據庫
    def open_spider(self, spider):
        db_name = spider.settings.get('SQLITE_DB_NAME', 'scrapy.db')

        self.db_conn = sqlite3.connect(db_name)
        self.db_cur = self.db_conn.cursor()

    #關閉數據庫
    def close_spider(self, spider):
        self.db_conn.commit()
        self.db_conn.close()

    #對數據進行處理
    def process_item(self, item, spider):
        self.insert_db(item)
        return item

    #插入數據
    def insert_db(self, item):
        values = (
            item['upc'],
            item['name'],
            item['price'],
            item['review_rating'],
            item['review_num'],
            item['stock'],
        )

        sql = 'INSERT INTO books VALUES(?,?,?,?,?,?)'
        self.db_cur.execute(sql, values)

2.修改settings.py文件，加入如下代碼

# sqlite 配置
SQLITE_DB_NAME = 'scrapy.db'

在settings啟動管道文件

ITEM_PIPELINES = {
   'toscrape_book.pipelines.SQLitePipeline': 400,
}

二、mysql

1.修改pipelines.py文件加入如下代碼

# 爬取到的數據寫入到MySQL數據庫
import pymysql
class MySQLPipeline(object):

    # 打開數據庫
    def open_spider(self, spider):
        db = spider.settings.get('MYSQL_DB_NAME','scrapy_db')
        host = spider.settings.get('MYSQL_HOST', 'localhost')
        port = spider.settings.get('MYSQL_PORT', 3306)
        user = spider.settings.get('MYSQL_USER', 'root')
        passwd = spider.settings.get('MYSQL_PASSWORD', '123456')

        self.db_conn =pymysql.connect(host=host, port=port, db=db, user=user, passwd=passwd, charset='utf8')
        self.db_cur = self.db_conn.cursor()

    # 關閉數據庫
    def close_spider(self, spider):
        self.db_conn.commit()
        self.db_conn.close()

    # 對數據進行處理
    def process_item(self, item, spider):
        self.insert_db(item)
        return item

    #插入數據
    def insert_db(self, item):
        values = (
            item['upc'],
            item['name'],
            item['price'],
            item['review_rating'],
            item['review_num'],
            item['stock'],
        )

        sql = 'INSERT INTO books VALUES(%s,%s,%s,%s,%s,%s)'
        self.db_cur.execute(sql, values)

2.修改settings.py文件，加入如下代碼

# mysql 配置
MYSQL_DB_NAME = 'scrapy_db'
MYSQL_HOST = '127.0.0.1'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'

在settings啟動管道文件

ITEM_PIPELINES = {
   'toscrape_book.pipelines.MySQLPipeline': 401,
}

三、mongodb

1.修改pipelines.py文件加入如下代碼

# 爬取到的數據寫入到Mongodb數據庫
from pymongo import MongoClient
from scrapy import Item

class MongoDBPipeline(object):

    # 打開數據庫
    def open_spider(self, spider):
        db_uri = spider.settings.get('MONGODB_URI', 'mongodb://localhost:27017')
        db_name = spider.settings.get('MONOGDB_DB_NAME', 'scrapy_db')

        self.db_client = MongoClient(db_uri)
        self.db = self.db_client[db_name]

    # 關閉數據庫
    def close_spider(self, spider):
        self.db_client.close()

    # 對數據進行處理
    def process_item(self, item, spider):
        self.insert_db(item)
        return item

    # 插入數據
    def insert_db(self, item):
        if isinstance(item, Item):
            item = dict(item)
        self.db.books.insert(item)

2.修改settings.py文件，加入如下代碼

# mongodb 配置
MONGODB_URI = 'mongodb://127.0.0.1:27017'
MONGODB_DB_NAME = 'scrapy_db'

在settings啟動管道文件

ITEM_PIPELINES = {
   'toscrape_book.pipelines.MongoDBPipeline': 403,
}

四、redis

1.修改pipelines.py文件加入如下代碼

# 爬取到的數據寫入到redis數據庫
import redis
from scrapy import Item

class RedisPipeline(object):

    # 打開數據庫
    def open_spider(self, spider):
        db_host = spider.settings.get('REDIS_HOST', 'localhost')
        db_port = spider.settings.get('REDIS_PORT', 6379)
        db_index = spider.settings.get('REDIS_DB_INDEX', 0)

        self.db_conn = redis.StrictRedis(host=db_host, port=db_port, db=db_index)
        self.item_i = 0

    # 關閉數據庫
    def close_spider(self, spider):
        self.db_conn.connection_pool.disconnect()

    # 處理數據
    def process_item(self, item, spider):
        self.insert_db(item)
        return item

    # 插入數據
    def insert_db(self, item):
        if isinstance(item, Item):
            item = dict(item)

        self.item_i += 1
        self.db_conn.hmset('book:{}'.format(self.item_i), item)

2.修改settings.py文件，加入如下代碼

# redis 配置
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
REDIS_DB_INDEX = 0

在settings啟動管道文件

ITEM_PIPELINES = {
   'toscrape_book.pipelines.RedisPipeline': 404,
}

scrapy 連接各數據的設置並不復雜，首先在pipelines文件中建立管道，建立個數據的連接，然后處理數據，關閉連接。接下來我們在settings文件中定義各類數據庫的基本配置，然后在item_pipelines中啟動相應的管道

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 數據庫之 MongoDB and SQLite 淺談PHP如何連接到MySql數據庫 PHP當中如何使用Wampserver連接到Mysql數據庫以及怎樣使用！ vs2010如何連接到mysql數據庫 python 連接操作各類數據庫 go 連接到數據庫 Python3.7和數據庫MySQL 8.0.12 數據庫SQLite3連接(三) C#連接SQLite數據庫 nodejs連接sqlite數據庫 JDBC連接SQlite數據庫