首先要做的:
- 建庫 article 建表 article
- 在cmd中的工作環境中安裝mysql的驅動 mysqlclient
`pip install mysqlclient
如果是使用centos 需要 yum install python-devel mysql-devel`
接下來保存數據庫兩種方法:
- 同步操作:數據少可以
- 異步操作:大數據(scrapy爬取得速度快於數據庫插入速度,當數據量大時,就會出現阻塞,異步就能解決)
1.同步
修改數據,由於我們抓取的時間格式是str 需要轉換成date存入數據庫
import datetime
try:
create_date=datetime.datetime.strptime('create_date',"%Y/%m/%d").date()
except Exception as e:
create_date =datetime.datetime.now().date() #如果沒有就取當前時間
article_item['create_date'] =create_date
建立MysqlPipeline
import MySQLdb
class MysqlPipeline(object):
def __init__(self):
self.conn=MySQLdb.connect('localhost','root','root','article',charset='utf8',use_unicode=True)
self.cursor=self.conn.cursor()
def process_item(self, item, spider):
insert_sql="""
insert into article(title,url,create_date,url_object_id,front_image_url,front_image_path,
praise,collect_nums,comment_nums,contents,tags)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
self.cursor.execute(insert_sql,(item['title'],item['url'],item['create_date'],item['url_object_id'],
item['front_image_url'],item['front_image_path'],item['praise'],item['collect_nums'],item['comment_nums'],item['contents'],item['tags'] ))
self.conn.commit()
PIPELINE添加配置
ITEM_PIPELINES = {
'spider_first.pipelines.ArticleImagePipeline': 1,
'spider_first.pipelines.MysqlPipeline':2,
}
2.異步
settings.py
MYSQL_HOST='localhost'
MYSQL_DBNAME='article'
MYSQL_USER='root'
MYSQL_PASSWORD='root'
創建異步pipeline
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class MysqlTwistPipeline(object):
@classmethod
def from_settings(cls,settings):#名稱固定 會被scrapy調用 直接可用setting的值
adbparams=dict(
host=settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
user = settings['MYSQL_USER'],
password = settings['MYSQL_PASSWORD'],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
#這是鏈接數據庫的另一種方法,在settings中寫入參數
dbpool=adbapi.ConnectionPool('MySQLdb',**adbparams)
return cls(dbpool)
def __init__(self,dbpool):
self.dbpool=dbpool
def process_item(self, item, spider):
#使用twiest將mysql插入變成異步
query=self.dbpool.runInteraction(self.do_insert,item)
#因為異步 可能有些錯誤不能及時爆出
query.addErrback(self.handle_error)
#處理異步的異常
def handle_error(self,failure):
print('failure')
def do_insert(self,cursor,item):
insert_sql = """
insert into article(title,url,create_date,url_object_id,front_image_url,front_image_path,
praise,collect_nums,comment_nums,contents,tags)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
cursor.execute(insert_sql, (item['title'], item['url'], item['create_date'], item['url_object_id'],item['front_image_url'], item['front_image_path'], item['praise'],item['collect_nums'], item['comment_nums'], item['contents'], item['tags']))
最后同樣添加配置即可