1、簡單的在pipelines.py上添加如下代碼:
import pymysql class ScrapyTextMysqlPipeLine(object): #__init__函數里面初始化就是連接數據庫,便於實現增刪改查 def __init__(self): # connection database self.connect = pymysql.connect('localhost', 'root', '', 'test')# 后面三個依次是數據庫連接名、數據庫密碼、數據庫名稱 # get cursor self.cursor = self.connect.cursor() print("連接數據庫成功") def process_item(self, item, spider): print("開始輸入數據") print(item['申報要素']) try: self.cursor.execute("insert into test(申報要素, hscode, 申報名稱, 參考均價, 參考最高價, 參考最低價) values (%s, %s, %s, %s, %s, %s)", (item['申報要素'], item['hscode'], item['申報名稱'], item['參考均價'], item['參考最高價'], item['參考最低價'])) self.connect.commit() except Exception as error: #print error print(error) return item
問題:需要刪除到爬取數據的第一個元組,解決方法是使用if語句去除第一個元組(其中的美元符號需要除去,其中的一些數據類型如下)如下:
for i in range(0, len(hscodes)): #設置if語句去除不必要的標題類數據,如“申報要素”等 if i != 0: item['申報要素'] = sbElements[i] item['hscode'] = hscodes[i] item['申報名稱'] = sbNames[i] item['參考均價'] = ckAvgPrices[i].lstrip("$") item['參考最高價'] = ckMaxPrices[i].lstrip("$") item['參考最低價'] = ckMinPrice[i].lstrip("$") yield item print(item) else: print("已經刪除多余值")
3、保存數據到數據庫卻不能使得數據重復,解決方法如下:
yield scrapy.Request(third_page, callback=self.details_parse2, dont_filter=False)
4、創建一個與spiders的同級的文件夾db,在其下創建DBHelper.py來實現爬取數據在數據庫的增刪查改,在下面的代碼雖然使用了insert語句,但是其他的也應該是一樣的,然后在settings.py中配置MySQL數據庫的信息,最后只要在pipelines.py中調用即可。
dbHelper.py:
import pymysql from scrapy.utils.project import get_project_settings#引入settings配置 class DBHelper(): def __init__(self): self.settings=get_project_settings()#獲取settings配置數據 self.host=self.settings['MYSQL_HOST'] self.port=self.settings['MYSQL_PORT'] self.user=self.settings['MYSQL_USER'] self.passwd=self.settings['MYSQL_PASSWD'] self.db=self.settings['MYSQL_DBNAME'] #連接mysql def connectMysql(self): conn=pymysql.connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, charset='utf8') return conn #連接數據庫 def connectDatabase(self): conn=pymysql.connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, db=self.db, charset='utf8') return conn #創建數據庫 def createDatabase(self): conn=self.connectMysql() sql="create database if not exists "+self.db cur=conn.cursor() cur.execute(sql) cur.close() conn.close() #創建數據表 def createTable(self,sql): conn=self.connectDatabase() cur=conn.cursor() cur.execute(sql) cur.close() conn.close() #插入數據 def insert(self,sql,*params): conn=self.connectDatabase() cur=conn.cursor() cur.execute(sql,params) conn.commit() cur.close() conn.close() #更新數據 def update(self,sql,*params): conn=self.connectDatabase() cur=conn.cursor() cur.execute(sql,params) conn.commit() cur.close() conn.close() #刪除數據 def delete(self,sql,*params): conn=self.connectDatabase() cur=conn.cursor() cur.execute(sql,params) conn.commit() cur.close() conn.close() #測試數據庫操作 class TestDBHelper(): def __init__(self): self.dbHelper=DBHelper() def testCreateDatebase(self): self.dbHelper.createDatabase() def testCreateTable(self): sql="create table testtable(id int primary key auto_increment,name varchar(50),url varchar(200))" self.dbHelper.createTable(sql) def testInsert(self, item): sql="insert into example(hscode, 申報名稱, 申報要素, 參考均價, 參考最高價, 參考最低價) values (%s, %s, %s, %s, %s, %s)" params=(item['hscode'], item['申報名稱'], item['申報名稱'], item['參考均價'], item['參考最高價'], item['參考最低價']) self.dbHelper.insert(sql,*params) def testUpdate(self): sql="update testtable set name=%s,url=%s where id=%s" params=("update","update","1") self.dbHelper.update(sql,*params) def testDelete(self): sql="delete from testtable where id=%s" params=("1") self.dbHelper.delete(sql,*params) if __name__=="__main__": testDBHelper=TestDBHelper() #testDBHelper.testCreateDatebase() # #testDBHelper.testCreateTable() # testDBHelper.testInsert() # #testDBHelper.testUpdate() # #testDBHelper.testDelete() #
settings.py:
#Mysql數據庫的配置信息 MYSQL_HOST = '127.0.0.1' MYSQL_DBNAME = 'test' #數據庫名字 MYSQL_USER = 'root' #數據庫賬號 MYSQL_PASSWORD = '' #數據庫密碼 MYSQL_PORT = 3306 #數據庫端口,在dbHelper中使用
pipelines.py:
class ScrapyTextManyMysqlPipeLine(object): def __init__(self): from ScrapyTest.db.dbHelper import TestDBHelper self.db = TestDBHelper() def process_item(self, item, spider): self.db.testInsert(item) return item