python scrapy爬蟲存儲數據庫方法帶去重步驟


import pymongo
import requests
import random
import time
import pymysql

db = pymongo.MongoClient()['cs']['dn']
db1 = pymysql.connect(user='root',password='root',db='cs',charset='utf8')
cursor = db1.cursor()

class CsdnPipeline(object):
    def __init__(self):
        self.set = set()
    def process_item(self, item, spider):
        if item not in self.set:
            title = item['title']
            content_text = item['content_text']
            create_time_datetime = item['create_time_datetime']
            nickName = item['nickName']
            read_count = item['read_count']
            content_img = item['content_img']
            keyword = item['keyword']
            if len(content_img)>0:
                path = []
                for img in content_img:
                    img_name = 'F:\\34\\tu\\'+str(time.time()).split('.')[1]+str(random.randrange(1,9999999999999999999999999))+'.jpg'
                    img_source = requests.get(img).content
                    op = open(img_name,'wb')
                    op.write(img_source)
                    op.close()
                    path.append(img_name)
                item['content_img'] = path

            else:
                item['content_img'] = '暫無圖片'
            db.insert(dict(item))
            import json
            data = json.dumps(dict(item))
            sql = "insert into dn1(`data`) VALUES ('{}')".format(data)
            cursor.execute(sql)
            db1.commit()
            self.set.add(item)
            return item
        else:
            print('已經存在')
            return item

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM