python scrapy爬虫存储数据库方法带去重步骤


import pymongo
import requests
import random
import time
import pymysql

db = pymongo.MongoClient()['cs']['dn']
db1 = pymysql.connect(user='root',password='root',db='cs',charset='utf8')
cursor = db1.cursor()

class CsdnPipeline(object):
    def __init__(self):
        self.set = set()
    def process_item(self, item, spider):
        if item not in self.set:
            title = item['title']
            content_text = item['content_text']
            create_time_datetime = item['create_time_datetime']
            nickName = item['nickName']
            read_count = item['read_count']
            content_img = item['content_img']
            keyword = item['keyword']
            if len(content_img)>0:
                path = []
                for img in content_img:
                    img_name = 'F:\\34\\tu\\'+str(time.time()).split('.')[1]+str(random.randrange(1,9999999999999999999999999))+'.jpg'
                    img_source = requests.get(img).content
                    op = open(img_name,'wb')
                    op.write(img_source)
                    op.close()
                    path.append(img_name)
                item['content_img'] = path

            else:
                item['content_img'] = '暂无图片'
            db.insert(dict(item))
            import json
            data = json.dumps(dict(item))
            sql = "insert into dn1(`data`) VALUES ('{}')".format(data)
            cursor.execute(sql)
            db1.commit()
            self.set.add(item)
            return item
        else:
            print('已经存在')
            return item

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM