Redis實現搜索和排序


明日更新文字。

 

建立反向索引

基於文件建立單詞與文檔的反向索引,使用集合存儲。

# # #!/usr/bin/env python
# # # -*- coding: UTF-8 -*-
import jieba
import codecs
import redis
import uuid
#分詞
def cut_words(file):
    with open(file, 'r',encoding="utf-8") as f:
        text = f.read()
        words = jieba.cut_for_search(text)
        #print(len(words),words) #查看分詞結果
    return words

#去停用詞
def drop_Disable_Words(cut_res,stopwords):
    res = []
    for word in cut_res:
        if(len(word)) > 2:
            if word in stopwords or word =="\n" or word =="\u3000":
                continue
            res.append(word)
    #print(len(res),res) #查看去停用詞結果
    return res

#讀取停用詞
def read_stop_word(file_path):
    file = file_path
    stopwords = codecs.open(file,'r',encoding='utf8').readlines()
    stopwords = [ w.strip() for w in stopwords ]
    return stopwords

#建立反向索引
def index_document(conn,docid,keywords):
    pipe = conn.pipeline(True) #管道里執行的命令可以保證執行的原子性
    for keyword in keywords:
        pipe.sadd(keyword,docid)
    return len(pipe.execute())


def _set_conmon(conn,method,names,ttl = 30,execute = True):
    id = str(uuid.uuid4())

#讀取原始語料、停用詞表
files = ['file1.txt','file2.txt']
stopwords = read_stop_word("stop_word.txt")

dic = {}
#分詞、去停用詞
corpus = []
for file in files:
    #分詞
    cut_res = cut_words(file)
    #去停用詞
    res = drop_Disable_Words(cut_res,stopwords)
    #記錄頁標和位置
    corpus.append(res)
print(corpus)
pool = redis.ConnectionPool(host='localhost', password='lin@Wen.',port=6379, decode_responses=True)
conn = redis.Redis(connection_pool=pool)
pipeline = conn.pipeline(True)
for i in range(0,len(corpus)):
    for word in corpus[i]:
        pipeline.sadd('idx:'+word,files[i])
    print(len(pipeline.execute()))

對單詞進行搜索

#搜索
def _set_common(conn,method,names,ttl=30,execute =True):
    id = str(uuid.uuid4())
    pipeline = conn.pipeline(True)
    names = ['idx:' + name for name in names]
    getattr(pipeline,method)('idx:' + id,*names)
    pipeline.expire('idx:' + id,ttl)
    if execute:
        print(pipeline.execute())
    return id
#交集計算
def intersect(conn,items,ttl = 30,_execute=True):
    return _set_common(conn,'sinterstore',items,ttl,_execute)
#並集計算
def union(conn,items,ttl = 30,_execute=True):
    return _set_common(conn,'sunionstore',items,ttl,_execute)
#差集計算
def difference(conn,items,ttl = 30,_execute=True):
    return _set_common(conn,'sdiffstore',items,ttl,_execute)

names = ["DirectX","Unity3D","STL"]
pool = redis.ConnectionPool(host='localhost', password='lin@Wen.',port=6379, decode_responses=True)
conn = redis.Redis(connection_pool=pool)
id = union(conn,names)
print(id)
print(conn.smembers('idx:'+id))
# redis.exceptions.ResponseError: WRONGTYPE Operation against a key holding the wrong kind of value
# 錯誤原因:
# redis的存儲數據的類型和代碼試圖讀取該數據時使用的函數不一致.
# print(conn.sunion("idx:DirectX", "idx:STL"))

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM