文本信息檢索——布爾模型和TF-IDF模型
1. 布爾模型
如要檢索“布爾檢索”或“概率檢索”但不包括“向量檢索”方面的文檔,其相應的查詢表達式為:Q=檢索 and (布爾or 概率 not向量),那么Q可以在其相應的(檢索,布爾,概率,向量)標引詞向量上取(1,1,0,0)(1,0,1,0)(1,1,1,0),那么文檔Dj的向量如果與這中間一個相等,那么即可認為他們之間存在相似關系,而這種相互關系也是布爾值,即sim(Q,Dj)只能為0或1。
2.TF-IDF模型
在某個一共有一千詞的網頁中“原子能”、“的”和“應用”分別出現了 2 次、35 次 和 5 次,那么它們的詞頻TF就分別是 0.002、0.035 和 0.005。 我們將這三個數相加,其和 0.042 就是相應網頁和查詢“原子能的應用”。
一個詞預測主題能力越強,權重就越大,反之,權重就越小。我們在網頁中看到“原子能”這個詞,或多或少地能了解網頁的主題。我們看到“應用”一次,對主題基本上還是一無所知。因此,“原子能“的權重就應該比應用大。
應刪除詞的權重應該是零。
2.1權重計算
- 我們假定中文網頁數是D=10億,應刪除詞“的”在所有的網頁中都出現,即|{𝑗:𝑡_𝑖 "∈" 𝑑_𝑗}|=10億,那么它的idf=log(10億/10億)= log (1) =0
- 假如專用詞“原子能”在兩百萬個網頁中出現,即|{𝑗:𝑡_𝑖 "∈" 𝑑_𝑗}| =200萬,則它的權重
idf=log(500) =6.2 - 假定通用詞“應用”,出現在五億個網頁中,它的權重
idf = log(2)= 0.7。 - 最終獲得某一網頁的TF-IDF計算如下
0.002(tf)6.2(idf)+ 0.035(tf)0(idf)+0.005(tf)*0.7(idf)
3.實現代碼
- 布爾模型
def regularization(s):
ss = s.split(' ')
expression = []
target = {}
for i in ss:
if i != "and" and i != "or" and i != "not" and i != "(" and i != ")":
if i[0] == "(":
expression.append("(")
expression.append(i[1:])
target[i[1:]] = 0
elif i[-1] == ")":
expression.append(i[:-1])
expression.append(")")
target[i[:-1]] = 0
else:
expression.append(i)
target[i] = 0
else:
expression.append(i)
return target, expression
def analysis(line):
output = []
# 去除每行的換行符
t_line = line.strip('\n')
# 按空格分開每個詞
words = t_line.split(' ')
for word in words[1:]:
if word == "":
continue
# 按/分開標記和詞
t_word = word.split('/')
# 左方括號去除
tf_word = t_word[0].split('[')
if len(tf_word) == 2:
f_word = tf_word[1]
else:
f_word = t_word[0]
# 若不在列表中
if f_word not in output:
output.append(f_word)
big_word1 = t_line.split('[')
for i in range(1, len(big_word1)):
big_word2 = big_word1[i].split(']')[0]
words = big_word2.split(' ')
big_word = ""
for word in words:
# 按/分開標記和詞
t_word = word.split('/')
big_word = big_word + t_word[0]
# 若不在列表中
if big_word not in output:
output.append(big_word)
return output
def getValue(target, reg):
# 逆波蘭
RPN = []
stack = []
stack.append("#")
for i in reg:
if i in target.keys():
RPN.append(target[i])
elif i == "(":
stack.append(i)
elif i == ")":
while stack[-1] != "(":
RPN.append(stack.pop())
stack.pop()
elif i == "not":
while stack[-1] == "not":
RPN.append(stack.pop())
stack.append(i)
elif i == "and":
while stack[-1] == "not" or stack[-1] == "and":
RPN.append(stack.pop())
stack.append(i)
else:
while stack[-1] == "not" or stack[-1] == "and" or stack[-1] == "or":
RPN.append(stack.pop())
stack.append(i)
while len(stack) != 1:
RPN.append(stack.pop())
# 計算逆波蘭式
ans = []
for i in RPN:
if i == 0 or i == 1:
ans.append(i)
elif i == "not":
ans.append(1 ^ ans.pop())
elif i == "and":
op1 = ans.pop()
op2 = ans.pop()
ans.append(op1 and op2)
elif i == "or":
op1 = ans.pop()
op2 = ans.pop()
ans.append(op1 or op2)
return ans[0]
if __name__ == '__main__':
booltext = input("輸入布爾表達式:")
target, reg = regularization(booltext)
key_target = target.keys()
num = 0
with open('語料庫.txt', mode='r', encoding='UTF-8') as f:
for line in f.readlines():
if num >=10:
break
for i in key_target:
target[i] = 0
if line is not None and line != "\n":
output = analysis(line)
for i in key_target:
if i in output:
target[i] = 1
if getValue(target, reg):
print(line)
num = num + 1
f.close()
-
TF-IDF模型
-
getWeight.py(提前計算權重)
import sys output = {} with open('語料庫.txt', mode='r', encoding='UTF-8') as f: for line in f.readlines(): if line is not None and line != "\n": t_line = line.strip('\n') words = t_line.split(' ') word_w = [] for word in words[1:]: if word == "": continue t_word = word.split('/') # 左方括號 tf_word = t_word[0].split('[') if len(tf_word) == 2: f_word = tf_word[1] else: f_word = t_word[0] if f_word not in word_w: word_w.append(f_word) for f_word in word_w: if f_word in output.keys(): output[f_word] = output[f_word]+1 else: output[f_word] = 1 f.close() with open('outputWeight.txt', mode='w', encoding='UTF-8') as f: while output: minNum = sys.maxsize minName = "" for key, values in output.items(): if values < minNum: minNum = values minName = key f.write(minName+": "+str(minNum)+"\n") del output[minName] f.close()
-
TF-IDF.py
import math def analysis(line): output = [] # 去除每行的換行符 t_line = line.strip('\n') # 按空格分開每個詞 words = t_line.split(' ') for word in words[1:]: if word == "": continue # 按/分開標記和詞 t_word = word.split('/') # 左方括號去除 tf_word = t_word[0].split('[') if len(tf_word) == 2: f_word = tf_word[1] else: f_word = t_word[0] # 若不在列表中 if f_word not in output: output.append(f_word) big_word1 = t_line.split('[') for i in range(1, len(big_word1)): big_word2 = big_word1[i].split(']')[0] words = big_word2.split(' ') big_word = "" for word in words: # 按/分開標記和詞 t_word = word.split('/') big_word = big_word + t_word[0] # 若不在列表中 if big_word not in output: output.append(big_word) return output def getValue(target, reg): # 逆波蘭 RPN = [] stack = [] stack.append("#") for i in reg: if i in target.keys(): RPN.append(target[i]) elif i == "(": stack.append(i) elif i == ")": while stack[-1] != "(": RPN.append(stack.pop()) stack.pop() elif i == "not": while stack[-1] == "not": RPN.append(stack.pop()) stack.append(i) elif i == "and": while stack[-1] == "not" or stack[-1] == "and": RPN.append(stack.pop()) stack.append(i) else: while stack[-1] == "not" or stack[-1] == "and" or stack[-1] == "or": RPN.append(stack.pop()) stack.append(i) while len(stack) != 1: RPN.append(stack.pop()) # 計算逆波蘭式 ans = [] for i in RPN: if i == 0 or i == 1: ans.append(i) elif i == "not": ans.append(1 ^ ans.pop()) elif i == "and": op1 = ans.pop() op2 = ans.pop() ans.append(op1 and op2) elif i == "or": op1 = ans.pop() op2 = ans.pop() ans.append(op1 or op2) return ans[0] def getW(): word_list = {} with open('outputWeight.txt', mode='r', encoding='UTF-8')as f: for line in f.readlines(): if line is not None: word = line.split(':') word_list[word[0]]=word[1] f.close() return word_list def BMM(origin_sentence): MAX_WORD = 19 word_list = [] with open('output.txt', mode='r', encoding='UTF-8')as f: for line in f.readlines(): if line is not None: word = line.split(':') word_list.append(word[0]) f.close() ans_word = [] while len(origin_sentence) != 0: len_word = MAX_WORD while len_word > 0: # 從后讀取最大詞長度的數據,若該數據在字典中,則存入數組,並將其去除 if origin_sentence[-len_word:] in word_list: ans_word.append(origin_sentence[-len_word:]) len_sentence = len(origin_sentence) origin_sentence = origin_sentence[0:len_sentence - len_word] break # 不在詞典中,則從后取詞長度-1 else: len_word = len_word - 1 # 單詞直接存入數組 if len_word == 0: if origin_sentence[-1:] != ' ': ans_word.append(origin_sentence[-1:]) len_sentence = len(origin_sentence) origin_sentence = origin_sentence[0:len_sentence - 1] return ans_word if __name__ == '__main__': w = getW() sentence = input("輸入短語:") words = BMM(sentence) ans = [] # 計算總文檔數(一行一文檔) count = 0 for index, line in enumerate(open('語料庫.txt', 'r', encoding='UTF-8')): count += 1 with open('語料庫.txt', mode='r', encoding='UTF-8') as f: for line in f.readlines(): score = 0 if line is not None and line != "\n": out = analysis(line) for word in words: # TF-IDF計算 score = score + out.count(word) / len(out) * math.log(count*1.0/int(w[word])) ans.append((line, score)) f.close() new_ans = sorted(ans, key=lambda a: a[1], reverse=True) for i in range(10): print(new_ans[i])
-