文本信息檢索——布爾模型和TF-IDF模型

本文轉載自查看原文 2019-06-02 22:23 826

文本信息檢索——布爾模型和TF-IDF模型

1. 布爾模型

如要檢索“布爾檢索”或“概率檢索”但不包括“向量檢索”方面的文檔，其相應的查詢表達式為：Q=檢索 and (布爾or 概率 not向量)，那么Q可以在其相應的（檢索,布爾,概率,向量）標引詞向量上取（1,1,0,0）（1,0,1,0）（1,1,1,0），那么文檔Dj的向量如果與這中間一個相等，那么即可認為他們之間存在相似關系，而這種相互關系也是布爾值，即sim(Q,Dj)只能為0或1。

2.TF-IDF模型

在某個一共有一千詞的網頁中“原子能”、“的”和“應用”分別出現了 2 次、35 次和 5 次，那么它們的詞頻TF就分別是 0.002、0.035 和 0.005。我們將這三個數相加，其和 0.042 就是相應網頁和查詢“原子能的應用”。
一個詞預測主題能力越強，權重就越大，反之，權重就越小。我們在網頁中看到“原子能”這個詞，或多或少地能了解網頁的主題。我們看到“應用”一次，對主題基本上還是一無所知。因此，“原子能“的權重就應該比應用大。
應刪除詞的權重應該是零。

2.1權重計算

我們假定中文網頁數是D＝10億，應刪除詞“的”在所有的網頁中都出現，即|{𝑗:𝑡_𝑖 "∈" 𝑑_𝑗}|=10億，那么它的idf＝log(10億/10億）= log (1) =0
假如專用詞“原子能”在兩百萬個網頁中出現，即|{𝑗:𝑡_𝑖 "∈" 𝑑_𝑗}| ＝200萬，則它的權重
idf＝log(500) =6.2
假定通用詞“應用”，出現在五億個網頁中，它的權重
idf = log(2)= 0.7。
最終獲得某一網頁的TF-IDF計算如下
0.002(tf)6.2(idf)+ 0.035(tf)0(idf)+0.005(tf)*0.7(idf)

3.實現代碼

布爾模型

def regularization(s):
    ss = s.split(' ')
    expression = []
    target = {}
    for i in ss:
        if i != "and" and i != "or" and i != "not" and i != "(" and i != ")":
            if i[0] == "(":
                expression.append("(")
                expression.append(i[1:])
                target[i[1:]] = 0
            elif i[-1] == ")":
                expression.append(i[:-1])
                expression.append(")")
                target[i[:-1]] = 0
            else:
                expression.append(i)
                target[i] = 0
        else:
            expression.append(i)
    return target, expression

def analysis(line):
    output = []
    # 去除每行的換行符
    t_line = line.strip('\n')
    # 按空格分開每個詞
    words = t_line.split(' ')
    for word in words[1:]:
        if word == "":
            continue
        # 按/分開標記和詞
        t_word = word.split('/')
        # 左方括號去除
        tf_word = t_word[0].split('[')
        if len(tf_word) == 2:
            f_word = tf_word[1]
        else:
            f_word = t_word[0]
        # 若不在列表中
        if f_word not in output:
            output.append(f_word)
    big_word1 = t_line.split('[')
    for i in range(1, len(big_word1)):
        big_word2 = big_word1[i].split(']')[0]
        words = big_word2.split(' ')
        big_word = ""
        for word in words:
            # 按/分開標記和詞
            t_word = word.split('/')
            big_word = big_word + t_word[0]
        # 若不在列表中
        if big_word not in output:
            output.append(big_word)
    return output


def getValue(target, reg):
    # 逆波蘭
    RPN = []
    stack = []
    stack.append("#")
    for i in reg:
        if i in target.keys():
            RPN.append(target[i])
        elif i == "(":
            stack.append(i)
        elif i == ")":
            while stack[-1] != "(":
                RPN.append(stack.pop())
            stack.pop()
        elif i == "not":
            while stack[-1] == "not":
                RPN.append(stack.pop())
            stack.append(i)
        elif i == "and":
            while stack[-1] == "not" or stack[-1] == "and":
                RPN.append(stack.pop())
            stack.append(i)
        else:
            while stack[-1] == "not" or stack[-1] == "and" or stack[-1] == "or":
                RPN.append(stack.pop())
            stack.append(i)

    while len(stack) != 1:
        RPN.append(stack.pop())
#   計算逆波蘭式
    ans = []
    for i in RPN:
        if i == 0 or i == 1:
            ans.append(i)
        elif i == "not":
            ans.append(1 ^ ans.pop())
        elif i == "and":
            op1 = ans.pop()
            op2 = ans.pop()
            ans.append(op1 and op2)
        elif i == "or":
            op1 = ans.pop()
            op2 = ans.pop()
            ans.append(op1 or op2)
    return ans[0]

if __name__ == '__main__':
    booltext = input("輸入布爾表達式：")
    target, reg = regularization(booltext)
    key_target = target.keys()
    num = 0
    with open('語料庫.txt', mode='r', encoding='UTF-8') as f:
        for line in f.readlines():
            if num >=10:
                break
            for i in key_target:
                target[i] = 0
            if line is not None and line != "\n":
                output = analysis(line)
                for i in key_target:
                    if i in output:
                        target[i] = 1
            if getValue(target, reg):
                print(line)
                num = num + 1
    f.close()

TF-IDF模型

getWeight.py（提前計算權重）

import sys


output = {}

with open('語料庫.txt', mode='r', encoding='UTF-8') as f:
    for line in f.readlines():
        if line is not None and line != "\n":
            t_line = line.strip('\n')
            words = t_line.split(' ')
            word_w = []
            for word in words[1:]:
                if word == "":
                    continue
                t_word = word.split('/')
                # 左方括號
                tf_word = t_word[0].split('[')
                if len(tf_word) == 2:
                    f_word = tf_word[1]
                else:
                    f_word = t_word[0]
                if f_word not in word_w:
                    word_w.append(f_word)
            for f_word in word_w:
                if f_word in output.keys():
                    output[f_word] = output[f_word]+1
                else:
                    output[f_word] = 1
f.close()

with open('outputWeight.txt', mode='w', encoding='UTF-8') as f:
    while output:
        minNum = sys.maxsize
        minName = ""
        for key, values in output.items():
            if values < minNum:
                minNum = values
                minName = key
        f.write(minName+": "+str(minNum)+"\n")
        del output[minName]
f.close()

TF-IDF.py

import math
def analysis(line):
    output = []
    # 去除每行的換行符
    t_line = line.strip('\n')
    # 按空格分開每個詞
    words = t_line.split(' ')
    for word in words[1:]:
        if word == "":
            continue
        # 按/分開標記和詞
        t_word = word.split('/')
        # 左方括號去除
        tf_word = t_word[0].split('[')
        if len(tf_word) == 2:
            f_word = tf_word[1]
        else:
            f_word = t_word[0]
        # 若不在列表中
        if f_word not in output:
            output.append(f_word)
    big_word1 = t_line.split('[')
    for i in range(1, len(big_word1)):
        big_word2 = big_word1[i].split(']')[0]
        words = big_word2.split(' ')
        big_word = ""
        for word in words:
            # 按/分開標記和詞
            t_word = word.split('/')
            big_word = big_word + t_word[0]
        # 若不在列表中
        if big_word not in output:
            output.append(big_word)
    return output

def getValue(target, reg):
    # 逆波蘭
    RPN = []
    stack = []
    stack.append("#")
    for i in reg:
        if i in target.keys():
            RPN.append(target[i])
        elif i == "(":
            stack.append(i)
        elif i == ")":
            while stack[-1] != "(":
                RPN.append(stack.pop())
            stack.pop()
        elif i == "not":
            while stack[-1] == "not":
                RPN.append(stack.pop())
            stack.append(i)
        elif i == "and":
            while stack[-1] == "not" or stack[-1] == "and":
                RPN.append(stack.pop())
            stack.append(i)
        else:
            while stack[-1] == "not" or stack[-1] == "and" or stack[-1] == "or":
                RPN.append(stack.pop())
            stack.append(i)

    while len(stack) != 1:
        RPN.append(stack.pop())
#   計算逆波蘭式
    ans = []
    for i in RPN:
        if i == 0 or i == 1:
            ans.append(i)
        elif i == "not":
            ans.append(1 ^ ans.pop())
        elif i == "and":
            op1 = ans.pop()
            op2 = ans.pop()
            ans.append(op1 and op2)
        elif i == "or":
            op1 = ans.pop()
            op2 = ans.pop()
            ans.append(op1 or op2)
    return ans[0]

def getW():
    word_list = {}
    with open('outputWeight.txt', mode='r', encoding='UTF-8')as f:
        for line in f.readlines():
            if line is not None:
                word = line.split(':')
                word_list[word[0]]=word[1]
    f.close()
    return word_list

def BMM(origin_sentence):
    MAX_WORD = 19
    word_list = []
    with open('output.txt', mode='r', encoding='UTF-8')as f:
        for line in f.readlines():
            if line is not None:
                word = line.split(':')
                word_list.append(word[0])
    f.close()
    ans_word = []
    while len(origin_sentence) != 0:
        len_word = MAX_WORD
        while len_word > 0:
            # 從后讀取最大詞長度的數據，若該數據在字典中，則存入數組，並將其去除
            if origin_sentence[-len_word:] in word_list:
                ans_word.append(origin_sentence[-len_word:])
                len_sentence = len(origin_sentence)
                origin_sentence = origin_sentence[0:len_sentence - len_word]
                break
            # 不在詞典中，則從后取詞長度-1
            else:
                len_word = len_word - 1
        # 單詞直接存入數組
        if len_word == 0:
            if origin_sentence[-1:] != ' ':
                ans_word.append(origin_sentence[-1:])
            len_sentence = len(origin_sentence)
            origin_sentence = origin_sentence[0:len_sentence - 1]
    return ans_word

if __name__ == '__main__':
    w = getW()
    sentence = input("輸入短語：")
    words = BMM(sentence)
    ans = []
    # 計算總文檔數（一行一文檔）
    count = 0
    for index, line in enumerate(open('語料庫.txt', 'r', encoding='UTF-8')):
        count += 1
    with open('語料庫.txt', mode='r', encoding='UTF-8') as f:
        for line in f.readlines():
            score = 0
            if line is not None and line != "\n":
                out = analysis(line)
                for word in words:
                    # TF-IDF計算
                    score = score + out.count(word) / len(out) * math.log(count*1.0/int(w[word]))
                ans.append((line, score))

    f.close()
    new_ans = sorted(ans, key=lambda a: a[1], reverse=True)
    for i in range(10):
        print(new_ans[i])

4.實現效果

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 TF-IDF模型 TF-IDF模型詳解基於tf-idf的文本分類預測模型詞袋模型和TF-IDF TF-IDF模型的概率解釋【sklearn文本特征提取】詞袋模型/稀疏表示/停用詞/TF-IDF模型文本特征提取---詞袋模型，TF-IDF模型，N-gram模型（Text Feature Extraction Bag of Words TF-IDF N-gram ） Bag-of-words模型、TF-IDF模型 NLP之文本分類：「Tf-Idf、Word2Vec和BERT」三種模型比較文本相似度算法——空間向量模型的余弦算法和TF-IDF