基於python語言使用余弦相似性算法進行文本相似度分析

本文轉載自查看原文 2019-07-09 10:21 2031 相似度分析/ nlp/ jieba/ 相似度/ 測試工具

編寫此腳本的目的：

　　本人從事軟件測試工作，近兩年發現項目成員總會提出一些內容相似的問題，導致開發抱怨。一開始想搜索一下是否有此類工具能支持查重的工作，但並沒找到，因此寫了這個工具。通過從紙上談兵到着手實踐，還是發現很多大大小小的問題（一定要動手去做喔！），總結起來就是理解清楚參考資料、按需設計、多角度去解決問題。

腳本進行相似度分析的基本過程：

　　1、獲取Bug數據。讀取excel表，獲取到“BugID”和“Bug內容”

　　2、獲取指定格式的Bug關鍵字集合。使用“jieba包”，采用“搜索模式”，對Bug內容進行分詞，獲取到分詞表后，使用“正則表達式”過濾，拿到詞語（詞語長度>=2），提出掉單個字、符號、數字等非關鍵字

　　3、計算詞頻（TF）。在步驟2中獲取到關鍵字總量，在本步驟，則對篩選后的關鍵字進行頻率統計，最后得出“TF值（關鍵字出現頻率/總詞數）”

　　4、獲取詞逆文檔頻率（IDF）（也可以理解為權重）。在本步驟，有個重要的前提就是“語料庫”，這里我沒有使用開放、通用的語料庫，而是使用本項目的“測試用例步驟、約束條件”等內容，唯一的原因就是“很適應Bug內容的語言場景”，在此模塊，也是采用分詞，然后利用參考資料中的計算公式，獲取到每個“關鍵詞”的IDF值，並存至名稱為“get_IDF_value.txt”的文件中（一開始沒放到這里，導致出現重復、多次進行本步驟計算，幾kb的文本內容還能接受，幾百kb的就......當時跑了一夜都沒完！）

　　5、獲取TF_IDF值，並據此對Bug關鍵字進行倒序排列，然后硬性截取所有Bug排位前50%的關鍵字，並形成集合，然后以冒泡的形式，從第一個Bug開始，進行“相似度計算”（公式見參考資料），最終將相似度大於閥值的Bug，以形式“Bug編號_1（被比對對象）-Bug編號_2（比對對象）”打印到名稱為“bug_compare_result.xls”的Excel表中

過程中一些特殊點的處理：

　　1、“所有Bug排位前50%的關鍵字，並形成集合”之所以創建該列表是為減少單個列表或字典在多個函數使用的頻率，肯定是可以減少腳本問題頻率的。

　　2、語料庫的選擇是為適應使用需求

　　3、得到TF_IDF值的目的只是為了獲取到“排位前50%的Bug關鍵字”

　　4、逆文檔頻率（IDF）值會存儲至“get_IDF_value.txt”文件中，每次運行腳本時會提醒是否更改語料庫

說明：

　　1、代碼友好性差，技術能力有限。當然可塑性很高 ~ ~

　　2、出於練習的目的，獨立編寫“IDF計算”的腳本，並放到另一路徑下

源碼：（純生肉，通過代碼注釋還是能看到本人設計過程的心態）

sameText.py

--------------------sameText.py--------------------------

#-*- coding:utf-8 -*-
import jieba
import re
import CorpusDatabase.getIDF as getIDF
import math
import xlrd
import xlwt
import time
#######獲取原始bug數據
re_rule_getTF = re.compile(r"^([\u4e00-\u9fa5]){2,20}")#選擇中文開頭，且文字長度為2-20,一定要記住哇
Top_List = []#將分詞的列表和TF_IDF值前50%的列表合於此處
Compare_CosResult = []#相似度大於??(可調整)的問題號
BugID_List = []#用於存放BugID
getIDF_get_IDF_value = {}#存儲語料庫中各關鍵詞的權重
xiangsidu = 0.8
###########
def read_Originalbase(filename):
    Original_Bug_Dict = {}
    book = xlrd.open_workbook(filename)
    sheet_name = book.sheet_by_index(0)
    nrows = sheet_name.nrows
    for i in range(nrows):
        Original_Bug_Dict[sheet_name.row_values(i)[0]] = sheet_name.row_values(i)[1]
    return Original_Bug_Dict

###########詞頻統計###################
def get_TF_value(test_String_one):
    word_counts = 0#統計除空、單字、符號以外的總詞數
    counts = {}
    counts_two = {}
    words_re_TF = []
    words = jieba.lcut_for_search(test_String_one)
#這邊得加段正則表達式的，篩除掉words非中文且單字的內容，並構建words_re_TF
    for i in words:
        if re.match(re_rule_getTF,i):
            words_re_TF.append(i)
    #統計每個分詞的出現次數，即總詞數量counts
    Top_List.append(words_re_TF)#全局的作用列表，很關鍵
    for word in words_re_TF:
            word_counts = word_counts + 1
            counts[word] = counts.get(word,0) + 1#為關鍵字出現次數進行統計，從0開始,get中第一個為key，0為賦給其對應的value
    #獲取每個關鍵字在整句話中出現的頻率，即最終詞頻
    for count in counts:
        counts_two[count] = counts_two.get(count,(counts[count]/float(word_counts)))
    return counts_two
###########獲取TF_TDF值，並向量化#############
def get_TF_IDF_Key(test_String_two):
    global getIDF_get_IDF_value
    TestString_TF_value = {} 
    TF_IDF_value = {}
    The_Last_TF_IDF_List = []
    Corpus_weight_value = getIDF_get_IDF_value#此部分直接定義一個存儲語料庫權重的全局變量吧
    TestString_TF_value = get_TF_value(test_String_two)
    for i in TestString_TF_value:
        balance_value = 1
        for j in Corpus_weight_value:
            if i == j:
                TF_IDF_value[i] = float(TestString_TF_value[i]) * float(Corpus_weight_value[j])
            elif i != j and balance_value != len(Corpus_weight_value):#遍歷列表過程中，判斷是否與TF的值一樣，且是否遍歷到最后一個元素了
                balance_value = balance_value + 1#計算遍歷次數
                continue
            else:
                TF_IDF_value[i] = float(TestString_TF_value[i]) * 1#對於語料庫中不存在的詞，權重默認為1
    The_Last_TF_IDF_List = sorted(TF_IDF_value,key = TF_IDF_value.__getitem__,reverse=True)
    The_Last_TF_IDF_List = The_Last_TF_IDF_List[0:int((len(The_Last_TF_IDF_List)/2) + 0.5)]#默認取一半關鍵字
    Top_List.append(The_Last_TF_IDF_List)#全局的作用列表，很關鍵
    return The_Last_TF_IDF_List
def cosine_result(the_Top_List):#計算余弦相似度
    global Compare_CosResult
    global BugID_List
    global xiangsidu
    cos_result = 0.0
    keymix = []
    compare_one = []
    compare_two = []
    control_i = 0
    for i in range(0,len(the_Top_List) - 1,2):
        control_j = control_i + 1
        for j in range(i + 2,len(the_Top_List) - 1,2):
            keymix = list(set(the_Top_List[i+1] + the_Top_List[j+1]))#去掉重復的關鍵字
            compare_one = the_Top_List[i]
            compare_two = the_Top_List[j]
            compare_one_dict = {}
            compare_two_dict = {}
            fenzi_value = 0.0
            fenmu_value = 1.0
            fenmu_value_1 = 0.0
            fenmu_value_2 = 0.0
            for word_one in compare_one:
                for unit_keymix_one in keymix:
                    if unit_keymix_one == word_one:
                        compare_one_dict[unit_keymix_one] = compare_one_dict.get(unit_keymix_one,0) + 1
                    else:
                        compare_one_dict[unit_keymix_one] = compare_one_dict.get(unit_keymix_one,0) + 0
            for word_two in compare_two:
                for unit_keymix_two in keymix:
                    if unit_keymix_two == word_two:
                        compare_two_dict[unit_keymix_two] = compare_two_dict.get(unit_keymix_two,0) + 1
                    else:
                        compare_two_dict[unit_keymix_two] = compare_two_dict.get(unit_keymix_two,0) + 0
###########計算余弦相似度##################
            for k in compare_one_dict:
                fenzi_value = fenzi_value + compare_one_dict[k] * compare_two_dict[k]
                fenmu_value_1 = fenmu_value_1 + math.pow(compare_one_dict[k],2)
                fenmu_value_2 = fenmu_value_2 + math.pow(compare_two_dict[k],2)
                fenmu_value = math.sqrt(fenmu_value_1) * math.sqrt(fenmu_value_2)
            cos_result = fenzi_value / fenmu_value
            if cos_result >= xiangsidu:#調控值
                Compare_CosResult.append(str(int(BugID_List[control_i])) + '-' + str(int(BugID_List[control_j])))#浮點數轉整數再轉字符串
            control_j = control_j + 1
        control_i = control_i + 1
    return 0
#########將最終結果寫入到新的xls文件中#########
def write_BugBase(bug_base,CosResult_list):
    global xiangsidu
    key_list = list(bug_base.keys())
    value_list = list(bug_base.values())
    newbook = xlwt.Workbook(encoding = "utf-8", style_compression = 0)
    sheet_name = newbook.add_sheet('sheet1',cell_overwrite_ok = True)
    for i in range(0,len(key_list)):
        bug_com_key_list = []
        if i == 0:
            sheet_name.write(i,0,key_list[i])
            sheet_name.write(i,1,value_list[i])
            sheet_name.write(i,2,u"相似問題(相似度大於%.1f)"%(xiangsidu))
        else:
            re_CosResult = re.compile(r"^((%s)-\d{0,3}$)"%str(int(key_list[i])))
            sheet_name.write(i,0,key_list[i])
            sheet_name.write(i,1,value_list[i])
            for bug_com_key in CosResult_list:
                if re.match(re_CosResult,bug_com_key):########
                    bug_com_key_list.append(bug_com_key)########
                else:
                    continue
            for j in range(len(bug_com_key_list)):
                sheet_name.write(i,j + 2,bug_com_key_list[j] + "\n")#重復寫入            
    newbook.save("bug_compare_result.xls")

def main():
    global getIDF_get_IDF_value
    user_message = input("是否更改了語料庫，請輸入 Y or N:")
    starttime = time.time()
    if str(user_message) == "Y" or str(user_message) == "y":
        getIDF_get_IDF_value = getIDF.get_IDF_value()
        getIDF_file = open('get_IDF_value.txt','w')
        getIDF_file.write(str(getIDF_get_IDF_value))
        getIDF_file.close()
    else:
        getIDF_file = open('get_IDF_value.txt','r')
        getIDF_get_IDF_value = eval(getIDF_file.read())
        getIDF_file.close()
    ##########
    Bug_base = read_Originalbase('bug.xls')
    for main_key_value_1 in Bug_base.keys():
        if main_key_value_1 == "Bug編號":
            continue
        else:
            BugID_List.append(main_key_value_1)
    for main_key_value_2 in Bug_base.values():
        if main_key_value_2 == "Bug標題":
            continue
        else:
            get_TF_IDF_Key(main_key_value_2)#字符串
    cosine_result(Top_List)
    write_BugBase(Bug_base,Compare_CosResult)
    endtime = time.time()
    print("共用時：%d秒！"%(endtime - starttime))
    

if __name__ == '__main__':
    main()

-----------------------------------------------------------------

getIDF.py

---------------------getIDF.py-------------------------------

#-*- coding:utf-8 -*-
import os
import re
import jieba
import math
import re
######正則表達式規則######
re_rule_getIDF = re.compile(r"^([\u4e00-\u9fa5]){2,20}")#選擇中文開頭，且文字長度為2-20,一定要記住哇
#讀取語料庫,並計算出語料庫中所有關鍵詞的權重
def get_IDF_value():
    Corpus_file_list = []#獲取所有語料庫文件，以列表形式保存
    Corpus_text_list = []
    words_re_IDF = []
    Corpus_word_counts = 0
    Corpus_counts = {}
    Corpus_counts_two = {}
    for root,dirs,files in os.walk('.',topdown=False):
        for name in files:
            str_value = ""
            str_value = os.path.join(root,name)
            Corpus_file_list.append(str_value)
    for i in Corpus_file_list:
        if i == '.\\CorpusDatabase\\__pycache__\\getIDF.cpython-36.pyc' or i == '.\\CorpusDatabase\\__pycache__\\__init__.cpython-36.pyc' or i == '.\\CorpusDatabase\\getIDF.py' or i == '.\\CorpusDatabase\\getIDF.pyc' or i == '.\\CorpusDatabase\\__init__.py' or i == '.\\CorpusDatabase\\__init__.pyc' or i == '.\\sameText.py' or i == '.\\bug.xls' or i == '.\\bug_compare_result.xls' or i == '.\\get_IDF_value.txt':
            continue
        else:
            file_object = open(i,'r',encoding = 'UTF-8')#避免出現編碼問題，open文件時使用UTF-8編碼
            file_content = file_object.readlines()
            for j in file_content:
                Corpus_text_list.append(j)
    #對語料庫進行分詞，統計總詞數和每個詞出現頻率，最終計算出權重
    for split_words in Corpus_text_list:
        words = jieba.lcut_for_search(split_words)
#這邊得加段正則表達式的，篩除掉words非中文、單個漢字的內容，並構建words_re_IDF
        for k in words:
            if re.match(re_rule_getIDF,k):
                words_re_IDF.append(k)
        for word in words_re_IDF:
               Corpus_word_counts = Corpus_word_counts + 1
               Corpus_counts[word] = Corpus_counts.get(word,0) + 1
    for count in Corpus_counts:
        Corpus_counts_two[count] = Corpus_counts_two.get(count,(math.log(Corpus_word_counts / (float(Corpus_counts[count] + 1)))))
    return Corpus_counts_two