synonyms自帶的相似度比較結果並不能滿意。
以下提高了名詞分數占比,隨便寫的,錯誤之處請指正
主要代碼
# encoding=utf8 import synonyms,jieba,re jieba.load_userdict('mydict.txt') sen1 = "硅膠成品的銷售及生產" temp2 = ['橡膠加工專用設備制造', '石墨及碳素制品制造', '其他橡膠制品制造', '建築材料生產專用機械制造', '肉制品及副產品加工', "特種陶瓷制品制造"] def getWordType(word): w,t = synonyms.seg(word) return t[0] # 去重、排序 def deal_list(objs, order='name', isLen=True, reverse=False): temp,result = {},[] for obj in objs: temp[obj[order]] = obj for prpo in temp: result.append(temp[prpo]) if isLen: return sorted(result, key=lambda obj: len(obj[order])) return sorted(result, key=lambda obj: obj[order], reverse=reverse) def wordCompare(instr, sentences): result = [] keys = {} words,types = synonyms.seg(instr) for i,w1 in enumerate(words): keys[w1] = [] if re.findall('[vn]+', types[i]) and len(w1)>1: ws,ss = synonyms.nearby(w1) for j,w2 in enumerate(ws): if ss[j]<0.6: break keys[w1].append({ 'text': w2, 'source': ss[j], 'typeSource': 2 if re.findall('[vn]+', getWordType(w2)) else 0.5 }) if len(keys[w1])==0: keys[w1].append({ 'text': w1, 'source': 1, 'typeSource': 2 if re.findall('[vn]+', getWordType(w1)) else 0.5 #名詞類得分加倍,其它對折 }) for i in sentences: source = 0 for j in keys: bfSource = 1 #降低同一個詞的同義詞影響 for k in keys[j]: if k['text'] in i: source += (bfSource*k['source']*k['typeSource']) bfSource = k['source'] result.append({ 'text': i, 'source': source }) return result if __name__ == '__main__': temp2 = deal_list(wordCompare(sen1, temp2), order='source', isLen=False, reverse=True) for i in temp2: print sen1, i['text'], i['source']
mydict.txt
酒 1000000 n 膠 10000000 n
運行結果