synonyms自帶的相似度比較結果並不能滿意。
以下提高了名詞分數占比,隨便寫的,錯誤之處請指正
主要代碼
# encoding=utf8
import synonyms,jieba,re
jieba.load_userdict('mydict.txt')
sen1 = "硅膠成品的銷售及生產"
temp2 = ['橡膠加工專用設備制造', '石墨及碳素制品制造', '其他橡膠制品制造', '建築材料生產專用機械制造', '肉制品及副產品加工', "特種陶瓷制品制造"]
def getWordType(word):
w,t = synonyms.seg(word)
return t[0]
# 去重、排序
def deal_list(objs, order='name', isLen=True, reverse=False):
temp,result = {},[]
for obj in objs:
temp[obj[order]] = obj
for prpo in temp:
result.append(temp[prpo])
if isLen:
return sorted(result, key=lambda obj: len(obj[order]))
return sorted(result, key=lambda obj: obj[order], reverse=reverse)
def wordCompare(instr, sentences):
result = []
keys = {}
words,types = synonyms.seg(instr)
for i,w1 in enumerate(words):
keys[w1] = []
if re.findall('[vn]+', types[i]) and len(w1)>1:
ws,ss = synonyms.nearby(w1)
for j,w2 in enumerate(ws):
if ss[j]<0.6:
break
keys[w1].append({
'text': w2,
'source': ss[j],
'typeSource': 2 if re.findall('[vn]+', getWordType(w2)) else 0.5
})
if len(keys[w1])==0:
keys[w1].append({
'text': w1,
'source': 1,
'typeSource': 2 if re.findall('[vn]+', getWordType(w1)) else 0.5 #名詞類得分加倍,其它對折
})
for i in sentences:
source = 0
for j in keys:
bfSource = 1 #降低同一個詞的同義詞影響
for k in keys[j]:
if k['text'] in i:
source += (bfSource*k['source']*k['typeSource'])
bfSource = k['source']
result.append({
'text': i,
'source': source
})
return result
if __name__ == '__main__':
temp2 = deal_list(wordCompare(sen1, temp2), order='source', isLen=False, reverse=True)
for i in temp2:
print sen1, i['text'], i['source']
mydict.txt
運行結果
