這一部分我是請教的同學完成的這部分內容
其思路是首先從簡介中分析獲取關鍵詞,這部分在上篇博客中,這里不再詳細說明,然后分析數據中已有的關鍵詞和行業的關系,然后分析未知關鍵詞的行業分類
這個方法會讓每個關鍵詞都匹配出很多的行業分類,因此我將每條數據匹配出的所有行業分類存入一個列表中,然后得到出現在次數最多的行業分類
行業代碼是根據2011版的代碼進行的搜索匹配
完整代碼如下

import re import kejichengguo.sql as SQL def getindustry(trains, datas): train_keywords=[] train_id=[] for train in trains: train_keywords.append(train['keyword']) train_id.append(train['id']) test_keyword=[] test_industry=[] for data in datas: test_keyword.append(data['keyword']) test_industry.append(data['industry']) num_train=0 for train_keyword in train_keywords: if(train_keyword==None): num_train = num_train + 1 continue industry_list = [] lists = train_keyword.split() for list in lists: num_i=0 for test_pi in test_keyword: if(test_pi==None): continue str1 = re.findall(r""+list+"",test_pi ) if (len(str1)!=0): ID=train_id[num_train] midID=str(ID) industry_1=test_industry[num_i] industry_2=industry_1.split() for industry_3 in industry_2: industry_list.append(industry_3) num_i = num_i + 1 try: industry = max(industry_list[0:20], key=industry_list[0:20].count) print(midID, industry) SQL.updateindustry(industry, midID) except: print(midID, industry_list) num_train=num_train+1 def getindustrycode(datas): num = 0 for data in datas: num_list = "" lists = data['industry'].split() for list in lists: list = list[0:2] if list == "新型": list = "材料" Value_code = SQL.select_industrycode(list) if (len(Value_code) > 0): num_list = num_list + " " + Value_code[0]['code'] print(data['id'], data['industry'], num_list) SQL.updateindustrycode(num_list, data['id']) num = num + 1 if __name__=='__main__': # trains= SQL.select_keyword() # datas= SQL.select_pi_keyword() # getindustry(trains, datas) datas = SQL.select_industry() getindustrycode(datas)