liblinear是libsvm的線性核的改進版本,專門適用於百萬數據量的分類。正好適用於我這次數據挖掘的實驗。
liblinear用法和libsvm很相似,我是用的是.exe文件,利用python的subprocess向控制台發送命令即可完成本次試驗。
其中核心兩句即
train train.txt
predict test.txt train.txt.model output.txt
由於是線性核,沒有設置參數c、g
對於50W篇文章模型訓練僅需340秒,50W篇文章的預測僅需6秒
1 from subprocess import * 2 import time 3 4 time = time.time 5 6 start_time = time() 7 print("訓練") 8 cmd = "train train.txt" 9 Popen(cmd, shell = True, stdout = PIPE).communicate() 10 print("訓練結束",str(time() - start_time)) 11 12 13 start_time = time() 14 print("預測") 15 cmd = "predict test.txt train.txt.model output.txt" 16 Popen(cmd, shell = True).communicate() 17 print("預測結束",str(time() - start_time)) 18 19 20 #進行統計 21 #讀測試集真實label 22 start_time = time() 23 print("統計") 24 test_filename = "test.txt" 25 f = open(test_filename,"r",encoding = "utf-8") 26 real_class = [] 27 for line in f: 28 real_class.append(line[0]) 29 30 #總樣本 31 total_sample = len(real_class) 32 33 #讀預測結果label 34 predict_filename = "output.txt" 35 f_predict = open(predict_filename,"r",encoding = "utf-8") 36 s = f_predict.read() 37 predict_class = s.split() 38 39 #對預測正確的文章進行計數 40 T = 0 41 for real, predict in zip(real_class,predict_class): 42 if int(real) == int(predict): 43 T += 1 44 accuracy = T / total_sample * 100 45 print("正確率 為", str(accuracy) + "%") 46 47 48 # class_label = ["0","1","2","3","4","5","6","7","8","9"] 49 num_to_cate = {0:"it",1:"體育",2:"軍事",3:"金融",4:"健康",5:"汽車",6:"房產",7:"文化",8:"教育",9:"娛樂"} 50 51 class_label = ["it","體育","軍事","金融","健康","汽車","房產","文化","教育","娛樂"] 52 53 predict_precision = dict.fromkeys(class_label,1.0) 54 predict_true = dict.fromkeys(class_label,1.0) 55 56 predict_recall = dict.fromkeys(class_label,1.0) 57 predict_F = dict.fromkeys(class_label,0.0) 58 # print(str(predict_precision)) 59 # print(str(predict_precision)) 60 # print(str(predict_recall)) 61 # print(str(predict_true)) 62 mat = dict.fromkeys(class_label,{}) 63 for k,v in mat.items(): 64 mat[k] = dict.fromkeys(class_label,0) 65 66 # print(str(mat)) 67 68 for real, predict in zip(real_class,predict_class): 69 real = int(real) 70 predict = int(predict) 71 # print(num_to_cate[real]) 72 # print(num_to_cate[predict]) 73 mat[num_to_cate[real]][num_to_cate[predict]] += 1 74 predict_precision[num_to_cate[predict]] += 1 75 predict_recall[num_to_cate[real]] += 1 76 77 if int(real) == int(predict): 78 predict_true[num_to_cate[predict]] += 1 79 80 # print(str(predict_precision)) 81 # print(str(predict_recall)) 82 # print(str(predict_true)) 83 84 #輸出混淆矩陣 85 for k, v in mat.items(): 86 print(k + ":" + str(v)) 87 88 #計算精確率和召回率 89 for x in range(len(class_label)): 90 # x = str(x) 91 predict_precision[num_to_cate[x]] = predict_true[num_to_cate[x]] / predict_precision[num_to_cate[x]] 92 predict_recall[num_to_cate[x]] = predict_true[num_to_cate[x]] / predict_recall[num_to_cate[x]] 93 94 # print(str(predict_precision)) 95 # print(str(predict_recall)) 96 # print(str(predict_true)) 97 98 #計算F測度 99 for x in range(len(class_label)): 100 # x = str(x) 101 predict_F[num_to_cate[x]] = 2 * predict_recall[num_to_cate[x]] * predict_precision[num_to_cate[x]] / (predict_precision[num_to_cate[x]] + predict_recall[num_to_cate[x]]) 102 103 print("統計結束",str(time() - start_time)) 104 print("精確率為",str(predict_precision)) 105 print("召回率為",str(predict_recall)) 106 print("F測度為",str(predict_F)) 107 108 print("保存結果") 109 final_result_filename = "./finalresult.txt" 110 f = open(final_result_filename,"w",encoding = "utf-8") 111 for k, v in mat.items(): 112 f.write(k + ":" + str(v) + "\n") 113 114 f.write("\n") 115 f.write("正確率為" + str(accuracy) + "%" + "\n\n") 116 f.write("精確率為" + str(predict_precision) + "\n\n") 117 f.write("召回率為" + str(predict_recall) + "\n\n") 118 f.write("F測度為" + str(predict_F) + "\n\n") 119 print("保存結果結束") 120 121 122 # cate_to_num = {"it":0,"體育":1,"軍事":2,"華人":3,"國內":4,"國際":5,"房產":6,"文娛":7,"社會":8,"財經":9} 123 # num_to_cate = {0:"it",1:"體育",2:"軍事",3:"華人",4:"國內",5:"國際",6:"房產",7:"文娛",8:"社會",9:"財經"}
