liblinear使用總結


liblinear是libsvm的線性核的改進版本,專門適用於百萬數據量的分類。正好適用於我這次數據挖掘的實驗。

 

liblinear用法和libsvm很相似,我是用的是.exe文件,利用python的subprocess向控制台發送命令即可完成本次試驗。

 

其中核心兩句即

train train.txt

predict test.txt train.txt.model output.txt

 

由於是線性核,沒有設置參數c、g

 

對於50W篇文章模型訓練僅需340秒,50W篇文章的預測僅需6秒

 

  1 from subprocess import *
  2 import time
  3 
  4 time = time.time
  5 
  6 start_time = time()
  7 print("訓練")
  8 cmd = "train train.txt"
  9 Popen(cmd, shell = True, stdout = PIPE).communicate()
 10 print("訓練結束",str(time() - start_time))
 11 
 12 
 13 start_time = time()
 14 print("預測")
 15 cmd = "predict test.txt train.txt.model output.txt"
 16 Popen(cmd, shell = True).communicate()
 17 print("預測結束",str(time() - start_time))
 18 
 19 
 20 #進行統計
 21 #讀測試集真實label
 22 start_time = time()
 23 print("統計")
 24 test_filename = "test.txt"
 25 f = open(test_filename,"r",encoding = "utf-8")
 26 real_class = []
 27 for line in f:
 28     real_class.append(line[0])
 29 
 30 #總樣本
 31 total_sample = len(real_class)
 32 
 33 #讀預測結果label
 34 predict_filename = "output.txt"
 35 f_predict = open(predict_filename,"r",encoding = "utf-8")
 36 s = f_predict.read()
 37 predict_class = s.split()
 38 
 39 #對預測正確的文章進行計數
 40 T = 0
 41 for real, predict in zip(real_class,predict_class):
 42     if int(real) == int(predict):
 43         T += 1
 44 accuracy  = T / total_sample * 100
 45 print("正確率 為", str(accuracy) + "%")
 46 
 47 
 48 # class_label = ["0","1","2","3","4","5","6","7","8","9"]
 49 num_to_cate = {0:"it",1:"體育",2:"軍事",3:"金融",4:"健康",5:"汽車",6:"房產",7:"文化",8:"教育",9:"娛樂"}
 50 
 51 class_label = ["it","體育","軍事","金融","健康","汽車","房產","文化","教育","娛樂"]
 52 
 53 predict_precision = dict.fromkeys(class_label,1.0)
 54 predict_true = dict.fromkeys(class_label,1.0)
 55 
 56 predict_recall = dict.fromkeys(class_label,1.0)
 57 predict_F = dict.fromkeys(class_label,0.0)
 58 # print(str(predict_precision))
 59 # print(str(predict_precision))
 60 # print(str(predict_recall))
 61 # print(str(predict_true))
 62 mat = dict.fromkeys(class_label,{})
 63 for k,v in mat.items():
 64     mat[k] = dict.fromkeys(class_label,0)
 65 
 66 # print(str(mat))
 67 
 68 for real, predict in zip(real_class,predict_class):
 69     real = int(real)
 70     predict = int(predict)
 71     # print(num_to_cate[real])
 72     # print(num_to_cate[predict])
 73     mat[num_to_cate[real]][num_to_cate[predict]] += 1
 74     predict_precision[num_to_cate[predict]] += 1
 75     predict_recall[num_to_cate[real]] += 1
 76 
 77     if int(real) == int(predict):
 78         predict_true[num_to_cate[predict]] += 1
 79 
 80 # print(str(predict_precision))
 81 # print(str(predict_recall))
 82 # print(str(predict_true))
 83 
 84 #輸出混淆矩陣
 85 for k, v in mat.items():
 86     print(k + ":" + str(v))
 87 
 88 #計算精確率和召回率
 89 for x in range(len(class_label)):
 90     # x =  str(x)
 91     predict_precision[num_to_cate[x]] = predict_true[num_to_cate[x]] / predict_precision[num_to_cate[x]]
 92     predict_recall[num_to_cate[x]] = predict_true[num_to_cate[x]] / predict_recall[num_to_cate[x]]
 93 
 94 # print(str(predict_precision))
 95 # print(str(predict_recall))
 96 # print(str(predict_true))
 97 
 98 #計算F測度
 99 for x in range(len(class_label)):
100     # x = str(x)
101     predict_F[num_to_cate[x]] = 2 * predict_recall[num_to_cate[x]] * predict_precision[num_to_cate[x]] / (predict_precision[num_to_cate[x]] + predict_recall[num_to_cate[x]])
102 
103 print("統計結束",str(time() - start_time))
104 print("精確率為",str(predict_precision))
105 print("召回率為",str(predict_recall))
106 print("F測度為",str(predict_F))
107 
108 print("保存結果")
109 final_result_filename = "./finalresult.txt"
110 f = open(final_result_filename,"w",encoding = "utf-8")
111 for k, v in mat.items():
112     f.write(k + ":" + str(v) + "\n")
113 
114 f.write("\n")
115 f.write("正確率為" + str(accuracy) + "%" + "\n\n")
116 f.write("精確率為" + str(predict_precision) + "\n\n")
117 f.write("召回率為" + str(predict_recall) + "\n\n")
118 f.write("F測度為" + str(predict_F) + "\n\n")
119 print("保存結果結束")
120 
121 
122 # cate_to_num = {"it":0,"體育":1,"軍事":2,"華人":3,"國內":4,"國際":5,"房產":6,"文娛":7,"社會":8,"財經":9}
123 # num_to_cate = {0:"it",1:"體育",2:"軍事",3:"華人",4:"國內",5:"國際",6:"房產",7:"文娛",8:"社會",9:"財經"}

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM