1.1 輸入信息
輸入分為三部分:
l train_data.txt為已經做好特征工程處理的本地訓練集文件。每一行為一條數據記錄,以逗號分開。最后一列為類別(二分類),前面的列為特征值。
l test_data.txt 為需要預測的本地測試集文件。特征數和訓練集一致。不含類別信息。
l 示例代碼為准確率和性能待優化的參考代碼,支持的語言分別為C++/Python/JAVA。
answer.txt為test_data.txt的二分類結果,用於練習的時候使用。
1.1 輸出信息
輸出信息為一個文件result.txt,按行順序放置測試集記錄的預測結果,每一行代表一條訓練數據的二分類結果。
1.2 限制條件
l 選手拿到的訓練集和測試集並不是最終判題用的數據。
l 示例代碼的算法實現為LR(邏輯回歸),選手可以將其改為其它的機器學習算法,但程序中定義的輸入輸出文件路徑不能改。
l 不允許使用外部機器學習庫。
示例代碼(LR,邏輯回歸)
1 import math 2 import datetime 3 import sys 4 import numpy as np 5 6 7 class LR: 8 def __init__(self, train_file_name, test_file_name, predict_result_file_name): 9 self.train_file = train_file_name 10 self.predict_file = test_file_name 11 self.predict_result_file = predict_result_file_name 12 self.max_iters = 760 13 self.rate = 0.1 14 self.feats = [] 15 self.labels = [] 16 self.feats_test = [] 17 self.labels_predict = [] 18 self.param_num = 0 19 self.weight = [] 20 21 def loadDataSet(self, file_name, label_existed_flag): 22 feats = [] 23 labels = [] 24 fr = open(file_name) 25 lines = fr.readlines() 26 for line in lines: 27 temp = [] 28 allInfo = line.strip().split(',') 29 dims = len(allInfo) 30 if label_existed_flag == 1: 31 for index in range(dims-1): 32 temp.append(float(allInfo[index])) 33 feats.append(temp) 34 labels.append(float(allInfo[dims-1])) 35 else: 36 for index in range(dims): 37 temp.append(float(allInfo[index])) 38 feats.append(temp) 39 fr.close() 40 feats = np.array(feats) 41 labels = np.array(labels) 42 return feats, labels 43 44 def loadTrainData(self): 45 self.feats, self.labels = self.loadDataSet(self.train_file, 1) 46 47 def loadTestData(self): 48 self.feats_test, self.labels_predict = self.loadDataSet( 49 self.predict_file, 0) 50 51 def savePredictResult(self): 52 print(self.labels_predict) 53 f = open(self.predict_result_file, 'w') 54 for i in range(len(self.labels_predict)): 55 f.write(str(self.labels_predict[i])+"\n") 56 f.close() 57 58 def sigmod(self, x): 59 return 1/(1+np.exp(-x)) 60 61 def printInfo(self): 62 print(self.train_file) 63 print(self.predict_file) 64 print(self.predict_result_file) 65 print(self.feats) 66 print(self.labels) 67 print(self.feats_test) 68 print(self.labels_predict) 69 70 def initParams(self): 71 self.weight = np.ones((self.param_num,), dtype=np.float) 72 73 def compute(self, recNum, param_num, feats, w): 74 return self.sigmod(np.dot(feats, w)) 75 76 def error_rate(self, recNum, label, preval): 77 return np.power(label - preval, 2).sum() 78 79 def predict(self): 80 self.loadTestData() 81 preval = self.compute(len(self.feats_test), 82 self.param_num, self.feats_test, self.weight) 83 self.labels_predict = (preval+0.5).astype(np.int) 84 self.savePredictResult() 85 86 def train(self): 87 self.loadTrainData() 88 recNum = len(self.feats) 89 self.param_num = len(self.feats[0]) 90 #print(self.param_num) 91 self.initParams() 92 ISOTIMEFORMAT = '%Y-%m-%d %H:%M:%S,f' 93 for i in range(self.max_iters): 94 preval = self.compute(recNum, self.param_num, 95 self.feats, self.weight) 96 sum_err = self.error_rate(recNum, self.labels, preval) 97 if i%30 == 0: 98 print("Iters:" + str(i) + " error:" + str(sum_err)) 99 theTime = datetime.datetime.now().strftime(ISOTIMEFORMAT) 100 print(theTime) 101 err = self.labels - preval 102 delt_w = np.dot(self.feats.T, err) 103 delt_w /= recNum 104 self.weight += self.rate*delt_w 105 106 107 def print_help_and_exit(): 108 print("usage:python3 main.py train_data.txt test_data.txt predict.txt [debug]") 109 sys.exit(-1) 110 111 112 def parse_args(): 113 debug = False 114 if len(sys.argv) == 2: 115 if sys.argv[1] == 'debug': 116 print("test mode") 117 debug = True 118 else: 119 print_help_and_exit() 120 return debug 121 122 123 if __name__ == "__main__": 124 #debug = parse_args() 125 train_file = "./data/train_data.txt" 126 test_file = "./data/test_data.txt" 127 predict_file = "./data/result.txt" 128 lr = LR(train_file, test_file, predict_file) 129 lr.train() 130 lr.predict() 131 debug=True 132 133 if debug: 134 answer_file ="./data/answer.txt" 135 f_a = open(answer_file, 'r') 136 f_p = open(predict_file, 'r') 137 a = [] 138 p = [] 139 lines = f_a.readlines() 140 for line in lines: 141 a.append(int(float(line.strip()))) 142 f_a.close() 143 144 lines = f_p.readlines() 145 for line in lines: 146 p.append(int(float(line.strip()))) 147 f_p.close() 148 149 print("answer lines:%d" % (len(a))) 150 print("predict lines:%d" % (len(p))) 151 152 errline = 0 153 for i in range(len(a)): 154 if a[i] != p[i]: 155 errline += 1 156 157 accuracy = (len(a)-errline)/len(a) 158 print("accuracy:%f" %(accuracy))