項目需要,用隨機森林和決策樹對已有50個事件做預測
import time import pandas as pd from sklearn import metrics from sklearn.model_selection import train_test_split # Random Forest Classifier def random_forest_classifier(train_X, train_y): from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators=10) model.fit(train_X, train_y) #擬合模型 由X_train, y_train訓練數據集建模; return model # Decision Tree Classifier def decision_tree_classifier(train_X, train_y): from sklearn import tree model = tree.DecisionTreeClassifier() model.fit(train_X, train_y) return model def read_data(data_file): col = ['ric'] col += [str(i) for i in range(50)] col += ['label'] data = pd.read_csv(data_file, names=col) # data.head() y = data['label'].apply(lambda x : 0 if x is -1 else 1) X_train, X_test, y_train, y_test = train_test_split(data.drop(['ric', 'label'], axis=1), y, test_size=0.2, random_state=0) return X_train, X_test, y_train, y_test if __name__ == '__main__': data_file = 'C:\\Python36\\TestCode\\RandomForest\\part-00000' test_classifiers = ['Random Forest', 'Decision Tree'] classifiers = {'Random Forest': random_forest_classifier, 'Decision Tree': decision_tree_classifier} print('****Reading training and testing data...****') X_train, X_test, y_train, y_test = read_data(data_file) num_train, num_feat = X_train.shape num_test, num_feat = X_test.shape print('******************** Data Info *********************') print('#training data: %d, #testing_data: %d, dimension: %d' % (num_train, num_test, num_feat)) for classifier in test_classifiers: print('******************* %s ********************' % classifier) start_time = time.time() model = classifiers[classifier](X_train, y_train) print('training took %fs!' % (time.time() - start_time)) predict = model.predict(X_test) #模型預測, X_test測試數據集預測;對訓練數據集測試得分(因為有時根本不知道測試數據集對應的真實y值) #score = model.score(X_test, y_test) precision = metrics.precision_score(y_test, predict) # accuracy = metrics.accuracy_score(y_test, predict) recall = metrics.recall_score(y_test, predict) F_Measure = metrics.f1_score(y_test, predict) #print('Model score is: %.2f%%' % (100 * score)) #評估模型准確率 print('precision ratio: %.2f%%' % (100 * precision)) #准確率 print('Accuracy ratio: %.2f%%' % (100 * accuracy)) print('Recall ratio: %.2f%%' % (100 * recall)) print('F-Measure ratio: %.2f%%' % (100 * F_Measure))
精確率和准確率大概在54%,召回率只有30%+,還需要繼續調