機器學習入門
(注:無基礎可快速入門,想提高准確率還得多下功夫,文中各名詞不做過多解釋)
Python語言、pandas包、sklearn包 建議在Jupyter環境操作
操作步驟
1.pandas包加載給機器學習訓練的表格
依照機器學習領域的習慣,我們把特征叫做X,目標叫做y,通常一列數據最后一列作為目標列
2.映射數據列為整型(Python做決策樹需要整型或者實數)
3.拆分訓練集、測試集
4.sklearn創建訓練模型、測試模型准確率等
5.預測結果導出
算法
1.PCA算法
2.LDA算法
3.線性回歸
4.邏輯回歸
5.朴素貝葉斯
6.決策樹
7.SVM
8.神經網絡
9.KNN算法
import pandas as pd import matplotlib.pyplot as plt X = pd.read_csv('x_train.csv') X = X.drop('target', axis=1) y = df.target #print(X.shape,y.head(10),y.shape,y.head(10)) #處理轉換為整型(存在優化空間) from sklearn.preprocessing import LabelEncoder from collections import defaultdict d = defaultdict(LabelEncoder) X_train = X.apply(lambda x: d[x.name].fit_transform(x)) #X_train.tail(10) #拆分訓練集、測試集 from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_train, y,test_size=0.25, random_state=7) #print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) #決策樹 from sklearn import tree clf = tree.DecisionTreeClassifier(max_depth=3) clf = clf.fit(X_train, y_train) #acc正確率 from sklearn.metrics import accuracy_score print(accuracy_score(y_test, clf.predict(X_test))) #F1 score #from sklearn import metrics #predict_labels = clf.predict(X_test) #F1_scores = metrics.f1_score(y_test, predict_labels, pos_label=0) #print(F1_scores) #預測 X_pred = pd.read_csv('x_test') dx = defaultdict(LabelEncoder) X_pred = X_pred.apply(lambda x: dx[x.name].fit_transform(x)) pred_list = clf.predict(X_pred) pred_proba_list = clf.predict_proba(X_pred) print(pred_list) print(pred_proba_list) print(type(pred_list),type(pred_proba_list)) tag_list =pred_list.tolist() proba_list = [] for i in pred_proba_list.tolist(): proba_list.append(i[1]) X_pred["Proba"] = proba_list X_pred["Tag"] = tag_list X_pred.head(10) X_pred.to_csv('./predict.csv',index=False,encoding='utf-8') #from sklearn.svm import SVC ## 模型訓練 #clf = SVC(kernel='linear') #clf.fit(X_train, y_train) ## 模型存儲 #joblib.dump(clf, './model/svm_mode.pkl') #
