一、Importing all the libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score
二、Reading the file
還是蘑菇數據集,直接采用Kaggle競賽中22維特征 https://www.kaggle.com/uciml/mushroom-classification
數據集下載地址:http://download.csdn.net/download/u011630575/10266626
# path to where the data lies
dpath = './data/'
data = pd.read_csv(dpath+"mushrooms.csv")
data.head(6)
三、Let us check if there is any null values
data.isnull().sum() #檢查數據有沒有空值
四、check if we have two claasification. Either the mushroom is poisonous or edible
data['class'].unique() #檢查是否只有蘑菇的種類,有毒,可使用
print(data.dtypes)
五、check if 22 features(1st one is label) and 8124 instances
data.shape #22個特征 8124個樣例 第一個是標簽
六、The dataset has values in strings. We need to convert all the unique values to integers. Thus we perform label encoding on the data 標准化標簽
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder() #標准化標簽,將標簽值統一轉換成range(標簽值個數-1)范圍內
for col in data.columns:
data[col] = labelencoder.fit_transform(data[col])
data.head()
Separating features and label
X = data.iloc[:,1:23] # 獲取1-23行特征
y = data.iloc[:, 0] # 獲取0行標簽
X.head()
y.head()
Splitting the data into training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)
七、default Logistic Regression
from sklearn.linear_model import LogisticRegression
model_LR= LogisticRegression()
model_LR.fit(X_train,y_train)
y_prob = model_LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_LR.score(X_test, y_pred)
注:np.where(condition,x,y) 是三元運算符,conditon條件成立則結果為x,否則為y。
accuracy
auc_roc=metrics.roc_auc_score(y_test,y_pred)
print(auc_roc)
八、Logistic Regression(Tuned model) 調整模型
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
LR_model= LogisticRegression()
tuned_parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] ,
'penalty': ['l1','l2']
}
九、CV
from sklearn.model_selection import GridSearchCV
LR= GridSearchCV(LR_model, tuned_parameters,cv=10)
LR.fit(X_train,y_train)
print(LR.best_params_)
y_prob = LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
LR.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
print(auc_roc)
十、Default Decision Tree model
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, y_train)
y_prob = model_tree.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_tree.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc
十一、Let us tune the hyperparameters of the Decision tree model
from sklearn.tree import DecisionTreeClassifier
model_DD = DecisionTreeClassifier()
tuned_parameters= { 'max_features': ["auto","sqrt","log2"],
'min_samples_leaf': range(1,100,1) , 'max_depth': range(1,50,1)
}
#tuned_parameters= { 'max_features': ["auto","sqrt","log2"] }
#If “auto”, then max_features=sqrt(n_features).
from sklearn.model_selection import GridSearchCV
DD = GridSearchCV(model_DD, tuned_parameters,cv=10)
DD.fit(X_train, y_train)
print(DD.grid_scores_)
print(DD.best_score_)
print(DD.best_params_)
y_prob = DD.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
DD.score(X_test, y_pred)
auc_roc=metrics.classification_report(y_test,y_pred)
print(auc_roc)
十二、Default Random Forest
from sklearn.ensemble import RandomForestClassifier
model_RR=RandomForestClassifier()
model_RR.fit(X_train,y_train)
y_prob = model_RR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_RR.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc
十三、Let us tuned the parameters of Random Forest just for the purpose of knowledge
1) max_features
2) n_estimators 估計量
3) min_sample_leaf
from sklearn.ensemble import RandomForestClassifier
model_RR=RandomForestClassifier()
tuned_parameters = {'min_samples_leaf' range(10,100,10), 'n_estimators' : range(10,100,10),
'max_features':['auto','sqrt','log2']
}
from sklearn.model_selection import GridSearchCV
RR = GridSearchCV(model_RR, tuned_parameters,cv=10)
RR.fit(X_train,y_train)
print(RR.grid_scores_)
print(RR.best_score_)
print(RR.best_params_)
y_prob = RR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
RR_model.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc
十四、Default XGBoost
from xgboost import XGBClassifier
model_XGB=XGBClassifier()
model_XGB.fit(X_train,y_train)
y_prob = model_XGB.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_XGB.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc
十五、特征重要性
在XGBoost中特征重要性已經自動算好,存放在featureimportances
print(model_XGB.feature_importances_)
from matplotlib import pyplot
pyplot.bar(range(len(model_XGB.feature_importances_)), model_XGB.feature_importances_)
pyplot.show()
# plot feature importance using built-in function
from xgboost import plot_importance
plot_importance(model_XGB)
pyplot.show()
可以根據特征重要性進行特征選擇
from numpy import sort
from sklearn.feature_selection import SelectFromModel
# Fit model using each importance as a threshold
thresholds = sort(model_XGB.feature_importances_)
for thresh in thresholds:
# select features using threshold
selection = SelectFromModel(model_XGB, threshold=thresh, prefit=True)
select_X_train = selection.transform(X_train)
# train model
selection_model = XGBClassifier()
selection_model.fit(select_X_train, y_train)
# eval model
select_X_test = selection.transform(X_test)
y_pred = selection_model.predict(select_X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],
accuracy*100.0))