自己实践一下在本章学到一些方法
首先实践核心的部分,怎么实现一个分类模型,并通过验证曲线去优化模型,最后使用训练出来的模型进行预测
In [20]:
#加载预处理的数据
import pandas as pd df=pd.read_csv('../data/hr-analytics/hr_data_processed.csv') df.columns
Out[20]:
In [21]:
#选择训练集
features = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'work_accident', 'promotion_last_5years', 'department_IT', 'department_RandD', 'department_accounting', 'department_hr', 'department_management', 'department_marketing', 'department_product_mng', 'department_sales', 'department_support', 'department_technical', 'salary_high', 'salary_low', 'salary_medium'] X=df[features].values y=df.left.values
In [33]:
#使用随机森林分类器,计算验证曲线的 max_depth
from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score from sklearn.model_selection import validation_curve import numpy as np np.random.seed(1) #保证对于相同数量的随机数的数列的值是相同的 clf=RandomForestClassifier(n_estimators=20) max_depths=[3,4,5,6,7,9,12,15,18,21] print('Training {} models'.format(len(max_depths))) train_scores,test_scores= validation_curve(estimator=clf, X=X, y=y, param_name="max_depth",param_range=max_depths,cv=5)
In [43]:
def plot_validation_curve(train_scores, test_scores, param_range, xlabel='', log=False): ''' This code is from scikit-learn docs: http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html Also here: https://github.com/rasbt/python-machine-learning-book-2nd-edition/blob/master/code/ch06/ch06.ipynb ''' train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) fig = plt.figure() plt.plot(param_range, train_mean, color=sns.color_palette('Set1')[1], marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color=sns.color_palette('Set1')[1]) plt.plot(param_range, test_mean, color=sns.color_palette('Set1')[0], linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color=sns.color_palette('Set1')[0]) if log: plt.xscale('log') plt.legend(loc='lower right') if xlabel: plt.xlabel(xlabel) plt.ylabel('Accuracy') plt.ylim(0.9, 1.0) return fig
In [45]:
import matplotlib.pyplot as plt import seaborn as sns
In [47]:
#画出验证曲线
plot_validation_curve(train_scores,test_scores,max_depths,xlabel='max_depth') plt.xlim(3,21) plt.savefig('../figures/test_classfication_model.png', bbox_inches='tight', dpi=300)
In [58]:
from sklearn.model_selection import StratifiedKFold from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from IPython.display import display from mlxtend.plotting import plot_decision_regions def cross_val_class_score(clf,X,y,cv=10): kfold=StratifiedKFold(n_splits=cv).split(X,y) class_accuracy=[] for k,(train,test) in enumerate(kfold): clf.fit(X[train],y[train]) #使用训练数据拟合模型 y_test=y[test] y_pred=clf.predict(X[test]) #计算混淆矩阵,通过混淆矩阵找出对于每一个折,分类是0或者1的概率 cmat=confusion_matrix(y_test,y_pred) class_acc=cmat.diagonal()/cmat.sum(axis=1) class_accuracy.append(class_acc) print('fold: {:d} accuracy {:s}'.format(k+1,str(class_acc))) return np.array(class_accuracy)
In [61]:
#显示k折验证的结果
np.random.seed(1) clf=RandomForestClassifier(n_estimators=200, max_depth=6) scores=cross_val_class_score(clf,X,y) print('accuracy {} +/- {}'.format(scores.mean(axis=0),scores.std(axis=0)))
In [69]:
#画出结果的箱图
fig=plt.figure(figsize=(5,7)) sns.boxplot(data=pd.DataFrame(scores,columns=[0,1]), palette=sns.color_palette('Set1')) plt.xlabel('Left') plt.ylabel('accuracy') plt.show()
In [71]:
#计算特征的重要性
d=(clf.feature_importances_,df.columns) list(zip(*d))
Out[71]:
In [75]:
#可视化特征的重要性
pd.Series(clf.feature_importances_, name='Feature importance', index=df[features].columns).sort_values().plot.barh() plt.show()
In [76]:
#打印出所有低重要性的特征
importances=list(pd.Series(clf.feature_importances_, index=df[features].columns).sort_values(ascending=False).index) np.array(importances[5:])
Out[76]:
In [77]:
#将低重要性的特征使用PCA进行降维
from sklearn.decomposition import PCA #需要进行降维处理的特征 pca_features = ['work_accident', 'salary_low', 'salary_high', 'salary_medium', 'promotion_last_5years', 'department_RandD', 'department_hr', 'department_technical', 'department_support', 'department_management', 'department_sales', 'department_accounting', 'department_IT', 'department_product_mng', 'department_marketing'] X_reduce=df[pca_features] pca=PCA(n_components=3) #参数是降低为多少个特征 pca.fit(X_reduce) X_pca=pca.transform(X_reduce) #这个数组保存的就是新的特征的值
In [78]:
#向数据中添加降维后的新特征
df['first_principle_component']=X_pca.T[0] df['second_principle_component']=X_pca.T[1] df['third_principle_component']=X_pca.T[2]
In [80]:
#构造降维后的新的训练集
features=['satisfaction_level','number_project','time_spend_company','average_montly_hours', 'last_evaluation','first_principle_component','second_principle_component','third_principle_component'] X=df[features].values y=df.left.values
In [84]:
#使用新的训练集,显示k折验证的结果
np.random.seed(1) clf=RandomForestClassifier(n_estimators=200, max_depth=6) scores=cross_val_class_score(clf,X,y) print('accruacy {} +/- {}'.format(scores.mean(axis=0),scores.std(axis=0)))
In [93]:
#画出新的训练集的箱图
fig=plt.figure(figsize=(5,7)) sns.boxplot(data=pd.DataFrame(scores,columns=[0,1]), palette=sns.color_palette('Set1')) plt.xlabel('Left') plt.ylabel('Accuracy') plt.show()
In [91]:
#训练最后的模型
np.random.seed(1) clf=RandomForestClassifier(n_estimators=200, max_depth=6) clf.fit(X,y)
In [95]:
#将训练好的模型保存进二进制文件,并且从保存的二进制文件读取训练好的模型
from sklearn.externals import joblib joblib.dump(clf,'randomForestTrainedTest.pkl') clf=joblib.load('randomForestTrainedtest.pkl')
In [96]:
#对一个特定的样本,使用我们的模型
xiaoming=df.iloc[123] X=xiaoming[features] X
Out[96]:
In [98]:
#预测小明是否会离开公司
clf.predict([list(X.values)])
Out[98]:
In [100]:
#打印出小明属于0还是1的概率
clf.predict_proba([X])
Out[100]:
In [104]:
#降低小明的 average_montly_hours ,尝试让小明留下
X.average_montly_hours=100 X.number_project=2 clf.predict([X])
Out[104]:
In [106]:
clf.predict_proba([X])
Out[106]: