sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频)
https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share
医药统计项目可联系
QQ:231469242
randomForest.py
调参后,预测最高准确性也达到了89%
随机森林的参数
# -*- coding: utf-8 -*- """ Created on Sat Mar 31 09:30:24 2018 @author: Administrator 随机森林不需要预处理数据 """ #导入数据预处理,包括标准化处理或正则处理 from sklearn import preprocessing from sklearn.preprocessing import Imputer from sklearn import metrics import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split import pandas as pd #中文字体设置 from matplotlib.font_manager import FontProperties font=FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=14) #读取变量名文件 varibleFileName="titantic.xlsx" #读取目标文件 targetFileName="target.xlsx" #读取excel data=pd.read_excel(varibleFileName) data_dummies=pd.get_dummies(data) print('features after one-hot encoding:\n',list(data_dummies.columns)) features=data_dummies.ix[:,"Pclass":'Embarked_S'] x=features.values #数据预处理 imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(x) x=imp.transform(x) target=pd.read_excel(targetFileName) y=target.values x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0) names=features.columns trees=1000 max_depth=10 #n_estimators表示树的个数,测试中100颗树足够 forest=RandomForestClassifier(n_estimators=trees,random_state=0,max_depth=max_depth) forest.fit(x_train,y_train) print("random forest with %d trees:"%trees) print("accuracy on the training subset:{:.3f}".format(forest.score(x_train,y_train))) print("accuracy on the test subset:{:.3f}".format(forest.score(x_test,y_test))) #print('Feature importances:{}'.format(forest.feature_importances_)) names=features.columns importance=forest.feature_importances_ zipped = zip(importance,names) list1=list(zipped) list1.sort(reverse=True) #print(list1) n_features=data_dummies.shape[1] plt.barh(range(n_features),forest.feature_importances_,align='center') plt.yticks(np.arange(n_features),features) plt.title("random forest with %d trees,%dmax_depth:"%(trees,max_depth)) plt.xlabel('Feature Importance') plt.ylabel('Feature') plt.show() ''' random forest with 1000 trees: accuracy on the training subset:0.983 accuracy on the test subset:0.878 random forest with 1000 trees,max_depth=4: accuracy on the training subset:0.854 accuracy on the test subset:0.884 random forest with 1000 trees,max_depth=5: accuracy on the training subset:0.853 accuracy on the test subset:0.887 random forest with 1000 trees,max_depth=9 accuracy on the training subset:0.871 accuracy on the test subset:0.890 '''
去掉覆盖率低的变量后,随机森林准确性反而下降,看了随机森林不需要去计算变量覆盖率
训练数据准确性0.983
测试数据准确性0.878
'''
random forest with 1000 trees:
accuracy on the training subset:0.983
accuracy on the test subset:0.878
'''
重要因子来看,性别第一,占据40%重要性,
年龄重要性18%左右,
票价重要性17%左右
logistic.py
# -*- coding: utf-8 -*- """ Created on Sun Apr 29 22:39:35 2018 @author: Administrator """ # -*- coding: utf-8 -*- """ Created on Sat Mar 31 09:30:24 2018 @author: Administrator 随机森林不需要预处理数据 """ from sklearn.linear_model import LogisticRegression #导入数据预处理,包括标准化处理或正则处理 from sklearn import preprocessing from sklearn.preprocessing import Imputer from sklearn import metrics import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split import pandas as pd #中文字体设置 from matplotlib.font_manager import FontProperties font=FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=14) #读取变量名文件 varibleFileName="titantic.xlsx" #读取目标文件 targetFileName="target.xlsx" #读取excel data=pd.read_excel(varibleFileName) data_dummies=pd.get_dummies(data) print('features after one-hot encoding:\n',list(data_dummies.columns)) features=data_dummies.ix[:,"Pclass":'Embarked_S'] x=features.values #数据预处理 imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(x) x=imp.transform(x) target=pd.read_excel(targetFileName) y=target.values x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0) names=features.columns #n_estimators表示树的个数,测试中100颗树足够 logistic=LogisticRegression() logistic.fit(x_train,y_train) print("logistic:") print("accuracy on the training subset:{:.3f}".format(logistic.score(x_train,y_train))) print("accuracy on the test subset:{:.3f}".format(logistic.score(x_test,y_test))) ''' logistic: accuracy on the training subset:0.850 accuracy on the test subset:0.875 '''
目前效果最好的是去掉低覆盖率的变量后,SVM准确率最高0.89
# -*- coding: utf-8 -*- """ Created on Sat Mar 31 09:30:24 2018 @author: Administrator 随机森林不需要预处理数据 """ from sklearn.svm import SVC #导入数据预处理,包括标准化处理或正则处理 from sklearn import preprocessing from sklearn.preprocessing import Imputer from sklearn import metrics import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split import pandas as pd #中文字体设置 from matplotlib.font_manager import FontProperties font=FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=14) #读取变量名文件 varibleFileName="titantic.xlsx" #读取目标文件 targetFileName="target.xlsx" #读取excel data=pd.read_excel(varibleFileName) data_dummies=pd.get_dummies(data) print('features after one-hot encoding:\n',list(data_dummies.columns)) features=data_dummies.ix[:,"Pclass":'Embarked_S'] x=features.values #数据预处理 imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(x) x=imp.transform(x) target=pd.read_excel(targetFileName) y=target.values x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0) names=features.columns svm=SVC() svm.fit(x_train,y_train) print("svc:") print("accuracy on the training subset:{:.3f}".format(svm.score(x_train,y_train))) print("accuracy on the test subset:{:.3f}".format(svm.score(x_test,y_test))) ''' svc: accuracy on the training subset:0.900 accuracy on the test subset:0.726 ''' #标准化数据 X_train_scaled = preprocessing.scale(x_train) x_test_scaled = preprocessing.scale(x_test) svm1=SVC() svm1.fit(X_train_scaled,y_train) #改变C参数,调优,kernel表示核函数,用于平面转换,probability表示是否需要计算概率 svm1=SVC() svm1.fit(X_train_scaled,y_train) print("accuracy on the scaled training subset:{:.3f}".format(svm1.score(X_train_scaled,y_train))) print("accuracy on the scaled test subset:{:.3f}".format(svm1.score(x_test_scaled,y_test))) ''' accuracy on the scaled training subset:0.866 accuracy on the scaled test subset:0.881 ''' #改变C参数,调优,kernel表示核函数,用于平面转换,probability表示是否需要计算概率 svm2=SVC(C=10,gamma="auto",kernel='rbf',probability=True) svm2.fit(X_train_scaled,y_train) print("after c parameter=10,accuracy on the scaled training subset:{:.3f}".format(svm2.score(X_train_scaled,y_train))) print("after c parameter=10,accuracy on the scaled test subset:{:.3f}".format(svm2.score(x_test_scaled,y_test))) ''' after c parameter=10,accuracy on the scaled training subset:0.878 after c parameter=10,accuracy on the scaled test subset:0.890 '''
xgboost1.py
效果也相当好
AUC: 0.9464 ACC: 0.8841 Recall: 0.8716 F1-score: 0.8716 Precesion: 0.8716
# -*- coding: utf-8 -*- """ Created on Sat Mar 31 09:30:24 2018 @author: Administrator 随机森林不需要预处理数据 """ import xgboost as xgb #导入数据预处理,包括标准化处理或正则处理 from sklearn import preprocessing from sklearn.preprocessing import Imputer from sklearn import metrics import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split import pandas as pd #中文字体设置 from matplotlib.font_manager import FontProperties font=FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=14) #读取变量名文件 varibleFileName="titantic.xlsx" #读取目标文件 targetFileName="target.xlsx" #读取excel data=pd.read_excel(varibleFileName) data_dummies=pd.get_dummies(data) print('features after one-hot encoding:\n',list(data_dummies.columns)) features=data_dummies.ix[:,"Pclass":'Embarked_S'] x=features.values #数据预处理 imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(x) x=imp.transform(x) target=pd.read_excel(targetFileName) y=target.values x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0) names=features.columns dtrain=xgb.DMatrix(x_train,label=y_train) dtest=xgb.DMatrix(x_test) params={'booster':'gbtree', #'objective': 'reg:linear', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth':4, 'lambda':10, 'subsample':0.75, 'colsample_bytree':0.75, 'min_child_weight':2, 'eta': 0.025, 'seed':0, 'nthread':8, 'silent':1} watchlist = [(dtrain,'train')] bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist) ypred=bst.predict(dtest) # 设置阈值, 输出一些评价指标 y_pred = (ypred >= 0.5)*1 #模型校验 print ('AUC: %.4f' % metrics.roc_auc_score(y_test,ypred)) print ('ACC: %.4f' % metrics.accuracy_score(y_test,y_pred)) print ('Recall: %.4f' % metrics.recall_score(y_test,y_pred)) print ('F1-score: %.4f' %metrics.f1_score(y_test,y_pred)) print ('Precesion: %.4f' %metrics.precision_score(y_test,y_pred)) metrics.confusion_matrix(y_test,y_pred) print("xgboost:") print('Feature importances:{}'.format(bst.get_fscore())) ''' AUC: 0.9464 ACC: 0.8841 Recall: 0.8716 F1-score: 0.8716 Precesion: 0.8716 xgboost: Feature importances:{'f5': 69, 'f1': 178, 'f2': 68, 'f4': 245, 'f6': 25, 'f0': 88, 'f3': 25, 'f194': 4, 'f193': 21, 'f195': 9} '''
决策树
decisionTree.py
# -*- coding: utf-8 -*- """ Created on Mon Apr 30 19:04:10 2018 @author: Administrator """ from sklearn.tree import export_graphviz from sklearn.model_selection import train_test_split from sklearn.preprocessing import Imputer import pandas as pd import numpy as np from sklearn.tree import DecisionTreeClassifier import matplotlib.pyplot as plt #读取变量名文件 varibleFileName="titantic.xlsx" #读取目标文件 targetFileName="target.xlsx" #读取excel data=pd.read_excel(varibleFileName) data_dummies=pd.get_dummies(data) print('features after one-hot encoding:\n',list(data_dummies.columns)) features=data_dummies.ix[:,"Pclass":'Embarked_S'] x=features.values #数据预处理 imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(x) x=imp.transform(x) target=pd.read_excel(targetFileName) y=target.values X_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0) #变量名 names=features.columns #调参 list_average_accuracy=[] depth=range(1,30) for i in depth: #max_depth=4限制决策树深度可以降低算法复杂度,获取更精确值 tree= DecisionTreeClassifier(max_depth=i,random_state=0) tree.fit(X_train,y_train) accuracy_training=tree.score(X_train,y_train) accuracy_test=tree.score(x_test,y_test) average_accuracy=(accuracy_training+accuracy_test)/2.0 #print("average_accuracy:",average_accuracy) list_average_accuracy.append(average_accuracy) max_value=max(list_average_accuracy) #索引是0开头,结果要加1 best_depth=list_average_accuracy.index(max_value)+1 print("best_depth:",best_depth) best_tree= DecisionTreeClassifier(max_depth=best_depth,random_state=0) best_tree.fit(X_train,y_train) accuracy_training=best_tree.score(X_train,y_train) accuracy_test=best_tree.score(x_test,y_test) print("decision tree:") print("accuracy on the training subset:{:.3f}".format(best_tree.score(X_train,y_train))) print("accuracy on the test subset:{:.3f}".format(best_tree.score(x_test,y_test))) ''' best_depth: 19 decision tree: accuracy on the training subset:0.976 accuracy on the test subset:0.860 ''' #绘图,显示因子重要性 n_features=x.shape[1] plt.barh(range(n_features),best_tree.feature_importances_,align='center') plt.yticks(np.arange(n_features),features) plt.title("Decision Tree:") plt.xlabel('Feature Importance') plt.ylabel('Feature') plt.show() #生成一个dot文件,以后用cmd形式生成图片 export_graphviz(best_tree,out_file="Titanic.dot",class_names=['death','live'],feature_names=names,impurity=False,filled=True)
python风控评分卡建模和风控常识