sklearn机器学习-泰坦尼克号

本文转载自查看原文 2018-04-29 21:38 1023 sklearn机器学习

sklearn实战-乳腺癌细胞数据挖掘（博主亲自录制视频）

https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

医药统计项目可联系

QQ：231469242

randomForest.py

调参后，预测最高准确性也达到了89%

随机森林的参数

# -*- coding: utf-8 -*-
"""
Created on Sat Mar 31 09:30:24 2018

@author: Administrator
随机森林不需要预处理数据
"""
#导入数据预处理，包括标准化处理或正则处理
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
#中文字体设置
from matplotlib.font_manager import FontProperties
font=FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=14)

#读取变量名文件
varibleFileName="titantic.xlsx"
#读取目标文件
targetFileName="target.xlsx"
#读取excel
data=pd.read_excel(varibleFileName)
data_dummies=pd.get_dummies(data)
print('features after one-hot encoding:\n',list(data_dummies.columns))
features=data_dummies.ix[:,"Pclass":'Embarked_S']
x=features.values

#数据预处理
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) 
imp.fit(x)
x=imp.transform(x)


target=pd.read_excel(targetFileName)
y=target.values
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)
names=features.columns

trees=1000
max_depth=10
#n_estimators表示树的个数，测试中100颗树足够
forest=RandomForestClassifier(n_estimators=trees,random_state=0,max_depth=max_depth)
forest.fit(x_train,y_train)

print("random forest with %d trees:"%trees)  
print("accuracy on the training subset:{:.3f}".format(forest.score(x_train,y_train)))
print("accuracy on the test subset:{:.3f}".format(forest.score(x_test,y_test)))
#print('Feature importances:{}'.format(forest.feature_importances_))

names=features.columns
importance=forest.feature_importances_
zipped = zip(importance,names)
list1=list(zipped)

list1.sort(reverse=True)
#print(list1)



n_features=data_dummies.shape[1]
plt.barh(range(n_features),forest.feature_importances_,align='center')
plt.yticks(np.arange(n_features),features)
plt.title("random forest with %d trees,%dmax_depth:"%(trees,max_depth))
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()


'''
random forest with 1000 trees:
accuracy on the training subset:0.983
accuracy on the test subset:0.878


random forest with 1000 trees,max_depth=4:
accuracy on the training subset:0.854
accuracy on the test subset:0.884

random forest with 1000 trees,max_depth=5:
accuracy on the training subset:0.853
accuracy on the test subset:0.887

random forest with 1000 trees,max_depth=9
accuracy on the training subset:0.871
accuracy on the test subset:0.890
'''

去掉覆盖率低的变量后，随机森林准确性反而下降，看了随机森林不需要去计算变量覆盖率

训练数据准确性0.983

测试数据准确性0.878

'''
random forest with 1000 trees:
accuracy on the training subset:0.983
accuracy on the test subset:0.878
'''

重要因子来看，性别第一，占据40%重要性，

年龄重要性18%左右，

票价重要性17%左右

logistic.py

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 29 22:39:35 2018

@author: Administrator
"""

# -*- coding: utf-8 -*-
"""
Created on Sat Mar 31 09:30:24 2018

@author: Administrator
随机森林不需要预处理数据
"""
from sklearn.linear_model import LogisticRegression
#导入数据预处理，包括标准化处理或正则处理
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
#中文字体设置
from matplotlib.font_manager import FontProperties
font=FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=14)

#读取变量名文件
varibleFileName="titantic.xlsx"
#读取目标文件
targetFileName="target.xlsx"
#读取excel
data=pd.read_excel(varibleFileName)
data_dummies=pd.get_dummies(data)
print('features after one-hot encoding:\n',list(data_dummies.columns))
features=data_dummies.ix[:,"Pclass":'Embarked_S']
x=features.values

#数据预处理
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) 
imp.fit(x)
x=imp.transform(x)


target=pd.read_excel(targetFileName)
y=target.values
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)
names=features.columns


#n_estimators表示树的个数，测试中100颗树足够
logistic=LogisticRegression()
logistic.fit(x_train,y_train)

print("logistic:")  
print("accuracy on the training subset:{:.3f}".format(logistic.score(x_train,y_train)))
print("accuracy on the test subset:{:.3f}".format(logistic.score(x_test,y_test)))


'''
logistic:
accuracy on the training subset:0.850
accuracy on the test subset:0.875
'''

目前效果最好的是去掉低覆盖率的变量后，SVM准确率最高0.89

# -*- coding: utf-8 -*-
"""
Created on Sat Mar 31 09:30:24 2018

@author: Administrator
随机森林不需要预处理数据
"""
from sklearn.svm import SVC
#导入数据预处理，包括标准化处理或正则处理
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
#中文字体设置
from matplotlib.font_manager import FontProperties
font=FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=14)

#读取变量名文件
varibleFileName="titantic.xlsx"
#读取目标文件
targetFileName="target.xlsx"
#读取excel
data=pd.read_excel(varibleFileName)
data_dummies=pd.get_dummies(data)
print('features after one-hot encoding:\n',list(data_dummies.columns))
features=data_dummies.ix[:,"Pclass":'Embarked_S']
x=features.values

#数据预处理
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) 
imp.fit(x)
x=imp.transform(x)


target=pd.read_excel(targetFileName)
y=target.values
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)
names=features.columns

svm=SVC()
svm.fit(x_train,y_train)
print("svc:")  
print("accuracy on the training subset:{:.3f}".format(svm.score(x_train,y_train)))
print("accuracy on the test subset:{:.3f}".format(svm.score(x_test,y_test)))


'''
svc:
accuracy on the training subset:0.900
accuracy on the test subset:0.726
'''

#标准化数据
X_train_scaled = preprocessing.scale(x_train)
x_test_scaled = preprocessing.scale(x_test)
svm1=SVC()
svm1.fit(X_train_scaled,y_train)
#改变C参数，调优,kernel表示核函数，用于平面转换，probability表示是否需要计算概率
svm1=SVC()
svm1.fit(X_train_scaled,y_train)
print("accuracy on the scaled training subset:{:.3f}".format(svm1.score(X_train_scaled,y_train)))
print("accuracy on the scaled test subset:{:.3f}".format(svm1.score(x_test_scaled,y_test)))

'''
accuracy on the scaled training subset:0.866
accuracy on the scaled test subset:0.881
'''
#改变C参数，调优,kernel表示核函数，用于平面转换，probability表示是否需要计算概率
svm2=SVC(C=10,gamma="auto",kernel='rbf',probability=True)
svm2.fit(X_train_scaled,y_train)
print("after c parameter=10,accuracy on the scaled training subset:{:.3f}".format(svm2.score(X_train_scaled,y_train)))
print("after c parameter=10,accuracy on the scaled test subset:{:.3f}".format(svm2.score(x_test_scaled,y_test)))

'''
after c parameter=10,accuracy on the scaled training subset:0.878
after c parameter=10,accuracy on the scaled test subset:0.890
'''

xgboost1.py

效果也相当好

AUC: 0.9464
ACC: 0.8841
Recall: 0.8716
F1-score: 0.8716
Precesion: 0.8716

# -*- coding: utf-8 -*-
"""
Created on Sat Mar 31 09:30:24 2018

@author: Administrator
随机森林不需要预处理数据
"""
import xgboost as xgb
#导入数据预处理，包括标准化处理或正则处理
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
#中文字体设置
from matplotlib.font_manager import FontProperties
font=FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=14)

#读取变量名文件
varibleFileName="titantic.xlsx"
#读取目标文件
targetFileName="target.xlsx"
#读取excel
data=pd.read_excel(varibleFileName)
data_dummies=pd.get_dummies(data)
print('features after one-hot encoding:\n',list(data_dummies.columns))
features=data_dummies.ix[:,"Pclass":'Embarked_S']
x=features.values

#数据预处理
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) 
imp.fit(x)
x=imp.transform(x)


target=pd.read_excel(targetFileName)
y=target.values
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)
names=features.columns

dtrain=xgb.DMatrix(x_train,label=y_train)
dtest=xgb.DMatrix(x_test)

params={'booster':'gbtree',
    #'objective': 'reg:linear',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth':4,
    'lambda':10,
    'subsample':0.75,
    'colsample_bytree':0.75,
    'min_child_weight':2,
    'eta': 0.025,
    'seed':0,
    'nthread':8,
     'silent':1}


watchlist = [(dtrain,'train')]

bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)

ypred=bst.predict(dtest)

# 设置阈值, 输出一些评价指标
y_pred = (ypred >= 0.5)*1

#模型校验
print ('AUC: %.4f' % metrics.roc_auc_score(y_test,ypred))
print ('ACC: %.4f' % metrics.accuracy_score(y_test,y_pred))
print ('Recall: %.4f' % metrics.recall_score(y_test,y_pred))
print ('F1-score: %.4f' %metrics.f1_score(y_test,y_pred))
print ('Precesion: %.4f' %metrics.precision_score(y_test,y_pred))
metrics.confusion_matrix(y_test,y_pred)

print("xgboost:")  
print('Feature importances:{}'.format(bst.get_fscore()))

'''
AUC: 0.9464
ACC: 0.8841
Recall: 0.8716
F1-score: 0.8716
Precesion: 0.8716
xgboost:
Feature importances:{'f5': 69, 'f1': 178, 'f2': 68, 'f4': 245, 'f6': 25, 'f0': 88, 'f3': 25, 'f194': 4, 'f193': 21, 'f195': 9}
'''

决策树

decisionTree.py

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 30 19:04:10 2018

@author: Administrator
"""
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

#读取变量名文件
varibleFileName="titantic.xlsx"
#读取目标文件
targetFileName="target.xlsx"
#读取excel
data=pd.read_excel(varibleFileName)
data_dummies=pd.get_dummies(data)
print('features after one-hot encoding:\n',list(data_dummies.columns))
features=data_dummies.ix[:,"Pclass":'Embarked_S']
x=features.values

#数据预处理
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) 
imp.fit(x)
x=imp.transform(x)


target=pd.read_excel(targetFileName)
y=target.values
X_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)
#变量名
names=features.columns

#调参
list_average_accuracy=[]
depth=range(1,30)
for i in depth:
    #max_depth=4限制决策树深度可以降低算法复杂度，获取更精确值
    tree= DecisionTreeClassifier(max_depth=i,random_state=0)
    tree.fit(X_train,y_train)
    accuracy_training=tree.score(X_train,y_train)
    accuracy_test=tree.score(x_test,y_test)
    average_accuracy=(accuracy_training+accuracy_test)/2.0
    #print("average_accuracy:",average_accuracy)
    list_average_accuracy.append(average_accuracy)
    
max_value=max(list_average_accuracy)
#索引是0开头，结果要加1
best_depth=list_average_accuracy.index(max_value)+1
print("best_depth:",best_depth)

best_tree= DecisionTreeClassifier(max_depth=best_depth,random_state=0)
best_tree.fit(X_train,y_train)
accuracy_training=best_tree.score(X_train,y_train)
accuracy_test=best_tree.score(x_test,y_test)

print("decision tree:")    
print("accuracy on the training subset:{:.3f}".format(best_tree.score(X_train,y_train)))
print("accuracy on the test subset:{:.3f}".format(best_tree.score(x_test,y_test)))

'''
best_depth: 19
decision tree:
accuracy on the training subset:0.976
accuracy on the test subset:0.860
'''

#绘图，显示因子重要性
n_features=x.shape[1]
plt.barh(range(n_features),best_tree.feature_importances_,align='center')
plt.yticks(np.arange(n_features),features)
plt.title("Decision Tree:")
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()

#生成一个dot文件，以后用cmd形式生成图片
export_graphviz(best_tree,out_file="Titanic.dot",class_names=['death','live'],feature_names=names,impurity=False,filled=True)

python风控评分卡建模和风控常识

https://study.163.com/course/introduction.htm?courseId=1005214003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

免责声明！

本站转载的文章为个人学习借鉴使用，本站对版权不负任何法律责任。如果侵犯了您的隐私权益，请联系本站邮箱yoyou2525@163.com删除。

猜您在找 机器学习项目实战----泰坦尼克号获救预测(一) Python机器学习：泰坦尼克号获救预测一机器学习——用逻辑回归及随机森林实现泰坦尼克号的生存预测机器学习项目实战----泰坦尼克号获救预测(二) [机器学习]贝叶斯算法对泰坦尼克号生存人群分类预测 [简单示例] 机器学习之路: python 决策树分类DecisionTreeClassifier 预测泰坦尼克号乘客是否幸存泰坦尼克号获救问题泰坦尼克号之灾分析泰坦尼克号幸存预测 Kaggle泰坦尼克号案例