https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

xgboost官網教程

https://xgboost.readthedocs.io/en/latest/python/python_intro.html

下載地址

https://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost

然后打開cmd，用cd切換路徑到下載地址

最后 pip install.....

乳腺癌數據測試代碼

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 19 21:04:14 2018
 
@author:231469242@qq.com
微信公眾號：pythonEducation
"""
 
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
train_x, test_x, train_y, test_y=train_test_split(cancer.data,cancer.target,random_state=0)
 
 
import xgboost as xgb
dtrain=xgb.DMatrix(train_x,label=train_y)
dtest=xgb.DMatrix(test_x)
 
params={'booster':'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth':4,
    'lambda':10,
    'subsample':0.75,
    'colsample_bytree':0.75,
    'min_child_weight':2,
    'eta': 0.025,
    'seed':0,
    'nthread':8,
     'silent':1}
 
watchlist = [(dtrain,'train')]
 
bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)
 
ypred=bst.predict(dtest)
 
# 設置閾值, 輸出一些評價指標
y_pred = (ypred >= 0.5)*1
 
#模型校驗
from sklearn import metrics
print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred))
print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred))
print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred))
print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred))
print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred))
metrics.confusion_matrix(test_y,y_pred)
 
'''
模型區分能力相當好
AUC: 0.9981
ACC: 0.9860
Recall: 0.9889
F1-score: 0.9889
Precesion: 0.9889
 
'''
 
print("xgboost:") 
#print("accuracy on the training subset:{:.3f}".format(bst.get_score(train_x,train_y)))
#print("accuracy on the test subset:{:.3f}".format(bst.get_score(test_x,test_y)))
print('Feature importances:{}'.format(bst.get_fscore()))
'''
Feature importances:{'f20': 33, 'f27': 50, 'f21': 54, 'f1': 29, 'f7': 33, 'f22': 38,
'f26': 17, 'f13': 46, 'f23': 41, 'f24': 13, 'f15': 2, 'f0': 6, 'f14': 5, 'f25': 7,
'f3': 6, 'f12': 3, 'f9': 3, 'f28': 11, 'f8': 2, 'f10': 9, 'f6': 9, 'f16': 2, 'f29': 1,
'f4': 4, 'f18': 3, 'f19': 2, 'f17': 2, 'f11': 1}
'''
'''
import matplotlib.pylab as plt
import pandas as pd
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
'''
#preds = bst.predict(test_x)

乳腺癌數據測試代碼2

# -*- coding: utf-8 -*-
"""
Created on Sun Jun 17 10:10:45 2018

@author: 231469242@qq.com
微信公眾號：pythonEducation
"""
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_breast_cancer
#gridSearchCv調參
from sklearn.grid_search import GridSearchCV
import pickle
import numpy as np
import graphviz

cancer=load_breast_cancer()
train_x, test_x, train_y, test_y=train_test_split(cancer.data,cancer.target,random_state=0)
x=cancer.data
y=cancer.target
 
#加載數據
#XGBoost可以加載libsvm格式的文本數據，加載的數據格式可以為Numpy的二維數組和XGBoost的二進制的緩存文件。加載的數據存儲在對象DMatrix中
dtrain=xgb.DMatrix(train_x,label=train_y)
dtest=xgb.DMatrix(test_x)
 
params={'booster':'gbtree',
    'objective': 'binary:logistic',
    #'eval_metric': 'auc',
    'max_depth':4,
    'lambda':10,
    'subsample':0.75,
    'colsample_bytree':0.75,
    'min_child_weight':2,
    'eta': 0.025,
    'seed':0,
    'nthread':8,
     'silent':1}

params['eval_metric'] = ['auc', 'ams@0']
#params['eta']=list(np.arange(0,1,0.1))
#Specify validations set to watch performance
evallist=[(dtrain, 'train')]
'''
eta_range=list(np.arange(0,1,0.1))
#參數格子
param_grid=dict(eta=eta_range)
'''

#num_boost_round這是指提升迭代的個數
#evals 這是一個列表，用於對訓練過程中進行評估列表中的元素。形式是evals = [(dtrain,’train’),(dval,’val’)]或者是evals = [(dtrain,’train’)],對於第一種情況，它使得我們可以在訓練過程中觀察驗證集的效果。
bst=xgb.train(params,dtrain,num_boost_round=100,evals=evallist)


ypred=bst.predict(dtest)
# 設置閾值, 輸出一些評價指標
y_pred = (ypred >= 0.5)*1

from sklearn import metrics
print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred))
print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred))
print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred))
print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred))
print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred))
metrics.confusion_matrix(test_y,y_pred)



#繪制重要性特征
xgb.plot_importance(bst)
#繪制樹
#xgb.plot_tree(bst, num_trees=2)

#保存模型
bst.save_model('xgboost.model')
#The model and its feature map can also be dumped to a text file.
# dump model
bst.dump_model('dump.raw.txt')
# dump model with feature map
bst.dump_model('dump.raw.txt', 'featmap.txt')

#保存分類器
save_classifier = open("xgboost.pickle","wb")
pickle.dump(bst, save_classifier)
save_classifier.close()

'''
#打開分類器文件測試
classifier_f = open("xgboost.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f .close()
'''

xgboost-feature importance

# -*- coding: utf-8 -*-
"""
Created on Sun Jun 17 10:34:20 2018

@author: 231469242@qq.com
微信公眾號：pythonEducation
"""

import pandas as pd
import xgboost as xgb
import operator
from matplotlib import pylab as plt

def ceate_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
        i = i + 1

    outfile.close()

def get_data():
    train = pd.read_csv("../input/train.csv")

    features = list(train.columns[2:])

    y_train = train.Hazard

    for feat in train.select_dtypes(include=['object']).columns:
        m = train.groupby([feat])['Hazard'].mean()
        train[feat].replace(m,inplace=True)

    x_train = train[features]

    return features, x_train, y_train

def get_data2():
    from sklearn.datasets import load_iris
    #獲取數據
    iris = load_iris()
    x_train=pd.DataFrame(iris.data)
    features=["sepal_length","sepal_width","petal_length","petal_width"]
    x_train.columns=features
    y_train=pd.DataFrame(iris.target)
    return features, x_train, y_train

#features, x_train, y_train = get_data()
features, x_train, y_train = get_data2()
ceate_feature_map(features)

xgb_params = {"objective": "reg:linear", "eta": 0.01, "max_depth": 8, "seed": 42, "silent": 1}
num_rounds = 1000

dtrain = xgb.DMatrix(x_train, label=y_train)
xgb_model = xgb.train(xgb_params, dtrain, num_rounds)

importance = xgb_model.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

plt.figure()
df.plot()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(16, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.gcf().savefig('feature_importance_xgb.png')

xgboost R語言函數索引：

agaricus.test Test part from Mushroom Data Set
agaricus.train Training part from Mushroom Data Set
callbacks Callback closures for booster training.
cb.cv.predict Callback closure for returning cross-validation
based predictions.
cb.early.stop Callback closure to activate the early
stopping.
cb.evaluation.log Callback closure for logging the evaluation
history
cb.gblinear.history Callback closure for collecting the model
coefficients history of a gblinear booster
during its training.
cb.print.evaluation Callback closure for printing the result of
evaluation
cb.reset.parameters Callback closure for restetting the booster's
parameters at each iteration.
cb.save.model Callback closure for saving a model file.
dim.xgb.DMatrix Dimensions of xgb.DMatrix
dimnames.xgb.DMatrix Handling of column names of 'xgb.DMatrix'
getinfo Get information of an xgb.DMatrix object
predict.xgb.Booster Predict method for eXtreme Gradient Boosting
model
print.xgb.Booster Print xgb.Booster
print.xgb.DMatrix Print xgb.DMatrix
print.xgb.cv.synchronous
Print xgb.cv result
setinfo Set information of an xgb.DMatrix object
slice Get a new DMatrix containing the specified rows
of orginal xgb.DMatrix object
xgb.Booster.complete Restore missing parts of an incomplete
xgb.Booster object.
xgb.DMatrix Construct xgb.DMatrix object
xgb.DMatrix.save Save xgb.DMatrix object to binary file
xgb.attr Accessors for serializable attributes of a
model.
xgb.create.features Create new features from a previously learned
model
xgb.cv Cross Validation
xgb.dump Dump an xgboost model in text format.
xgb.gblinear.history Extract gblinear coefficients history.
xgb.ggplot.deepness Plot model trees deepness
xgb.ggplot.importance Plot feature importance as a bar graph
xgb.importance Importance of features in a model.
xgb.load Load xgboost model from binary file
xgb.model.dt.tree Parse a boosted tree model text dump
xgb.parameters<- Accessors for model parameters.
xgb.plot.multi.trees Project all trees on one tree and plot it
xgb.plot.shap SHAP contribution dependency plots
xgb.plot.tree Plot a boosted tree model
xgb.save Save xgboost model to binary file
xgb.save.raw Save xgboost model to R's raw vector, user can
call xgb.load to load the model back from raw
vector
xgb.train eXtreme Gradient Boosting Training
xgboost-deprecated Deprecation notices.

可以在目錄‘C:/Users/zhi.li04/Documents/R/win-library/3.3/xgboost/doc’中的小文品內找到更多的信息

discoverYourData: Discover your data (source, pdf)
xgboostPresentation: Xgboost presentation (source, pdf)
xgboost: xgboost: eXtreme Gradient Boosting (source, pdf)

xgboost調參

https://blog.csdn.net/wdxin1322/article/details/71698659?utm_source=itdadao&utm_medium=referral

具體參數樹狀圖：

eta：默認值設置為0.3。您需要指定用於更新步長收縮來防止過度擬合。每個提升步驟后,我們可以直接獲得新特性的權重。實際上 eta 收縮特征權重的提高過程更為保守。范圍是0到1。低η值意味着模型過度擬合更健壯。
gamma:默認值設置為0。您需要指定最小損失減少應進一步划分樹的葉節點。更大,更保守的算法。范圍是0到∞。γ越大算法越保守。
max_depth:默認值設置為6。您需要指定一個樹的最大深度。參數范圍是1到∞。
min_child_weight:默認值設置為1。您需要在子樹中指定最小的（海塞）實例權重的和，然后這個構建過程將放棄進一步的分割。在線性回歸模式中，在每個節點最少所需實例數量將簡單的同時部署。更大,更保守的算法。參數范圍是0到∞。
max_delta_step：默認值設置為0。max_delta_step 允許我們估計每棵樹的權重。如果該值設置為0,這意味着沒有約束。如果它被設置為一個正值,它可以幫助更新步驟更為保守。通常不需要此參數,但是在邏輯回歸中當分類是極為不均衡時需要用到。將其設置為1 - 10的價值可能有助於控制更新。參數范圍是0到∞。
subsample：默認值設置為1。您需要指定訓練實例的子樣品比。設置為0.5意味着XGBoost隨機收集一半的數據實例來生成樹來防止過度擬合。參數范圍是0到1。
colsample_bytree : 默認值設置為1。在構建每棵樹時,您需要指定列的子樣品比。范圍是0到1。
colsample_bylevel：默認為1
max_leaf_nodes：葉結點最大數量，默認為2^6

線性上升具體參數

lambda and alpha : L2正則化項，默認為1、L1正則化項，默認為1。這些都是正則化項權重。λ默認值假設是1和α= 0。
lambda_bias : L2正則化項在偏差上的默認值為0。
scale_pos_weight：加快收斂速度，默認為1

任務參數

base_score : 默認值設置為0.5。您需要指定初始預測分數作為全局偏差。
objective : 默認值設置為reg:linear。您需要指定你想要的類型的學習者,包括線性回歸、邏輯回歸、泊松回歸等。
eval_metric : 您需要指定驗證數據的評估指標,一個默認的指標分配根據客觀(rmse回歸,錯誤分類,意味着平均精度等級
seed : 隨機數種子，確保重現數據相同的輸出。

調參方式

首先調整max_depth ,通常max_depth 這個參數與其他參數關系不大，初始值設置為10，找到一個最好的誤差值，然后就可以調整參數與這個誤差值進行對比。比如調整到8，如果此時最好的誤差變高了，那么下次就調整到12；如果調整到12,誤差值比10 的低，那么下次可以嘗試調整到15.
在找到了最優的max_depth之后，可以開始調整subsample,初始值設置為1，然后調整到0.8 如果誤差值變高，下次就調整到0.9，如果還是變高，就保持為1.0
接着開始調整min_child_weight , 方法與上面同理
再接着調整colsample_bytree
經過上面的調整，已經得到了一組參數，這時調整eta 到0.05，然后讓程序運行來得到一個最佳的num_round,(在誤差值開始上升趨勢的時候為最佳 )

 # General Parameters, see comment for each definition 
 2 # choose the booster, can be gbtree or gblinear 
 3 booster = gbtree 
 4 # choose logistic regression loss function for binary classification 
 5 objective = binary:logistic 
 6 
 7 # Tree Booster Parameters 
 8 # step size shrinkage 
 9 eta = 1.0 
10 # minimum loss reduction required to make a further partition 
11 gamma = 1.0 
12 # minimum sum of instance weight(hessian) needed in a child 
13 min_child_weight = 100 
14 # maximum depth of a tree 
15 max_depth = 6 
16 
17 # Task Parameters 
18 # the number of round to do boosting 
19 num_round = 50 
20 # 0 means do not save any model except the final round model 
21 save_period = 0 
22 # The path of training data 
23 data = "a.train" 
24 # The path of validation data, used to monitor training process, here [test] sets name of the validation set 
25 eval[test] = "a.test" 
26 # evaluate on training data as well each round 
27 #eval_train = 1 
28 # The path of test data 
29 eval_metric = "auc" 
30 eval_metric = "error" 
31 test:data = "a.test"

python機器學習生物信息學系列課（博主錄制）：http://dwz.date/b9vw

微信掃二維碼，免費學習更多python資源

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 xgboost-python參數深入理解【轉】XGBoost參數調優完全指南（附Python代碼） XGBoost中參數調整的完整指南（包含Python中的代碼） XGBoost參數調優完全指南（附Python代碼） XGBoost：在Python中使用XGBoost XGBoost參數調優完全指南（附Python代碼） python庫之xgboost xgboost原理總結和代碼展示 xgboost中XGBClassifier(）參數詳解 XGBoost