xgboost是基於GBDT原理進行改進的算法,效率高,並且可以進行並行化運算,而且可以在訓練的過程中給出各個特征的評分,從而表明每個特征對模型訓練的重要性,
調用的源碼就不准備詳述,本文主要側重的是計算的原理,函數get_fscore源碼如下,源碼來自安裝包:xgboost/python-package/xgboost/core.py
通過下面的源碼可以看出,特征評分可以看成是被用來分離決策樹的次數。
def get_fscore(self, fmap=''): """Get feature importance of each feature. Parameters ---------- fmap: str (optional) The name of feature map file """ return self.get_score(fmap, importance_type='weight') def get_score(self, fmap='', importance_type='weight'): """Get feature importance of each feature. Importance type can be defined as: 'weight' - the number of times a feature is used to split the data across all trees. 'gain' - the average gain of the feature when it is used in trees 'cover' - the average coverage of the feature when it is used in trees Parameters ---------- fmap: str (optional) The name of feature map file """ if importance_type not in ['weight', 'gain', 'cover']: msg = "importance_type mismatch, got '{}', expected 'weight', 'gain', or 'cover'" raise ValueError(msg.format(importance_type)) # if it's weight, then omap stores the number of missing values if importance_type == 'weight': # do a simpler tree dump to save time trees = self.get_dump(fmap, with_stats=False) fmap = {} for tree in trees: for line in tree.split('\n'): # look for the opening square bracket arr = line.split('[') # if no opening bracket (leaf node), ignore this line if len(arr) == 1: continue # extract feature name from string between [] fid = arr[1].split(']')[0].split('<')[0] if fid not in fmap: # if the feature hasn't been seen yet fmap[fid] = 1 else: fmap[fid] += 1 return fmap else: trees = self.get_dump(fmap, with_stats=True) importance_type += '=' fmap = {} gmap = {} for tree in trees: for line in tree.split('\n'): # look for the opening square bracket arr = line.split('[') # if no opening bracket (leaf node), ignore this line if len(arr) == 1: continue # look for the closing bracket, extract only info within that bracket fid = arr[1].split(']') # extract gain or cover from string after closing bracket g = float(fid[1].split(importance_type)[1].split(',')[0]) # extract feature name from string before closing bracket fid = fid[0].split('<')[0] if fid not in fmap: # if the feature hasn't been seen yet fmap[fid] = 1 gmap[fid] = g else: fmap[fid] += 1 gmap[fid] += g # calculate average value (gain/cover) for each feature for fid in gmap: gmap[fid] = gmap[fid] / fmap[fid] return gmap