一.
LTR(learning to rank)經常用於搜索排序中,開源工具中比較有名的是微軟的ranklib,但是這個好像是單機版的,也有好長時間沒有更新了。所以打算想利用lightgbm進行排序,但網上關於lightgbm用於排序的代碼很少,關於回歸和分類的倒是一堆。這里我將貼上python版的lightgbm用於排序的代碼,里面將包括訓練、獲取葉結點、ndcg評估、預測以及特征重要度等處理代碼,有需要的朋友可以參考一下或進行修改。
其實在使用時,本人也對比了ranklib中的lambdamart和lightgbm,令人映像最深刻的是lightgbm的訓練速度非常快,快的起飛。可能lambdamart訓練需要幾個小時,而lightgbm只需要幾分鍾,但是后面的ndcg測試都差不多,不像論文中所說的lightgbm精度高一點。lightgbm的訓練速度快,我想可能最大的原因要可能是:a.節點分裂用到了直方圖,而不是預排序方法;b.基於梯度的單邊采樣,即行采樣;c.互斥特征綁定,即列采樣;d.其於leaf-wise決策樹生長策略;e.類別特征的支持等
二.代碼
第一部分代碼塊是主代碼,后面三個代碼塊是用到的加載數據和ndcg。運行主代碼使用命令如訓練模型使用:python lgb.py -train等
完成代碼和數據格式放在https://github.com/jiangnanboy/learning_to_rank上面,大家可以參考一下!!!!!
1 import os 2 import lightgbm as lgb 3 from sklearn import datasets as ds 4 import pandas as pd 5 6 import numpy as np 7 from datetime import datetime 8 import sys 9 from sklearn.preprocessing import OneHotEncoder 10 11 def split_data_from_keyword(data_read, data_group, data_feats): 12 ''' 13 利用pandas 14 轉為lightgbm需要的格式進行保存 15 :param data_read: 16 :param data_save: 17 :return: 18 ''' 19 with open(data_group, 'w', encoding='utf-8') as group_path: 20 with open(data_feats, 'w', encoding='utf-8') as feats_path: 21 dataframe = pd.read_csv(data_read, 22 sep=' ', 23 header=None, 24 encoding="utf-8", 25 engine='python') 26 current_keyword = '' 27 current_data = [] 28 group_size = 0 29 for _, row in dataframe.iterrows(): 30 feats_line = [str(row[0])] 31 for i in range(2, len(dataframe.columns) - 1): 32 feats_line.append(str(row[i])) 33 if current_keyword == '': 34 current_keyword = row[1] 35 if row[1] == current_keyword: 36 current_data.append(feats_line) 37 group_size += 1 38 else: 39 for line in current_data: 40 feats_path.write(' '.join(line)) 41 feats_path.write('\n') 42 group_path.write(str(group_size) + '\n') 43 44 group_size = 1 45 current_data = [] 46 current_keyword = row[1] 47 current_data.append(feats_line) 48 49 for line in current_data: 50 feats_path.write(' '.join(line)) 51 feats_path.write('\n') 52 group_path.write(str(group_size) + '\n') 53 54 def save_data(group_data, output_feature, output_group): 55 ''' 56 group與features分別進行保存 57 :param group_data: 58 :param output_feature: 59 :param output_group: 60 :return: 61 ''' 62 if len(group_data) == 0: 63 return 64 output_group.write(str(len(group_data)) + '\n') 65 for data in group_data: 66 # 只包含非零特征 67 # feats = [p for p in data[2:] if float(p.split(":")[1]) != 0.0] 68 feats = [p for p in data[2:]] 69 output_feature.write(data[0] + ' ' + ' '.join(feats) + '\n') # data[0] => level ; data[2:] => feats 70 71 def process_data_format(test_path, test_feats, test_group): 72 ''' 73 轉為lightgbm需要的格式進行保存 74 ''' 75 with open(test_path, 'r', encoding='utf-8') as fi: 76 with open(test_feats, 'w', encoding='utf-8') as output_feature: 77 with open(test_group, 'w', encoding='utf-8') as output_group: 78 group_data = [] 79 group = '' 80 for line in fi: 81 if not line: 82 break 83 if '#' in line: 84 line = line[:line.index('#')] 85 splits = line.strip().split() 86 if splits[1] != group: # qid => splits[1] 87 save_data(group_data, output_feature, output_group) 88 group_data = [] 89 group = splits[1] 90 group_data.append(splits) 91 save_data(group_data, output_feature, output_group) 92 93 def load_data(feats, group): 94 ''' 95 加載數據 96 分別加載feature,label,query 97 ''' 98 x_train, y_train = ds.load_svmlight_file(feats) 99 q_train = np.loadtxt(group) 100 return x_train, y_train, q_train 101 102 def load_data_from_raw(raw_data): 103 with open(raw_data, 'r', encoding='utf-8') as testfile: 104 test_X, test_y, test_qids, comments = letor.read_dataset(testfile) 105 return test_X, test_y, test_qids, comments 106 107 def train(x_train, y_train, q_train, model_save_path): 108 ''' 109 模型的訓練和保存 110 ''' 111 train_data = lgb.Dataset(x_train, label=y_train, group=q_train) 112 params = { 113 'task': 'train', # 執行的任務類型 114 'boosting_type': 'gbrt', # 基學習器 115 'objective': 'lambdarank', # 排序任務(目標函數) 116 'metric': 'ndcg', # 度量的指標(評估函數) 117 'max_position': 10, # @NDCG 位置優化 118 'metric_freq': 1, # 每隔多少次輸出一次度量結果 119 'train_metric': True, # 訓練時就輸出度量結果 120 'ndcg_at': [10], 121 'max_bin': 255, # 一個整數,表示最大的桶的數量。默認值為 255。lightgbm 會根據它來自動壓縮內存。如max_bin=255 時,則lightgbm 將使用uint8 來表示特征的每一個值。 122 'num_iterations': 500, # 迭代次數 123 'learning_rate': 0.01, # 學習率 124 'num_leaves': 31, # 葉子數 125 # 'max_depth':6, 126 'tree_learner': 'serial', # 用於並行學習,‘serial’: 單台機器的tree learner 127 'min_data_in_leaf': 30, # 一個葉子節點上包含的最少樣本數量 128 'verbose': 2 # 顯示訓練時的信息 129 } 130 gbm = lgb.train(params, train_data, valid_sets=[train_data]) 131 gbm.save_model(model_save_path) 132 133 def predict(x_test, comments, model_input_path): 134 ''' 135 預測得分並排序 136 ''' 137 gbm = lgb.Booster(model_file=model_input_path) # 加載model 138 139 ypred = gbm.predict(x_test) 140 141 predicted_sorted_indexes = np.argsort(ypred)[::-1] # 返回從大到小的索引 142 143 t_results = comments[predicted_sorted_indexes] # 返回對應的comments,從大到小的排序 144 145 return t_results 146 147 def test_data_ndcg(model_path, test_path): 148 ''' 149 評估測試數據的ndcg 150 ''' 151 with open(test_path, 'r', encoding='utf-8') as testfile: 152 test_X, test_y, test_qids, comments = letor.read_dataset(testfile) 153 154 gbm = lgb.Booster(model_file=model_path) 155 test_predict = gbm.predict(test_X) 156 157 average_ndcg, _ = ndcg.validate(test_qids, test_y, test_predict, 60) 158 # 所有qid的平均ndcg 159 print("all qid average ndcg: ", average_ndcg) 160 print("job done!") 161 162 def plot_print_feature_importance(model_path): 163 ''' 164 打印特征的重要度 165 ''' 166 #模型中的特征是Column_數字,這里打印重要度時可以映射到真實的特征名 167 feats_dict = { 168 'Column_0': '特征0名稱', 169 'Column_1': '特征1名稱', 170 'Column_2': '特征2名稱', 171 'Column_3': '特征3名稱', 172 'Column_4': '特征4名稱', 173 'Column_5': '特征5名稱', 174 'Column_6': '特征6名稱', 175 'Column_7': '特征7名稱', 176 'Column_8': '特征8名稱', 177 'Column_9': '特征9名稱', 178 'Column_10': '特征10名稱', 179 } 180 if not os.path.exists(model_path): 181 print("file no exists! {}".format(model_path)) 182 sys.exit(0) 183 184 gbm = lgb.Booster(model_file=model_path) 185 186 # 打印和保存特征重要度 187 importances = gbm.feature_importance(importance_type='split') 188 feature_names = gbm.feature_name() 189 190 sum = 0. 191 for value in importances: 192 sum += value 193 194 for feature_name, importance in zip(feature_names, importances): 195 if importance != 0: 196 feat_id = int(feature_name.split('_')[1]) + 1 197 print('{} : {} : {} : {}'.format(feat_id, feats_dict[feature_name], importance, importance / sum)) 198 199 def get_leaf_index(data, model_path): 200 ''' 201 得到葉結點並進行one-hot編碼 202 ''' 203 gbm = lgb.Booster(model_file=model_path) 204 ypred = gbm.predict(data, pred_leaf=True) 205 206 one_hot_encoder = OneHotEncoder() 207 x_one_hot = one_hot_encoder.fit_transform(ypred) 208 print(x_one_hot.toarray()[0]) 209 210 if __name__ == '__main__': 211 model_path = "保存模型的路徑" 212 213 if len(sys.argv) != 2: 214 print("Usage: python main.py [-process | -train | -predict | -ndcg | -feature | -leaf]") 215 sys.exit(0) 216 217 if sys.argv[1] == '-process': 218 # 訓練樣本的格式與ranklib中的訓練樣本是一樣的,但是這里需要處理成lightgbm中排序所需的格式 219 # lightgbm中是將樣本特征和group分開保存為txt的,什么意思呢,看下面解釋 220 ''' 221 feats: 222 1 1:0.2 2:0.4 ... 223 2 1:0.2 2:0.4 ... 224 1 1:0.2 2:0.4 ... 225 3 1:0.2 2:0.4 ... 226 group: 227 2 228 4 229 這里group中2表示前2個是一個qid,4表示后兩個是一個qid 230 ''' 231 raw_data_path = '訓練樣本集路徑' 232 data_feats = '特征保存路徑' 233 data_group = 'group保存路徑' 234 process_data_format(raw_data_path, data_feats, data_group) 235 236 elif sys.argv[1] == '-train': 237 # train 238 train_start = datetime.now() 239 data_feats = '特征保存路徑' 240 data_group = 'group保存路徑' 241 x_train, y_train, q_train = load_data(data_feats, data_group) 242 train(x_train, y_train, q_train, model_path) 243 train_end = datetime.now() 244 consume_time = (train_end - train_start).seconds 245 print("consume time : {}".format(consume_time)) 246 247 elif sys.argv[1] == '-predict': 248 train_start = datetime.now() 249 raw_data_path = '需要預測的數據路徑'#格式如ranklib中的數據格式 250 test_X, test_y, test_qids, comments = load_data_from_raw(raw_data_path) 251 t_results = predict(test_X, comments, model_path) 252 train_end = datetime.now() 253 consume_time = (train_end - train_start).seconds 254 print("consume time : {}".format(consume_time)) 255 256 elif sys.argv[1] == '-ndcg': 257 # ndcg 258 test_path = '測試的數據路徑'#評估測試數據的平均ndcg 259 test_data_ndcg(model_path, test_path) 260 261 elif sys.argv[1] == '-feature': 262 plot_print_feature_importance(model_path) 263 264 elif sys.argv[1] == '-leaf': 265 #利用模型得到樣本葉結點的one-hot表示 266 raw_data = '測試數據路徑'# 267 with open(raw_data, 'r', encoding='utf-8') as testfile: 268 test_X, test_y, test_qids, comments = letor.read_dataset(testfile) 269 get_leaf_index(test_X, model_path)