主流機器學習[xgb, lgb, Keras, LR]


Preprocess

# 通用的預處理框架 import pandas as pd import numpy as np import scipy as sp # 文件讀取 def read_csv_file(f, logging=False): print("==========讀取數據=========") data = pd.read_csv(f) if logging: print(data.head(5)) print(f, "包含以下列") print(data.columns.values) print(data.describe()) print(data.info()) return data 

LR

# 通用的LogisticRegression框架 import pandas as pd import numpy as np from scipy import sparse from sklearn.preprocessing import OneHotEncoder from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler # 1. load data df_train = pd.DataFrame() df_test = pd.DataFrame() y_train = df_train['label'].values # 2. process data ss = StandardScaler() # 3. feature engineering/encoding # 3.1 For Labeled Feature enc = OneHotEncoder() feats = ["creativeID", "adID", "campaignID"] for i, feat in enumerate(feats): x_train = enc.fit_transform(df_train[feat].values.reshape(-1, 1)) x_test = enc.fit_transform(df_test[feat].values.reshape(-1, 1)) if i == 0: X_train, X_test = x_train, x_test else: X_train, X_test = sparse.hstack((X_train, x_train)), sparse.hstack((X_test, x_test)) # 3.2 For Numerical Feature # It must be a 2-D Data for StandardScalar, otherwise reshape(-1, len(feats)) is required feats = ["price", "age"] x_train = ss.fit_transform(df_train[feats].values) x_test = ss.fit_transform(df_test[feats].values) X_train, X_test = sparse.hstack((X_train, x_train)), sparse.hstack((X_test, x_test)) # model training lr = LogisticRegression() lr.fit(X_train, y_train) proba_test = lr.predict_proba(X_test)[:, 1] 

LightBGM

二分類

import lightgbm as lgb import pandas as pd import numpy as np import pickle from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split print("Loading Data ... ") # 導入數據 train_x, train_y, test_x = load_data() # 用sklearn.cross_validation進行訓練數據集划分,這里訓練集和交叉驗證集比例為7:3,可以自己根據需要設置 X, val_X, y, val_y = train_test_split( train_x, train_y, test_size=0.05, random_state=1, stratify=train_y ## 這里保證分割后y的比例分布與原數據一致 ) X_train = X y_train = y X_test = val_X y_test = val_y # create dataset for lightgbm lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) # specify your configurations as a dict params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': {'binary_logloss', 'auc'}, 'num_leaves': 5, 'max_depth': 6, 'min_data_in_leaf': 450, 'learning_rate': 0.1, 'feature_fraction': 0.9, 'bagging_fraction': 0.95, 'bagging_freq': 5, 'lambda_l1': 1, 'lambda_l2': 0.001, # 越小l2正則程度越高 'min_gain_to_split': 0.2, 'verbose': 5, 'is_unbalance': True } # train print('Start training...') gbm = lgb.train(params, lgb_train, num_boost_round=10000, valid_sets=lgb_eval, early_stopping_rounds=500) print('Start predicting...') preds = gbm.predict(test_x, num_iteration=gbm.best_iteration) # 輸出的是概率結果 # 導出結果 threshold = 0.5 for pred in preds: result = 1 if pred > threshold else 0 # 導出特征重要性 importance = gbm.feature_importance() names = gbm.feature_name() with open('./feature_importance.txt', 'w+') as file: for index, im in enumerate(importance): string = names[index] + ', ' + str(im) + '\n' file.write(string) 

多分類

import lightgbm as lgb import pandas as pd import numpy as np import pickle from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split print("Loading Data ... ") # 導入數據 train_x, train_y, test_x = load_data() # 用sklearn.cross_validation進行訓練數據集划分,這里訓練集和交叉驗證集比例為7:3,可以自己根據需要設置 X, val_X, y, val_y = train_test_split( train_x, train_y, test_size=0.05, random_state=1, stratify=train_y ## 這里保證分割后y的比例分布與原數據一致 ) X_train = X y_train = y X_test = val_X y_test = val_y # create dataset for lightgbm lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) # specify your configurations as a dict params = { 'boosting_type': 'gbdt', 'objective': 'multiclass', 'num_class': 9, 'metric': 'multi_error', 'num_leaves': 300, 'min_data_in_leaf': 100, 'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'lambda_l1': 0.4, 'lambda_l2': 0.5, 'min_gain_to_split': 0.2, 'verbose': 5, 'is_unbalance': True } # train print('Start training...') gbm = lgb.train(params, lgb_train, num_boost_round=10000, valid_sets=lgb_eval, early_stopping_rounds=500) print('Start predicting...') preds = gbm.predict(test_x, num_iteration=gbm.best_iteration) # 輸出的是概率結果 # 導出結果 for pred in preds: result = prediction = int(np.argmax(pred)) # 導出特征重要性 importance = gbm.feature_importance() names = gbm.feature_name() with open('./feature_importance.txt', 'w+') as file: for index, im in enumerate(importance): string = names[index] + ', ' + str(im) + '\n' file.write(string) 

XGB

二分類

import numpy as np import pandas as pd import xgboost as xgb import time from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import train_test_split train_x, train_y, test_x = load_data() # 構建特征 # 用sklearn.cross_validation進行訓練數據集划分,這里訓練集和交叉驗證集比例為7:3,可以自己根據需要設置 X, val_X, y, val_y = train_test_split( train_x, train_y, test_size=0.01, random_state=1, stratify=train_y ) # xgb矩陣賦值 xgb_val = xgb.DMatrix(val_X, label=val_y) xgb_train = xgb.DMatrix(X, label=y) xgb_test = xgb.DMatrix(test_x) # xgboost模型 ##################### params = { 'booster': 'gbtree', # 'objective': 'multi:softmax', # 多分類的問題、 # 'objective': 'multi:softprob', # 多分類概率 'objective': 'binary:logistic', 'eval_metric': 'logloss', # 'num_class': 9, # 類別數,與 multisoftmax 並用 'gamma': 0.1, # 用於控制是否后剪枝的參數,越大越保守,一般0.1、0.2這樣子。 'max_depth': 8, # 構建樹的深度,越大越容易過擬合 'alpha': 0, # L1正則化系數 'lambda': 10, # 控制模型復雜度的權重值的L2正則化項參數,參數越大,模型越不容易過擬合。 'subsample': 0.7, # 隨機采樣訓練樣本 'colsample_bytree': 0.5, # 生成樹時進行的列采樣 'min_child_weight': 3, # 這個參數默認是 1,是每個葉子里面 h 的和至少是多少,對正負樣本不均衡時的 0-1 分類而言 # ,假設 h 在 0.01 附近,min_child_weight 為 1 意味着葉子節點中最少需要包含 100 個樣本。 # 這個參數非常影響結果,控制葉子節點中二階導的和的最小值,該參數值越小,越容易 overfitting。 'silent': 0, # 設置成1則沒有運行信息輸出,最好是設置為0. 'eta': 0.03, # 如同學習率 'seed': 1000, 'nthread': -1, # cpu 線程數 'missing': 1, 'scale_pos_weight': (np.sum(y==0)/np.sum(y==1)) # 用來處理正負樣本不均衡的問題,通常取:sum(negative cases) / sum(positive cases) # 'eval_metric': 'auc' } plst = list(params.items()) num_rounds = 2000 # 迭代次數 watchlist = [(xgb_train, 'train'), (xgb_val, 'val')] # 交叉驗證 result = xgb.cv(plst, xgb_train, num_boost_round=200, nfold=4, early_stopping_rounds=200, verbose_eval=True, folds=StratifiedKFold(n_splits=4).split(X, y)) # 訓練模型並保存 # early_stopping_rounds 當設置的迭代次數較大時,early_stopping_rounds 可在一定的迭代次數內准確率沒有提升就停止訓練 model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=200) model.save_model('../data/model/xgb.model') # 用於存儲訓練出的模型 preds = model.predict(xgb_test) # 導出結果 threshold = 0.5 for pred in preds: result = 1 if pred > threshold else 0 

Keras

二分類

import numpy as np import pandas as pd import time from sklearn.model_selection import train_test_split from matplotlib import pyplot as plt from keras.models import Sequential from keras.layers import Dropout from keras.layers import Dense, Activation from keras.utils.np_utils import to_categorical # coding=utf-8 from model.util import load_data as load_data_1 from model.util_combine_train_test import load_data as load_data_2 from sklearn.preprocessing import StandardScaler # 用於特征的標准化 from sklearn.preprocessing import Imputer print("Loading Data ... ") # 導入數據 train_x, train_y, test_x = load_data() # 構建特征 X_train = train_x.values X_test = test_x.values y = train_y imp = Imputer(missing_values='NaN', strategy='mean', axis=0) X_train = imp.fit_transform(X_train) sc = StandardScaler() sc.fit(X_train) X_train = sc.transform(X_train) X_test = sc.transform(X_test) model = Sequential() model.add(Dense(256, input_shape=(X_train.shape[1],))) model.add(Activation('tanh')) model.add(Dropout(0.3)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(512)) model.add(Activation('tanh')) model.add(Dropout(0.3)) model.add(Dense(256)) model.add(Activation('linear')) model.add(Dense(1)) # 這里需要和輸出的維度一致 model.add(Activation('sigmoid')) # For a multi-class classification problem model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) epochs = 100 model.fit(X_train, y, epochs=epochs, batch_size=2000, validation_split=0.1, shuffle=True) # 導出結果 threshold = 0.5 for index, case in enumerate(X_test): case =np.array([case]) prediction_prob = model.predict(case) prediction = 1 if prediction_prob[0][0] > threshold else 0 

多分類

import numpy as np import pandas as pd import time from sklearn.model_selection import train_test_split from matplotlib import pyplot as plt from keras.models import Sequential from keras.layers import Dropout from keras.layers import Dense, Activation from keras.utils.np_utils import to_categorical # coding=utf-8 from model.util import load_data as load_data_1 from model.util_combine_train_test import load_data as load_data_2 from sklearn.preprocessing import StandardScaler # 用於特征的標准化 from sklearn.preprocessing import Imputer print("Loading Data ... ") # 導入數據 train_x, train_y, test_x = load_data() # 構建特征 X_train = train_x.values X_test = test_x.values y = train_y # 特征處理 sc = StandardScaler() sc.fit(X_train) X_train = sc.transform(X_train) X_test = sc.transform(X_test) y = to_categorical(y) ## 這一步很重要,一定要將多類別的標簽進行one-hot編碼 model = Sequential() model.add(Dense(256, input_shape=(X_train.shape[1],))) model.add(Activation('tanh')) model.add(Dropout(0.3)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(512)) model.add(Activation('tanh')) model.add(Dropout(0.3)) model.add(Dense(256)) model.add(Activation('linear')) model.add(Dense(9)) # 這里需要和輸出的維度一致 model.add(Activation('softmax')) # For a multi-class classification problem model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) epochs = 200 model.fit(X_train, y, epochs=epochs, batch_size=200, validation_split=0.1, shuffle=True) # 導出結果 for index, case in enumerate(X_test): case = np.array([case]) prediction_prob = model.predict(case) prediction = np.argmax(prediction_prob)




免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM