stacking method house price in kaggle top10%

本文轉載自查看原文 2019-07-26 17:20 1154 pandas/ 機器學習

整合幾部分代碼的匯總
特征工程
模型部門

整合幾部分代碼的匯總

隱藏代碼片段

from IPython.display import HTML
from IPython.display import Image
import sys  
sys.path.append('.')  


HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

導入python數據和可視化包

"""
pandas numpy : data_process
matplotlib seaborn : data visualization
warning: avoid warning from packages
"""
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn 
# data process
import pandas as pd
import  numpy as np
# data visualization
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns
# set option for visilazition
color = sns.color_palette()
sns.set_style('darkgrid')
#sns.set(style='white', context='notebook', palette='deep')
"""
# avoid warning ignore annoying warning (from sklearn and seaborn and other packages xgboost and lightgbm)
# we can use !ls or !pip install package_name to ahcieve some magic command line 
"""
# Set visualisation colours
mycols = ["#66c2ff", "#5cd6d6", "#00cc99", "#85e085", "#ffd966", "#ffb366", "#ffb3b3", "#dab3ff", "#c2c2d6"]
sns.set_palette(palette = mycols, n_colors = 4)
#or sns.set(style='white', context='notebook', palette='deep') 
print('Data Manipulation, Mathematical Computation and Visualisation packages imported!')

Data Manipulation, Mathematical Computation and Visualisation packages imported!

導入統計相關的工具

"""
function:Statistical packages used for transformations
stats: staticstic function in scipy 
skew: for partial norm distributions  skewed coefficient.
boxcox1p: transform data or feature to normal distribution 
https://blog.csdn.net/u012735708/article/details/84755595,determine the lambda估算的值)
pearsonr: 皮爾遜系數
"""
from scipy import stats 
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats.stats import pearsonr
print('Statistical packages imported!')

Statistical packages imported!

導入回歸相關的算法


"""
ElasticNet:彈性網絡
Lasso: 奧卡姆剃刀回歸，正則化
BayesianRidge: 貝葉斯回歸
常見的線性回歸模型:http://blog.sina.com.cn/s/blog_62970c250102xfgb.html，LassoLarsIC這個模型不熟悉
ensemble 方法: 隨即森林回歸，GBDT回歸，xgboost回歸，lightGBM 回歸
numpy.dtype size changed, may indicate binary incompatibility 問題解決方案: numpy 版本過高，調低numpy版本
"""
# Algorithms used for modeling
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb
import lightgbm as lgb
print('Algorithm packages imported!')

Algorithm packages imported!

導入數據預處理相關的方法


"""
make_pipeline: construct pipeline for processing data
RobustScaler: 針對離群點的RobustScaler有些時候，數據集中存在離群點，用Z-Score進行標准化，但是結果不理想，
因為離群點在標准化后喪失了利群特性。RobustScaler針對離群點做標准化處理，該方法對數據中心化的數據的縮放健壯性有更強的參數控制能力。
StandScaler(Z-Score): 新數據=（原數據-均值）/標准差
歸一化Max-Min:新數據=（原數據-最小值）/（最大值-最小值）
"""
# Pipeline and scaling preprocessing will be used for models that are sensitive
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
#     from sklearn.feature_selection import SelectFromModel
#     from sklearn.feature_selection import SelectKBest
#     from sklearn.feature_selection import chi2
#     模型選擇的模塊用的比較少
print('Pipeline and preprocessing packages imported!')

Pipeline and preprocessing packages imported!

導入模型調參相關的包


# Model select packages used for sampling dataset and optimising parameters
"""
KFold: 它將原始數據分成K組(K-Fold)，將每個子集數據分別做一次驗證集，其余的K-1組子集數據作為訓練集，這樣會得到K個模型。
cross_val_score: 交叉驗證的評估值
train_test_split: 數據切割成訓練集和測試集（驗證集）
GridSearchCV:網格搜索參數，進行模型搜索
ShuffleSplit： train_test_split的參數中shuffle參數設定為True
"""
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
print('Model selection packages imported!')

Model selection packages imported!

from subprocess import check_output
print(check_output(['ls']).decode("utf8"))  # check the files available in the directory
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #設定pandas數字格式，小數點后3位

1-house-prices-solution-top-1.ipynb
Stacked Regressions _ Top 4% on LeaderBoard.ipynb
__pycache__
concat_kaggle_house_price.ipynb
data_description.txt
data_description.zip
final_submission.csv
input
kaggle house price.ipynb
laod_Algorithms.py
stacking-house-prices-walkthrough-to-top-5.ipynb
submission.csv

讀取數據

def load_data():
    #Now let's import and put the train and test datasets in  pandas dataframe

    train = pd.read_csv('input/train.csv')
    test = pd.read_csv('input/test.csv')
    return train, test

train,test = load_data()

train.head()

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.000	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.000	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.000	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.000	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.000	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

5 rows × 81 columns

test.head()

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	ScreenPorch	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition
0	1461	20	RH	80.000	11622	Pave	NaN	Reg	Lvl	AllPub	...	120	NaN	MnPrv	NaN	0	6	2010	WD	Normal
1	1462	20	RL	81.000	14267	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	Gar2	12500	6	2010	WD	Normal
2	1463	60	RL	74.000	13830	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	MnPrv	NaN	0	3	2010	WD	Normal
3	1464	60	RL	78.000	9978	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	6	2010	WD	Normal
4	1465	120	RL	43.000	5005	Pave	NaN	IR1	HLS	AllPub	...	144	NaN	NaN	NaN	0	1	2010	WD	Normal

5 rows × 80 columns

train_ID = train['Id']
test_ID = test['Id']

#去掉Id字段，因為這個特征沒意義
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

plt.subplots(figsize=(12,6))  # 設定畫布大小
plt.subplot(1,2,1)  # 圖片的排列方式1行2列的第一張圖
g= sns.regplot(x=train['GrLivArea'],y= train['SalePrice'],fit_reg=False).set_title('Beofre')  
plt.subplot(1,2,2)  # 圖片的排列方式1行2列的第二張圖
train= train.drop(train[train['GrLivArea']>4000].index)  # 去掉面積大於4000的樣本，axis=0 默認人參數
g=sns.regplot(x=train['GrLivArea'],y=train['SalePrice'],fit_reg=False).set_title('After')

png

"""
P-P圖是根據變量的累積概率對應於所指定的理論分布累積概率繪制的散點圖，用於直觀地檢測樣本數據是否符合某一概率分布。
如果被檢驗的數據符合所指定的分布，則代表樣本數據的點應當基本在代表理論分布的對角線上。
"""

plt.subplots(figsize=(15,6))
plt.subplot(1,2,1)
g=sns.distplot(train['SalePrice'],fit=norm)

mu, sigma, = norm.fit(train['SalePrice'])  # 均值，標准差
skew_co  =train['SalePrice'].skew() # 偏態系數
g.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} ),skew{:.2f}'.format(mu, sigma,skew_co)],
            loc='best')
plt.subplot(1,2,2)
g = stats.probplot(train['SalePrice'], plot=plt)

png

The target variable is right skewed. As (linear) models love normally distributed data , we need to transform this variable and make it more normally distributed.
目標數據右偏態，大部分的統計原理和參數檢驗都是基於正態分布推得，因此可以將目標變量轉化標准正態分布

#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train["SalePrice"] = np.log1p(train["SalePrice"])

#Check the new distribution 
sns.distplot(train['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
mu, sigma, = norm.fit(train['SalePrice'])  # 均值，標准差
skew_co  =train['SalePrice'].skew() # 偏態系數

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} ) skew{:.2f}'.format(mu, sigma,skew_co)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

png

The skew seems now corrected and the data appears more normally distributed.
通過P_P圖可以看出，經過log1p轉化之后的目標變量近似正態分布

特征工程

ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))

all_data size is : (2915, 79)

缺失值

all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

	Missing Ratio
PoolQC	99.726
MiscFeature	96.398
Alley	93.208
Fence	80.446
FireplaceQu	48.714
LotFrontage	16.672
GarageQual	5.455
GarageCond	5.455
GarageFinish	5.455
GarageYrBlt	5.455
GarageType	5.386
BsmtExposure	2.813
BsmtCond	2.813
BsmtQual	2.779
BsmtFinType2	2.744
BsmtFinType1	2.710
MasVnrType	0.823
MasVnrArea	0.789
MSZoning	0.137
BsmtFullBath	0.069

plt.subplots(figsize=(12,5)) # 設定畫布大小
sns.barplot(x=all_data_na.index,y=all_data_na)
plt.xticks(rotation='90')  # 設定x軸的標簽
plt.ylabel('percentage',fontsize=15)
plt.xlabel('feature',fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)

Text(0.5, 1.0, 'Percent missing data by feature')

png

all_data["PoolQC"] = all_data["PoolQC"].fillna("None")
# 字段的說明中如果沒有游泳池，因此用None填充

all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")
all_data["Alley"] = all_data["Alley"].fillna("None")
all_data["Fence"] = all_data["Fence"].fillna("None")
all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')

all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)

for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)

for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')

all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)

all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

all_data = all_data.drop(['Utilities'], axis=1)

all_data["Functional"] = all_data["Functional"].fillna("Typ")

all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")

#Check remaining missing values if any 
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head()

	Missing Ratio

類別特征處理-label轉化

from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))

# shape        
print('Shape all_data: {}'.format(all_data.shape))

Shape all_data: (2915, 78)

all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

box-cox轉換

numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(20)

Skew in numerical features:

	Skew
MiscVal	21.932
PoolArea	18.702
LotArea	13.124
LowQualFinSF	12.080
3SsnPorch	11.368
LandSlope	4.971
KitchenAbvGr	4.299
BsmtFinSF2	4.143
EnclosedPorch	4.001
ScreenPorch	3.944
BsmtHalfBath	3.943
MasVnrArea	2.601
OpenPorchSF	2.529
WoodDeckSF	1.848
1stFlrSF	1.253
LotFrontage	1.093
GrLivArea	0.978
BsmtFinSF1	0.974
TotalSF	0.936
BsmtUnfSF	0.920

skewness = skewness[abs(skewness) > 0.75]
print("There are {} highly skewed numerical features to Box Cox transform".format(skewness.shape[0]))

skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    all_data[feat] = boxcox1p(all_data[feat], lam)
    
#all_data[skewed_features] = np.log1p(all_data[skewed_features])

There are 59 highly skewed numerical features to Box Cox transform

one-hot categoy 特征


all_data = pd.get_dummies(all_data)
print(all_data.shape)

(2915, 220)

數據相關性

def get_data_coorelation(data):
    corr = data.corr()
    plt.subplots(figsize=(30,30))
    cmap = sns.diverging_palette(150, 250, as_cmap=True) # 以后可以固定下來這樣的格式，尤其是對於數據的相關系數
    sns.heatmap(corr, cmap="RdYlBu", vmax=1, vmin=-0.6, center=0.2, square=True, linewidths=0, cbar_kws={"shrink": .5}, annot = True)

get_data_coorelation(train)

png

train = all_data[:ntrain]
test = all_data[ntrain:]

模型部門

from sklearn.metrics import mean_squared_error

# for alg in models:
#     model_name = alg.__class__.__name__
#     before_model_compare.loc[row_index,'Name'] = model_name
#     before_model_compare.loc[row_index,'Parameters'] = str(alg.get_params())
#     alg.fit(X_train,Y_train)
#     # for cross_validation  but the results are negative,we need to convert it to postive,均方誤差
#     training_results = np.sqrt((-cross_val_score(alg,X_train,Y_train,cv=shuff,scoring='neg_mean_squared_error')).mean())
#     test_results = np.sqrt(((Y_test-alg.predict(X_test))**2).mean())
#     before_model_compare.loc[row_index,"Train mean_squared_error"] = training_results*100
#     before_model_compare.loc[row_index,'Test mean_squared_error'] = test_results*100
#     row_index+=1
#     print(row_index,model_name,"trained>>>>")

#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

基模型

LASSO Regression

This model may be very sensitive to outliers. So we need to made it more robust on them. For that we use the sklearn's Robustscaler() method on pipeline
這個模型對於異常值非常敏感，需要使用Robustscaler方法

lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
# alpha參數怎么定的？這里面都是需要使用調參數進行解決的

###Elastic Net Regression，同樣是針對異常值處理的一個模型
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

###Kernel Ridge Regression :

KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

###Gradient Boosting Regression :
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

###XGBoost :
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

####LightGBM :
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

模型初步評估

score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Lasso score: 0.1112 (0.0071)

score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.1112 (0.0072)

score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Kernel Ridge score: 0.1152 (0.0071)

score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting score: 0.1163 (0.0085)


score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 0.1161 (0.0051)

score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

LGBM score: 0.1154 (0.0052)

stacking models

from sklearn.base import BaseEstimator,RegressorMixin,TransformerMixin,clone
class Averaging_models(BaseEstimator,RegressorMixin,TransformerMixin):
    def __inti__(self,models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    def predict(self,X):
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(predictions,axis=1)  # 返回所有預測結果的平均值
    
    
    
# class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
#     def __init__(self, models):
#         self.models = models
        
#     # we define clones of the original models to fit the data in
#     def fit(self, X, y):
#         self.models_ = [clone(x) for x in self.models]
        
#         # Train cloned base models
#         for model in self.models_:
#             model.fit(X, y)

#         return self
    
#     #Now we do the predictions for cloned models and average them
#     def predict(self, X):
#         predictions = np.column_stack([
#             model.predict(X) for model in self.models_
#         ])
#         return np.mean(predictions, axis=1)

平均基模型的結果

# 選擇了基本6個模型ENET，GBoost，KRR，Lasso，model_lgb
averaged_models = AveragingModels(models = (ENet, GBoost, KRR, lasso,model_lgb))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

 Averaged base models score: 0.1082 (0.0068)

增加metal模型

print("""
Split the total training set into two disjoint sets (here train and .holdout )
1. 將原始的train set 分割為兩部分，train1，驗證集（其對應的目標數據為y-valid）
Train several base models on the first part (train)
2. 用基模型在train1訓練得到不同的基模型M1，M2，M3，M4...
Test these base models on the second part (holdout)
3. 用上面的模型預測 驗證集，得到rsult1，result2，reuslt3，result4...
Use the predictions from 3) (called out-of-folds predictions) as the inputs, 
將result1，result2，result3，result4....組成新的訓練集作為輸入  train2
and the correct responses (target variable) 驗證集的y-valid as the outputs to train a higher level learner called meta-model.
y-valid 作為輸出，然后可以訓練得到一個更加高級的模型
""")
# 前三步 一般是迭代而來，如果采用5fold的stacking，就需要先將訓練集train分割為5份，然后重復5次，得到模型對於整個訓練集的預測結果
# 將原始的訓練集組變成一個新的訓練集的一個特征M1r，
# 將不同的模型訓練得到的結果排列成新的輸入集[Mr1,Mr2,Mr3,Mr4....],將整個的訓練集的y值作為out-put，得到新的metal model
# 下圖的上層描述的是單獨一個模型的的5-flod過程，然后獲得該模型處理訓練數據之后的New_feature，
# 然后分別獲得不同模型的上述特征，組成新的輸入，訓練得到metal model
# 下層是我們上面用的平均的方法，獲得不同的結果，然后取平均

Split the total training set into two disjoint sets (here train and .holdout )
1. 將原始的train set 分割為兩部分，train1，驗證集（其對應的目標數據為y-valid）
Train several base models on the first part (train)
2. 用基模型在train1訓練得到不同的基模型M1，M2，M3，M4...
Test these base models on the second part (holdout)
3. 用上面的模型預測 驗證集，得到rsult1，result2，reuslt3，result4...
Use the predictions from 3) (called out-of-folds predictions) as the inputs, 
將result1，result2，result3，result4....組成新的訓練集作為輸入  train2
and the correct responses (target variable) 驗證集的y-valid as the outputs to train a higher level learner called meta-model.
y-valid 作為輸出，然后可以訓練得到一個更加高級的模型

(Image taken from Faron)

print("""
On this gif, the base models are algorithms 0, 1, 2 and the meta-model is algorithm 3. 
The entire training dataset is A+B (target variable y known) that we can split into train part (A) and holdout part (B). 
And the test dataset is C.

B1 (which is the prediction from the holdout part) is the new feature used to train the meta-model 3 
and C1 (which is the prediction from the test dataset) is the meta-feature on which the final prediction is done.

""")

On this gif, the base models are algorithms 0, 1, 2 and the meta-model is algorithm 3. 
The entire training dataset is A+B (target variable y known) that we can split into train part (A) and holdout part (B). 
And the test dataset is C.

B1 (which is the prediction from the holdout part) is the new feature used to train the meta-model 3 
and C1 (which is the prediction from the test dataset) is the meta-feature on which the final prediction is done.

class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_fold=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_fold = n_fold

    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]  # 創建空列表，用於放kfold中的各個模型
        self.meta_model_ = clone(self.meta_model)
        k_fold = KFold(n_splits=self.n_fold, shuffle=True, random_state=43)

        out_of_flods_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, hold_index in k_fold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[hold_index])
                out_of_flods_predictions[hold_index, i] = y_pred

        self.meta_model_.fit(out_of_flods_predictions, y)
        return self

    def predict(self, X):
        meta_features = np.column_stack([np.column_stack([model.predict(X) for model in base_models]).mean(
            axis=1) for base_models in self.base_models_])
        return self.meta_model_.predict(meta_features)

Stacking Averaged models Score

stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
                                                 meta_model = lasso)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Stacking Averaged models score: 0.1081 (0.0069)

ensemble StackedRegressor model with XGBoost and LightGBM

def rmsle(y,y_pred):
    return np.sqrt(mean_squared_error(y,y_pred))

stacked_averaged_models.fit(train.values,y_train)
stacked_train_pred = stacked_averaged_models.predict(train.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
print(rmsle(y_train,stacked_train_pred))

0.07662229886245185

model_xgb.fit(train.values,y_train)
xgb_train_pred = model_xgb.predict(train.values)
xgb_pred = np.expm1(model_xgb.predict(test.values))
print(rmsle(y_train,xgb_train_pred))

0.07978944418551953

model_lgb.fit(train.values,y_train)
# 這些模型的參數，都是通過GridSearch得到

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
              boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              feature_fraction=0.2319, feature_fraction_seed=9,
              importance_type='split', learning_rate=0.05, max_bin=55,
              max_depth=-1, min_child_samples=20, min_child_weight=0.001,
              min_data_in_leaf=6, min_split_gain=0.0,
              min_sum_hessian_in_leaf=11, n_estimators=720, n_jobs=-1,
              num_leaves=5, objective='regression', random_state=None,
              reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

lgb_train_pred= model_lgb.predict(train.values)
lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmsle(y_train,lgb_train_pred))

0.07145250287861045

'''RMSE on the entire Train data when averaging'''

print('RMSLE score on train data:')
print(rmsle(y_train,stacked_train_pred*0.70 +
               xgb_train_pred*0.15 + lgb_train_pred*0.15 ))

RMSLE score on train data:
0.07431573219850335

ensemble = stacked_pred*0.70 + xgb_pred*0.15 + lbg_pred*0.15
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = ensemble
sub.to_csv('submission.csv',index=False)

github地址：https://github.com/point6013/essay_for_kaggle_test

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 OWASP TOP10 OWASP TOP10 【Kaggle-數據分析競賽】House Price Prediction 項目總結 Python熱門開源項目TOP10 2020年漏洞掃描工具Top10 【OWASP TOP10】2021年常見web安全漏洞TOP10排行 2020年GitHub 上那些優秀Android開源庫，這里是Top10！爬取今日熱榜百度熱搜TOP10 windows黨碼農在linux下你最需要的軟件列表TOP10 tomcat常見面試題目問答Top10