kaggle預測房價的代碼步驟


# -*- coding: utf-8 -*-
"""
Created on Sat Oct 20 14:03:05 2018

@author: 12958
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 忽略警告
import warnings
warnings.filterwarnings('ignore')
# 讀取訓練集和測試集
train = pd.read_csv('train.csv')
train_len = len(train)
test = pd.read_csv('test.csv')

#print(train.head())
#print(test.head())
# 查看訓練集的房價分布,左圖是原始房價分布,右圖是將房價對數化之后的分布
all_data = pd.concat([train, test], axis = 0, ignore_index= True)
all_data.drop(labels = ["SalePrice"],axis = 1, inplace = True)
fig = plt.figure(figsize=(12,5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
g1 = sns.distplot(train['SalePrice'],hist = True,label='skewness:{:.2f}'.format(train['SalePrice'].skew()),ax = ax1)
g1.legend()
g1.set(xlabel = 'Price')
g2 = sns.distplot(np.log1p(train['SalePrice']),hist = True,label='skewness:{:.2f}'.format(np.log1p(train['SalePrice']).skew()),ax=ax2)
g2.legend()
g2.set(xlabel = 'log(Price+1)')

plt.show()
# 由於房價是有偏度的,將房價對數化
train['SalePrice'] = np.log1p(train['SalePrice']) 
# 將有偏的數值特征對數化
num_features_list = list(all_data.dtypes[all_data.dtypes != "object"].index)

for i in num_features_list:
    if all_data[i].dropna().skew() > 0.75:
        all_data[i] = np.log1p(all_data[i])

# 將類別數值轉化為虛擬變量
all_data = pd.get_dummies(all_data)

# 查看缺失值
print(all_data.isnull().sum())
# 將缺失值用該列的均值填充
all_data = all_data.fillna(all_data.mean())
# 將測試集和訓練集分開
X_train = all_data[:train_len]
X_test = all_data[train_len:]
Y_train = train['SalePrice']
from sklearn.linear_model import Ridge, LassoCV
from sklearn.model_selection import cross_val_score

# 定義交叉驗證,用均方根誤差來評價模型的擬合程度
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, Y_train, scoring = 'neg_mean_squared_error', cv=5))
    return rmse
# Ridge模型
model_ridge = Ridge()
alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = a)).mean() for a in alphas]
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge
# 交叉驗證可視化
fig = plt.figure(figsize=(8,5))
cv_ridge.plot(title = 'Cross Validation Score with Model Ridge')
plt.xlabel("alpha")
plt.ylabel("rmse")
plt.show()
# 當alpha為10時,均方根誤差最小
cv_ridge.min()
# lasso模型,均方根誤差的均值更小,因此最終選擇lasso模型
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, Y_train)
rmse_cv(model_lasso).mean()
# 查看模型系數, lasso模型能選擇特征,將不重要的特征系數設置為0
coef = pd.Series(model_lasso.coef_, index = X_train.columns)
print("Lasso picked {} variables and eliminated the other {} variables".format(sum(coef != 0), sum(coef==0)))
# 查看重要的特征, GrLivArea地上面積是最重要的正相關特征
imp_coef = pd.concat([coef.sort_values().head(10),coef.sort_values().tail(10)])
fig = plt.figure(figsize=(6,8))
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")
plt.show()
# 查看殘差
est = pd.DataFrame({"est":model_lasso.predict(X_train), "true":Y_train})
plt.rcParams["figure.figsize"] = [6,6]
est["resi"] = est["true"] - est["est"]
est.plot(x = "est", y = "resi",kind = "scatter")
plt.show()



# xgboost模型
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label = Y_train)
dtest = xgb.DMatrix(X_test)
# 交叉驗證
params = {"max_depth":2, "eta":0.1}
cv_xgb = xgb.cv(params, dtrain,  num_boost_round=500, early_stopping_rounds=100)
cv_xgb.loc[30:,["test-rmse-mean", "train-rmse-mean"]].plot()
plt.show()

# 訓練模型
model_xgb = xgb.XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1) 
model_xgb.fit(X_train, Y_train)

'''
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=2, min_child_weight=1, missing=None, n_estimators=360,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
'''

# 查看兩種模型的預測結果, 將結果指數化
lasso_preds = np.expm1(model_lasso.predict(X_test))
xgb_preds = np.expm1(model_xgb.predict(X_test))
predictions = pd.DataFrame({"xgb":xgb_preds, "lasso":lasso_preds})
predictions.plot(x = "xgb", y = "lasso", kind = "scatter")
plt.show()
# 最終結果采用兩種模型預測的加權平均值,提交結果
preds = 0.7*lasso_preds + 0.3*xgb_preds
result = pd.DataFrame({"id":test.Id, "SalePrice":preds})
result.to_csv('result.csv', index = False)

需要實驗數據的請留言哦

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM