# -*- coding: utf-8 -*- """ Created on Sat Oct 20 14:03:05 2018 @author: 12958 """ import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # 忽略警告 import warnings warnings.filterwarnings('ignore') # 讀取訓練集和測試集 train = pd.read_csv('train.csv') train_len = len(train) test = pd.read_csv('test.csv') #print(train.head()) #print(test.head()) # 查看訓練集的房價分布,左圖是原始房價分布,右圖是將房價對數化之后的分布 all_data = pd.concat([train, test], axis = 0, ignore_index= True) all_data.drop(labels = ["SalePrice"],axis = 1, inplace = True) fig = plt.figure(figsize=(12,5)) ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) g1 = sns.distplot(train['SalePrice'],hist = True,label='skewness:{:.2f}'.format(train['SalePrice'].skew()),ax = ax1) g1.legend() g1.set(xlabel = 'Price') g2 = sns.distplot(np.log1p(train['SalePrice']),hist = True,label='skewness:{:.2f}'.format(np.log1p(train['SalePrice']).skew()),ax=ax2) g2.legend() g2.set(xlabel = 'log(Price+1)') plt.show() # 由於房價是有偏度的,將房價對數化 train['SalePrice'] = np.log1p(train['SalePrice']) # 將有偏的數值特征對數化 num_features_list = list(all_data.dtypes[all_data.dtypes != "object"].index) for i in num_features_list: if all_data[i].dropna().skew() > 0.75: all_data[i] = np.log1p(all_data[i]) # 將類別數值轉化為虛擬變量 all_data = pd.get_dummies(all_data) # 查看缺失值 print(all_data.isnull().sum()) # 將缺失值用該列的均值填充 all_data = all_data.fillna(all_data.mean()) # 將測試集和訓練集分開 X_train = all_data[:train_len] X_test = all_data[train_len:] Y_train = train['SalePrice'] from sklearn.linear_model import Ridge, LassoCV from sklearn.model_selection import cross_val_score # 定義交叉驗證,用均方根誤差來評價模型的擬合程度 def rmse_cv(model): rmse = np.sqrt(-cross_val_score(model, X_train, Y_train, scoring = 'neg_mean_squared_error', cv=5)) return rmse # Ridge模型 model_ridge = Ridge() alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75] cv_ridge = [rmse_cv(Ridge(alpha = a)).mean() for a in alphas] cv_ridge = pd.Series(cv_ridge, index = alphas) cv_ridge # 交叉驗證可視化 fig = plt.figure(figsize=(8,5)) cv_ridge.plot(title = 'Cross Validation Score with Model Ridge') plt.xlabel("alpha") plt.ylabel("rmse") plt.show() # 當alpha為10時,均方根誤差最小 cv_ridge.min() # lasso模型,均方根誤差的均值更小,因此最終選擇lasso模型 model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, Y_train) rmse_cv(model_lasso).mean() # 查看模型系數, lasso模型能選擇特征,將不重要的特征系數設置為0 coef = pd.Series(model_lasso.coef_, index = X_train.columns) print("Lasso picked {} variables and eliminated the other {} variables".format(sum(coef != 0), sum(coef==0))) # 查看重要的特征, GrLivArea地上面積是最重要的正相關特征 imp_coef = pd.concat([coef.sort_values().head(10),coef.sort_values().tail(10)]) fig = plt.figure(figsize=(6,8)) imp_coef.plot(kind = "barh") plt.title("Coefficients in the Lasso Model") plt.show() # 查看殘差 est = pd.DataFrame({"est":model_lasso.predict(X_train), "true":Y_train}) plt.rcParams["figure.figsize"] = [6,6] est["resi"] = est["true"] - est["est"] est.plot(x = "est", y = "resi",kind = "scatter") plt.show() # xgboost模型 import xgboost as xgb dtrain = xgb.DMatrix(X_train, label = Y_train) dtest = xgb.DMatrix(X_test) # 交叉驗證 params = {"max_depth":2, "eta":0.1} cv_xgb = xgb.cv(params, dtrain, num_boost_round=500, early_stopping_rounds=100) cv_xgb.loc[30:,["test-rmse-mean", "train-rmse-mean"]].plot() plt.show() # 訓練模型 model_xgb = xgb.XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1) model_xgb.fit(X_train, Y_train) ''' XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=2, min_child_weight=1, missing=None, n_estimators=360, n_jobs=1, nthread=None, objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=1) ''' # 查看兩種模型的預測結果, 將結果指數化 lasso_preds = np.expm1(model_lasso.predict(X_test)) xgb_preds = np.expm1(model_xgb.predict(X_test)) predictions = pd.DataFrame({"xgb":xgb_preds, "lasso":lasso_preds}) predictions.plot(x = "xgb", y = "lasso", kind = "scatter") plt.show() # 最終結果采用兩種模型預測的加權平均值,提交結果 preds = 0.7*lasso_preds + 0.3*xgb_preds result = pd.DataFrame({"id":test.Id, "SalePrice":preds}) result.to_csv('result.csv', index = False)
需要實驗數據的請留言哦