一、畫出模型的殘差值分布情況
#!/usr/bin/python import pandas as pd import numpy as np import csv as csv import matplotlib import matplotlib.pyplot as plt from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV from sklearn.model_selection import cross_val_score train = pd.read_csv('train.csv', header=0) # Load the train file into a dataframe df = pd.get_dummies(train.iloc[:,1:-1]) df = df.fillna(df.mean()) X_train = df y = train.price def rmse_cv(model): rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 3)) return(rmse) #調用LassoCV函數,並進行交叉驗證,默認cv=3 model_lasso = LassoCV(alphas = [0.1,1,0.001, 0.0005]).fit(X_train, y) matplotlib.rcParams['figure.figsize'] = (6.0, 6.0) #將模型預測的值與真實值作為兩列放在DataFrame里面 preds = pd.DataFrame({"preds":model_lasso.predict(X_train), "true":y}) #真實值與預測值之間的差值作為一個新列 preds["residuals"] = preds["true"] - preds["preds"] print(preds) #預測值作為X軸,殘差值作為y軸,畫出圖形 preds.plot(x = "preds", y = "residuals",kind = "scatter") plt.show()
注:本樣例只是為了說明問題,只用了幾行數據來預測畫圖。
正常來講,一個好的模型,殘差值應該分布比較集中,而且基本都在0上下稍微浮動,表明殘差值都比較小。