一.過擬合
建模的目的是讓模型學習到數據的一般性規律,但有時候可能會學過頭,學到一些噪聲數據的特性,雖然模型可以在訓練集上取得好的表現,但在測試集上結果往往會變差,這時稱模型陷入了過擬合,接下來造一些偽數據進行演示:
import os
os.chdir('../')
from ml_models.linear_model import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#造偽樣本
X=np.linspace(0,100,100)
X=np.c_[X,np.ones(100)]
w=np.asarray([3,2])
Y=X.dot(w)
X=X.astype('float')
Y=Y.astype('float')
X[:,0]+=np.random.normal(size=(X[:,0].shape))*3#添加噪聲
Y=Y.reshape(100,1)
#擬合數據並可視化
lr=LinearRegression()
lr.fit(X[:,:-1],Y)
lr.plot_fit_boundary(X[:,:-1],Y)
目前看起來效果還是可以的,但如果加入幾個異常點,再看看效果呢
X=np.concatenate([X,np.asanyarray([[100,1],[101,1],[102,1],[103,1],[104,1]])])
Y=np.concatenate([Y,np.asanyarray([[3000],[3300],[3600],[3800],[3900]])])
lr=LinearRegression()
lr.fit(X[:,:-1],Y)
lr.plot_fit_boundary(X[:,:-1],Y)
二.正則化
可以看到,僅僅加入了幾個很離譜的異常點,就會對預測產生很大的影響,且偏離很遠,這在實際情況中是很常見的;通常可以通過對模型參數添加正則化約束來避免這種情況,使其不會太“飄”,做法是在loss函數中為權重\(w\)添加\(L_1\)或者\(L_2\)約束,借用上一節的公式推導,直接推出loss部分:
1.線性回歸中添加\(L_1\)約束稱為Lasso回歸,其損失函數如下:
2.線性回歸中添加\(L_2\)約束稱為Ridge回歸,其損失函數如下:
3.如果不太確定用\(L_1\)好,還是\(L_2\)好,可以用它們的組合,稱作ElasticNet,損失函數如下:
可以發現通過調整超參,可以控制\(w\)的大小,如果\(\lambda\)或\(\alpha\)設置很大,\(w\)會被約束的很小,而如果\(\alpha\)或\(\lambda\)設置為0,等價於原始的不帶正則項的線性回歸;通常可以通過交叉驗證,根據驗證集上的表現來設置一個合適的超參;接下來在上一節線性回歸代碼的基礎上實現Lasso,Ridge,ElasticNet模型,另外設置兩個參數l1_ratio
以及l2_ratio
,分別用來控制\(L_1\)和\(L_2\)的loss部分的權重
三.代碼實現
class LinearRegression(object):
def __init__(self, fit_intercept=True, solver='sgd', if_standard=True, epochs=10, eta=1e-2, batch_size=1,
l1_ratio=None, l2_ratio=None):
"""
:param fit_intercept: 是否訓練bias
:param solver:
:param if_standard:
"""
self.w = None
self.fit_intercept = fit_intercept
self.solver = solver
self.if_standard = if_standard
if if_standard:
self.feature_mean = None
self.feature_std = None
self.epochs = epochs
self.eta = eta
self.batch_size = batch_size
self.l1_ratio = l1_ratio
self.l2_ratio = l2_ratio
# 注冊sign函數
self.sign_func = np.vectorize(utils.sign)
def init_params(self, n_features):
"""
初始化參數
:return:
"""
self.w = np.random.random(size=(n_features, 1))
def _fit_closed_form_solution(self, x, y):
"""
直接求閉式解
:param x:
:param y:
:return:
"""
if self.l1_ratio is None and self.l2_ratio is None:
self.w = np.linalg.pinv(x).dot(y)
elif self.l1_ratio is None and self.l2_ratio is not None:
self.w = np.linalg.inv(x.T.dot(x) + self.l2_ratio * np.eye(x.shape[1])).dot(x.T).dot(y)
else:
self._fit_sgd(x, y)
def _fit_sgd(self, x, y):
"""
隨機梯度下降求解
:param x:
:param y:
:param epochs:
:param eta:
:param batch_size:
:return:
"""
x_y = np.c_[x, y]
# 按batch_size更新w,b
for _ in range(self.epochs):
np.random.shuffle(x_y)
for index in range(x_y.shape[0] // self.batch_size):
batch_x_y = x_y[self.batch_size * index:self.batch_size * (index + 1)]
batch_x = batch_x_y[:, :-1]
batch_y = batch_x_y[:, -1:]
dw = -2 * batch_x.T.dot(batch_y - batch_x.dot(self.w)) / self.batch_size
# 添加l1和l2的部分
dw_reg = np.zeros(shape=(x.shape[1] - 1, 1))
if self.l1_ratio is not None:
dw_reg += self.l1_ratio * self.sign_func(self.w[:-1]) / self.batch_size
if self.l2_ratio is not None:
dw_reg += 2 * self.l2_ratio * self.w[:-1] / self.batch_size
dw_reg = np.concatenate([dw_reg, np.asarray([[0]])], axis=0)
dw += dw_reg
self.w = self.w - self.eta * dw
def fit(self, x, y):
# 是否歸一化feature
if self.if_standard:
self.feature_mean = np.mean(x, axis=0)
self.feature_std = np.std(x, axis=0) + 1e-8
x = (x - self.feature_mean) / self.feature_std
# 是否訓練bias
if self.fit_intercept:
x = np.c_[x, np.ones_like(y)]
# 初始化參數
self.init_params(x.shape[1])
# 訓練模型
if self.solver == 'closed_form':
self._fit_closed_form_solution(x, y)
elif self.solver == 'sgd':
self._fit_sgd(x, y)
def get_params(self):
"""
輸出原始的系數
:return: w,b
"""
if self.fit_intercept:
w = self.w[:-1]
b = self.w[-1]
else:
w = self.w
b = 0
if self.if_standard:
w = w / self.feature_std.reshape(-1, 1)
b = b - w.T.dot(self.feature_mean.reshape(-1, 1))
return w.reshape(-1), b
def predict(self, x):
"""
:param x:ndarray格式數據: m x n
:return: m x 1
"""
if self.if_standard:
x = (x - self.feature_mean) / self.feature_std
if self.fit_intercept:
x = np.c_[x, np.ones(shape=x.shape[0])]
return x.dot(self.w)
def plot_fit_boundary(self, x, y):
"""
繪制擬合結果
:param x:
:param y:
:return:
"""
plt.scatter(x[:, 0], y)
plt.plot(x[:, 0], self.predict(x), 'r')
Lasso
lasso=LinearRegression(l1_ratio=100)
lasso.fit(X[:,:-1],Y)
lasso.plot_fit_boundary(X[:,:-1],Y)
Ridge
ridge=LinearRegression(l2_ratio=10)
ridge.fit(X[:,:-1],Y)
ridge.plot_fit_boundary(X[:,:-1],Y)
ElasticNet
elastic=LinearRegression(l1_ratio=100,l2_ratio=10)
elastic.fit(X[:,:-1],Y)
elastic.plot_fit_boundary(X[:,:-1],Y)
將sign函數整理到ml_models.utils中