import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

#創建一個矩陣
a = np.array([[1, 2], [3, 4]])
m = np.mat(a) 
m

matrix([[1, 2],
        [3, 4]])

#矩陣運算回顧
# 矩陣轉秩
m.T
# 矩陣乘法
m * m 
a * a
# 矩陣⾏列式
np.linalg.det(m)
# 求逆矩陣
m.I
#轉換成array
m.A
#降維成一維
m.fattlen

matrix([[-2. ,  1. ],
        [ 1.5, -0.5]])

假設輸入數據為DataFrame格式，最后一列為標簽值，在此基礎上編寫線性回歸自定義函數(最小二乘）

#矩陣公式
w=(x.T * x).I * X.T * y

#根據最小二乘法推導得 w=(x.T * x).I * X.T * y 注：如果(x.T * X)不滿足可逆性,那么最小二乘無解，另不滿足凸函數，也無解
#又因為特征矩陣在存在多重共線性的情況下，特征矩陣不滿足可逆性，所以在做回歸之前，需要消除多重共線性
def standRegres(dataSet):
    #把DataFrame轉換成array 在轉換成matrix，因為DateFrame每一列數據可以不一樣，不能直接計算，轉換成matirx同時，數據格式也會統一
    xMat = np.mat(dataSet.iloc[:, :-1].values）
    yMat = np.mat(dataSet.iloc[:, -1].values).T
    xTx = xMat.T*xMat
    if np.linalg.det(xTx) == 0:#判斷xTx是否是滿秩矩陣，若不滿秩，則⽆法對其進⾏求逆矩陣的操作
        print("This matrix is singular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T*yMat)
    return ws
#這⾥需要注意的是，當使⽤矩陣分解來求解多元線性回歸時，必須添加⼀列全為1的列，⽤於表征線性⽅程截距b。

ex0 = pd.read_table('ex0.txt', header=None)

ws = standRegres(ex0)
ws
#返回結果即為各列特征權重，其中數據集第⼀列值均為1，因此返回結果的第⼀個分量表示截距

matrix([[3.00774324],
        [1.69532264]])

#可視化展示
yhat = ex0.iloc[:, :-1].values * ws
plt.plot(ex0.iloc[:, 1], ex0.iloc[:, 2], 'o')
plt.plot(ex0.iloc[:, 1], yhat)

[<matplotlib.lines.Line2D at 0x215fd3146a0>]

模型評價指標殘差平⽅和SSE

y = ex0.iloc[:, -1].values
yhat = yhat.flatten()
rss = np.power(yhat - y, 2).sum()
rss

1.3552490816814904

#將SSE做一個封裝
def sseCal(dataSet, regres):#設置參數為 數據集 與 回歸方法
    n = dataSet.shape[0] 
    y = dataSet.iloc[:, -1].values
    ws = regres(dataSet)
    yhat = dataSet.iloc[:, :-1].values * ws
    yhat = yhat.reshape([n,])
    rss = np.power(yhat - y, 2).sum()
    return rss

sseCal(ex0,standRegres)

1.3552490816814904

模型評價指標決定系數R_square,決定系數分布在[0, 1]區間內，且越趨近於1，表明擬合程度越好。

sse = sseCal(ex0, standRegres) 
y = ex0.iloc[:, -1].values
sst = np.power(y - y.mean(), 2).sum()
1 - sse / sst

0.9731300889856916

#封裝R**2
2
def rSquare(dataSet, regres):#設置參數為 數據集 與 回歸方法
    sse = sseCal(dataSet, regres) 
    y = dataSet.iloc[:, -1].values
    sst = np.power(y - y.mean(), 2).sum()
    return 1 - sse / sst

rSquare(ex0, standRegres)

0.9731300889856916

線性回歸的Scikit-Learn實現

from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(ex0.iloc[:, :-1].values, ex0.iloc[:,-1].values)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

reg.coef_#返回系數
Out[61]:
array([0.        , 1.69532264])
In [62]:
reg.intercept_#返回截距
Out[62]:
3.0077432426975905

from sklearn.metrics import mean_squared_error, r2_score
yhat = reg.predict(ex0.iloc[:, :-1])
mean_squared_error(y, yhat)

0.006776245408407454

mean_squared_error(y, yhat)*ex0.shape[0]

1.3552490816814908

r2_score(y, yhat)

0.9731300889856916

	0	1	2
0	1.0	0.067732	3.176513
1	1.0	0.427810	3.816464
2	1.0	0.995731	4.550095
3	1.0	0.738336	4.256571
4	1.0	0.981083	4.560815

A--多元線性回歸的python實現

假設輸入數據為DataFrame格式，最后一列為標簽值，在此基礎上編寫線性回歸自定義函數(最小二乘）

模型評價指標殘差平⽅和SSE

模型評價指標決定系數R_square,決定系數分布在[0, 1]區間內，且越趨近於1，表明擬合程度越好。

線性回歸的Scikit-Learn實現

免責聲明！

A--多元線性回歸的python實現

假設輸入數據為DataFrame格式，最后一列為標簽值，在此基礎上編寫線性回歸自定義函數(最小二乘）

模型評價指標殘差平⽅和SSE

模型評價指標 決定系數R_square,決定系數分布在[0, 1]區間內，且越趨近於1，表明擬合程度越好。

線性回歸的Scikit-Learn實現

免責聲明！

模型評價指標決定系數R_square,決定系數分布在[0, 1]區間內，且越趨近於1，表明擬合程度越好。