多元回歸
# 導⼊模塊
from sklearn import model_selection
# 導⼊數據
Profit = pd.read_excel(r'Predict to Profit.xlsx')
# 將數據集拆分為訓練集和測試集
train, test = model_selection.train_test_split(Profit,
test_size = 0.2,
random_state=1234
)
# 根據train數據集建模
model = sm.formula.ols('Profit ~ RD_Spend+Administration+Marketing_Spend+C(State)', data= train).fit()
# print('模型的偏回歸系數分別為:\n', model.params)
# 刪除test數據集中的Profit變量,⽤剩下的⾃變量進⾏預測
test_X = test.drop(labels = 'Profit', axis = 1)
pred = model.predict(exog = test_X)
print('對⽐預測值和實際值的差異:\n',pd.DataFrame({'Prediction':pred,'Real':test.Profit}))
由於地區自變量存在多重共線性,所以系統會自動刪除一個,當然也可以自定義一個
# ⽣成由State變量衍⽣的啞變量
dummies = pd.get_dummies(Profit.State)
# 將啞變量與原始數據集⽔平合並
Profit_New = pd.concat([Profit,dummies], axis = 1)
# 刪除State變量和California變量(因為State變量已被分解為啞變量,New York變量需要作為參照組)
Profit_New.drop(labels = ['State','New York'], axis = 1, inplace = True)
# 拆分數據集Profit_New
train, test = model_selection.train_test_split(Profit_New, test_size = 0.2, random_state=1234)
# 建模
model2 = sm.formula.ols('Profit~RD_Spend+Administration+Marketing_Spend+Florida+California',
data = train).fit()
print('模型的偏回歸系數分別為:\n', model2.params)