【R語言學習筆記】8. 非線性模型及交叉檢驗

本文轉載自查看原文 2020-01-30 12:41 974 交叉檢驗/ R語言/ 非線性模型

1. 目的：通過案例介紹R語言實現交叉檢驗的方法，構建非線性回歸模型，並比較不同模型的准確性。

2. 數據來源：Datacamp

https://assets.datacamp.com/production/repositories/894/datasets/6f144237ef9d7da94b2c84aa8eccc519bae4b300/houseprice.rds

3. 數據介紹

數據中含有40個觀測值以及兩個變量：房子大小以及房子價格。本文希望通過利用房子大小信息來預測房價信息。

houseprice <- readRDS(gzcon(url("https://assets.datacamp.com/production/repositories/894/datasets/6f144237ef9d7da94b2c84aa8eccc519bae4b300/houseprice.rds")))
head(houseprice)
summary(houseprice)

4. 應用

4.1 繪制直方圖及箱線圖查看變量分布及異常值

# explore the data
library(ggplot2)

# size
ggplot(houseprice, aes(y = size)) + geom_boxplot(outlier.colour = 'darkblue', outlier.shape = 5, outlier.size = 3)
ggplot(houseprice, aes(x = size)) + geom_histogram(aes(y = ..density..), binwidth = 10)

# price
ggplot(houseprice, aes(y = price)) + geom_boxplot(outlier.colour = 'darkblue', outlier.shape = 5, outlier.size = 3)
diff(range(houseprice$price))
ggplot(houseprice, aes(x = price)) + geom_histogram(aes(y = ..density..), binwidth = 50)

# realtionship b/w size and price
ggplot(houseprice, aes(x = size, y = price)) + geom_point() # non-linear relationship

4.2 構建模型

Model1: Linear Regression Model

# Fit a model of price as a linear function of size
model_lin <- lm(price ~ size, houseprice)
summary(model_lin)

Model2: Quadratic Model

# Fit a model of price as a function of squared size 
model_sqr <- lm(price ~ I(size^2), houseprice)
summary(model_sqr)

Model3: Generalized Additive Model

library(mgcv)
model_gam <- gam(price ~ s(size), data = houseprice, family = 'gaussian')
summary(model_gam)

plot(model_gam)

Model comparison

library(dplyr)
library(tidyr)
houseprice %>% 
  mutate(pred_lin = predict(model_lin),       
         pred_sqr = predict(model_sqr),     
         pred_gam = predict(model_gam)) %>%    
  gather(key = modeltype, value = pred, pred_lin, pred_sqr, pred_gam) %>%
  ggplot(aes(x = size)) + 
  geom_point(aes(y = price)) +                   # actual prices
  geom_line(aes(y = pred, color = modeltype)) + # the predictions
  scale_color_brewer(palette = "Dark2")

4.3 樣本內模型准確性比較(In-Sample RMSE)

# Compare in-sample RMSE
houseprice %>% 
  mutate(pred_lin = predict(model_lin),       
         pred_sqr = predict(model_sqr),     
         pred_gam = predict(model_gam)) %>%    
  gather(key = modeltype, value = pred, pred_lin, pred_sqr, pred_gam) %>%
  mutate(residual = price - pred) %>%
  group_by(modeltype) %>%
  summarize(rmse = sqrt(mean(residual^2)))

4.4 樣本外模型准確性比較(Out-of-Sample RMSE + Cross-Validation)

Cross-Validation Method1

# Create a splitting plan for 3-fold cross validation
library(vtreat)
set.seed(34245)  # set the seed for reproducibility
splitPlan <- kWayCrossValidation(nrow(houseprice), 3, NULL, NULL)

# get cross-validation predictions for price ~ size
houseprice$pred_lin2 <- 0  # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  model_lin2 <- lm(price ~ size, data = houseprice[split$train,])
  houseprice$pred_lin2[split$app] <- predict(model_lin2, newdata = houseprice[split$app,])
}

# Get cross-validation predictions for price as a function of size^2 
houseprice$pred_sqr2 <- 0 # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  model_sqr2 <- lm(price ~ I(size^2), data = houseprice[split$train, ])
  houseprice$pred_sqr2[split$app] <- predict(model_sqr2, newdata = houseprice[split$app, ])
}

# Get cross-valalidation predictions for price as a function of GAM 
houseprice$pred_gam2 <- 0 # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  model_gam2 <- gam(price ~ s(size), data = houseprice[split$train, ])
  houseprice$pred_gam2[split$app] <- predict(model_gam2, newdata = houseprice[split$app, ])
}

Cross-Validation Method2

# alternative for cross validation
library(caret)
myControl <- trainControl(method = "cv", 
                          number = 3,
                          verboseIter = T)
model_lin3 <- train(
  price ~ size, 
  houseprice,
  method = "lm",
  trControl = myControl)

model_sqr3 <- train(
  price ~ I(size^2), 
  houseprice,
  method = "lm",
  trControl = myControl)

model_gam3 <- train(
  price ~ size, 
  houseprice,
  method = "gam",
  trControl = myControl)

model_lin3$results$RMSE
model_sqr3$results$RMSE
model_gam3$results$RMSE

Out-of-Sample RMSE

# Gather the predictions and calculate RMSE
houseprice %>%
  gather(key = modeltype, value = pred2, pred_lin2, pred_sqr2, pred_gam2) %>%
  mutate(residuals = price - pred2) %>% 
  group_by(modeltype) %>% # group by modeltype
  summarise(rmse = sqrt(mean(residuals^2))

Compare the predictions against actual prices on the data

houseprice %>%
  gather(key = modeltype, value = pred2, pred_lin2, pred_sqr2, pred_gam2) %>%
  ggplot(aes(x = size)) +                          # the column for the x axis
  geom_point(aes(y = price)) +                     # the y-column for the scatterplot
  geom_point(aes(y = pred2, color = modeltype)) +  # the y-column for the point-and-line plot
  geom_line(aes(y = pred2, color = modeltype, linetype = modeltype)) + # the y-column for the point-and-line plot
  scale_color_brewer(palette = "Dark2")

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 【R語言學習筆記】9. 線性回歸及其假設檢驗 R語言學習筆記（二） R語言學習筆記（三） R語言學習筆記（四） R語言學習筆記（五） R語言學習筆記之九 R語言學習筆記 R語言學習筆記之四 R語言學習筆記之五【R語言學習筆記】11. 線性回歸中的交互效應(interaction)