1. 目的:通過案例介紹R語言實現交叉檢驗的方法,構建非線性回歸模型,並比較不同模型的准確性。
2. 數據來源:Datacamp
https://assets.datacamp.com/production/repositories/894/datasets/6f144237ef9d7da94b2c84aa8eccc519bae4b300/houseprice.rds
3. 數據介紹
數據中含有40個觀測值以及兩個變量:房子大小以及房子價格。本文希望通過利用房子大小信息來預測房價信息。
houseprice <- readRDS(gzcon(url("https://assets.datacamp.com/production/repositories/894/datasets/6f144237ef9d7da94b2c84aa8eccc519bae4b300/houseprice.rds")))
head(houseprice)
summary(houseprice)

4. 應用
4.1 繪制直方圖及箱線圖查看變量分布及異常值
# explore the data library(ggplot2) # size ggplot(houseprice, aes(y = size)) + geom_boxplot(outlier.colour = 'darkblue', outlier.shape = 5, outlier.size = 3) ggplot(houseprice, aes(x = size)) + geom_histogram(aes(y = ..density..), binwidth = 10)


# price ggplot(houseprice, aes(y = price)) + geom_boxplot(outlier.colour = 'darkblue', outlier.shape = 5, outlier.size = 3) diff(range(houseprice$price)) ggplot(houseprice, aes(x = price)) + geom_histogram(aes(y = ..density..), binwidth = 50)


# realtionship b/w size and price ggplot(houseprice, aes(x = size, y = price)) + geom_point() # non-linear relationship

4.2 構建模型
Model1: Linear Regression Model
# Fit a model of price as a linear function of size model_lin <- lm(price ~ size, houseprice)
summary(model_lin)

Model2: Quadratic Model
# Fit a model of price as a function of squared size model_sqr <- lm(price ~ I(size^2), houseprice) summary(model_sqr)

Model3: Generalized Additive Model
library(mgcv) model_gam <- gam(price ~ s(size), data = houseprice, family = 'gaussian') summary(model_gam)

plot(model_gam)

Model comparison
library(dplyr)
library(tidyr)
houseprice %>%
mutate(pred_lin = predict(model_lin),
pred_sqr = predict(model_sqr),
pred_gam = predict(model_gam)) %>%
gather(key = modeltype, value = pred, pred_lin, pred_sqr, pred_gam) %>%
ggplot(aes(x = size)) +
geom_point(aes(y = price)) + # actual prices
geom_line(aes(y = pred, color = modeltype)) + # the predictions
scale_color_brewer(palette = "Dark2")

4.3 樣本內模型准確性比較(In-Sample RMSE)
# Compare in-sample RMSE
houseprice %>%
mutate(pred_lin = predict(model_lin),
pred_sqr = predict(model_sqr),
pred_gam = predict(model_gam)) %>%
gather(key = modeltype, value = pred, pred_lin, pred_sqr, pred_gam) %>%
mutate(residual = price - pred) %>%
group_by(modeltype) %>%
summarize(rmse = sqrt(mean(residual^2)))

4.4 樣本外模型准確性比較(Out-of-Sample RMSE + Cross-Validation)
Cross-Validation Method1
# Create a splitting plan for 3-fold cross validation
library(vtreat)
set.seed(34245) # set the seed for reproducibility
splitPlan <- kWayCrossValidation(nrow(houseprice), 3, NULL, NULL)
# get cross-validation predictions for price ~ size
houseprice$pred_lin2 <- 0 # initialize the prediction vector
for(i in 1:3) {
split <- splitPlan[[i]]
model_lin2 <- lm(price ~ size, data = houseprice[split$train,])
houseprice$pred_lin2[split$app] <- predict(model_lin2, newdata = houseprice[split$app,])
}
# Get cross-validation predictions for price as a function of size^2
houseprice$pred_sqr2 <- 0 # initialize the prediction vector
for(i in 1:3) {
split <- splitPlan[[i]]
model_sqr2 <- lm(price ~ I(size^2), data = houseprice[split$train, ])
houseprice$pred_sqr2[split$app] <- predict(model_sqr2, newdata = houseprice[split$app, ])
}
# Get cross-valalidation predictions for price as a function of GAM
houseprice$pred_gam2 <- 0 # initialize the prediction vector
for(i in 1:3) {
split <- splitPlan[[i]]
model_gam2 <- gam(price ~ s(size), data = houseprice[split$train, ])
houseprice$pred_gam2[split$app] <- predict(model_gam2, newdata = houseprice[split$app, ])
}
Cross-Validation Method2
# alternative for cross validation
library(caret)
myControl <- trainControl(method = "cv",
number = 3,
verboseIter = T)
model_lin3 <- train(
price ~ size,
houseprice,
method = "lm",
trControl = myControl)
model_sqr3 <- train(
price ~ I(size^2),
houseprice,
method = "lm",
trControl = myControl)
model_gam3 <- train(
price ~ size,
houseprice,
method = "gam",
trControl = myControl)
model_lin3$results$RMSE
model_sqr3$results$RMSE
model_gam3$results$RMSE
Out-of-Sample RMSE
# Gather the predictions and calculate RMSE houseprice %>% gather(key = modeltype, value = pred2, pred_lin2, pred_sqr2, pred_gam2) %>% mutate(residuals = price - pred2) %>% group_by(modeltype) %>% # group by modeltype summarise(rmse = sqrt(mean(residuals^2))

Compare the predictions against actual prices on the data
houseprice %>% gather(key = modeltype, value = pred2, pred_lin2, pred_sqr2, pred_gam2) %>% ggplot(aes(x = size)) + # the column for the x axis geom_point(aes(y = price)) + # the y-column for the scatterplot geom_point(aes(y = pred2, color = modeltype)) + # the y-column for the point-and-line plot geom_line(aes(y = pred2, color = modeltype, linetype = modeltype)) + # the y-column for the point-and-line plot scale_color_brewer(palette = "Dark2")

