custom_regression_metrics <- function (data, lev = NULL, model = NULL) {
c(RMSE = sqrt(mean((data$obs-data$pred)^2)),
Rsquared = summary(lm(pred ~ obs, data))$r.squared,
MAE = mean(abs(data$obs-data$pred)),
MSE = mean((data$obs-data$pred)^2),
RSS = sum((data$obs-data$pred)^2))
}
ctrl <- trainControl(method = "cv", number = 10)
CV_RMSE <- c()
set.seed(123)
for (i in 1:10) {
model_temp <- train(y = Wage$wage,
x = poly(Wage$age, i, raw = T, simple = T),
method = "lm",
metric = "RMSE",
trControl = ctrl)
CV_RMSE[i] <- model_temp$results$RMSE
}
data.frame(degree = 1:10, CV_RMSE = CV_RMSE) %>%
mutate(min_CV_RMSE = as.numeric(min(CV_RMSE) == CV_RMSE)) %>%
ggplot(aes(x = degree, y = CV_RMSE)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_CV_RMSE))) +
scale_x_continuous(breaks = seq(1, 10), minor_breaks = NULL) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme(legend.position = "none") +
labs(title = "Wage Dataset - Polynomial Regression",
subtitle = "Selecting the 'age' polynomial degree with cross-validation",
x = "Degree",
y = "CV RMSE")
Using ANOVA as apposed to cross-validation could result in selecting either the third-order or fourth-order polynomials of age.
I plot the third-order polynomial fit, which certainly looks reasonable:
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", formula = "y ~ cut(x, 12)") +
labs(title = "Wage Dataset - Step Function",
subtitle = "Predicting 'wage' with a 12-interval step function of 'age'")
If we were using the 1 s.e. rule for cross-validation we would probably end up choosing the step function with 7 intervals, which is probably what I would select in practice.
set.seed(1)
train_index <- sample(1:nrow(College), round(nrow(College) * 0.7))
train <- College[train_index, ]
nrow(train) / nrow(College)
## [1] 0.7001287
test <- College[-train_index, ]
nrow(test) / nrow(College)
## [1] 0.2998713
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 1,
summaryFunction = custom_regression_metrics,
selectionFunction = "oneSE")
set.seed(11)
model_forward <- train(Outstate ~ .,
data = train,
method = "leapForward",
metric = "MSE",
maximize = F,
trControl = ctrl,
tuneGrid = data.frame(nvmax = 1:17))
model_forward
## Linear Regression with Forward Selection
##
## 544 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 489, 490, 492, 491, 489, 488, ...
## Resampling results across tuning parameters:
##
## nvmax RMSE Rsquared MAE MSE RSS
## 1 3116.676 0.5047899 2480.277 9924143 537869955
## 2 2670.511 0.6141343 2018.854 7253757 393129955
## 3 2394.728 0.6806436 1847.491 5787625 314365711
## 4 2223.172 0.7179308 1740.158 4974734 270218937
## 5 2160.560 0.7329846 1701.678 4701270 255744224
## 6 2117.879 0.7416528 1665.929 4519142 245832663
## 7 2123.836 0.7408057 1670.549 4548786 247459481
## 8 2119.735 0.7406173 1674.291 4533400 246643018
## 9 2155.722 0.7293720 1691.149 4710988 256372597
## 10 2157.944 0.7291680 1680.271 4726621 257176853
## 11 2113.974 0.7416482 1663.049 4513600 245493174
## 12 2096.665 0.7459114 1653.184 4443008 241535828
## 13 2086.080 0.7482922 1647.345 4399171 239126796
## 14 2082.560 0.7487630 1644.411 4383112 238253554
## 15 2090.142 0.7471492 1651.776 4414671 239941188
## 16 2088.295 0.7476358 1651.515 4404923 239413821
## 17 2088.644 0.7475671 1651.796 4406352 239491261
##
## MSE was used to select the optimal model using the one SE rule.
## The final value used for the model was nvmax = 6.
model_forward_SE <- model_forward$results %>%
mutate(MSE_SE_low = MSE - (MSESD / sqrt(model_forward$control$number * model_forward$control$repeats)),
MSE_SE_high = MSE + (MSESD / sqrt(model_forward$control$number * model_forward$control$repeats)),
min_CV_MSE = as.numeric(min(MSE) == MSE))
ggplot(model_forward_SE, aes(x = nvmax, y = MSE)) +
geom_line(col = "grey55") +
geom_hline(aes(yintercept = model_forward_SE$MSE_SE_high[model_forward_SE$min_CV_MSE == 1], col = "1 s.e. model")) +
geom_vline(aes(xintercept = model_forward$bestTune$nvmax, col = "mediumseagreen")) +
geom_errorbar(aes(ymin = MSE_SE_low, ymax = MSE_SE_high),
colour = "grey55",
width = 0.4) +
geom_point(size = 2, aes(col = factor(min_CV_MSE))) +
scale_color_manual(values = c("deepskyblue3", "green", "mediumseagreen", "mediumseagreen")) +
scale_x_continuous(breaks = 1:17, minor_breaks = NULL) +
theme(legend.position = "none") +
labs(title = "College Dataset - Linear Model",
subtitle = "Selecting number of predictors with forward selection (cross-validation MSE, 1 s.e. rule)",
x = "Number of Predictors",
y = "Cross-Validation MSE")
## Warning: Use of `model_forward_SE$MSE_SE_high` is discouraged. Use `MSE_SE_high`
## instead.
## Warning: Use of `model_forward_SE$min_CV_MSE` is discouraged. Use `min_CV_MSE`
## instead.
coef(model_forward$finalModel, id = 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3764.3413062 2793.2069104 0.9703210 38.2157650 59.0358377
## Expend Grad.Rate
## 0.2031532 28.6548780
model_gam_1 <- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = train)
par(mfrow = c(2, 3))
plot(model_gam_1, se = T, col = "blue")
Interpreting these plots using the logic outlined on page 284, it seems that there is obvious evidence of a nonlinear effect of Expend, whereas a linear relationship seems more reasonable for perc.alumni. As for the other terms, it seems there is moderate evidence of non-linearity, but it is harder for me to say.
mean((predict(model_gam_1, newdata = test) - test$Outstate)^2)
## [1] 3170227
test_TSS <- sum((test$Outstate - mean(test$Outstate))^2)
test_RSS <- sum((predict(model_gam_1, newdata = test) - test$Outstate)^2)
1 - test_RSS/test_TSS
## [1] 0.7712156
mean((predict(model_forward, newdata = test) - test$Outstate)^2)
## [1] 3616631
test_RSS <- sum((predict(model_forward, newdata = test) - test$Outstate)^2)
1 - test_RSS/test_TSS
## [1] 0.7390001
The GAM model clearly has better test performance, so it would seem that the non-linear relationships it found (displayed in the previous plot) were not spurious and generalized well to unseen data.
summary(model_gam_1)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) +
## s(Expend) + s(Grad.Rate), data = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7511.91 -1172.59 38.71 1267.44 7827.85
##
## (Dispersion Parameter for gaussian family taken to be 3617918)
##
## Null Deviance: 9263945675 on 543 degrees of freedom
## Residual Deviance: 1888551843 on 521.9997 degrees of freedom
## AIC: 9782.515
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2402224634 2402224634 663.980 < 2.2e-16 ***
## s(Room.Board) 1 1721929895 1721929895 475.945 < 2.2e-16 ***
## s(PhD) 1 620590394 620590394 171.532 < 2.2e-16 ***
## s(perc.alumni) 1 456743095 456743095 126.245 < 2.2e-16 ***
## s(Expend) 1 746743434 746743434 206.401 < 2.2e-16 ***
## s(Grad.Rate) 1 99786036 99786036 27.581 2.197e-07 ***
## Residuals 522 1888551843 3617918
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board) 3 2.680 0.04629 *
## s(PhD) 3 1.179 0.31718
## s(perc.alumni) 3 0.937 0.42233
## s(Expend) 3 32.503 < 2e-16 ***
## s(Grad.Rate) 3 1.995 0.11380
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model_gam_2 <- gam(Outstate ~ Private + s(Room.Board) + PhD + perc.alumni + s(Expend) + Grad.Rate, data = train)
mean((predict(model_gam_2, newdata = test) - test$Outstate)^2)
## [1] 3226447
test_RSS <- sum((predict(model_gam_2, newdata = test) - test$Outstate)^2)
1 - test_RSS/test_TSS
## [1] 0.7671584
model_gam_3 <- gam(Outstate ~ Private + s(Room.Board) + PhD + perc.alumni + Expend + Grad.Rate, data = train)
mean((predict(model_gam_3, newdata = test) - test$Outstate)^2)
## [1] 3610592
test_RSS <- sum((predict(model_gam_3, newdata = test) - test$Outstate)^2)
1 - test_RSS/test_TSS
## [1] 0.739436