custom_regression_metrics <- function (data, lev = NULL, model = NULL) {
  c(RMSE = sqrt(mean((data$obs-data$pred)^2)),
    Rsquared = summary(lm(pred ~ obs, data))$r.squared,
    MAE = mean(abs(data$obs-data$pred)), 
    MSE = mean((data$obs-data$pred)^2),
    RSS = sum((data$obs-data$pred)^2))
}
ctrl <- trainControl(method = "cv", number = 10)

CV_RMSE <- c()

set.seed(123)

for (i in 1:10) {
  model_temp <- train(y = Wage$wage,
                      x = poly(Wage$age, i, raw = T, simple = T),
                      method = "lm",
                      metric = "RMSE",
                      trControl = ctrl)
  CV_RMSE[i] <- model_temp$results$RMSE
}


data.frame(degree = 1:10, CV_RMSE = CV_RMSE) %>%
  mutate(min_CV_RMSE = as.numeric(min(CV_RMSE) == CV_RMSE)) %>%
  ggplot(aes(x = degree, y = CV_RMSE)) +
  geom_line(col = "grey55") +
  geom_point(size = 2, aes(col = factor(min_CV_RMSE))) +
  scale_x_continuous(breaks = seq(1, 10), minor_breaks = NULL) +
  scale_color_manual(values = c("deepskyblue3", "green")) +
  theme(legend.position = "none") +
  labs(title = "Wage Dataset - Polynomial Regression",
       subtitle = "Selecting the 'age' polynomial degree with cross-validation",
       x = "Degree",
       y = "CV RMSE")

Using ANOVA as apposed to cross-validation could result in selecting either the third-order or fourth-order polynomials of age.

I plot the third-order polynomial fit, which certainly looks reasonable:

ggplot(Wage, aes(x = age, y = wage)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 12)") + 
  labs(title = "Wage Dataset - Step Function",
       subtitle = "Predicting 'wage' with a 12-interval step function of 'age'")

If we were using the 1 s.e. rule for cross-validation we would probably end up choosing the step function with 7 intervals, which is probably what I would select in practice.

Question 10

set.seed(1)
train_index <- sample(1:nrow(College), round(nrow(College) * 0.7))

train <- College[train_index, ]
nrow(train) / nrow(College)
## [1] 0.7001287
test <- College[-train_index, ]
nrow(test) / nrow(College)
## [1] 0.2998713
ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     repeats = 1, 
                     summaryFunction = custom_regression_metrics,
                     selectionFunction = "oneSE")

set.seed(11)

model_forward <- train(Outstate ~ .,
                       data = train,
                       method = "leapForward",
                       metric = "MSE",
                       maximize = F,
                       trControl = ctrl,
                       tuneGrid = data.frame(nvmax = 1:17))

model_forward
## Linear Regression with Forward Selection 
## 
## 544 samples
##  17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 489, 490, 492, 491, 489, 488, ... 
## Resampling results across tuning parameters:
## 
##   nvmax  RMSE      Rsquared   MAE       MSE      RSS      
##    1     3116.676  0.5047899  2480.277  9924143  537869955
##    2     2670.511  0.6141343  2018.854  7253757  393129955
##    3     2394.728  0.6806436  1847.491  5787625  314365711
##    4     2223.172  0.7179308  1740.158  4974734  270218937
##    5     2160.560  0.7329846  1701.678  4701270  255744224
##    6     2117.879  0.7416528  1665.929  4519142  245832663
##    7     2123.836  0.7408057  1670.549  4548786  247459481
##    8     2119.735  0.7406173  1674.291  4533400  246643018
##    9     2155.722  0.7293720  1691.149  4710988  256372597
##   10     2157.944  0.7291680  1680.271  4726621  257176853
##   11     2113.974  0.7416482  1663.049  4513600  245493174
##   12     2096.665  0.7459114  1653.184  4443008  241535828
##   13     2086.080  0.7482922  1647.345  4399171  239126796
##   14     2082.560  0.7487630  1644.411  4383112  238253554
##   15     2090.142  0.7471492  1651.776  4414671  239941188
##   16     2088.295  0.7476358  1651.515  4404923  239413821
##   17     2088.644  0.7475671  1651.796  4406352  239491261
## 
## MSE was used to select the optimal model using  the one SE rule.
## The final value used for the model was nvmax = 6.
model_forward_SE <- model_forward$results %>%
  mutate(MSE_SE_low = MSE - (MSESD / sqrt(model_forward$control$number * model_forward$control$repeats)),
         MSE_SE_high = MSE + (MSESD / sqrt(model_forward$control$number * model_forward$control$repeats)), 
         min_CV_MSE = as.numeric(min(MSE) == MSE))

ggplot(model_forward_SE, aes(x = nvmax, y = MSE)) +
  geom_line(col = "grey55") +
  geom_hline(aes(yintercept = model_forward_SE$MSE_SE_high[model_forward_SE$min_CV_MSE == 1], col = "1 s.e. model")) +
  geom_vline(aes(xintercept = model_forward$bestTune$nvmax, col = "mediumseagreen")) +
  geom_errorbar(aes(ymin = MSE_SE_low, ymax = MSE_SE_high), 
                colour = "grey55",
                width = 0.4) +
  geom_point(size = 2, aes(col = factor(min_CV_MSE))) +
  scale_color_manual(values = c("deepskyblue3", "green", "mediumseagreen", "mediumseagreen")) +
  scale_x_continuous(breaks = 1:17, minor_breaks = NULL) +
  theme(legend.position = "none") +
  labs(title = "College Dataset - Linear Model",
       subtitle = "Selecting number of predictors with forward selection (cross-validation MSE, 1 s.e. rule)",
       x = "Number of Predictors", 
       y = "Cross-Validation MSE")
## Warning: Use of `model_forward_SE$MSE_SE_high` is discouraged. Use `MSE_SE_high`
## instead.
## Warning: Use of `model_forward_SE$min_CV_MSE` is discouraged. Use `min_CV_MSE`
## instead.

coef(model_forward$finalModel, id = 6)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3764.3413062  2793.2069104     0.9703210    38.2157650    59.0358377 
##        Expend     Grad.Rate 
##     0.2031532    28.6548780
model_gam_1 <- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = train)

par(mfrow = c(2, 3))
plot(model_gam_1, se = T, col = "blue")

Interpreting these plots using the logic outlined on page 284, it seems that there is obvious evidence of a nonlinear effect of Expend, whereas a linear relationship seems more reasonable for perc.alumni. As for the other terms, it seems there is moderate evidence of non-linearity, but it is harder for me to say.

mean((predict(model_gam_1, newdata = test) - test$Outstate)^2)
## [1] 3170227
test_TSS <- sum((test$Outstate - mean(test$Outstate))^2)
test_RSS <- sum((predict(model_gam_1, newdata = test) - test$Outstate)^2)

1 - test_RSS/test_TSS
## [1] 0.7712156
mean((predict(model_forward, newdata = test) - test$Outstate)^2)
## [1] 3616631
test_RSS <- sum((predict(model_forward, newdata = test) - test$Outstate)^2)

1 - test_RSS/test_TSS
## [1] 0.7390001

The GAM model clearly has better test performance, so it would seem that the non-linear relationships it found (displayed in the previous plot) were not spurious and generalized well to unseen data.

summary(model_gam_1)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate), data = train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7511.91 -1172.59    38.71  1267.44  7827.85 
## 
## (Dispersion Parameter for gaussian family taken to be 3617918)
## 
##     Null Deviance: 9263945675 on 543 degrees of freedom
## Residual Deviance: 1888551843 on 521.9997 degrees of freedom
## AIC: 9782.515 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private          1 2402224634 2402224634 663.980 < 2.2e-16 ***
## s(Room.Board)    1 1721929895 1721929895 475.945 < 2.2e-16 ***
## s(PhD)           1  620590394  620590394 171.532 < 2.2e-16 ***
## s(perc.alumni)   1  456743095  456743095 126.245 < 2.2e-16 ***
## s(Expend)        1  746743434  746743434 206.401 < 2.2e-16 ***
## s(Grad.Rate)     1   99786036   99786036  27.581 2.197e-07 ***
## Residuals      522 1888551843    3617918                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df Npar F   Pr(F)    
## (Intercept)                              
## Private                                  
## s(Room.Board)        3  2.680 0.04629 *  
## s(PhD)               3  1.179 0.31718    
## s(perc.alumni)       3  0.937 0.42233    
## s(Expend)            3 32.503 < 2e-16 ***
## s(Grad.Rate)         3  1.995 0.11380    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model_gam_2 <- gam(Outstate ~ Private + s(Room.Board) + PhD + perc.alumni + s(Expend) + Grad.Rate, data = train)

mean((predict(model_gam_2, newdata = test) - test$Outstate)^2)
## [1] 3226447
test_RSS <- sum((predict(model_gam_2, newdata = test) - test$Outstate)^2)
1 - test_RSS/test_TSS
## [1] 0.7671584
model_gam_3 <- gam(Outstate ~ Private + s(Room.Board) + PhD + perc.alumni + Expend + Grad.Rate, data = train)

mean((predict(model_gam_3, newdata = test) - test$Outstate)^2)
## [1] 3610592
test_RSS <- sum((predict(model_gam_3, newdata = test) - test$Outstate)^2)
1 - test_RSS/test_TSS
## [1] 0.739436