Question 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

a: Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

#performing K-fold CV with K = 10
rm(list = ls())
set.seed(9)
library(ISLR)
library(boot)
cv.MSE <- NA
for (i in 1:15) {
  glm.fit <-  glm(wage ~ poly(age, i), data = Wage)
  cv.MSE[i] <-  cv.glm(Wage, glm.fit, K = 10)$delta[1]
}
cv.MSE
##  [1] 1677.010 1600.554 1594.409 1594.565 1594.294 1594.970 1591.626 1595.563
##  [9] 1595.249 1593.224 1593.898 1598.134 1598.566 1597.780 1598.661
#plotting
plot( x = 1:15, y = cv.MSE, xlab = "degrees", ylab = "CV error", 
      type = "b", pch = 19, lwd = 2, bty = "n", 
      ylim = c(min(cv.MSE) - sd(cv.MSE), max(cv.MSE) + sd(cv.MSE)))

# horizontal line
abline(h = min(cv.MSE) + sd(cv.MSE) , lty = "dotted")

# where is the minimum
points(x = which.min(cv.MSE), y = min(cv.MSE), col = "red", pch = "X", cex = 1.5)

Comment: We see that using CV, a degree 7 polynomial provides the best fit of the data.

# container for the models we will fit
models <- vector("list", length(cv.MSE))
for( a in 1:length(cv.MSE)){
  models[[a]] <- glm(wage ~ poly(age, a), data = Wage)
}

# f-test
anova(models[[1]], models[[2]], models[[3]], models[[4]], models[[5]], models[[6]],
      models[[7]], models[[8]], models[[9]], models[[10]], models[[11]], models[[12]],
      models[[13]], models[[14]], models[[15]], test = "F")
## Analysis of Deviance Table
## 
## Model  1: wage ~ poly(age, a)
## Model  2: wage ~ poly(age, a)
## Model  3: wage ~ poly(age, a)
## Model  4: wage ~ poly(age, a)
## Model  5: wage ~ poly(age, a)
## Model  6: wage ~ poly(age, a)
## Model  7: wage ~ poly(age, a)
## Model  8: wage ~ poly(age, a)
## Model  9: wage ~ poly(age, a)
## Model 10: wage ~ poly(age, a)
## Model 11: wage ~ poly(age, a)
## Model 12: wage ~ poly(age, a)
## Model 13: wage ~ poly(age, a)
## Model 14: wage ~ poly(age, a)
## Model 15: wage ~ poly(age, a)
##    Resid. Df Resid. Dev Df Deviance        F    Pr(>F)    
## 1       2998    5022216                                   
## 2       2997    4793430  1   228786 143.5637 < 2.2e-16 ***
## 3       2996    4777674  1    15756   9.8867  0.001681 ** 
## 4       2995    4771604  1     6070   3.8090  0.051070 .  
## 5       2994    4770322  1     1283   0.8048  0.369731    
## 6       2993    4766389  1     3932   2.4675  0.116329    
## 7       2992    4763834  1     2555   1.6034  0.205515    
## 8       2991    4763707  1      127   0.0795  0.778016    
## 9       2990    4756703  1     7004   4.3952  0.036124 *  
## 10      2989    4756701  1        3   0.0017  0.967552    
## 11      2988    4756597  1      103   0.0648  0.799144    
## 12      2987    4756591  1        7   0.0043  0.947923    
## 13      2986    4756401  1      190   0.1189  0.730224    
## 14      2985    4756158  1      243   0.1522  0.696488    
## 15      2984    4755364  1      795   0.4986  0.480151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plotting the results of polynomial fit
plot(wage ~ age, data = Wage, col = "darkgrey",  bty = "n")
agelims <-  range(Wage$age)
age.grid <-  seq(from = agelims[1], to = agelims[2])
lm.fit <-  lm(wage ~ poly(age, 2), data = Wage)
lm.pred <-  predict(lm.fit, data.frame(age = age.grid), se = TRUE)

# mean prediction
lines(x = age.grid , y = lm.pred$fit, col = "blue", lwd = 2)

# uncertainty bands
matlines( x = age.grid, y = cbind( lm.pred$fit + 2*lm.pred$se.fit, lm.pred$fit - 2*lm.pred$se.fit),
          lty = "dashed", col = "blue")


b: Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

cv.error <-  NA
# performing 10-fold cross-validation for each cut
for (i in 2:15) {
  Wage$age.cut <- cut(Wage$age, i)
  lm.fit <-  glm(wage ~ age.cut, data = Wage)
  cv.error[i] <-  cv.glm(Wage, lm.fit, K = 10)$delta[1]
}

# the first element of cv.error is NA because we started our loop at 2
plot(2:15, cv.error[-1], xlab = "Number of cuts", ylab = "CV error", 
     type = "b", pch = 19, lwd = 2, bty ="n")

# horizontal line for 1se to less complexity
abline(h = min(cv.error, na.rm = TRUE) + sd(cv.error, na.rm = TRUE) , lty = "dotted")

# highlight minimum
points( x = which.min(cv.error), y = min(cv.error, na.rm = TRUE), col = "red", pch = "X", cex = 1.5 )

lm.fit <- glm(wage ~ cut(age, 4), data = Wage)
agelims <- range(Wage$age)
age.grid <- seq(from = agelims[1], to = agelims[2])
lm.pred <- predict(lm.fit, data.frame(age = age.grid), se = TRUE)
plot(wage ~ age, data = Wage, col = "darkgrey", bty = "n")
lines(age.grid, lm.pred$fit, col = "red", lwd = 2)
matlines(age.grid, cbind( lm.pred$fit + 2* lm.pred$se.fit,
                          lm.pred$fit - 2* lm.pred$se.fit),
         col = "blue", lty ="dashed")


Question 10

This question relates to the College data set

a: Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

set.seed(9)
train <- sample(nrow(College) * 0.7)
train_set <- College[train, ]
test_set <- College[-train, ]
forward_subset <- regsubsets(Outstate ~ ., data = train_set, nvmax = ncol(College)-1, method = "forward")
model_summary <- summary(forward_subset)

plot_metric <- function(metric, yaxis_label, reverse = FALSE) {
  plot(metric, xlab = "Number of Variables", ylab = yaxis_label, xaxt = "n", type = "l")
  axis(side = 1, at = 1:length(metric))
  
  if (reverse) {
    metric_1se <- max(metric) - (sd(metric) / sqrt(length(metric)))
    min_subset <- which(metric > metric_1se)
  } else {
    metric_1se <- min(metric) + (sd(metric) / sqrt(length(metric)))
    min_subset <- which(metric < metric_1se)
  }
  
  abline(h = metric_1se, col = "red", lty = 2)
  abline(v = min_subset[1], col = "green", lty = 2)
}

par(mfrow=c(1, 3))
  
plot_metric(model_summary$cp, "Cp")
plot_metric(model_summary$bic, "BIC")
plot_metric(model_summary$adjr2, "Adjusted R2", reverse = TRUE)

Comment: The green dotted line shows the best subset for best metric with the smallest number of variables, while the red dotted line shows the best metric within 1 standard error. From the plots above, we conclude that Cp and Adjusted R2 give us a subset with 7 variables and BIC gives us 6.

coef(forward_subset, 6)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3769.0587788  2748.6944010     0.8999634    38.5143460    44.4889713 
##        Expend     Grad.Rate 
##     0.2543900    31.2043096

b: Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam_model <- gam(Outstate ~ Private + s(Room.Board, df=2) + s(perc.alumni, df=2) + 
                   s(PhD, df=2) + s(Expend, df=2) + s(Grad.Rate, df=2), data=train_set)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
par(mfrow=c(2, 3))
plot(gam_model, se=TRUE, col="red")


c: Evaluate the model obtained on the test set, and explain the results obtained.

gam.pred <- predict(gam_model, test_set)
gam.err <- mean((test_set$Outstate - gam.pred)^2)
gam.err
## [1] 3937782
lm.pred <- predict(lm(Outstate~Private+Room.Board+PhD+perc.alumni+Expend+Grad.Rate, data = train_set), test_set)
lm.err <- mean((test_set$Outstate - lm.pred)^2)
lm.err
## [1] 4200277

Comment: When using normal linear model, we get a higher RSS. This is an improvement.


d: For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam_model)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(perc.alumni, 
##     df = 2) + s(PhD, df = 2) + s(Expend, df = 2) + s(Grad.Rate, 
##     df = 2), data = train_set)
## Deviance Residuals:
##       Min        1Q    Median        3Q       Max 
## -7164.185 -1192.389     9.746  1195.918  8668.434 
## 
## (Dispersion Parameter for gaussian family taken to be 3515849)
## 
##     Null Deviance: 8614032615 on 542 degrees of freedom
## Residual Deviance: 1866916246 on 531.0002 degrees of freedom
## AIC: 9739.358 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                  1 1968096326 1968096326 559.778 < 2.2e-16 ***
## s(Room.Board, df = 2)    1 1852547004 1852547004 526.913 < 2.2e-16 ***
## s(perc.alumni, df = 2)   1  739433651  739433651 210.314 < 2.2e-16 ***
## s(PhD, df = 2)           1  408910513  408910513 116.305 < 2.2e-16 ***
## s(Expend, df = 2)        1  669471699  669471699 190.415 < 2.2e-16 ***
## s(Grad.Rate, df = 2)     1  105813593  105813593  30.096 6.372e-08 ***
## Residuals              531 1866916246    3515849                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df Npar F     Pr(F)    
## (Intercept)                                        
## Private                                            
## s(Room.Board, df = 2)        1  5.624 0.0180708 *  
## s(perc.alumni, df = 2)       1  0.517 0.4725226    
## s(PhD, df = 2)               1 11.780 0.0006455 ***
## s(Expend, df = 2)            1 70.804 4.441e-16 ***
## s(Grad.Rate, df = 2)         1  3.159 0.0761031 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Comment: There seems to be a strong non-linear relationship between outstate and expend, phd and room.board.