Chapter 7

Author

Justin Pons

library(ISLR2)
library(tidyverse)
library(boot)
library(splines)
library(gam)
library(leaps)

6A

Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

data("Wage")

Cross Validation is used to test polynomials 1:5.

n <- 5

cv.error <- rep(0,n)

set.seed(111)

for (i in 1:n) {
    lm_poly_fit <- glm(wage ~ poly(age, i), data = Wage)
    cv.error[i] <- cv.glm(Wage, lm_poly_fit, K = 10)$delta[1]
}

poly <- seq(1:n)

error <- as.data.frame(cbind(poly, cv.error))

colnames(error) <- c("Polynomial","CvError")

minCV <- error |> 
  filter(CvError == min(CvError))

The minimum CV error is extracted and plotted. The 4th polynomial has the lowest CV error.

error |> 
  ggplot(aes(x = Polynomial, y = CvError)) +
  geom_point() +
  geom_point(data = minCV,
             aes(x = Polynomial, y = CvError),
             color = "red") +
  geom_line()

The null hypothesis for this ANOVA test is that the model M is sufficient to explain the data. The alternative is that a more complex model M is required. The results indicate that the fourth order polynomial is sufficient, agreeing with the cross-validation.

fit.1 <- lm(wage ~ age , data = Wage)
fit.2 <- lm(wage ~ poly(age , 2), data = Wage)
fit.3 <- lm(wage ~ poly(age , 3), data = Wage)
fit.4 <- lm(wage ~ poly(age , 4), data = Wage)
fit.5 <- lm(wage ~ poly(age , 5), data = Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
Analysis of Variance Table

Model 1: wage ~ age
Model 2: wage ~ poly(age, 2)
Model 3: wage ~ poly(age, 3)
Model 4: wage ~ poly(age, 4)
Model 5: wage ~ poly(age, 5)
  Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
1   2998 5022216                                    
2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
3   2996 4777674  1     15756   9.8888  0.001679 ** 
4   2995 4771604  1      6070   3.8098  0.051046 .  
5   2994 4770322  1      1283   0.8050  0.369682    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The 4th degree polynomial model is plotted against the data.

ages <- seq(range(Wage$age)[1],range(Wage$age)[2])
new <- as.data.frame(ages)
colnames(new) <- "age"
preds <- predict(fit.4, newdata = new)
new <- cbind(new, preds)

Wage |> 
  ggplot(aes(x = age, y = wage)) +
  geom_point(alpha = .5) +
  labs(x = "Age", y ="Wage") +
  geom_line(data = new,
            aes(x = age, y = preds),
            size = 1.5,
            color = "red")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

6B

Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

Using CV on cuts 2:20, the optimal cut is 16

cv.error <- vector()

set.seed(111)

for(d in 2:20){
  Wage$age_cut <- cut(Wage$age, d)
  fit <- glm(wage ~ age_cut, data = Wage)
  cv.error[d-1] <- cv.glm(Wage, fit, K = 10)$delta[2]
}

cv.error <- cbind(cv.error,2:20) |> 
  as.data.frame()

colnames(cv.error) <- c("CvError","Cuts")

OptimalCut <- cv.error |> 
  filter(CvError == min(CvError))
fit <- glm(wage ~ cut(age, 16), data = Wage)
ages <- seq(range(Wage$age)[1],range(Wage$age)[2])
new <- as.data.frame(ages)
colnames(new) <- "age"
preds <- predict(fit, newdata = new)
new <- cbind(new, preds)

Wage |> 
  ggplot(aes(x = age, y = wage)) +
  geom_point(alpha = .5) +
  labs(x = "Age", y ="Wage") +
  geom_line(data = new,
            aes(x = age, y = preds),
            size = 1.5,
            color = "red")

10A

Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

data(College)
idx <- sample(nrow(College),.8*nrow(College), replace = F)
train <- College[idx,]
test <- College[-idx,]

For five variables, forwards selection results in Private, Room.Board, PhD, perc.alumni, and Expend.

best <- regsubsets(Outstate ~ ., data = train, nvmax = 10, method = 'forward')
summary(best)
Subset selection object
Call: regsubsets.formula(Outstate ~ ., data = train, nvmax = 10, method = "forward")
17 Variables  (and intercept)
            Forced in Forced out
PrivateYes      FALSE      FALSE
Apps            FALSE      FALSE
Accept          FALSE      FALSE
Enroll          FALSE      FALSE
Top10perc       FALSE      FALSE
Top25perc       FALSE      FALSE
F.Undergrad     FALSE      FALSE
P.Undergrad     FALSE      FALSE
Room.Board      FALSE      FALSE
Books           FALSE      FALSE
Personal        FALSE      FALSE
PhD             FALSE      FALSE
Terminal        FALSE      FALSE
S.F.Ratio       FALSE      FALSE
perc.alumni     FALSE      FALSE
Expend          FALSE      FALSE
Grad.Rate       FALSE      FALSE
1 subsets of each size up to 10
Selection Algorithm: forward
          PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
7  ( 1 )  "*"        " "  "*"    " "    " "       " "       " "        
8  ( 1 )  "*"        " "  "*"    "*"    " "       " "       " "        
9  ( 1 )  "*"        "*"  "*"    "*"    " "       " "       " "        
10  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
          P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
5  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
6  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
7  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
8  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
9  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
10  ( 1 ) " "         "*"        " "   " "      "*" " "      " "      
          perc.alumni Expend Grad.Rate
1  ( 1 )  " "         "*"    " "      
2  ( 1 )  " "         "*"    " "      
3  ( 1 )  " "         "*"    " "      
4  ( 1 )  "*"         "*"    " "      
5  ( 1 )  "*"         "*"    " "      
6  ( 1 )  "*"         "*"    "*"      
7  ( 1 )  "*"         "*"    "*"      
8  ( 1 )  "*"         "*"    "*"      
9  ( 1 )  "*"         "*"    "*"      
10  ( 1 ) "*"         "*"    "*"      

10B

Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam_model <- gam(Outstate ~ Private + ns(Room.Board, 4) + ns(PhD, 4) + ns(perc.alumni, 4) + ns(Expend, 4), data = train)

plot(gam_model)

  1. Private universities have a higher tuition.
  2. Expend has a negative quadratic relationship to tuition.
  3. PhD decreases to around 50, then increases again.

10C

Evaluate the model obtained on the test set, and explain the results obtained.

pred <- predict(gam_model, newdata = test, se.fit = TRUE)
mean((pred$fit - test$Outstate)^2)
[1] 4421748
lm_fit <- lm(Outstate ~ Private + Room.Board + PhD + perc.alumni + Expend, data = train)
pred_lm <- predict(lm_fit, newdata = test)
mean((pred_lm - test$Outstate)^2)
[1] 5412837

The MSE for the GAM model is 3,785,062 which is significantly less than a baseline LM model with MSE 4,148,762