library(ISLR2)   
library(boot)      
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

6a

set.seed(1)
cv.errors <- rep(NA, 10)

for (d in 1:10) {
  glm.fit <- glm(wage ~ poly(age, d), data = Wage)
  cv.errors[d] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}

best.degree <- which.min(cv.errors)
best.degree
## [1] 9

6b

plot(1:10, cv.errors, type = "b", xlab = "Degree", ylab = "CV Error", main = "Polynomial Degree Selection")
abline(v = best.degree, col = "red", lty = 2)

Compare with ANOVA

fit.1 <- lm(wage ~ poly(age, 1), data = Wage)
fit.2 <- lm(wage ~ poly(age, 2), data = Wage)
fit.3 <- lm(wage ~ poly(age, 3), data = Wage)
fit.4 <- lm(wage ~ poly(age, 4), data = Wage)
fit.5 <- lm(wage ~ poly(age, 5), data = Wage)

anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
age.grid <- seq(from = min(Wage$age), to = max(Wage$age), length.out = 100)
best.fit <- lm(wage ~ poly(age, best.degree), data = Wage)
preds <- predict(best.fit, newdata = list(age = age.grid))
plot(Wage$age, Wage$wage, col = "gray", main = paste("Polynomial Degree:", best.degree))
lines(age.grid, preds, col = "blue", lwd = 2)

Step function with cross validation

set.seed(1)
cv.step <- rep(NA, 9)

for (k in 2:10) {
  Wage$age.cut <- cut(Wage$age, k)
  glm.fit <- glm(wage ~ age.cut, data = Wage)
  cv.step[k - 1] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}

best.cut <- which.min(cv.step) + 1
best.cut
## [1] 8
Wage$age.cut <- cut(Wage$age, best.cut)
fit.step <- lm(wage ~ age.cut, data = Wage)

ggplot(Wage, aes(x = age, y = wage)) +
  geom_point(color = "gray", alpha = 0.5) +
  geom_line(aes(y = predict(fit.step)), color = "red", size = 1.2) +
  labs(title = paste("Step Function Fit with", best.cut, "cuts"),
       x = "Age", y = "Wage") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

10a

library(leaps)
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-5
set.seed(1)
train_index <- sample(1:nrow(College), nrow(College) * 0.7)
train_data <- College[train_index, ]
test_data <- College[-train_index, ]

# forward stepwise selection
forward_fit <- regsubsets(Outstate ~ ., data = train_data, nvmax = ncol(College) - 1, method = "forward")

forward_summary <- summary(forward_fit)

par(mfrow = c(1, 3))
plot(forward_summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "b")
plot(forward_summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "b")
plot(forward_summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted R^2", type = "b")

Best Model Size

which.min(forward_summary$bic)  
## [1] 13

10b Fit GAM

# Fit GAM model with natural splines (df = 4 is a common choice)
gam.fit <- gam(Outstate ~ Private + s(Room.Board, df = 4) + s(Grad.Rate, df = 4) + s(Enroll, df = 4), 
               data = train_data)

summary(gam.fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 4) + s(Grad.Rate, 
##     df = 4) + s(Enroll, df = 4), data = train_data)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -7784.2 -1683.5  -125.1  1568.4 12070.4 
## 
## (Dispersion Parameter for gaussian family taken to be 6362673)
## 
##     Null Deviance: 9260683704 on 542 degrees of freedom
## Residual Deviance: 3365854793 on 529.0002 degrees of freedom
## AIC: 10063.4 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                        Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                 1 2910395019 2910395019 457.417 < 2.2e-16 ***
## s(Room.Board, df = 4)   1 2049646194 2049646194 322.136 < 2.2e-16 ***
## s(Grad.Rate, df = 4)    1  551973881  551973881  86.752 < 2.2e-16 ***
## s(Enroll, df = 4)       1   72166969   72166969  11.342 0.0008128 ***
## Residuals             529 3365854793    6362673                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                       Npar Df Npar F    Pr(F)   
## (Intercept)                                     
## Private                                         
## s(Room.Board, df = 4)       3 3.4040 0.017539 * 
## s(Grad.Rate, df = 4)        3 4.0390 0.007413 **
## s(Enroll, df = 4)           3 3.5328 0.014739 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

All predictors are statistically significant.

par(mfrow = c(2, 2))
plot(gam.fit, se = TRUE, col = "blue")

Being a private college has a large positive effect on out-of-state tuition compared to public colleges. Tuition increases non-linearity with room an dboard costs. Grad.Rate: slight nonlinear increase especially after 60% Enroll: somewhat non-linear effect with diminishing returns

gam.pred <- predict(gam.fit, newdata = test_data)

rmse <- sqrt(mean((test_data$Outstate - gam.pred)^2))
rmse
## [1] 2301.316

On average, the GAM’s predicted out-of-state tuition differs from the actual values by about $2301

All three numeric predictors (Room.board, Grad.Rate, and Enroll) showed statisticallly significant non-linear relationships with the response (Outstate)