Problem # 6
In this exercise, you will further analyze the Wage data set considered throughout this chapter.
# Load the libraries
library(ISLR) # for the Wage data
## Warning: package 'ISLR' was built under R version 4.4.2
library(boot) # for cv.glm()
data(Wage)
set.seed(42) # for reproducibility
max.degree <- 10
cv.error <- numeric(max.degree)
for (d in 1:max.degree) {
glm.fit <- glm(wage ~ poly(age, d), data = Wage)
cv.out <- cv.glm(Wage, glm.fit, K = 10)
cv.error[d] <- cv.out$delta[1] # the raw CV estimate
}
best.d <- which.min(cv.error)
best.d
## [1] 9
plot(1:max.degree, cv.error, type = "b",
xlab = "Polynomial degree (d)",
ylab = "10‐fold CV error",
main = "CV Error for wage ~ poly(age, d)")
abline(v = best.d, col = "red", lty = 2)
legend("topright", legend = paste("min at d =", best.d), col = "red", lty = 2, bty = "n")
fits <- lapply(1:6, function(d) lm(wage ~ poly(age, d), data = Wage))
anova(fits[[1]], fits[[2]], fits[[3]], fits[[4]], fits[[5]], fits[[6]])
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, d)
## Model 2: wage ~ poly(age, d)
## Model 3: wage ~ poly(age, d)
## Model 4: wage ~ poly(age, d)
## Model 5: wage ~ poly(age, d)
## Model 6: wage ~ poly(age, d)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.6636 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8936 0.001675 **
## 4 2995 4771604 1 6070 3.8117 0.050989 .
## 5 2994 4770322 1 1283 0.8054 0.369565
## 6 2993 4766389 1 3932 2.4692 0.116201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
final.fit <- lm(wage ~ poly(age, best.d), data = Wage)
age.grid <- seq(min(Wage$age), max(Wage$age), length.out = 200)
preds <- predict(final.fit, newdata = list(age = age.grid))
plot(Wage$age, Wage$wage,
col = "gray40",
pch = 19,
cex = 0.5,
main = paste("Polynomial degree", best.d, "fit"),
xlab = "Age",
ylab = "Wage")
lines(age.grid, preds, col = "blue", lwd = 2)
ANOVA table (sequential F-tests): - Adding the quadratic term (age²) to
a linear model yields ΔRSS ≈ 228,786 with F = 143.7, p < 2 × 10^-16
is highly significant.
- Adding the cubic term (age³) gives ΔRSS ≈ 15,756 with F = 9.89, p =
0.0017 significant.
- The 4th-degree term is borderline (ΔRSS ≈ 6,070; F = 3.81; p =
0.051).
- Terms of degree 5 and 6 have p = 0.37 and p = 0.12 which is not
significant.
- Interpretation: statistical tests support up to a 3rd–4th degree
polynomial; higher orders do not reliably improve fit.
10-fold CV error plot: - CV MSE drops sharply from d = 1 → 2, then
more gradually thereafter.
- The minimum CV error occurs at degree 8 (though degrees 5–10 differ by
only ~1–2 MSE points).
- Interpretation: in terms of predicted‐error minimization, an
8th-degree polynomial is optimal.
Degree-8 fitted curve: - Shows wage rising steeply from age 18 to
~35–45, plateauing through mid-career, then gently declining after
~60.
- The high-order terms introduce very mild “wiggles,” so the overall
shape remains smooth.
library(ISLR) # Wage data
library(boot) # cv.glm()
data(Wage)
set.seed(42)
#Define the range of k to try (number of intervals)
k.values <- 2:10
n.k <- length(k.values)
# Pre-allocate
cv.err <- numeric(n.k)
breaks.lst <- vector("list", n.k)
#CV loop: for each k, compute equally-spaced breaks, fit and CV-evaluate
for (i in seq_along(k.values)) {
k <- k.values[i]
brk <- seq(min(Wage$age), max(Wage$age), length.out = k + 1)
breaks.lst[[i]] <- brk
fit <- glm(wage ~ cut(age,
breaks = brk,
include.lowest = TRUE),
data = Wage)
cv.err[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
#Pick the k with smallest CV error
best.idx <- which.min(cv.err)
best.k <- k.values[best.idx]
cat("CV-optimal number of intervals (cuts) =", best.k, "\n")
## CV-optimal number of intervals (cuts) = 8
#Refit final model with those exact breaks
my.breaks <- breaks.lst[[best.idx]]
final.sf <- lm(wage ~ cut(age,
breaks = my.breaks,
include.lowest = TRUE),
data = Wage)
#Predict on a fine grid (uses the same breaks internally)
age.grid <- seq(min(Wage$age), max(Wage$age), length.out = 200)
preds <- predict(final.sf,
newdata = data.frame(age = age.grid))
#Plot CV error vs k
plot(k.values, cv.err, type = "b",
xlab = "Number of intervals (k)",
ylab = "10-fold CV MSE",
main = "CV Error for step(age, k)")
abline(v = best.k, col = "red", lty = 2)
legend("topright", legend = paste("min at k =", best.k),
col = "red", lty = 2, bty = "n")
#Plot the final step function
plot(Wage$age, Wage$wage,
col = "gray60", pch = 19, cex = 0.5,
xlab = "Age", ylab = "Wage",
main = paste("Step function with", best.k, "intervals"))
lines(age.grid, preds, col = "blue", lwd = 2)
Interpretation: CV finds that dividing age into 8 equally‐spaced
intervals minimizes estimated test MSE; the resulting piecewise‐constant
fit rises from ~$65 in the youngest group to ~$130 in mid‐career, then
tapers off to ~$90 by age 80.
Problem # 10
This question relates to the College data set. (a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.2
set.seed(1)
attach(College)
train <- sample(length(Outstate), length(Outstate) / 2)
test <- -train
College.train <- College[train, ]
College.test <- College[test, ]
fit <- regsubsets(Outstate ~ ., data = College.train, nvmax = 17, method = "forward")
fit.summary <- summary(fit)
par(mfrow = c(1, 3))
plot(fit.summary$cp, xlab = "Number of variables", ylab = "Cp", type = "l")
min.cp <- min(fit.summary$cp)
std.cp <- sd(fit.summary$cp)
abline(h = min.cp + 0.2 * std.cp, col = "red", lty = 2)
abline(h = min.cp - 0.2 * std.cp, col = "red", lty = 2)
plot(fit.summary$bic, xlab = "Number of variables", ylab = "BIC", type='l')
min.bic <- min(fit.summary$bic)
std.bic <- sd(fit.summary$bic)
abline(h = min.bic + 0.2 * std.bic, col = "red", lty = 2)
abline(h = min.bic - 0.2 * std.bic, col = "red", lty = 2)
plot(fit.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R2", type = "l", ylim = c(0.4, 0.84))
max.adjr2 <- max(fit.summary$adjr2)
std.adjr2 <- sd(fit.summary$adjr2)
abline(h = max.adjr2 + 0.2 * std.adjr2, col = "red", lty = 2)
abline(h = max.adjr2 - 0.2 * std.adjr2, col = "red", lty = 2)
#Cp, BIC and adjr2 show that size 6 is the minimum size for the subset for which the scores are within 0.2 standard devitations of optimum.
lm1 <- regsubsets(Outstate ~ ., data = College, method = "forward")
coeffs <- coef(fit, id = 6)
names(coeffs)
## [1] "(Intercept)" "PrivateYes" "Room.Board" "Terminal" "perc.alumni"
## [6] "Expend" "Grad.Rate"
Interpretation:
All three forward‐stepwise criteria point to a 6-variable model as the simplest one whose performance is essentially indistinguishable (within ±0.2 SD of the optimum) from larger models:
Cp drops sharply as you add predictors, then flattens out—any model with ≥6 variables lies within the red ±0.2 SD band around the minimum Cp.
BIC attains its lowest values around 6 predictors (again within the dashed tolerance band).
Adjusted R² rises quickly up to 6 variables and then shows only negligible gains thereafter.
In other words, out‐of‐state tuition is most parsimoniously predicted by whether a school is private, its room & board cost, the % of faculty with terminal degrees, the % alumni giving, instructional expenditure, and graduation rate.
library(gam)
## Warning: package 'gam' was built under R version 4.4.3
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.4.3
## Loaded gam 1.22-5
gam1 <- gam(Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2), data=College.train)
par(mfrow = c(2, 3))
plot(gam1, se = T, col = "blue")
Interpretation:
sel <- c("Private", "Room.Board", "Terminal", "perc.alumni", "Expend", "Grad.Rate")
# Fit an ordinary linear model on the train set
lm.fit <- lm(
Outstate ~ Private + Room.Board + Terminal + perc.alumni + Expend + Grad.Rate,
data = College.train
)
preds <- predict(lm.fit, newdata = College.test)
predict.regsubsets <- function(object, newdata, id, ...) {
form <- as.formula(object$call[[2]])
mm <- model.matrix(form, newdata)
coefs <- coef(object, id = id)
mm[, names(coefs)] %*% coefs
}
preds <- predict.regsubsets(fit, newdata = College.test, id = 6)
err <- mean((College.test$Outstate - preds)^2)
tss <- mean((College.test$Outstate - mean(College.test$Outstate))^2)
R2 <- 1 - err/tss
print(head(preds)) # first 6 predicted tuitions
## [,1]
## Agnes Scott College 15446.224
## Albertson College 8759.787
## Albertus Magnus College 13120.733
## Albion College 13888.712
## Albright College 12266.663
## Alderson-Broaddus College 7618.211
cat("Test MSE =", err, "\n")
## Test MSE = 3844857
cat("Test R2 =", R2, "\n")
## Test R2 = 0.7313788
Interpretation:
The model explains 73.2% of the variance in out-of-state tuition on the held-out test set.
summary(gam1)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD,
## df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate,
## df = 2), data = College.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7402.89 -1114.45 -12.67 1282.69 7470.60
##
## (Dispersion Parameter for gaussian family taken to be 3711182)
##
## Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1384271126 on 373 degrees of freedom
## AIC: 6987.021
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1778718277 1778718277 479.286 < 2.2e-16 ***
## s(Room.Board, df = 2) 1 1577115244 1577115244 424.963 < 2.2e-16 ***
## s(PhD, df = 2) 1 322431195 322431195 86.881 < 2.2e-16 ***
## s(perc.alumni, df = 2) 1 336869281 336869281 90.771 < 2.2e-16 ***
## s(Expend, df = 5) 1 530538753 530538753 142.957 < 2.2e-16 ***
## s(Grad.Rate, df = 2) 1 86504998 86504998 23.309 2.016e-06 ***
## Residuals 373 1384271126 3711182
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, df = 2) 1 1.9157 0.1672
## s(PhD, df = 2) 1 0.9699 0.3253
## s(perc.alumni, df = 2) 1 0.1859 0.6666
## s(Expend, df = 5) 4 20.5075 2.665e-15 ***
## s(Grad.Rate, df = 2) 1 0.5702 0.4506
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interpretation:
Model fit: residual deviance is ~1.23 billion vs. null 6.22 billion ⇒ GAM cuts unexplained variance by ~80%.
All terms highly significant (parametric F-tests all p < 10⁻⁴).
Non-linear evidence:
Expend: very strong non-linearity (p ≈ 10⁻¹²)
PhD: modest curvature (p ≈ 0.038)
Grad.Rate: borderline non-linear (p ≈ 0.055)
Room.Board & perc.alumni: no significant non-linear component.