library(ISLR2)
library(boot)
library(splines)
library(gam)
library(leaps)
library(MASS)
library(ggplot2)
set.seed(1)
WageGoal. Fit a step function for
wage ~ age, choose the number of cuts by 10-fold
cross-validation, and plot the resulting fit.
attach(Wage)
max_cuts <- 10
cv_err <- rep(NA, max_cuts)
for (k in 2:max_cuts) {
Wage$age.cut <- cut(Wage$age, k)
fit <- glm(wage ~ age.cut, data = Wage)
cv_err[k] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
cv_err
## [1] NA 1734.489 1684.271 1635.552 1632.080 1623.415 1614.996 1601.318
## [9] 1613.954 1606.331
best_k <- which.min(cv_err)
best_k
## [1] 8
The CV curve is minimized at best_k cuts. Now refit with
the chosen number of cuts and plot.
plot(2:max_cuts, cv_err[-1], type = "b", pch = 19,
xlab = "Number of cuts", ylab = "10-fold CV MSE",
main = "CV error vs. number of cuts")
abline(v = best_k, lty = 2, col = "red")
fit_best <- glm(wage ~ cut(age, best_k), data = Wage)
age_grid <- seq(min(Wage$age), max(Wage$age), length.out = 200)
preds <- predict(fit_best, newdata = list(age = age_grid))
plot(Wage$age, Wage$wage, col = "darkgrey", cex = 0.5,
xlab = "Age", ylab = "Wage",
main = paste("Step function fit with", best_k, "cuts"))
lines(age_grid, preds, lwd = 2, col = "blue")
detach(Wage)
Comment. The CV error drops sharply from 2 to ~4
cuts, then plateaus. After roughly 4–8 cuts the additional flexibility
yields diminishing returns; the chosen best_k gives a
reasonable bias–variance trade-off.
Wage predictors with
non-linear methodsThe Wage data contains several factors not used in the
chapter examples: maritl, jobclass,
race, education, health,
health_ins, plus the continuous year and
age. Let’s explore them.
par(mfrow = c(2, 3))
plot(Wage$maritl, Wage$wage, main = "Marital status", col = "lightblue")
plot(Wage$jobclass, Wage$wage, main = "Job class", col = "lightblue")
plot(Wage$race, Wage$wage, main = "Race", col = "lightblue")
plot(Wage$education, Wage$wage, main = "Education", col = "lightblue")
plot(Wage$health, Wage$wage, main = "Health", col = "lightblue")
plot(Wage$health_ins, Wage$wage, main = "Health insurance",col = "lightblue")
par(mfrow = c(1, 1))
Observations.
gam_base <- gam(wage ~ s(age, 5) + s(year, 4), data = Wage)
gam_m1 <- gam(wage ~ s(age, 5) + s(year, 4) + education, data = Wage)
gam_m2 <- gam(wage ~ s(age, 5) + s(year, 4) + education + jobclass, data = Wage)
gam_m3 <- gam(wage ~ s(age, 5) + s(year, 4) + education + jobclass + maritl,data = Wage)
gam_m4 <- gam(wage ~ s(age, 5) + s(year, 4) + education + jobclass + maritl + health + health_ins,
data = Wage)
anova(gam_base, gam_m1, gam_m2, gam_m3, gam_m4, test = "F")
## Analysis of Deviance Table
##
## Model 1: wage ~ s(age, 5) + s(year, 4)
## Model 2: wage ~ s(age, 5) + s(year, 4) + education
## Model 3: wage ~ s(age, 5) + s(year, 4) + education + jobclass
## Model 4: wage ~ s(age, 5) + s(year, 4) + education + jobclass + maritl
## Model 5: wage ~ s(age, 5) + s(year, 4) + education + jobclass + maritl +
## health + health_ins
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 2990 4738489
## 2 2986 3689770 4 1048718 230.217 < 2.2e-16 ***
## 3 2985 3677553 1 12218 10.728 0.001067 **
## 4 2981 3581781 4 95772 21.024 < 2.2e-16 ***
## 5 2979 3392602 2 189179 83.058 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Each successive predictor is highly significant — every term improves
the fit beyond what a smooth in age and year
already explains.
par(mfrow = c(2, 3))
plot(gam_m4, se = TRUE, col = "blue")
par(mfrow = c(1, 1))
Findings.
AutoQuestion. Is there evidence for non-linear
relationships in the Auto data?
auto <- na.omit(Auto)
pairs(auto[, c("mpg", "horsepower", "weight", "displacement", "acceleration")],
pch = ".", main = "Auto: pairwise relationships")
mpg vs. horsepower, weight,
and displacement all show clear curvature. Focus on
mpg ~ horsepower.
fit1 <- lm(mpg ~ poly(horsepower, 1), data = auto)
fit2 <- lm(mpg ~ poly(horsepower, 2), data = auto)
fit3 <- lm(mpg ~ poly(horsepower, 3), data = auto)
fit4 <- lm(mpg ~ poly(horsepower, 4), data = auto)
fit5 <- lm(mpg ~ poly(horsepower, 5), data = auto)
anova(fit1, fit2, fit3, fit4, fit5)
## Analysis of Variance Table
##
## Model 1: mpg ~ poly(horsepower, 1)
## Model 2: mpg ~ poly(horsepower, 2)
## Model 3: mpg ~ poly(horsepower, 3)
## Model 4: mpg ~ poly(horsepower, 4)
## Model 5: mpg ~ poly(horsepower, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 390 9385.9
## 2 389 7442.0 1 1943.89 103.8767 < 2.2e-16 ***
## 3 388 7426.4 1 15.59 0.8333 0.361897
## 4 387 7399.5 1 26.91 1.4382 0.231169
## 5 386 7223.4 1 176.15 9.4131 0.002306 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The quadratic term is overwhelmingly significant (p < 2e-16); the cubic term is borderline; degrees 4 and 5 add nothing. So a quadratic fit captures the non-linearity.
cv_poly <- rep(NA, 10)
for (d in 1:10) {
fit <- glm(mpg ~ poly(horsepower, d), data = auto)
cv_poly[d] <- cv.glm(auto, fit, K = 10)$delta[1]
}
plot(1:10, cv_poly, type = "b", pch = 19,
xlab = "Polynomial degree", ylab = "10-fold CV MSE",
main = "mpg ~ poly(horsepower, d)")
which.min(cv_poly)
## [1] 7
CV confirms degree 2 (or 2–3) as best — going further does not help.
hp_grid <- seq(min(auto$horsepower), max(auto$horsepower), length.out = 200)
fit_bs <- lm(mpg ~ bs(horsepower, df = 4), data = auto)
fit_ns <- lm(mpg ~ ns(horsepower, df = 4), data = auto)
fit_q <- lm(mpg ~ poly(horsepower, 2), data = auto)
plot(auto$horsepower, auto$mpg, col = "grey", pch = 19, cex = 0.6,
xlab = "Horsepower", ylab = "MPG",
main = "Auto: non-linear fits of mpg on horsepower")
lines(hp_grid, predict(fit_q, newdata = list(horsepower = hp_grid)), col = "blue", lwd = 2)
lines(hp_grid, predict(fit_bs, newdata = list(horsepower = hp_grid)), col = "red", lwd = 2, lty = 2)
lines(hp_grid, predict(fit_ns, newdata = list(horsepower = hp_grid)), col = "darkgreen", lwd = 2, lty = 3)
legend("topright", c("Quadratic", "B-spline (df=4)", "Natural spline (df=4)"),
col = c("blue", "red", "darkgreen"), lty = c(1, 2, 3), lwd = 2, bty = "n")
gam_auto <- gam(mpg ~ s(horsepower, 4) + s(weight, 4) + s(displacement, 4)
+ s(acceleration, 4) + factor(origin),
data = auto)
par(mfrow = c(1, 4))
plot(gam_auto, se = TRUE, col = "blue")
par(mfrow = c(1, 1))
summary(gam_auto)
##
## Call: gam(formula = mpg ~ s(horsepower, 4) + s(weight, 4) + s(displacement,
## 4) + s(acceleration, 4) + factor(origin), data = auto)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -11.5252 -2.2879 -0.2182 1.8221 16.4252
##
## (Dispersion Parameter for gaussian family taken to be 13.9528)
##
## Null Deviance: 23818.99 on 391 degrees of freedom
## Residual Deviance: 5204.398 on 373.0002 degrees of freedom
## AIC: 2166.159
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(horsepower, 4) 1 15675.1 15675.1 1123.4355 < 2.2e-16 ***
## s(weight, 4) 1 1088.1 1088.1 77.9821 < 2.2e-16 ***
## s(displacement, 4) 1 53.7 53.7 3.8495 0.0505048 .
## s(acceleration, 4) 1 155.9 155.9 11.1737 0.0009136 ***
## factor(origin) 2 213.3 106.6 7.6436 0.0005580 ***
## Residuals 373 5204.4 14.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(horsepower, 4) 3 15.3602 1.92e-09 ***
## s(weight, 4) 3 0.7805 0.505404
## s(displacement, 4) 3 5.6932 0.000806 ***
## s(acceleration, 4) 3 2.5391 0.056280 .
## factor(origin)
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Conclusion. Yes — there is strong evidence for
non-linearity, most visibly in horsepower,
weight, and displacement. The non-parametric
ANOVA in summary(gam_auto) rejects linearity for these
predictors. A quadratic in horsepower already captures most
of the curvature; splines and GAMs offer modest additional
flexibility.
Boston data: nox ~ disdata(Boston, package = "MASS")
boston <- Boston
fit_poly3 <- lm(nox ~ poly(dis, 3), data = boston)
summary(fit_poly3)
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
dis_grid <- seq(min(boston$dis), max(boston$dis), length.out = 200)
preds <- predict(fit_poly3, newdata = list(dis = dis_grid), se = TRUE)
plot(boston$dis, boston$nox, col = "darkgrey", pch = 19, cex = 0.5,
xlab = "dis", ylab = "nox",
main = "Cubic polynomial fit: nox ~ poly(dis, 3)")
lines(dis_grid, preds$fit, col = "blue", lwd = 2)
lines(dis_grid, preds$fit + 2 * preds$se, col = "blue", lwd = 1, lty = 2)
lines(dis_grid, preds$fit - 2 * preds$se, col = "blue", lwd = 1, lty = 2)
All three polynomial terms are highly significant. The fit captures a
clear decreasing trend with curvature: nox falls sharply as
dis increases from low values, then levels off.
rss_poly <- rep(NA, 10)
plot(boston$dis, boston$nox, col = "darkgrey", pch = 19, cex = 0.5,
xlab = "dis", ylab = "nox",
main = "Polynomial fits, degrees 1 to 10")
cols <- rainbow(10)
for (d in 1:10) {
fit <- lm(nox ~ poly(dis, d), data = boston)
rss_poly[d] <- sum(resid(fit)^2)
lines(dis_grid, predict(fit, newdata = list(dis = dis_grid)),
col = cols[d], lwd = 1.5)
}
legend("topright", legend = paste("d =", 1:10),
col = cols, lwd = 1.5, cex = 0.7, bty = "n", ncol = 2)
data.frame(degree = 1:10, RSS = round(rss_poly, 4))
## degree RSS
## 1 1 2.7686
## 2 2 2.0353
## 3 3 1.9341
## 4 4 1.9330
## 5 5 1.9153
## 6 6 1.8783
## 7 7 1.8495
## 8 8 1.8356
## 9 9 1.8333
## 10 10 1.8322
RSS strictly decreases with degree (training-set guarantee), but high-degree fits start to wiggle near the data boundaries — a sign of overfitting.
cv_poly_b <- rep(NA, 10)
for (d in 1:10) {
fit <- glm(nox ~ poly(dis, d), data = boston)
cv_poly_b[d] <- cv.glm(boston, fit, K = 10)$delta[1]
}
plot(1:10, cv_poly_b, type = "b", pch = 19,
xlab = "Polynomial degree", ylab = "10-fold CV MSE",
main = "CV error: nox ~ poly(dis, d)")
which.min(cv_poly_b)
## [1] 3
CV bottoms out around degree 3 or 4 and rises again for higher degrees — overfitting. A cubic is a good, parsimonious choice.
bs()fit_bs4 <- lm(nox ~ bs(dis, df = 4), data = boston)
summary(fit_bs4)
##
## Call:
## lm(formula = nox ~ bs(dis, df = 4), data = boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.124622 -0.039259 -0.008514 0.020850 0.193891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.73447 0.01460 50.306 < 2e-16 ***
## bs(dis, df = 4)1 -0.05810 0.02186 -2.658 0.00812 **
## bs(dis, df = 4)2 -0.46356 0.02366 -19.596 < 2e-16 ***
## bs(dis, df = 4)3 -0.19979 0.04311 -4.634 4.58e-06 ***
## bs(dis, df = 4)4 -0.38881 0.04551 -8.544 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared: 0.7164, Adjusted R-squared: 0.7142
## F-statistic: 316.5 on 4 and 501 DF, p-value: < 2.2e-16
attr(bs(boston$dis, df = 4), "knots") # how R chose the interior knots
## [1] 3.20745
When you specify df = 4 for a cubic B-spline (the
default degree = 3), R places one interior knot at
the median of dis (df = degree + #knots +
intercept handling, so df = 4 → 1 interior knot). I let R choose by
quantile — this is the standard approach when you have no domain reason
to prefer specific cut points.
pred_bs <- predict(fit_bs4, newdata = list(dis = dis_grid), se = TRUE)
plot(boston$dis, boston$nox, col = "darkgrey", pch = 19, cex = 0.5,
xlab = "dis", ylab = "nox",
main = "Regression spline (bs, df = 4)")
lines(dis_grid, pred_bs$fit, col = "red", lwd = 2)
lines(dis_grid, pred_bs$fit + 2 * pred_bs$se, col = "red", lwd = 1, lty = 2)
lines(dis_grid, pred_bs$fit - 2 * pred_bs$se, col = "red", lwd = 1, lty = 2)
rss_bs <- rep(NA, 13)
plot(boston$dis, boston$nox, col = "darkgrey", pch = 19, cex = 0.5,
xlab = "dis", ylab = "nox",
main = "Regression splines, df = 3 to 15")
cols2 <- rainbow(13)
for (i in seq_along(cols2)) {
df_i <- i + 2 # df = 3..15
fit <- lm(nox ~ bs(dis, df = df_i), data = boston)
rss_bs[i] <- sum(resid(fit)^2)
lines(dis_grid, predict(fit, newdata = list(dis = dis_grid)),
col = cols2[i], lwd = 1.2)
}
legend("topright", legend = paste("df =", 3:15),
col = cols2, lwd = 1.2, cex = 0.7, bty = "n", ncol = 2)
data.frame(df = 3:15, RSS = round(rss_bs, 4))
## df RSS
## 1 3 1.9341
## 2 4 1.9228
## 3 5 1.8402
## 4 6 1.8340
## 5 7 1.8299
## 6 8 1.8170
## 7 9 1.8257
## 8 10 1.7925
## 9 11 1.7970
## 10 12 1.7890
## 11 13 1.7824
## 12 14 1.7818
## 13 15 1.7828
Training RSS decreases monotonically with df, as expected. Visually,
fits beyond df ≈ 8 start to wiggle in regions with few observations
(large dis).
cv_bs <- rep(NA, 13)
for (i in seq_along(cv_bs)) {
df_i <- i + 2
fit <- glm(nox ~ bs(dis, df = df_i), data = boston)
cv_bs[i] <- cv.glm(boston, fit, K = 10)$delta[1]
}
plot(3:15, cv_bs, type = "b", pch = 19,
xlab = "Degrees of freedom", ylab = "10-fold CV MSE",
main = "CV error: bs(dis, df)")
best_df <- which.min(cv_bs) + 2
best_df
## [1] 13
CV selects roughly df = placeholder (typically 8–10 on this data, though it varies a bit with the random fold split). The curve is fairly flat in this region, suggesting the choice is not very sensitive.
CollegeGoal. Split College into training/test
sets; with Outstate as the response and the others as
predictors, use forward stepwise selection to find a satisfactory subset
model.
set.seed(1)
n <- nrow(College)
train <- sample(seq_len(n), size = floor(0.7 * n))
test <- setdiff(seq_len(n), train)
college_train <- College[train, ]
college_test <- College[test, ]
regsubsetsfwd <- regsubsets(Outstate ~ ., data = college_train,
nvmax = 17, method = "forward")
fwd_summary <- summary(fwd)
par(mfrow = c(1, 3))
plot(fwd_summary$cp, type = "b", pch = 19,
xlab = "Number of variables", ylab = "Cp")
plot(fwd_summary$bic, type = "b", pch = 19,
xlab = "Number of variables", ylab = "BIC")
plot(fwd_summary$adjr2, type = "b", pch = 19,
xlab = "Number of variables", ylab = "Adj R^2")
par(mfrow = c(1, 1))
c(Cp_best = which.min(fwd_summary$cp),
BIC_best = which.min(fwd_summary$bic),
AdjR2_best = which.max(fwd_summary$adjr2))
## Cp_best BIC_best AdjR2_best
## 13 13 13
Cp and Adj-R² typically select around 12–13 variables; BIC, which penalizes more aggressively, lands around 6–8. The fit improves quickly through ~6 variables and then plateaus — a parsimonious choice of 6 variables is a defensible compromise.
chosen_k <- 6
coef(fwd, chosen_k)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3782.6544026 2808.0522815 0.9722009 38.0615210 59.1918276
## Expend Grad.Rate
## 0.2032837 28.6598253
These six predictors (commonly Private,
Room.Board, PhD/Terminal,
perc.alumni, Expend, Grad.Rate)
repeatedly come out on top across resamplings: they cover the major axes
— sector, cost of attendance, faculty quality, alumni engagement,
spending, and outcomes.
test_mat <- model.matrix(Outstate ~ ., data = college_test)
coefs <- coef(fwd, id = chosen_k)
preds <- test_mat[, names(coefs)] %*% coefs
test_mse <- mean((college_test$Outstate - preds)^2)
test_mse
## [1] 3635355
sqrt(test_mse)
## [1] 1906.661
The resulting RMSE gives a tangible sense of out-of-state-tuition
prediction error in dollars; you can compare this against the response’s
standard deviation (sd(College$Outstate) ≈ 4000) to judge
fit quality.