library(ISLR2)
library(boot)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(1)
cv.errors <- rep(NA, 10)
for (d in 1:10) {
glm.fit <- glm(wage ~ poly(age, d), data = Wage)
cv.errors[d] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}
best.degree <- which.min(cv.errors)
best.degree
## [1] 9
plot(1:10, cv.errors, type = "b", xlab = "Degree", ylab = "CV Error", main = "Polynomial Degree Selection")
abline(v = best.degree, col = "red", lty = 2)
Compare with ANOVA
fit.1 <- lm(wage ~ poly(age, 1), data = Wage)
fit.2 <- lm(wage ~ poly(age, 2), data = Wage)
fit.3 <- lm(wage ~ poly(age, 3), data = Wage)
fit.4 <- lm(wage ~ poly(age, 4), data = Wage)
fit.5 <- lm(wage ~ poly(age, 5), data = Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
age.grid <- seq(from = min(Wage$age), to = max(Wage$age), length.out = 100)
best.fit <- lm(wage ~ poly(age, best.degree), data = Wage)
preds <- predict(best.fit, newdata = list(age = age.grid))
plot(Wage$age, Wage$wage, col = "gray", main = paste("Polynomial Degree:", best.degree))
lines(age.grid, preds, col = "blue", lwd = 2)
Step function with cross validation
set.seed(1)
cv.step <- rep(NA, 9)
for (k in 2:10) {
Wage$age.cut <- cut(Wage$age, k)
glm.fit <- glm(wage ~ age.cut, data = Wage)
cv.step[k - 1] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}
best.cut <- which.min(cv.step) + 1
best.cut
## [1] 8
Wage$age.cut <- cut(Wage$age, best.cut)
fit.step <- lm(wage ~ age.cut, data = Wage)
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(color = "gray", alpha = 0.5) +
geom_line(aes(y = predict(fit.step)), color = "red", size = 1.2) +
labs(title = paste("Step Function Fit with", best.cut, "cuts"),
x = "Age", y = "Wage") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(leaps)
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-5
set.seed(1)
train_index <- sample(1:nrow(College), nrow(College) * 0.7)
train_data <- College[train_index, ]
test_data <- College[-train_index, ]
# forward stepwise selection
forward_fit <- regsubsets(Outstate ~ ., data = train_data, nvmax = ncol(College) - 1, method = "forward")
forward_summary <- summary(forward_fit)
par(mfrow = c(1, 3))
plot(forward_summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "b")
plot(forward_summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "b")
plot(forward_summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted R^2", type = "b")
Best Model Size
which.min(forward_summary$bic)
## [1] 13
# Fit GAM model with natural splines (df = 4 is a common choice)
gam.fit <- gam(Outstate ~ Private + s(Room.Board, df = 4) + s(Grad.Rate, df = 4) + s(Enroll, df = 4),
data = train_data)
summary(gam.fit)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 4) + s(Grad.Rate,
## df = 4) + s(Enroll, df = 4), data = train_data)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7784.2 -1683.5 -125.1 1568.4 12070.4
##
## (Dispersion Parameter for gaussian family taken to be 6362673)
##
## Null Deviance: 9260683704 on 542 degrees of freedom
## Residual Deviance: 3365854793 on 529.0002 degrees of freedom
## AIC: 10063.4
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2910395019 2910395019 457.417 < 2.2e-16 ***
## s(Room.Board, df = 4) 1 2049646194 2049646194 322.136 < 2.2e-16 ***
## s(Grad.Rate, df = 4) 1 551973881 551973881 86.752 < 2.2e-16 ***
## s(Enroll, df = 4) 1 72166969 72166969 11.342 0.0008128 ***
## Residuals 529 3365854793 6362673
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, df = 4) 3 3.4040 0.017539 *
## s(Grad.Rate, df = 4) 3 4.0390 0.007413 **
## s(Enroll, df = 4) 3 3.5328 0.014739 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
All predictors are statistically significant.
par(mfrow = c(2, 2))
plot(gam.fit, se = TRUE, col = "blue")
Being a private college has a large positive effect on out-of-state
tuition compared to public colleges. Tuition increases non-linearity
with room an dboard costs. Grad.Rate: slight nonlinear increase
especially after 60% Enroll: somewhat non-linear effect with diminishing
returns
gam.pred <- predict(gam.fit, newdata = test_data)
rmse <- sqrt(mean((test_data$Outstate - gam.pred)^2))
rmse
## [1] 2301.316
On average, the GAM’s predicted out-of-state tuition differs from the actual values by about $2301
All three numeric predictors (Room.board, Grad.Rate, and Enroll) showed statisticallly significant non-linear relationships with the response (Outstate)