In this exercise, you will further analyze the Wage data set considered throughout this chapter.
Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
library(ISLR2)
library(boot)
library(ggplot2)
# Remove rows with missing values
Wage <- na.omit(Wage)
set.seed(1)
CrossVali_errors <- rep(NA, 10)
for (d in 1:10) {
fit <- glm(wage ~ poly(age, d), data = Wage)
CrossVali_errors[d] <- cv.glm(Wage, fit, K = 10)$delta[1]
cat(sprintf("Degree %2d CV error: %7.2f\n", d, CrossVali_errors[d]))
}
## Degree 1 CV error: 1676.83
## Degree 2 CV error: 1600.76
## Degree 3 CV error: 1598.40
## Degree 4 CV error: 1595.65
## Degree 5 CV error: 1594.98
## Degree 6 CV error: 1596.06
## Degree 7 CV error: 1594.30
## Degree 8 CV error: 1598.13
## Degree 9 CV error: 1593.91
## Degree 10 CV error: 1595.95
optimal_degree <- which.min(CrossVali_errors)
cat(sprintf("\n>> Optimal degree by 10-fold CV: %d\n\n", optimal_degree))
##
## >> Optimal degree by 10-fold CV: 9
# Visualize the errors by degree
cv_df <- data.frame(
Degree = 1:10,
CV_Error = CrossVali_errors
)
ggplot(cv_df, aes(x = Degree, y = CV_Error)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 2) +
labs(
title = "10-fold CV Error by Polynomial Degree",
x = "Polynomial Degree",
y = "Cross-Validation Error"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# ANOVA: compare nested models for degree = 1~5
fit_list <- lapply(1:5, function(degree) lm(wage ~ poly(age, degree), data = Wage))
ANOVA_Res <- do.call(anova, fit_list)
print(ANOVA_Res)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, degree)
## Model 2: wage ~ poly(age, degree)
## Model 3: wage ~ poly(age, degree)
## Model 4: wage ~ poly(age, degree)
## Model 5: wage ~ poly(age, degree)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.3) +
stat_smooth(
method = "lm",
formula = y ~ poly(x, optimal_degree),
se = FALSE,
color = "blue",
size = 1
) +
labs(
title = paste("Polynomial Regression of Wage on Age (Best Degree =", optimal_degree, ")"),
x = "Age",
y = "Wage (USD)"
) +
theme_minimal()
Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.
set.seed(1)
CrossVali_Errors <- numeric(9) # K
for (K in 2:10) {
brks <- seq(min(Wage$age), max(Wage$age), length.out = K + 1)
Wage$ageGrp <- cut(Wage$age,
breaks = brks,
include.lowest = TRUE,
right = TRUE)
Wage$ageGrp <- factor(Wage$ageGrp, levels = levels(Wage$ageGrp))
fit_step <- glm(wage ~ ageGrp, data = Wage)
CrossVali_Errors[K - 1] <- cv.glm(Wage, fit_step)$delta[1]
cat(sprintf("Cuts = %2d CV error = %7.2f\n", K, CrossVali_Errors[K - 1]))
}
## Cuts = 2 CV error = 1734.06
## Cuts = 3 CV error = 1682.76
## Cuts = 4 CV error = 1635.89
## Cuts = 5 CV error = 1631.45
## Cuts = 6 CV error = 1623.29
## Cuts = 7 CV error = 1611.60
## Cuts = 8 CV error = 1601.01
## Cuts = 9 CV error = 1610.21
## Cuts = 10 CV error = 1604.80
cuts_vec <- 2:10
bestK <- cuts_vec[ which.min(CrossVali_Errors) ]
cat("\n>> Optimal cuts by CV:", bestK, "\n")
##
## >> Optimal cuts by CV: 8
## Final Optimal MOdel with best K
best_brks <- seq(min(Wage$age), max(Wage$age), length.out = bestK + 1)
Wage$ageGrp <- cut(Wage$age,
breaks = best_brks,
include.lowest = TRUE,
right = TRUE)
Wage$ageGrp <- factor(Wage$ageGrp, levels = levels(Wage$ageGrp))
final_fit <- lm(wage ~ ageGrp, data = Wage)
#Visualize Result
Wage$wage_hat <- predict(final_fit)
ord <- order(Wage$age)
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.3, color = "gray50") +
geom_step(
aes(x = age[ord], y = wage_hat[ord]),
color = "blue", size = 1
) +
labs(
title = paste("Step-Function Fit with", bestK, "Cuts"),
x = "Age", y = "Wage"
) +
theme_minimal()
This question relates to the College data set.
Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
library(ISLR2)
library(leaps) # regsubsets()
head(College)
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Agnes Scott College 510 63 12960 5450 450
## Alaska Pacific University 249 869 7560 4120 800
## Albertson College 678 41 13500 3335 500
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Agnes Scott College 875 92 97 7.7 37 19016
## Alaska Pacific University 1500 76 72 11.9 2 10922
## Albertson College 675 67 73 9.4 11 9727
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
set.seed(1)
n <- nrow(College)
train_idx <- sample(seq_len(n), size = 0.7 * n)
College.train <- College[train_idx, ] #Train 70%
College.test <- College[-train_idx, ] #Test 30%
# Forward Stepwise Selection
fwd.fit <- regsubsets(Outstate ~ ., data = College.train, nvmax = 17, method = "forward")
# Summary
fwd.sum <- summary(fwd.fit)
fwd.sum
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College.train, nvmax = 17,
## method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " " " " "
## 9 ( 1 ) "*" " " " " " " "*" " " " "
## 10 ( 1 ) "*" " " " " " " "*" " " " "
## 11 ( 1 ) "*" " " "*" " " "*" " " " "
## 12 ( 1 ) "*" "*" "*" " " "*" " " " "
## 13 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 14 ( 1 ) "*" "*" "*" " " "*" "*" "*"
## 15 ( 1 ) "*" "*" "*" " " "*" "*" "*"
## 16 ( 1 ) "*" "*" "*" " " "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" "*" " "
## 9 ( 1 ) " " "*" " " "*" "*" "*" " "
## 10 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 11 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 12 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 13 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 14 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 15 ( 1 ) "*" "*" " " "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" "*"
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
plot(fwd.sum$bic, type = "b", xlab = "Number of Predictors", ylab = "BIC")
# Best Size
best_size <- which.min(fwd.sum$bic)
cat("Lowest‐BIC model size:", best_size, "\n")
## Lowest‐BIC model size: 13
# Coeff of a model
best_coefs <- coef(fwd.fit, best_size)
print(best_coefs)
## (Intercept) PrivateYes Apps Accept Top10perc
## -1739.5725417 2276.7996721 -0.3358567 0.7814587 28.9687655
## F.Undergrad Room.Board Personal PhD Terminal
## -0.1559550 0.9134134 -0.3484815 11.8113175 24.9233138
## S.F.Ratio perc.alumni Expend Grad.Rate
## -55.0649149 48.6046652 0.1744677 20.9498491
Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings. Answer: Apps and Expend have significant non-linear effects on out-of-state tuition, while Top10perc and PhD do not. Private is a strong predictor.
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-5
best_size <- which.min(fwd.sum$bic) # check the best size
selected_vars <- names(coef(fwd.fit, best_size))[-1]
print(selected_vars)
## [1] "PrivateYes" "Apps" "Accept" "Top10perc" "F.Undergrad"
## [6] "Room.Board" "Personal" "PhD" "Terminal" "S.F.Ratio"
## [11] "perc.alumni" "Expend" "Grad.Rate"
# GAM formula
gam_formula <- as.formula("Outstate ~ Private + s(Apps) + s(Top10perc) + s(PhD) + s(Expend)")
# fit GAM model
gam.fit <- gam(gam_formula, data = College.train)
# model Summary
summary(gam.fit)
##
## Call: gam(formula = gam_formula, data = College.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7445.4 -1101.0 237.1 1382.3 7794.1
##
## (Dispersion Parameter for gaussian family taken to be 4440828)
##
## Null Deviance: 9260683704 on 542 degrees of freedom
## Residual Deviance: 2331432484 on 524.9995 degrees of freedom
## AIC: 9872.011
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2550500698 2550500698 574.330 < 2.2e-16 ***
## s(Apps) 1 771472280 771472280 173.723 < 2.2e-16 ***
## s(Top10perc) 1 1068974759 1068974759 240.715 < 2.2e-16 ***
## s(PhD) 1 182409160 182409160 41.075 3.263e-10 ***
## s(Expend) 1 814435659 814435659 183.397 < 2.2e-16 ***
## Residuals 525 2331432484 4440828
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Apps) 3 9.311 5.243e-06 ***
## s(Top10perc) 3 0.622 0.6013
## s(PhD) 3 0.942 0.4199
## s(Expend) 3 43.132 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot the result
par(mfrow = c(2, 2))
plot(gam.fit, se = TRUE, col = "blue")
Evaluate the model obtained on the test set, and explain the results obtained. Answer: this valuse(= 3,918,303) shows that the GAM makes accurate predictions.
# Predict with Test set
gam.pred <- predict(gam.fit, newdata = College.test)
# Calc MSE
mse.test <- mean((College.test$Outstate - gam.pred)^2)
cat("Test MSE:", round(mse.test, 2), "\n")
## Test MSE: 3918303
For which variables, if any, is there evidence of a non-linear relationship with the response?
Answer: non-linear relationship between the response (Outstate) and the variables Apps and Expend => their smooth terms had highly significant p-values (p < 0.001).