library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(boot)
## Warning: package 'boot' was built under R version 4.4.3
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.2
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(gratia)
## Warning: package 'gratia' was built under R version 4.4.3
attach(Wage)
attach(College)
###In this exercise, you will further analyze the Wage data set considered throughout this chapter.
(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
set.seed(123)
cv_results <- lapply(1:10, function(d) {
glm_fit <- glm(wage ~ poly(age, d), data = Wage)
cv_error <- cv.glm(Wage, glm_fit, K = 10)$delta[1]
data.frame(degree = d, cv_error = cv_error)
})
cv_df <- bind_rows(cv_results)
best_degree <- cv_df |>
arrange(cv_error) |>
slice(1)
print(best_degree)
## degree cv_error
## 1 10 1593.154
fit_1 <- lm(wage ~ poly(age, 1), data = Wage)
fit_2 <- lm(wage ~ poly(age, 2), data = Wage)
fit_3 <- lm(wage ~ poly(age, 3), data = Wage)
fit_4 <- lm(wage ~ poly(age, 4), data = Wage)
fit_5 <- lm(wage ~ poly(age, 5), data = Wage)
anova(fit_1, fit_2, fit_3, fit_4, fit_5)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
best_model <- lm(wage ~ poly(age, best_degree$degree), data = Wage)
age_grid <- data.frame(age = seq(min(Wage$age), max(Wage$age), length.out = 100))
wage_preds <- predict(best_model, newdata = age_grid)
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.5, color = "gray") +
geom_line(data = data.frame(age = age_grid$age, wage = wage_preds),
aes(x = age, y = wage), color = "blue", linewidth = 1.2) +
labs(title = paste("Polynomial Degree", best_degree$degree),
x = "Age", y = "Wage") +
theme_minimal()
(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.
set.seed(123)
cv_results <- lapply(2:10, function(k) {
Wage <- Wage |>
mutate(age_cut = cut(age, breaks = k))
glm_fit <- glm(wage ~ age_cut, data = Wage)
cv_error <- boot::cv.glm(Wage, glm_fit, K = 10)$delta[1]
data.frame(cuts = k, cv_error = cv_error)
})
cv_df <- bind_rows(cv_results)
best_cut <- cv_df |> arrange(cv_error) |> slice(1)
print(best_cut)
## cuts cv_error
## 1 8 1603.495
best_k <- best_cut$cuts
Wage <- Wage |>
mutate(age_cut = cut(age, breaks = best_k))
final_step_fit <- glm(wage ~ age_cut, data = Wage)
age_grid <- data.frame(age = seq(min(Wage$age), max(Wage$age), length.out = 100))
age_grid$age_cut <- cut(age_grid$age, breaks = best_k)
age_grid$wage_pred <- predict(final_step_fit, newdata = age_grid)
ggplot(Wage, aes(x = age, y = wage))+
geom_point(alpha = 0.4, color = "gray") +
geom_line(data = age_grid, aes(x = age, y =wage_pred), color = "darkred", linewidth = 1.2) +
labs(
title = paste("Step Function Fit: Wage ~ Age with", best_k, "cuts"),
x = "Age", y = "Wage"
) +
theme_minimal()
###This question relates to the College data set.
(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
set.seed(123)
sample_indices <- sample(1:nrow(College), size = 0.8 * nrow(College))
college_train <- College[sample_indices, ]
college_test <- College[-sample_indices, ]
forward_fit <- regsubsets(
Outstate ~ ., data = college_train, nvmax = ncol(college_train) - 1, method = "forward")
fit_summary <- summary(forward_fit)
fit_summary$adjr2
## [1] 0.4391229 0.5941741 0.6690075 0.7052218 0.7250374 0.7346054 0.7369384
## [8] 0.7399431 0.7455402 0.7477156 0.7511284 0.7519828 0.7524343 0.7526841
## [15] 0.7523417 0.7519748 0.7515665
fit_summary$bic
## [1] -347.2352 -542.7495 -663.9014 -730.4342 -768.2260 -784.7995 -784.8635
## [8] -786.5799 -794.6757 -794.5933 -797.6387 -794.3635 -790.0859 -785.3054
## [15] -779.0406 -772.7171 -766.2935
which.max(fit_summary$adjr2)
## [1] 14
which.max(fit_summary$bic)
## [1] 1
best_size <- which.min(fit_summary$bic)
best_vars <- coef(forward_fit, id = best_size)
print(best_vars)
## (Intercept) PrivateYes Apps Accept Enroll
## -2002.7518586 2604.8342824 -0.2518536 0.8483927 -1.1478290
## Top10perc Room.Board Personal PhD perc.alumni
## 25.2899335 0.8327798 -0.2759877 27.9273632 45.7642632
## Expend Grad.Rate
## 0.2112021 21.8924587
(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
best_vars <- coef(forward_fit, id = which.min(fit_summary$bic))
print(best_vars)
## (Intercept) PrivateYes Apps Accept Enroll
## -2002.7518586 2604.8342824 -0.2518536 0.8483927 -1.1478290
## Top10perc Room.Board Personal PhD perc.alumni
## 25.2899335 0.8327798 -0.2759877 27.9273632 45.7642632
## Expend Grad.Rate
## 0.2112021 21.8924587
gam_fit <- gam(
Outstate ~ Private + s(Room.Board) + s(Grad.Rate) + s(PhD), data = college_train
)
summary(gam_fit)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Outstate ~ Private + s(Room.Board) + s(Grad.Rate) + s(PhD)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7721.3 197.0 39.19 <2e-16 ***
## PrivateYes 3678.0 243.1 15.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Room.Board) 6.227 7.436 15.58 <2e-16 ***
## s(Grad.Rate) 4.998 6.148 10.12 <2e-16 ***
## s(PhD) 4.288 5.293 31.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.696 Deviance explained = 70.4%
## GCV = 5.0536e+06 Scale est. = 4.911e+06 n = 621
plot(gam_fit, se = TRUE, col = "blue")
draw(gam_fit)
The GAM reveals non-linear relationships that a standard linear model
would miss. Using the room and board, grad rates, and PhD info provide
some better understanding of how tuition is influenced. And obvisously
Private colleges consistently charge more, regardless of other
features.
(c) Evaluate the model obtained on the test set, and explain the results obtained.
gam_preds <- predict(gam_fit, newdata = college_test)
actual <- college_test$Outstate
mse_test <- mean((actual - gam_preds)^2)
ss_total <- sum((actual - mean(actual))^2)
ss_resid <- sum((actual - gam_preds)^2)
r2_test <- ss_resid / ss_total
mse_test
## [1] 4796623
r2_test
## [1] 0.2964146
the model fits the training data and the R^2 shows the model explained about 70% of the variance in the Outstate tuition on the training data. However, from the test data the R^2 means it only explains about 30% of the variance and the high MSE shows large prediction errors on the new data. which would suggest overfitting.
(d) For which variables, if any, is there evidence of a non-linear relationship with the response?
there is strong evidence of a non-linear relationship between Outstate tuition and the three other variables tested. Room.Board, Grad.Rate, and PhD are all significant. Tuition ticks up then explodes based on Room.Board, it increases with Grad.Rate but flattens, and PhD cause a U-Shape with both very low and very high PhD students tend to charge more.