Problem 6.
a.
# Load necessary libraries
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
library(boot)
library(ggplot2)
# Set seed for reproducibility
set.seed(123)
# Attach the Wage dataset
data(Wage)
attach(Wage)
Wage <- na.omit(Wage) # Remove missing values to prevent errors
# (a) Perform polynomial regression with cross-validation
# Initialize variables
max_degree <- 10
cv_errors <- numeric(max_degree)
# Perform k-fold cross-validation (k=10) for each polynomial degree
for (d in 1:max_degree) {
# Fit polynomial regression model
glm_fit <- glm(wage ~ poly(age, d), data = Wage)
# Compute cross-validation error
cv_errors[d] <- cv.glm(Wage, glm_fit, K = 10)$delta[1]
}
# Find the degree with minimum CV error
optimal_degree <- which.min(cv_errors)
min_cv_error <- cv_errors[optimal_degree]
# Print the optimal degree and CV errors
cat("Optimal polynomial degree:", optimal_degree, "\n")
## Optimal polynomial degree: 10
cat("Cross-validation errors for degrees 1 to", max_degree, ":\n")
## Cross-validation errors for degrees 1 to 10 :
print(cv_errors)
## [1] 1676.988 1602.473 1597.036 1594.582 1594.424 1594.157 1596.532 1593.351
## [9] 1593.285 1593.154
# Fit the polynomial model with the optimal degree
fit_optimal <- lm(wage ~ poly(age, optimal_degree), data = Wage)
# Perform ANOVA to compare polynomial models
fit_1 <- lm(wage ~ poly(age, 1), data = Wage)
fit_2 <- lm(wage ~ poly(age, 2), data = Wage)
fit_3 <- lm(wage ~ poly(age, 3), data = Wage)
fit_4 <- lm(wage ~ poly(age, 4), data = Wage)
fit_5 <- lm(wage ~ poly(age, 5), data = Wage)
anova_result <- anova(fit_1, fit_2, fit_3, fit_4, fit_5)
print("ANOVA results:")
## [1] "ANOVA results:"
print(anova_result)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot the polynomial fit
# Create a sequence of age values for prediction
age_grid <- seq(min(age), max(age), length.out = 100)
preds <- predict(fit_optimal, newdata = data.frame(age = age_grid))
# Create a data frame for plotting
plot_data <- data.frame(age = age_grid, wage = preds)
# Plot the data points and the polynomial fit
ggplot() +
geom_point(aes(x = age, y = wage), data = Wage, alpha = 0.5) +
geom_line(aes(x = age, y = wage), data = plot_data, color = "blue", size = 1) +
labs(title = paste("Polynomial Regression (Degree =", optimal_degree, ")"),
x = "Age", y = "Wage") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

b.
# Initialize variables
max_cuts <- 10 # Maximum number of age groups
cv_errors_step <- numeric(max_cuts)
# Perform k-fold cross-validation (k=10) for each step function model
for (cuts in 2:max_cuts) { # Start at 2 cuts to avoid invalid cases
Wage$age_cut <- cut(Wage$age, breaks = cuts) # Create age groups
glm_fit_step <- glm(wage ~ age_cut, data = Wage) # Fit model
cv_errors_step[cuts] <- cv.glm(Wage, glm_fit_step, K = 10)$delta[1] # Compute CV error
}
# Find optimal number of cuts
optimal_cuts <- max(2, which.min(cv_errors_step)) # Ensure at least 2 bins
cat("Optimal number of cuts:", optimal_cuts, "\n")
## Optimal number of cuts: 2
# Create breakpoints to ensure a valid step function
breaks_seq <- seq(min(Wage$age) - 1, max(Wage$age) + 1, length.out = optimal_cuts + 1)
Wage$age_cut <- cut(Wage$age, breaks = breaks_seq)
# Fit the final model with optimal cuts
fit_optimal_step <- lm(wage ~ age_cut, data = Wage)
# Generate predictions for plotting
Wage$step_preds <- predict(fit_optimal_step, newdata = Wage)
# Plot the step function fit with corrected scaling
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.5) +
geom_step(aes(x = age, y = step_preds), color = "red", size = 1.2) +
labs(title = paste("Step Function Regression with", optimal_cuts, "Cuts"),
x = "Age", y = "Wage") +
theme_minimal() +
xlim(min(Wage$age), max(Wage$age)) # Ensure proper scaling

Problem 10.
a.
# Load necessary libraries
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.2
# Set seed for reproducibility
set.seed(123)
# Load and attach the College dataset
data(College)
attach(College)
# (a) Split the data into training and test sets
n <- nrow(College)
train_idx <- sample(1:n, size = floor(0.7 * n)) # 70% for training
train_data <- College[train_idx, ]
test_data <- College[-train_idx, ]
# Perform forward stepwise selection on the training set
# Use regsubsets with forward stepwise method
fwd_model <- regsubsets(Outstate ~ ., data = train_data, method = "forward", nvmax = ncol(train_data) - 1)
# Summarize the forward stepwise selection results
fwd_summary <- summary(fwd_model)
# Identify a satisfactory model based on adjusted R^2, Cp, and BIC
# Extract metrics
adj_r2 <- fwd_summary$adjr2
cp <- fwd_summary$cp
bic <- fwd_summary$bic
# Find the model with maximum adjusted R^2
best_adj_r2_idx <- which.max(adj_r2)
cat("Model with highest adjusted R^2:", best_adj_r2_idx, "variables, Adj R^2 =", adj_r2[best_adj_r2_idx], "\n")
## Model with highest adjusted R^2: 15 variables, Adj R^2 = 0.7562066
# Find the model with minimum Cp (close to number of predictors)
best_cp_idx <- which.min(abs(cp - (1:ncol(train_data))))
## Warning in cp - (1:ncol(train_data)): longer object length is not a multiple of
## shorter object length
cat("Model with best Cp:", best_cp_idx, "variables, Cp =", cp[best_cp_idx], "\n")
## Model with best Cp: 16 variables, Cp = 16.06192
# Find the model with minimum BIC
best_bic_idx <- which.min(bic)
cat("Model with best BIC:", best_bic_idx, "variables, BIC =", bic[best_bic_idx], "\n")
## Model with best BIC: 9 variables, BIC = -698.5627
# Print the predictors for the model with the best BIC
best_model_vars <- names(coef(fwd_model, id = best_bic_idx))[-1] # Exclude intercept
# Replace 'PrivateYes' with 'Private' to match dataset column
best_model_vars <- gsub("PrivateYes", "Private", best_model_vars)
cat("Predictors in the best BIC model (", best_bic_idx, "variables):\n", paste(best_model_vars, collapse = ", "), "\n")
## Predictors in the best BIC model ( 9 variables):
## Private, Accept, Top25perc, F.Undergrad, Room.Board, PhD, perc.alumni, Expend, Grad.Rate
# Fit the selected model on the training set
formula_best <- as.formula(paste("Outstate ~", paste(best_model_vars, collapse = "+")))
best_lm <- lm(formula_best, data = train_data)
# Print the summary of the selected model
cat("\nSummary of the selected model:\n")
##
## Summary of the selected model:
print(summary(best_lm))
##
## Call:
## lm(formula = formula_best, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6498.2 -1335.8 88.8 1302.1 10460.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.675e+03 5.465e+02 -4.894 1.31e-06 ***
## PrivateYes 2.915e+03 3.017e+02 9.660 < 2e-16 ***
## Accept 4.835e-01 9.908e-02 4.880 1.40e-06 ***
## Top25perc 1.595e+01 6.125e+00 2.604 0.00948 **
## F.Undergrad -2.291e-01 5.196e-02 -4.409 1.25e-05 ***
## Room.Board 7.857e-01 1.002e-01 7.844 2.39e-14 ***
## PhD 3.346e+01 7.369e+00 4.541 6.93e-06 ***
## perc.alumni 4.758e+01 9.112e+00 5.221 2.56e-07 ***
## Expend 1.909e-01 2.161e-02 8.834 < 2e-16 ***
## Grad.Rate 1.847e+01 6.719e+00 2.748 0.00619 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2042 on 533 degrees of freedom
## Multiple R-squared: 0.754, Adjusted R-squared: 0.7499
## F-statistic: 181.5 on 9 and 533 DF, p-value: < 2.2e-16
# Evaluate the model on the test set (mean squared error)
test_preds <- predict(best_lm, newdata = test_data)
test_mse <- mean((test_data$Outstate - test_preds)^2)
cat("Test set MSE for the selected model:", test_mse, "\n")
## Test set MSE for the selected model: 3793829
b.
# Load necessary library for GAM
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
# (b) Fit a GAM on the training data using the selected predictors
# Ensure best_model_vars from previous step is available (from forward stepwise selection)
# If not running sequentially, assume best_model_vars is defined (e.g., from BIC model)
# For reproducibility, let's redefine the context if needed
if (!exists("best_model_vars")) {
# Re-run key parts of previous code to ensure continuity
library(ISLR2)
library(leaps)
set.seed(123)
data(College)
n <- nrow(College)
train_idx <- sample(1:n, size = floor(0.7 * n))
train_data <- College[train_idx, ]
test_data <- College[-train_idx, ]
fwd_model <- regsubsets(Outstate ~ ., data = train_data, method = "forward", nvmax = ncol(train_data) - 1)
fwd_summary <- summary(fwd_model)
best_bic_idx <- which.min(fwd_summary$bic)
best_model_vars <- names(coef(fwd_model, id = best_bic_idx))[-1]
best_model_vars <- gsub("PrivateYes", "Private", best_model_vars)
}
# Fit GAM with smoothing splines for numeric predictors and factor for Private
# Identify numeric vs. factor predictors
numeric_vars <- best_model_vars[!best_model_vars %in% "Private"]
factor_vars <- best_model_vars[best_model_vars %in% "Private"]
# Construct the GAM formula: s() for numeric variables, factor() for Private
gam_terms <- c(
if (length(numeric_vars) > 0) paste0("s(", numeric_vars, ")"),
if (length(factor_vars) > 0) paste0("factor(", factor_vars, ")")
)
gam_formula <- as.formula(paste("Outstate ~", paste(gam_terms, collapse = "+")))
# Fit the GAM model
gam_model <- gam(gam_formula, data = train_data, method = "REML")
# Print summary of the GAM model
cat("\nSummary of the GAM model:\n")
##
## Summary of the GAM model:
print(summary(gam_model))
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Outstate ~ s(Accept) + s(Top25perc) + s(F.Undergrad) + s(Room.Board) +
## s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate) + factor(Private)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8660.4 239.1 36.223 < 2e-16 ***
## factor(Private)Yes 2417.5 314.7 7.682 7.91e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Accept) 3.954 4.974 5.637 5.00e-05 ***
## s(Top25perc) 1.000 1.001 0.782 0.377040
## s(F.Undergrad) 7.299 8.284 3.571 0.000264 ***
## s(Room.Board) 1.755 2.222 21.521 < 2e-16 ***
## s(PhD) 1.520 1.895 1.651 0.161962
## s(perc.alumni) 1.000 1.001 18.028 2.62e-05 ***
## s(Expend) 5.676 6.811 27.360 < 2e-16 ***
## s(Grad.Rate) 2.719 3.463 3.936 0.007185 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.802 Deviance explained = 81.1%
## -REML = 4806.3 Scale est. = 3.3006e+06 n = 543
# Plot the GAM results (partial effect plots for each predictor)
par(mfrow = c(2, ceiling(length(best_model_vars)/2))) # Arrange plots in a grid
plot(gam_model, shade = TRUE, se = TRUE, rug = TRUE,
main = "Partial Effects of Predictors on Outstate Tuition",
xlab = NULL, ylab = "Partial Effect")
par(mfrow = c(1, 1)) # Reset plot layout

# Evaluate the GAM model on the test set (mean squared error)
test_preds_gam <- predict(gam_model, newdata = test_data)
test_mse_gam <- mean((test_data$Outstate - test_preds_gam)^2)
cat("Test set MSE for the GAM model:", test_mse_gam, "\n")
## Test set MSE for the GAM model: 3068829
# Compare with the linear model from previous step
cat("Test set MSE for the linear model (from forward stepwise):", test_mse, "\n")
## Test set MSE for the linear model (from forward stepwise): 3793829
c.
# (c) Evaluate the GAM model on the test set
# Predict on the test set using the GAM model
test_preds_gam <- predict(gam_model, newdata = test_data)
# Compute performance metrics
# Mean Squared Error (MSE)
test_mse_gam <- mean((test_data$Outstate - test_preds_gam)^2)
# Root Mean Squared Error (RMSE)
test_rmse_gam <- sqrt(test_mse_gam)
# Mean Absolute Error (MAE)
test_mae_gam <- mean(abs(test_data$Outstate - test_preds_gam))
# R-squared on test set
sst <- sum((test_data$Outstate - mean(test_data$Outstate))^2) # Total sum of squares
sse <- sum((test_data$Outstate - test_preds_gam)^2) # Residual sum of squares
test_r2_gam <- 1 - sse / sst
# Print GAM test set metrics
cat("\nGAM Model Test Set Performance:\n")
##
## GAM Model Test Set Performance:
cat("MSE:", test_mse_gam, "\n")
## MSE: 3068829
cat("RMSE:", test_rmse_gam, "\n")
## RMSE: 1751.807
cat("MAE:", test_mae_gam, "\n")
## MAE: 1313.2
cat("R-squared:", test_r2_gam, "\n")
## R-squared: 0.7960297
# Compare with the linear model from forward stepwise selection
# Recompute linear model test metrics for consistency
test_preds_lm <- predict(best_lm, newdata = test_data)
test_mse_lm <- mean((test_data$Outstate - test_preds_lm)^2)
test_rmse_lm <- sqrt(test_mse_lm)
test_mae_lm <- mean(abs(test_data$Outstate - test_preds_lm))
sst_lm <- sum((test_data$Outstate - mean(test_data$Outstate))^2)
sse_lm <- sum((test_data$Outstate - test_preds_lm)^2)
test_r2_lm <- 1 - sse_lm / sst_lm
# Print linear model test set metrics
cat("\nLinear Model (Forward Stepwise) Test Set Performance:\n")
##
## Linear Model (Forward Stepwise) Test Set Performance:
cat("MSE:", test_mse_lm, "\n")
## MSE: 3793829
cat("RMSE:", test_rmse_lm, "\n")
## RMSE: 1947.775
cat("MAE:", test_mae_lm, "\n")
## MAE: 1474.372
cat("R-squared:", test_r2_lm, "\n")
## R-squared: 0.7478424
# Compare the models
cat("\nComparison:\n")
##
## Comparison:
cat("GAM MSE is", ifelse(test_mse_gam < test_mse_lm, "lower", "higher"), "than Linear MSE\n")
## GAM MSE is lower than Linear MSE
cat("GAM R-squared is", ifelse(test_r2_gam > test_r2_lm, "higher", "lower"), "than Linear R-squared\n")
## GAM R-squared is higher than Linear R-squared
d.
# (d) Analyze non-linear relationships in the GAM model
# Print the GAM model summary to inspect smooth terms
cat("\nGAM Model Summary for Non-Linearity Analysis:\n")
##
## GAM Model Summary for Non-Linearity Analysis:
print(summary(gam_model))
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Outstate ~ s(Accept) + s(Top25perc) + s(F.Undergrad) + s(Room.Board) +
## s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate) + factor(Private)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8660.4 239.1 36.223 < 2e-16 ***
## factor(Private)Yes 2417.5 314.7 7.682 7.91e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Accept) 3.954 4.974 5.637 5.00e-05 ***
## s(Top25perc) 1.000 1.001 0.782 0.377040
## s(F.Undergrad) 7.299 8.284 3.571 0.000264 ***
## s(Room.Board) 1.755 2.222 21.521 < 2e-16 ***
## s(PhD) 1.520 1.895 1.651 0.161962
## s(perc.alumni) 1.000 1.001 18.028 2.62e-05 ***
## s(Expend) 5.676 6.811 27.360 < 2e-16 ***
## s(Grad.Rate) 2.719 3.463 3.936 0.007185 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.802 Deviance explained = 81.1%
## -REML = 4806.3 Scale est. = 3.3006e+06 n = 543
# Extract smooth terms from the GAM summary
gam_summary <- summary(gam_model)
smooth_terms <- gam_summary$s.table
# Analyze numeric predictors for non-linearity
cat("\nEvidence of Non-Linearity for Numeric Predictors:\n")
##
## Evidence of Non-Linearity for Numeric Predictors:
for (i in 1:nrow(smooth_terms)) {
predictor <- rownames(smooth_terms)[i]
edf <- smooth_terms[i, "edf"]
p_value <- smooth_terms[i, "p-value"]
cat("Predictor:", predictor, "\n")
cat(" Effective Degrees of Freedom (edf):", edf, "\n")
cat(" P-value:", p_value, "\n")
if (edf > 1.5 && p_value < 0.05) {
cat(" --> Strong evidence of non-linear relationship\n")
} else if (edf > 1.2 && p_value < 0.05) {
cat(" --> Moderate evidence of non-linear relationship\n")
} else if (edf <= 1.2) {
cat(" --> Relationship is nearly linear\n")
} else {
cat(" --> No significant non-linear relationship (check p-value)\n")
}
cat("\n")
}
## Predictor: s(Accept)
## Effective Degrees of Freedom (edf): 3.954348
## P-value: 4.999039e-05
## --> Strong evidence of non-linear relationship
##
## Predictor: s(Top25perc)
## Effective Degrees of Freedom (edf): 1.000362
## P-value: 0.3770405
## --> Relationship is nearly linear
##
## Predictor: s(F.Undergrad)
## Effective Degrees of Freedom (edf): 7.299149
## P-value: 0.0002638275
## --> Strong evidence of non-linear relationship
##
## Predictor: s(Room.Board)
## Effective Degrees of Freedom (edf): 1.755398
## P-value: 0
## --> Strong evidence of non-linear relationship
##
## Predictor: s(PhD)
## Effective Degrees of Freedom (edf): 1.520082
## P-value: 0.161962
## --> No significant non-linear relationship (check p-value)
##
## Predictor: s(perc.alumni)
## Effective Degrees of Freedom (edf): 1.000303
## P-value: 2.621062e-05
## --> Relationship is nearly linear
##
## Predictor: s(Expend)
## Effective Degrees of Freedom (edf): 5.676215
## P-value: 0
## --> Strong evidence of non-linear relationship
##
## Predictor: s(Grad.Rate)
## Effective Degrees of Freedom (edf): 2.719137
## P-value: 0.007184555
## --> Strong evidence of non-linear relationship
# Plot partial effects to visually inspect non-linearity
par(mfrow = c(2, ceiling(length(best_model_vars)/2))) # Arrange plots in a grid
plot(gam_model, shade = TRUE, se = TRUE, rug = TRUE,
main = "Partial Effects of Predictors on Outstate Tuition",
xlab = NULL, ylab = "Partial Effect")
par(mfrow = c(1, 1)) # Reset plot layout
