Problem 6.

a.

# Load necessary libraries
library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.4.2

library(boot)
library(ggplot2)

# Set seed for reproducibility
set.seed(123)

# Attach the Wage dataset
data(Wage)
attach(Wage)

Wage <- na.omit(Wage)  # Remove missing values to prevent errors

# (a) Perform polynomial regression with cross-validation
# Initialize variables
max_degree <- 10
cv_errors <- numeric(max_degree)

# Perform k-fold cross-validation (k=10) for each polynomial degree
for (d in 1:max_degree) {
  # Fit polynomial regression model
  glm_fit <- glm(wage ~ poly(age, d), data = Wage)
  # Compute cross-validation error
  cv_errors[d] <- cv.glm(Wage, glm_fit, K = 10)$delta[1]
}

# Find the degree with minimum CV error
optimal_degree <- which.min(cv_errors)
min_cv_error <- cv_errors[optimal_degree]

# Print the optimal degree and CV errors
cat("Optimal polynomial degree:", optimal_degree, "\n")

## Optimal polynomial degree: 10

cat("Cross-validation errors for degrees 1 to", max_degree, ":\n")

## Cross-validation errors for degrees 1 to 10 :

print(cv_errors)

##  [1] 1676.988 1602.473 1597.036 1594.582 1594.424 1594.157 1596.532 1593.351
##  [9] 1593.285 1593.154

# Fit the polynomial model with the optimal degree
fit_optimal <- lm(wage ~ poly(age, optimal_degree), data = Wage)

# Perform ANOVA to compare polynomial models
fit_1 <- lm(wage ~ poly(age, 1), data = Wage)
fit_2 <- lm(wage ~ poly(age, 2), data = Wage)
fit_3 <- lm(wage ~ poly(age, 3), data = Wage)
fit_4 <- lm(wage ~ poly(age, 4), data = Wage)
fit_5 <- lm(wage ~ poly(age, 5), data = Wage)

anova_result <- anova(fit_1, fit_2, fit_3, fit_4, fit_5)
print("ANOVA results:")

## [1] "ANOVA results:"

print(anova_result)

## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#  Plot the polynomial fit
# Create a sequence of age values for prediction
age_grid <- seq(min(age), max(age), length.out = 100)
preds <- predict(fit_optimal, newdata = data.frame(age = age_grid))

# Create a data frame for plotting
plot_data <- data.frame(age = age_grid, wage = preds)

# Plot the data points and the polynomial fit
ggplot() +
  geom_point(aes(x = age, y = wage), data = Wage, alpha = 0.5) +
  geom_line(aes(x = age, y = wage), data = plot_data, color = "blue", size = 1) +
  labs(title = paste("Polynomial Regression (Degree =", optimal_degree, ")"),
       x = "Age", y = "Wage") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

b.

# Initialize variables
max_cuts <- 10  # Maximum number of age groups
cv_errors_step <- numeric(max_cuts)

# Perform k-fold cross-validation (k=10) for each step function model
for (cuts in 2:max_cuts) {  # Start at 2 cuts to avoid invalid cases
  Wage$age_cut <- cut(Wage$age, breaks = cuts)  # Create age groups
  glm_fit_step <- glm(wage ~ age_cut, data = Wage)  # Fit model
  cv_errors_step[cuts] <- cv.glm(Wage, glm_fit_step, K = 10)$delta[1]  # Compute CV error
}

# Find optimal number of cuts
optimal_cuts <- max(2, which.min(cv_errors_step))  # Ensure at least 2 bins
cat("Optimal number of cuts:", optimal_cuts, "\n")

## Optimal number of cuts: 2

# Create breakpoints to ensure a valid step function
breaks_seq <- seq(min(Wage$age) - 1, max(Wage$age) + 1, length.out = optimal_cuts + 1)
Wage$age_cut <- cut(Wage$age, breaks = breaks_seq)

# Fit the final model with optimal cuts
fit_optimal_step <- lm(wage ~ age_cut, data = Wage)

# Generate predictions for plotting
Wage$step_preds <- predict(fit_optimal_step, newdata = Wage)

# Plot the step function fit with corrected scaling
ggplot(Wage, aes(x = age, y = wage)) +
  geom_point(alpha = 0.5) +
  geom_step(aes(x = age, y = step_preds), color = "red", size = 1.2) +
  labs(title = paste("Step Function Regression with", optimal_cuts, "Cuts"),
       x = "Age", y = "Wage") +
  theme_minimal() +
  xlim(min(Wage$age), max(Wage$age))  # Ensure proper scaling

Problem 10.

a.

# Load necessary libraries
library(leaps)

## Warning: package 'leaps' was built under R version 4.4.2

# Set seed for reproducibility
set.seed(123)

# Load and attach the College dataset
data(College)
attach(College)

# (a) Split the data into training and test sets
n <- nrow(College)
train_idx <- sample(1:n, size = floor(0.7 * n))  # 70% for training
train_data <- College[train_idx, ]
test_data <- College[-train_idx, ]

# Perform forward stepwise selection on the training set
# Use regsubsets with forward stepwise method
fwd_model <- regsubsets(Outstate ~ ., data = train_data, method = "forward", nvmax = ncol(train_data) - 1)

# Summarize the forward stepwise selection results
fwd_summary <- summary(fwd_model)

# Identify a satisfactory model based on adjusted R^2, Cp, and BIC
# Extract metrics
adj_r2 <- fwd_summary$adjr2
cp <- fwd_summary$cp
bic <- fwd_summary$bic

# Find the model with maximum adjusted R^2
best_adj_r2_idx <- which.max(adj_r2)
cat("Model with highest adjusted R^2:", best_adj_r2_idx, "variables, Adj R^2 =", adj_r2[best_adj_r2_idx], "\n")

## Model with highest adjusted R^2: 15 variables, Adj R^2 = 0.7562066

# Find the model with minimum Cp (close to number of predictors)
best_cp_idx <- which.min(abs(cp - (1:ncol(train_data))))

## Warning in cp - (1:ncol(train_data)): longer object length is not a multiple of
## shorter object length

cat("Model with best Cp:", best_cp_idx, "variables, Cp =", cp[best_cp_idx], "\n")

## Model with best Cp: 16 variables, Cp = 16.06192

# Find the model with minimum BIC
best_bic_idx <- which.min(bic)
cat("Model with best BIC:", best_bic_idx, "variables, BIC =", bic[best_bic_idx], "\n")

## Model with best BIC: 9 variables, BIC = -698.5627

# Print the predictors for the model with the best BIC
best_model_vars <- names(coef(fwd_model, id = best_bic_idx))[-1]  # Exclude intercept
# Replace 'PrivateYes' with 'Private' to match dataset column
best_model_vars <- gsub("PrivateYes", "Private", best_model_vars)
cat("Predictors in the best BIC model (", best_bic_idx, "variables):\n", paste(best_model_vars, collapse = ", "), "\n")

## Predictors in the best BIC model ( 9 variables):
##  Private, Accept, Top25perc, F.Undergrad, Room.Board, PhD, perc.alumni, Expend, Grad.Rate

# Fit the selected model on the training set
formula_best <- as.formula(paste("Outstate ~", paste(best_model_vars, collapse = "+")))
best_lm <- lm(formula_best, data = train_data)

# Print the summary of the selected model
cat("\nSummary of the selected model:\n")

## 
## Summary of the selected model:

print(summary(best_lm))

## 
## Call:
## lm(formula = formula_best, data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6498.2 -1335.8    88.8  1302.1 10460.2 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.675e+03  5.465e+02  -4.894 1.31e-06 ***
## PrivateYes   2.915e+03  3.017e+02   9.660  < 2e-16 ***
## Accept       4.835e-01  9.908e-02   4.880 1.40e-06 ***
## Top25perc    1.595e+01  6.125e+00   2.604  0.00948 ** 
## F.Undergrad -2.291e-01  5.196e-02  -4.409 1.25e-05 ***
## Room.Board   7.857e-01  1.002e-01   7.844 2.39e-14 ***
## PhD          3.346e+01  7.369e+00   4.541 6.93e-06 ***
## perc.alumni  4.758e+01  9.112e+00   5.221 2.56e-07 ***
## Expend       1.909e-01  2.161e-02   8.834  < 2e-16 ***
## Grad.Rate    1.847e+01  6.719e+00   2.748  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2042 on 533 degrees of freedom
## Multiple R-squared:  0.754,  Adjusted R-squared:  0.7499 
## F-statistic: 181.5 on 9 and 533 DF,  p-value: < 2.2e-16

# Evaluate the model on the test set (mean squared error)
test_preds <- predict(best_lm, newdata = test_data)
test_mse <- mean((test_data$Outstate - test_preds)^2)
cat("Test set MSE for the selected model:", test_mse, "\n")

## Test set MSE for the selected model: 3793829

b.

# Load necessary library for GAM
library(mgcv)

## Loading required package: nlme

## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.

# (b) Fit a GAM on the training data using the selected predictors
# Ensure best_model_vars from previous step is available (from forward stepwise selection)
# If not running sequentially, assume best_model_vars is defined (e.g., from BIC model)
# For reproducibility, let's redefine the context if needed
if (!exists("best_model_vars")) {
  # Re-run key parts of previous code to ensure continuity
  library(ISLR2)
  library(leaps)
  set.seed(123)
  data(College)
  n <- nrow(College)
  train_idx <- sample(1:n, size = floor(0.7 * n))
  train_data <- College[train_idx, ]
  test_data <- College[-train_idx, ]
  fwd_model <- regsubsets(Outstate ~ ., data = train_data, method = "forward", nvmax = ncol(train_data) - 1)
  fwd_summary <- summary(fwd_model)
  best_bic_idx <- which.min(fwd_summary$bic)
  best_model_vars <- names(coef(fwd_model, id = best_bic_idx))[-1]
  best_model_vars <- gsub("PrivateYes", "Private", best_model_vars)
}

# Fit GAM with smoothing splines for numeric predictors and factor for Private
# Identify numeric vs. factor predictors
numeric_vars <- best_model_vars[!best_model_vars %in% "Private"]
factor_vars <- best_model_vars[best_model_vars %in% "Private"]

# Construct the GAM formula: s() for numeric variables, factor() for Private
gam_terms <- c(
  if (length(numeric_vars) > 0) paste0("s(", numeric_vars, ")"),
  if (length(factor_vars) > 0) paste0("factor(", factor_vars, ")")
)
gam_formula <- as.formula(paste("Outstate ~", paste(gam_terms, collapse = "+")))

# Fit the GAM model
gam_model <- gam(gam_formula, data = train_data, method = "REML")

# Print summary of the GAM model
cat("\nSummary of the GAM model:\n")

## 
## Summary of the GAM model:

print(summary(gam_model))

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Outstate ~ s(Accept) + s(Top25perc) + s(F.Undergrad) + s(Room.Board) + 
##     s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate) + factor(Private)
## 
## Parametric coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          8660.4      239.1  36.223  < 2e-16 ***
## factor(Private)Yes   2417.5      314.7   7.682 7.91e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df      F  p-value    
## s(Accept)      3.954  4.974  5.637 5.00e-05 ***
## s(Top25perc)   1.000  1.001  0.782 0.377040    
## s(F.Undergrad) 7.299  8.284  3.571 0.000264 ***
## s(Room.Board)  1.755  2.222 21.521  < 2e-16 ***
## s(PhD)         1.520  1.895  1.651 0.161962    
## s(perc.alumni) 1.000  1.001 18.028 2.62e-05 ***
## s(Expend)      5.676  6.811 27.360  < 2e-16 ***
## s(Grad.Rate)   2.719  3.463  3.936 0.007185 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.802   Deviance explained = 81.1%
## -REML = 4806.3  Scale est. = 3.3006e+06  n = 543

# Plot the GAM results (partial effect plots for each predictor)
par(mfrow = c(2, ceiling(length(best_model_vars)/2)))  # Arrange plots in a grid
plot(gam_model, shade = TRUE, se = TRUE, rug = TRUE, 
     main = "Partial Effects of Predictors on Outstate Tuition",
     xlab = NULL, ylab = "Partial Effect")
par(mfrow = c(1, 1))  # Reset plot layout

# Evaluate the GAM model on the test set (mean squared error)
test_preds_gam <- predict(gam_model, newdata = test_data)
test_mse_gam <- mean((test_data$Outstate - test_preds_gam)^2)
cat("Test set MSE for the GAM model:", test_mse_gam, "\n")

## Test set MSE for the GAM model: 3068829

# Compare with the linear model from previous step
cat("Test set MSE for the linear model (from forward stepwise):", test_mse, "\n")

## Test set MSE for the linear model (from forward stepwise): 3793829

c.

# (c) Evaluate the GAM model on the test set
# Predict on the test set using the GAM model
test_preds_gam <- predict(gam_model, newdata = test_data)

# Compute performance metrics
# Mean Squared Error (MSE)
test_mse_gam <- mean((test_data$Outstate - test_preds_gam)^2)

# Root Mean Squared Error (RMSE)
test_rmse_gam <- sqrt(test_mse_gam)

# Mean Absolute Error (MAE)
test_mae_gam <- mean(abs(test_data$Outstate - test_preds_gam))

# R-squared on test set
sst <- sum((test_data$Outstate - mean(test_data$Outstate))^2)  # Total sum of squares
sse <- sum((test_data$Outstate - test_preds_gam)^2)  # Residual sum of squares
test_r2_gam <- 1 - sse / sst

# Print GAM test set metrics
cat("\nGAM Model Test Set Performance:\n")

## 
## GAM Model Test Set Performance:

cat("MSE:", test_mse_gam, "\n")

## MSE: 3068829

cat("RMSE:", test_rmse_gam, "\n")

## RMSE: 1751.807

cat("MAE:", test_mae_gam, "\n")

## MAE: 1313.2

cat("R-squared:", test_r2_gam, "\n")

## R-squared: 0.7960297

# Compare with the linear model from forward stepwise selection
# Recompute linear model test metrics for consistency
test_preds_lm <- predict(best_lm, newdata = test_data)
test_mse_lm <- mean((test_data$Outstate - test_preds_lm)^2)
test_rmse_lm <- sqrt(test_mse_lm)
test_mae_lm <- mean(abs(test_data$Outstate - test_preds_lm))
sst_lm <- sum((test_data$Outstate - mean(test_data$Outstate))^2)
sse_lm <- sum((test_data$Outstate - test_preds_lm)^2)
test_r2_lm <- 1 - sse_lm / sst_lm

# Print linear model test set metrics
cat("\nLinear Model (Forward Stepwise) Test Set Performance:\n")

## 
## Linear Model (Forward Stepwise) Test Set Performance:

cat("MSE:", test_mse_lm, "\n")

## MSE: 3793829

cat("RMSE:", test_rmse_lm, "\n")

## RMSE: 1947.775

cat("MAE:", test_mae_lm, "\n")

## MAE: 1474.372

cat("R-squared:", test_r2_lm, "\n")

## R-squared: 0.7478424

# Compare the models
cat("\nComparison:\n")

## 
## Comparison:

cat("GAM MSE is", ifelse(test_mse_gam < test_mse_lm, "lower", "higher"), "than Linear MSE\n")

## GAM MSE is lower than Linear MSE

cat("GAM R-squared is", ifelse(test_r2_gam > test_r2_lm, "higher", "lower"), "than Linear R-squared\n")

## GAM R-squared is higher than Linear R-squared

d.

# (d) Analyze non-linear relationships in the GAM model
# Print the GAM model summary to inspect smooth terms
cat("\nGAM Model Summary for Non-Linearity Analysis:\n")

## 
## GAM Model Summary for Non-Linearity Analysis:

print(summary(gam_model))

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Outstate ~ s(Accept) + s(Top25perc) + s(F.Undergrad) + s(Room.Board) + 
##     s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate) + factor(Private)
## 
## Parametric coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          8660.4      239.1  36.223  < 2e-16 ***
## factor(Private)Yes   2417.5      314.7   7.682 7.91e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df      F  p-value    
## s(Accept)      3.954  4.974  5.637 5.00e-05 ***
## s(Top25perc)   1.000  1.001  0.782 0.377040    
## s(F.Undergrad) 7.299  8.284  3.571 0.000264 ***
## s(Room.Board)  1.755  2.222 21.521  < 2e-16 ***
## s(PhD)         1.520  1.895  1.651 0.161962    
## s(perc.alumni) 1.000  1.001 18.028 2.62e-05 ***
## s(Expend)      5.676  6.811 27.360  < 2e-16 ***
## s(Grad.Rate)   2.719  3.463  3.936 0.007185 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.802   Deviance explained = 81.1%
## -REML = 4806.3  Scale est. = 3.3006e+06  n = 543

# Extract smooth terms from the GAM summary
gam_summary <- summary(gam_model)
smooth_terms <- gam_summary$s.table

# Analyze numeric predictors for non-linearity
cat("\nEvidence of Non-Linearity for Numeric Predictors:\n")

## 
## Evidence of Non-Linearity for Numeric Predictors:

for (i in 1:nrow(smooth_terms)) {
  predictor <- rownames(smooth_terms)[i]
  edf <- smooth_terms[i, "edf"]
  p_value <- smooth_terms[i, "p-value"]
  cat("Predictor:", predictor, "\n")
  cat("  Effective Degrees of Freedom (edf):", edf, "\n")
  cat("  P-value:", p_value, "\n")
  if (edf > 1.5 && p_value < 0.05) {
    cat("  --> Strong evidence of non-linear relationship\n")
  } else if (edf > 1.2 && p_value < 0.05) {
    cat("  --> Moderate evidence of non-linear relationship\n")
  } else if (edf <= 1.2) {
    cat("  --> Relationship is nearly linear\n")
  } else {
    cat("  --> No significant non-linear relationship (check p-value)\n")
  }
  cat("\n")
}

## Predictor: s(Accept) 
##   Effective Degrees of Freedom (edf): 3.954348 
##   P-value: 4.999039e-05 
##   --> Strong evidence of non-linear relationship
## 
## Predictor: s(Top25perc) 
##   Effective Degrees of Freedom (edf): 1.000362 
##   P-value: 0.3770405 
##   --> Relationship is nearly linear
## 
## Predictor: s(F.Undergrad) 
##   Effective Degrees of Freedom (edf): 7.299149 
##   P-value: 0.0002638275 
##   --> Strong evidence of non-linear relationship
## 
## Predictor: s(Room.Board) 
##   Effective Degrees of Freedom (edf): 1.755398 
##   P-value: 0 
##   --> Strong evidence of non-linear relationship
## 
## Predictor: s(PhD) 
##   Effective Degrees of Freedom (edf): 1.520082 
##   P-value: 0.161962 
##   --> No significant non-linear relationship (check p-value)
## 
## Predictor: s(perc.alumni) 
##   Effective Degrees of Freedom (edf): 1.000303 
##   P-value: 2.621062e-05 
##   --> Relationship is nearly linear
## 
## Predictor: s(Expend) 
##   Effective Degrees of Freedom (edf): 5.676215 
##   P-value: 0 
##   --> Strong evidence of non-linear relationship
## 
## Predictor: s(Grad.Rate) 
##   Effective Degrees of Freedom (edf): 2.719137 
##   P-value: 0.007184555 
##   --> Strong evidence of non-linear relationship

# Plot partial effects to visually inspect non-linearity
par(mfrow = c(2, ceiling(length(best_model_vars)/2)))  # Arrange plots in a grid
plot(gam_model, shade = TRUE, se = TRUE, rug = TRUE, 
     main = "Partial Effects of Predictors on Outstate Tuition",
     xlab = NULL, ylab = "Partial Effect")
par(mfrow = c(1, 1))  # Reset plot layout

Predictive Modeling HW6

Minh Nguyen zpx082

2025-04-17

Problem 6.

a.

b.

Problem 10.

a.

b.

c.

d.