if (!require(leaps)) {
install.packages("leaps")
}
## Loading required package: leaps
if (!require(pls)) {
install.packages("pls")
}
## Loading required package: pls
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
if (!require(glmnet)) {
install.packages("glmnet")
}
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 4.1-8
set.seed(1)
n <- 100
X <- rnorm(n)
epsilon <- rnorm(n)
beta_0 <- 2
beta_1 <- 3
beta_2 <- -2
beta_3 <- 1
Y <- beta_0 + beta_1*X + beta_2*X^2 + beta_3*X^3 + epsilon
library(leaps)
# Create data frame with predictors X^1 to X^10
predictors <- data.frame(
Y = Y,
X1 = X,
X2 = X^2,
X3 = X^3,
X4 = X^4,
X5 = X^5,
X6 = X^6,
X7 = X^7,
X8 = X^8,
X9 = X^9,
X10 = X^10
)
best_fit <- regsubsets(Y ~ ., data = predictors, nvmax = 10)
best_summary <- summary(best_fit)
# Plot model selection criteria
par(mfrow = c(2, 2))
plot(best_summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "b")
points(which.min(best_summary$cp), min(best_summary$cp), col = "red", pch = 19)
plot(best_summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "b")
points(which.min(best_summary$bic), min(best_summary$bic), col = "blue", pch = 19)
plot(best_summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted R2", type = "b")
points(which.max(best_summary$adjr2), max(best_summary$adjr2), col = "green", pch = 19)
# Extract coefficients of best model (based on BIC for example)
coef(best_fit, which.min(best_summary$bic))
## (Intercept) X1 X2 X3
## 2.061507 2.975280 -2.123791 1.017639
Based on the BIC, the best model includes the predictors X, X^2, and X^3. This is consistent with the true data-generating process, showing that best subset selection correctly identifies the underlying structure of the model.
forward_fit <- regsubsets(Y ~ ., data = predictors, nvmax = 10, method = "forward")
backward_fit <- regsubsets(Y ~ ., data = predictors, nvmax = 10, method = "backward")
# Compare results
summary(forward_fit)$which[which.min(summary(forward_fit)$bic), ]
## (Intercept) X1 X2 X3 X4 X5
## TRUE TRUE TRUE TRUE FALSE FALSE
## X6 X7 X8 X9 X10
## FALSE FALSE FALSE FALSE FALSE
summary(backward_fit)$which[which.min(summary(backward_fit)$bic), ]
## (Intercept) X1 X2 X3 X4 X5
## TRUE TRUE TRUE FALSE FALSE TRUE
## X6 X7 X8 X9 X10
## FALSE FALSE FALSE FALSE FALSE
Both forward and backward stepwise selection identified the same key predictors: X, X^2, and X^3. This confirms the robustness of model selection when using different stepwise approaches.
library(glmnet)
# Prepare design matrix
x_mat <- model.matrix(Y ~ . - 1, data = predictors)
cv_lasso <- cv.glmnet(x_mat, Y, alpha = 1)
plot(cv_lasso)
coef(cv_lasso, s = "lambda.min")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 2.04230565
## X1 3.27797188
## X2 -2.10819881
## X3 0.64713641
## X4 .
## X5 0.06304737
## X6 .
## X7 .
## X8 .
## X9 .
## X10 .
The lasso selected X, X^2, and X^3, and shrunk the coefficients of irrelevant predictors to zero. This result is consistent with the true model and the best subset selection, confirming lasso’s ability to perform variable selection and regularization effectively.
beta_7 <- 5
Y2 <- beta_0 + beta_7*X^7 + epsilon
predictors2 <- predictors
predictors2$Y <- Y2
best_fit2 <- regsubsets(Y ~ ., data = predictors2, nvmax = 10)
summary(best_fit2)
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = predictors2, nvmax = 10)
## 10 Variables (and intercept)
## Forced in Forced out
## X1 FALSE FALSE
## X2 FALSE FALSE
## X3 FALSE FALSE
## X4 FALSE FALSE
## X5 FALSE FALSE
## X6 FALSE FALSE
## X7 FALSE FALSE
## X8 FALSE FALSE
## X9 FALSE FALSE
## X10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 ( 1 ) " " " " " " " " " " " " "*" " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " "*" " " " " " "
## 3 ( 1 ) " " "*" " " " " "*" " " "*" " " " " " "
## 4 ( 1 ) "*" "*" "*" " " " " " " "*" " " " " " "
## 5 ( 1 ) "*" "*" "*" "*" " " " " "*" " " " " " "
## 6 ( 1 ) "*" " " "*" " " " " "*" "*" "*" " " "*"
## 7 ( 1 ) "*" " " "*" " " "*" "*" "*" "*" " " "*"
## 8 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*" " " "*"
## 9 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
coef(best_fit2, which.min(summary(best_fit2)$bic))
## (Intercept) X7
## 1.95894 5.00077
x_mat2 <- model.matrix(Y ~ . - 1, data = predictors2)
cv_lasso2 <- cv.glmnet(x_mat2, Y2, alpha = 1)
plot(cv_lasso2)
coef(cv_lasso2, s = "lambda.min")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 2.574163
## X1 .
## X2 .
## X3 .
## X4 .
## X5 .
## X6 .
## X7 4.854995
## X8 .
## X9 .
## X10 .
In this case, both best subset selection and lasso correctly identify X^7 as the key predictor, with the rest being negligible. However, lasso’s regularization may lead to slightly biased coefficient estimates, while best subset selection provides a more interpretable sparse model when the correct subset is small and well-separated.
library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
##
## The following object is masked from 'package:pls':
##
## R2
library(glmnet)
library(pls)
college <- read.csv("/Users/saransh/Downloads/Statistical_Learning_Resources/College.csv", row.names = 1)
set.seed(1)
# Create training and test split
train_index <- createDataPartition(college$Apps, p = 0.7, list = FALSE)
train <- college[train_index, ]
test <- college[-train_index, ]
# Prepare response and predictor matrices
x_train <- model.matrix(Apps ~ ., data = train)[, -1]
y_train <- train$Apps
x_test <- model.matrix(Apps ~ ., data = test)[, -1]
y_test <- test$Apps
lm_fit <- lm(Apps ~ ., data = train)
lm_pred <- predict(lm_fit, newdata = test)
lm_mse <- mean((y_test - lm_pred)^2)
lm_mse
## [1] 921637.5
The test MSE using linear regression is approximately 921,638. This serves as a baseline for comparing other models.
ridge_cv <- cv.glmnet(x_train, y_train, alpha = 0)
ridge_best_lambda <- ridge_cv$lambda.min
ridge_pred <- predict(ridge_cv, s = ridge_best_lambda, newx = x_test)
ridge_mse <- mean((y_test - ridge_pred)^2)
ridge_mse
## [1] 1032561
The ridge regression model gives a test MSE of about 1,032,561, slightly worse than least squares. Ridge helps reduce overfitting by shrinking coefficients.
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1)
lasso_best_lambda <- lasso_cv$lambda.min
lasso_pred <- predict(lasso_cv, s = lasso_best_lambda, newx = x_test)
lasso_mse <- mean((y_test - lasso_pred)^2)
lasso_coef <- coef(lasso_cv, s = "lambda.min")
non_zero_coef <- sum(lasso_coef != 0) - 1 # excluding intercept
lasso_mse
## [1] 945037.8
non_zero_coef
## [1] 13
Lasso achieves a test MSE of approximately 945,038 and selects 13 predictors. This model strikes a good balance between performance and simplicity.
pcr_fit <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
summary(pcr_fit)
## Data: X dimension: 545 17
## Y dimension: 545 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4012 3965 2198 2222 1852 1769 1760
## adjCV 4012 3965 2195 2222 1840 1764 1754
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1742 1735 1641 1645 1652 1655 1661
## adjCV 1732 1729 1634 1639 1646 1650 1656
## 14 comps 15 comps 16 comps 17 comps
## CV 1651 1546 1266 1236
## adjCV 1648 1515 1255 1226
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 30.774 56.86 63.89 69.96 75.50 80.34 83.92 87.39
## Apps 3.484 71.35 71.53 80.87 82.05 82.35 83.27 83.42
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.39 92.72 94.89 96.66 97.87 98.70 99.34
## Apps 84.72 84.91 84.95 84.97 84.97 85.21 91.17
## 16 comps 17 comps
## X 99.85 100.00
## Apps 92.67 92.95
# Choose optimal M
validationplot(pcr_fit, val.type = "MSEP")
pcr_pred <- predict(pcr_fit, test, ncomp = which.min(pcr_fit$validation$PRESS))
pcr_mse <- mean((y_test - pcr_pred)^2)
pcr_mse
## [1] 921637.5
The PCR model test error is close to that of other methods. The selected number of components reduces dimensionality while retaining explanatory power.
pls_fit <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
summary(pls_fit)
## Data: X dimension: 545 17
## Y dimension: 545 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4012 2018 1676 1610 1535 1389 1300
## adjCV 4012 2012 1668 1602 1515 1365 1286
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1279 1268 1261 1258 1256 1251 1249
## adjCV 1266 1256 1249 1247 1245 1240 1238
## 14 comps 15 comps 16 comps 17 comps
## CV 1249 1249 1249 1249
## adjCV 1238 1238 1238 1238
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 26.22 38.96 62.40 65.22 67.99 73.15 77.62 81.12
## Apps 76.71 84.67 86.89 90.38 92.42 92.75 92.81 92.85
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 83.38 86.63 89.56 90.79 92.46 94.70 96.64
## Apps 92.89 92.92 92.93 92.95 92.95 92.95 92.95
## 16 comps 17 comps
## X 98.78 100.00
## Apps 92.95 92.95
# Choose optimal M
validationplot(pls_fit, val.type = "MSEP")
pls_pred <- predict(pls_fit, test, ncomp = which.min(pls_fit$validation$PRESS))
pls_mse <- mean((y_test - pls_pred)^2)
pls_mse
## [1] 922149.4
The PLS model performs similarly to PCR with comparable test error. It incorporates both predictor variance and correlation with the response, making it more effective than PCR in some settings.
All five models, linear regression, ridge, lasso, PCR, and PLS, yield test errors in the same ballpark, demonstrating reasonably good predictive power. Lasso’s feature selection makes it interpretable and efficient.
PCR and PLS help reduce multicollinearity and overfitting through dimensionality reduction. Ridge is useful when multicollinearity is present but sacrifices interpretability. No model drastically outperforms the others, indicating that careful model selection should weigh both prediction error and context-specific goals like simplicity or interpretability.