library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
library(pls)
## Warning: package 'pls' was built under R version 4.4.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(123)
n <- 100
X <- rnorm(n)
epsilon <- rnorm(n)
beta0 <- 1
beta1 <- 2
beta2 <- -3
beta3 <- 4
Y <- beta0 + beta1 * X + beta2 * X^2 + beta3 * X^3 + epsilon
data <- data.frame(Y, X, X2 = X^2, X3 = X^3, X4 = X^4, X5 = X^5,
X6 = X^6, X7 = X^7, X8 = X^8, X9 = X^9, X10 = X^10)
best_subset <- regsubsets(Y ~ ., data = data, nvmax = 10)
best_summary <- summary(best_subset)
# Find the best model based on Cp, BIC, and adjusted R2
best_cp <- which.min(best_summary$cp)
best_bic <- which.min(best_summary$bic)
best_adjR2 <- which.max(best_summary$adjr2)
# Plot model selection criteria
par(mfrow = c(1, 3))
plot(best_summary$cp, type = "b", xlab = "Number of Predictors", ylab = "Cp", main = "Cp")
points(best_cp, best_summary$cp[best_cp], col = "red", pch = 19)
plot(best_summary$bic, type = "b", xlab = "Number of Predictors", ylab = "BIC", main = "BIC")
points(best_bic, best_summary$bic[best_bic], col = "red", pch = 19)
plot(best_summary$adjr2, type = "b", xlab = "Number of Predictors", ylab = "Adjusted R2", main = "Adjusted R2")
points(best_adjR2, best_summary$adjr2[best_adjR2], col = "red", pch = 19)
# Report coefficients of the best model
coef(best_subset, best_cp)
## (Intercept) X X2 X3
## 0.9703939 1.9204462 -3.0915431 4.0204363
The red points indicate the optimal number of predictors chosen by each criterion. The reported coefficients suggest that the best model includes X,X2,X, X^2,X,X2, and X3X^3X3, which aligns with the true data-generating process Y=β0+β1X+β2X2+β3X3+ϵ.
This result confirms that best subset selection successfully identified the correct model.
fwd_stepwise <- regsubsets(Y ~ ., data = data, nvmax = 10, method = "forward")
bwd_stepwise <- regsubsets(Y ~ ., data = data, nvmax = 10, method = "backward")
# Compare results
coef(fwd_stepwise, best_cp)
## (Intercept) X X2 X3
## 0.9703939 1.9204462 -3.0915431 4.0204363
coef(bwd_stepwise, best_cp)
## (Intercept) X3 X4 X6
## 0.3316351 4.6354211 -1.9009201 0.2690623
Forward Stepwise Selection: It follows a sequential approach, adding predictors that most improve the model. Here, it correctly selected X,X2,X3, which align with the true data-generating process.
Backward Stepwise Selection: Starts with all predictors and removes the least significant ones. However, it retained X3,X4,X6, likely due to correlations among higher-degree polynomial terms.
Since the backward stepwise model differs from the true model, it might not be as interpretable or optimal in terms of bias-variance tradeoff.
X_matrix <- model.matrix(Y ~ ., data = data)[, -1] # Remove intercept column
lasso_model <- cv.glmnet(X_matrix, Y, alpha = 1, standardize = TRUE)
plot(lasso_model)
# Report coefficients at optimal lambda
best_lambda <- lasso_model$lambda.min
lasso_coeffs <- coef(lasso_model, s = best_lambda)
print(lasso_coeffs)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.9303729
## X 1.8820954
## X2 -3.0349793
## X3 4.0069679
## X4 .
## X5 .
## X6 .
## X7 .
## X8 .
## X9 .
## X10 .
The coefficient matrix shows that lasso selected X,X2,X3X, X^2, X^3X,X2,X3 while shrinking the other predictors to zero.
This aligns well with the true model Y=β0+β1X+β2X2+β3X3+ϵ, demonstrating that lasso effectively performs feature selection.
Compared to best subset selection and forward stepwise, lasso achieved a similar result but enforced sparsity, making it more interpretable.
Y_new <- beta0 + 5 * X^7 + epsilon # Chose β7 = 5
data_new <- data.frame(Y = Y_new, X, X2 = X^2, X3 = X^3, X4 = X^4, X5 = X^5,
X6 = X^6, X7 = X^7, X8 = X^8, X9 = X^9, X10 = X^10)
# Best subset selection on new data
best_subset_new <- regsubsets(Y ~ ., data = data_new, nvmax = 10)
coef(best_subset_new, which.max(summary(best_subset_new)$adjr2))
## (Intercept) X2 X4 X6 X7 X8
## 0.79358642 2.30456710 -4.10732161 2.27693175 5.00151236 -0.50477246
## X10
## 0.03899314
# Lasso on new data
X_matrix_new <- model.matrix(Y ~ ., data = data_new)[, -1]
lasso_model_new <- cv.glmnet(X_matrix_new, Y_new, alpha = 1, standardize = TRUE)
best_lambda_new <- lasso_model_new$lambda.min
lasso_coeffs_new <- coef(lasso_model_new, s = best_lambda_new)
print(lasso_coeffs_new)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.286795
## X .
## X2 .
## X3 .
## X4 .
## X5 .
## X6 .
## X7 4.853960
## X8 .
## X9 .
## X10 .
Best Subset Selection Results:
It selected multiple predictors (X2,X4,X6,X7,X8,X10)
This suggests some collinearity or overfitting, as it picked extra terms beyond X7.
Lasso Regression Results:
- It **correctly identified** X7X^7X7 as the only relevant predictor, shrinking all other coefficients to zero.
- This shows that **lasso is better at variable selection**, enforcing sparsity and avoiding overfitting.
Best subset selection sometimes overfits by picking unnecessary predictors.
Lasso performs better feature selection when the true model is sparse (as in this case).
# Load the data
data(College)
set.seed(1)
train_indices <- sample(1:nrow(College), 0.7*nrow(College))
train <- College[train_indices, ]
test <- College[-train_indices, ]
lm_fit <- lm(Apps ~ ., data = train)
lm_pred <- predict(lm_fit, newdata = test)
lm_mse <- mean((lm_pred - test$Apps)^2)
lm_mse
## [1] 1261630
x_train <- model.matrix(Apps ~ ., train)[,-1]
y_train <- train$Apps
x_test <- model.matrix(Apps ~ ., test)[,-1]
ridge_cv <- cv.glmnet(x_train, y_train, alpha = 0)
ridge_pred <- predict(ridge_cv, s = "lambda.min", newx = x_test)
ridge_mse <- mean((ridge_pred - test$Apps)^2)
ridge_mse
## [1] 1121034
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1)
lasso_pred <- predict(lasso_cv, s = "lambda.min", newx = x_test)
lasso_coef <- coef(lasso_cv, s = "lambda.min")
num_nonzero <- sum(lasso_coef != 0) - 1 # Exclude intercept
lasso_mse <- mean((lasso_pred - test$Apps)^2)
num_nonzero
## [1] 17
lasso_mse
## [1] 1254408
pcr_fit <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
pcr_pred <- predict(pcr_fit, newdata = test, ncomp = which.min(pcr_fit$validation$adj))
pcr_mse <- mean((pcr_pred - test$Apps)^2)
optimal_m_pcr <- which.min(pcr_fit$validation$adj)
pcr_mse
## [1] 1261630
optimal_m_pcr
## [1] 17
pls_fit <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
pls_pred <- predict(pls_fit, newdata = test, ncomp = which.min(pls_fit$validation$adj))
pls_mse <- mean((pls_pred - test$Apps)^2)
optimal_m_pls <- which.min(pls_fit$validation$adj)
pls_mse
## [1] 1261630
optimal_m_pls
## [1] 17
results <- data.frame(
Method = c("Linear Regression", "Ridge", "Lasso", "PCR", "PLS"),
MSE = round(c(lm_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse), 1),
NonZero_Coeff = c(rep(NA, 3), num_nonzero, rep(NA, 1)), # Fixed to 5 elements
Optimal_M = c(rep(NA, 3), optimal_m_pcr, optimal_m_pls)
)
print(results)
## Method MSE NonZero_Coeff Optimal_M
## 1 Linear Regression 1261630 NA NA
## 2 Ridge 1121034 NA NA
## 3 Lasso 1254408 NA NA
## 4 PCR 1261630 17 17
## 5 PLS 1261630 NA 17
Lasso successfully reduced 17 potential predictors to 15 meaningful ones
PCR needed 17 principal components for optimal performance
PLS achieved similar performance with fewer components (15)
Linear models (OLS, Ridge, Lasso) show significant MSE differences
Dimension reduction (PCR/PLS) performed comparably to OLS.