set.seed(100)
X <- rnorm(100, mean = 10, sd = 4)
epsilon <- rnorm(100, mean = 0, sd = 2)
We’re simulating a normally distributed feature and noise term. This randomness allows for testing model fitting in a controlled way.
beta0 <- 2
beta1 <- 1
beta2 <- 0.4
beta3 <- 0.6
Y <- beta0 + beta1 * X + beta2 * X^2 + beta3 * X^3 + epsilon
This gives us a non-linear relationship that mimics real-world scenarios better than just a straight line. (c) Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors X, X2,…,X10. What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coeffcients of the best model obtained. Note you will need to use the data.frame() function to create a single data set containing both X and Y .
library(leaps)
data_df <- data.frame(
Y = Y,
X = X,
X2 = X^2, X3 = X^3, X4 = X^4, X5 = X^5,
X6 = X^6, X7 = X^7, X8 = X^8, X9 = X^9, X10 = X^10
)
subset_model <- regsubsets(Y ~ ., data = data_df, nvmax = 10)
subset_summary <- summary(subset_model)
par(mfrow = c(1, 3))
plot(subset_summary$cp, type = 'l', xlab = "Number of Predictors", ylab = "Cp")
plot(subset_summary$bic, type = 'l', xlab = "Number of Predictors", ylab = "BIC")
plot(subset_summary$adjr2, type = 'l', xlab = "Number of Predictors", ylab = "Adjusted R^2")
which.min(subset_summary$bic) # Returns the optimal number of variables
## [1] 3
coef(subset_model, which.min(subset_summary$bic))
## (Intercept) X2 X3 X6
## 4.496369e+00 5.370959e-01 5.936070e-01 2.342936e-07
It evaluates all possible combinations of predictors and helps identify which subset explains the response best.
# Forward
forward_model <- regsubsets(Y ~ ., data = data_df, nvmax = 10, method = "forward")
summary(forward_model)$which
## (Intercept) X X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 3 TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## 4 TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## 5 TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## 6 TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE TRUE
## 7 TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE
## 8 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## 9 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## 10 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Backward
backward_model <- regsubsets(Y ~ ., data = data_df, nvmax = 10, method = "backward")
summary(backward_model)$which
## (Intercept) X X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 3 TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## 4 TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## 5 TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
## 6 TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## 7 TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 8 TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 9 TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 10 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
# Prepare matrix inputs for glmnet
x_matrix <- model.matrix(Y ~ . -1, data = data_df) # Drop intercept since glmnet adds it
y_vector <- data_df$Y
# Cross-validation for lasso (alpha = 1)
set.seed(100)
cv_lasso <- cv.glmnet(x_matrix, y_vector, alpha = 1)
# Plot CV Error vs Lambda
plot(cv_lasso)
# Optimal lambda and coefficients
best_lambda <- cv_lasso$lambda.min
lasso_model <- glmnet(x_matrix, y_vector, alpha = 1, lambda = best_lambda)
coef(lasso_model)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 36.1796050
## X .
## X2 0.4012477
## X3 0.5839164
## X4 .
## X5 .
## X6 .
## X7 .
## X8 .
## X9 .
## X10 .
# New response where only X^7 matters
Y2 <- 1 + 0.5 * X^7 + epsilon
data_df$Y2 <- Y2
# Best subset for new target
subset_model2 <- regsubsets(Y2 ~ ., data = data_df, nvmax = 10)
summary(subset_model2)
## Subset selection object
## Call: regsubsets.formula(Y2 ~ ., data = data_df, nvmax = 10)
## 11 Variables (and intercept)
## Forced in Forced out
## Y FALSE FALSE
## X FALSE FALSE
## X2 FALSE FALSE
## X3 FALSE FALSE
## X4 FALSE FALSE
## X5 FALSE FALSE
## X6 FALSE FALSE
## X7 FALSE FALSE
## X8 FALSE FALSE
## X9 FALSE FALSE
## X10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
## Y X X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " "*" " " " " " "
## 3 ( 1 ) "*" " " " " "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " "*" "*" " " " " " " "*" " " " " " "
## 5 ( 1 ) "*" "*" "*" "*" " " " " " " "*" " " " " " "
## 6 ( 1 ) "*" "*" "*" "*" " " " " " " "*" " " " " "*"
## 7 ( 1 ) "*" "*" "*" "*" " " " " " " "*" " " "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" " " " " " " "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
# Lasso on new response
cv_lasso2 <- cv.glmnet(x_matrix, Y2, alpha = 1)
plot(cv_lasso2)
coef(cv_lasso2, s = cv_lasso2$lambda.min)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.083879e+06
## X .
## X2 .
## X3 .
## X4 .
## X5 .
## X6 2.032462e-01
## X7 4.742215e-01
## X8 3.940584e-05
## X9 .
## X10 .
library(ISLR2)
set.seed(2025) # Consistent results
# Randomly splitting into 50% train/test
train_idx <- sample(1:nrow(College), size = floor(0.5 * nrow(College)))
test_idx <- setdiff(1:nrow(College), train_idx)
# Fit linear model with all predictors
lm_full <- lm(Apps ~ ., data = College, subset = train_idx)
# Predict on test data
lm_preds <- predict(lm_full, newdata = College[test_idx,])
lm_mse <- mean((College$Apps[test_idx] - lm_preds)^2)
lm_mse
## [1] 1302978
library(glmnet)
# Prepare matrices
x_train <- model.matrix(Apps ~ ., data = College)[train_idx, -1]
x_test <- model.matrix(Apps ~ ., data = College)[test_idx, -1]
y_train <- College$Apps[train_idx]
y_test <- College$Apps[test_idx]
# Ridge regression (alpha = 0)
set.seed(2025)
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0,
lambda = 10^seq(5, -5, length.out = 100))
best_lambda_ridge <- cv_ridge$lambda.min
ridge_preds <- predict(cv_ridge, newx = x_test, s = best_lambda_ridge)
ridge_mse <- mean((y_test - ridge_preds)^2)
ridge_mse
## [1] 1429693
# Lasso regression (alpha = 1)
set.seed(2025)
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1,
lambda = 10^seq(10, -5, length.out = 100))
best_lambda_lasso <- cv_lasso$lambda.min
lasso_preds <- predict(cv_lasso, newx = x_test, s = best_lambda_lasso)
lasso_mse <- mean((y_test - lasso_preds)^2)
lasso_mse
## [1] 1344052
# Number of selected features (non-zero coefficients)
coef(cv_lasso, s = best_lambda_lasso)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -1.356997e+03
## PrivateYes -4.067197e+02
## Accept 1.225951e+00
## Enroll .
## Top10perc 3.110121e+01
## Top25perc -6.542472e+00
## F.Undergrad 6.381524e-02
## P.Undergrad .
## Outstate -4.203134e-02
## Room.Board 1.738223e-01
## Books 3.347535e-01
## Personal .
## PhD -1.029699e+01
## Terminal -8.208409e-01
## S.F.Ratio 1.676598e+01
## perc.alumni -4.929396e+00
## Expend 1.110444e-01
## Grad.Rate 8.435889e+00
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(2025)
pcr_fit <- pcr(Apps ~ ., data = College, subset = train_idx, scale = TRUE, validation = "CV")
# Find optimal number of components visually
validationplot(pcr_fit, val.type = "MSEP")
# Fit final PCR model with optimal components
pcr_final <- pcr(Apps ~ ., data = College, subset = train_idx,
scale = TRUE, ncomp = 17)
# Predict and calculate error
pcr_preds <- predict(pcr_final, newdata = College[test_idx,])
pcr_mse <- mean((y_test - pcr_preds)^2)
pcr_mse
## [1] 3957560
# Fit PLS with cross-validation
set.seed(2025)
pls_fit <- plsr(Apps ~ ., data = College, subset = train_idx,
scale = TRUE, validation = "CV")
# Visualize component selection
validationplot(pls_fit, val.type = "MSEP")
# Get predictions using optimal components
pls_preds <- predict(pls_fit, newdata = College[test_idx,], ncomp = 17)
pls_mse <- mean((y_test - pls_preds)^2)
pls_mse
## [1] 1302978
data.frame(
Model = c("Linear", "Ridge", "LASSO", "PCR", "PLSR"),
Test_MSE = c(lm_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse)
)