In this lecture, we apply Best Subset Selection, Lasso Regression,
and Ridge Regression to build predictive models. Following modern
R development practices, we implement these using the
standard tidyverse-compliant frameworks:
tidymodels for the shrinkage methods and
modelr for the subset selection resamples. We use
ggplot2 for visualizing tuning curves and provide code for
DT to generate interactive tables.
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
In this lecture, we apply Best Subset Selection, Lasso Regression,
and Ridge Regression to build predictive models. This lecture focuses on
the regression task of predicting the per capita crime rate
(crim) using the classic Boston
housing dataset.
# Install missing packages if necessary
# install.packages(c("tidymodels", "modelr", "ISLR2", "DT", "leaps", "glmnet","plotly"))
# Load libraries
library(tidymodels)
library(modelr)
library(ISLR2)
library(DT)
library(ggplot2)
library(leaps)
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
library(plotly)
# Load the classic Boston dataset
data(Boston)
# Set seed for reproducibility and establish 10-fold CV splits
set.seed(1)
boston_folds <- vfold_cv(Boston, v = 10)
# Create a preprocessing recipe (normalizing features is mandatory for Lasso/Ridge)
boston_recipe <- recipe(crim ~ ., data = Boston) %>%
step_normalize(all_predictors())
We will cover three foundational methodologies for feature selection and regularization:
Following modern R development practices, we implement
these using tidyverse-compliant frameworks: tidymodels for
the shrinkage methods and modelr for the subset selection
resamples. Visualizations are built using ggplot2, and
interactive data representations are generated via DT.
Seeks to minimize the Residual Sum of Squares (RSS) subject to a constraint on the total number of predictors \(p\):
\[\min_{\beta} \sum_{i=1}^n \left(y_i - \beta_0 - \sum_{j=1}^m \beta_j x_{ij}\right)^2 \quad \text{subject to} \quad \sum_{j=1}^m I(\beta_j \neq 0) \le p\]
Where \(I(\cdot)\) is an indicator function. This is a discrete optimization problem.
# Robust custom prediction helper for regsubsets
predict.regsubsets <- function(object, newdata, id, ...) {
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
# Explicitly force evaluation of id to bypass environment masking issues
subset_id <- as.integer(id)
coefi <- coef(object, id = subset_id)
xvars <- names(coefi)
mat[, xvars, drop = FALSE] %*% coefi
}
# Define 10 cross-validation folds using modelr
set.seed(1)
cv_folds <- Boston %>% crossv_kfold(k = 10)
# Evaluate all valid subset sizes across all 10 folds
cv_subset_errors <- map_df(1:10, function(fold_idx) {
train_df <- as.data.frame(cv_folds$train[[fold_idx]])
test_df <- as.data.frame(cv_folds$test[[fold_idx]])
# Fit best subsets on training fold
fit_subsets <- regsubsets(crim ~ ., data = train_df, nvmax = 13)
# Determine how many models were ACTUALLY generated for this fold
# (Prevents subscript out of bounds if it stops early)
num_models_built <- length(fit_subsets$rss) - 1 # Subtract 1 because index 1 is the intercept-only model
if(num_models_built < 1) return(NULL) # Defensive escape if no models built
# Build the model matrix for the test data once per fold
test_mat <- model.matrix(crim ~ ., data = test_df)
actual <- test_df$crim
# Map safely only over the valid indices that were built
map_df(1:num_models_built, function(i) {
# Extract coefficients safely for subset size 'i'
coefi <- coef(fit_subsets, id = i)
xvars <- names(coefi)
# Compute predictions manually via matrix multiplication
preds <- test_mat[, xvars, drop = FALSE] %*% coefi
rss <- sum((actual - preds)^2)
tss <- sum((actual - mean(actual))^2)
rsq_val <- 1 - (rss / tss)
tibble(
fold = fold_idx,
size = i,
rmse = sqrt(mean((actual - preds)^2)),
mae = mean(abs(actual - preds)),
rsq = rsq_val
)
})
})
# Summarize metrics across folds for each subset size
subset_cv_summary <- cv_subset_errors %>%
group_by(size) %>%
summarize(
mean_rmse = mean(rmse),
mean_mae = mean(mae),
mean_rsq = mean(rsq)
)
# Extract metrics for the optimal subset size (minimizing RMSE)
best_subset_size <- subset_cv_summary %>%
filter(mean_rmse == min(mean_rmse)) %>%
pull(size)
best_subset_metrics <- subset_cv_summary %>%
filter(size == best_subset_size) %>%
pivot_longer(cols = starts_with("mean_"), names_to = "metric", values_to = "estimate") %>%
mutate(
metric = str_replace(metric, "mean_", ""),
model = "Best Subset Selection"
) %>%
select(metric, estimate, model)
# Highlight the optimal size
optimal_size <- subset_cv_summary %>%
filter(mean_rmse == min(mean_rmse)) %>%
pull(size)
subset_curve_gg <- ggplot(subset_cv_summary, aes(x = size, y = mean_rmse,
text = paste("Subset Size:", size,
"<br>CV RMSE:", round(mean_rmse, 4)))) +
geom_line(color = "darkgreen", size = 1) +
geom_point(color = "darkgreen", size = 2.5) +
geom_vline(xintercept = optimal_size, linetype = "dashed", color = "grey40") +
scale_x_continuous(breaks = 1:13) +
labs(
title = "Best Subset Selection: Cross-Validation Profile",
x = "Number of Predictors (Subset Size)",
y = "Mean Cross-Validated RMSE"
) +
theme_minimal()
ggplotly(subset_curve_gg, tooltip = "text")
This plot shows how the 10-fold cross-validated RMSE drops as variables are added, highlighting the mathematically “optimal” model size with a dashed line.
full_subsets <- regsubsets(crim ~ ., data = Boston, nvmax = 13)
subsets_summary <- summary(full_subsets)
inclusion_matrix <- as.data.frame(subsets_summary$which) %>%
mutate(subset_size = row_number()) %>%
pivot_longer(cols = -subset_size, names_to = "Variable", values_to = "Included") %>%
filter(Variable != "(Intercept)") %>%
mutate(Status = if_else(Included, "Selected", "Excluded"))
matrix_gg <- ggplot(inclusion_matrix, aes(x = factor(subset_size), y = Variable, fill = Status,
text = paste("Subset Size (p):", subset_size,
"<br>Variable:", Variable,
"<br>Status:", Status))) +
geom_tile(color = "white", size = 0.5) +
scale_fill_manual(values = c("Selected" = "darkblue", "Excluded" = "grey90")) +
labs(
title = "Best Subset Selection: Variable Inclusion Matrix",
x = "Model Subset Size (p)",
y = "Predictor Variable",
fill = "Status"
) +
theme_minimal() +
theme(panel.grid = element_blank())
ggplotly(matrix_gg, tooltip = "text")
Let’s compare these results to the full linear regression model
full_lm_spec <- linear_reg() %>% set_engine("lm")
full_lm_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(full_lm_spec)
# Force the use of yardstick metrics to bypass modelr namespace masking
full_lm_results <- fit_resamples(
full_lm_wf,
resamples = boston_folds,
metrics = metric_set(yardstick::rmse, yardstick::rsq, yardstick::mae)
)
full_lm_metrics <- collect_metrics(full_lm_results) %>%
select(metric = .metric, estimate = mean) %>%
mutate(model = "Full Linear Regression")
# 1. Properly reshape the long metrics into a single row wide-format table
full_lm_cleaned <- full_lm_metrics %>%
select(metric, estimate) %>%
pivot_wider(names_from = metric, values_from = estimate) %>%
mutate(Model = "Full Linear Regression") %>%
select(Model, RMSE = rmse, MAE = mae, `R-squared` = rsq)
# 2. View it using DT::datatable
datatable(full_lm_cleaned, options = list(dom = 't'))
Introduces a continuous penalty based on the absolute magnitudes of the coefficients:
\[\min_{\beta} \sum_{i=1}^n \left(y_i - \beta_0 - \sum_{j=1}^m \beta_j x_{ij}\right)^2 + \lambda \sum_{j=1}^m |\beta_j|\]
# --- Lasso Specification ---
lasso_spec <- linear_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
lasso_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(lasso_spec)
# Define a grid of penalty parameters to test (10^-4 to 10^2)
penalty_grid <- grid_regular(penalty(range = c(-4, 2)), levels = 100)
# Run hyperparameter tuning via Cross-Validation
lasso_results <- tune_grid(
lasso_wf,
resamples = boston_folds,
grid = penalty_grid,
metrics = metric_set(yardstick::rmse, yardstick::rsq, yardstick::mae)
)
# Extract best hyperparameters based on minimizing RMSE
best_lasso <- select_best(lasso_results, metric = "rmse")
# Collect cross-validated performance metrics for the best models
lasso_cv_metrics <- collect_metrics(lasso_results) %>%
filter(penalty == best_lasso$penalty) %>%
select(metric = .metric, estimate = mean) %>%
mutate(model = "Lasso Regression")
Introduces a continuous penalty based on the squared magnitudes of the coefficients:
\[\min_{\beta} \sum_{i=1}^n \left(y_i - \beta_0 - \sum_{j=1}^m \beta_j x_{ij}\right)^2 + \lambda \sum_{j=1}^m \beta_j^2\]
# --- Ridge Specification ---
ridge_spec <- linear_reg(penalty = tune(), mixture = 0) %>%
set_engine("glmnet")
ridge_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(ridge_spec)
# Define a grid of penalty parameters to test (10^-4 to 10^2)
penalty_grid <- grid_regular(penalty(range = c(-4, 2)), levels = 100)
# Run hyperparameter tuning via Cross-Validation
ridge_results <- tune_grid(
ridge_wf,
resamples = boston_folds,
grid = penalty_grid,
metrics = metric_set(yardstick::rmse, yardstick::rsq, yardstick::mae)
)
# Extract best hyperparameters based on minimizing RMSE
best_ridge <- select_best(ridge_results, metric = "rmse")
# Collect cross-validated performance metrics for the best models
ridge_cv_metrics <- collect_metrics(ridge_results) %>%
filter(penalty == best_ridge$penalty) %>%
select(metric = .metric, estimate = mean) %>%
mutate(model = "Ridge Regression")
Cross-validation evaluates the out-of-sample RMSE across a grid of penalty hyperparameters (\(\lambda\)) for Ridge and Lasso.
# 1. Extract and combine the tuning datasets, adding a identifying column
lasso_plot_data <- collect_metrics(lasso_results) %>%
filter(.metric == "rmse") %>%
mutate(Model = "Lasso (L1)")
ridge_plot_data <- collect_metrics(ridge_results) %>%
filter(.metric == "rmse") %>%
mutate(Model = "Ridge (L2)")
combined_tuning_data <- bind_rows(lasso_plot_data, ridge_plot_data)
# 2. Build the combined ggplot object with custom tooltip text
combined_gg <- ggplot(combined_tuning_data,
aes(x = penalty, y = mean, color = Model, group = Model,
text = paste("Model:", Model,
"<br>Lambda:", round(penalty, 5),
"<br>10-Fold CV RMSE:", round(mean, 4)))) +
geom_line(size = 1) +
scale_x_log10() +
scale_color_manual(values = c("Lasso (L1)" = "darkred", "Ridge (L2)" = "navy")) +
# Add individual vertical dashed lines for each optimized penalty
geom_vline(xintercept = best_lasso$penalty, linetype = "dashed", color = "darkred", alpha = 0.7) +
geom_vline(xintercept = best_ridge$penalty, linetype = "dashed", color = "navy", alpha = 0.7) +
labs(
title = "Regularization Parameter Tuning Paths Matrix",
x = "Penalty Parameter (Lambda on Log10 Scale)",
y = "10-Fold CV RMSE",
color = "Regularization Type"
) +
theme_minimal()
# 3. Convert to an interactive plotly chart, specifying our custom text tooltip
ggplotly(combined_gg, tooltip = "text")
| Feature / Dimension | Best Subset Selection | Lasso Regression (\(L_1\)) | Ridge Regression (\(L_2\)) |
|---|---|---|---|
| Optimization Type | Discrete combinatorial optimization (\(2^m\) possible models). | Continuous convex optimization. | Continuous convex optimization. |
| Computational Complexity | Extremely high; becomes NP-hard and computationally intractable for large \(m\). | Efficiently solved via coordinate descent (e.g.,
glmnet). |
Efficiently solved via closed-form matrix solutions or coordinate descent. |
| Sparsity & Variable Selection | Yields a truly sparse model by explicitly dropping predictors. | Yields a sparse model; forces coefficients to exactly zero at high \(\lambda\). | Non-sparse; shrinks coefficients asymptotically toward zero but never excludes them entirely. |
| Handling Multicollinearity | Can pick one arbitrary predictor from a group of highly correlated variables. | Selects one variable from a correlated cluster and zeroes out the others; can be unstable. | Shrinks correlated coefficients together, distributing weight across them stably. |
Using 10-fold cross-validation to estimate the
out-of-sample error across the entire dataset, we obtain the following
performance metrics when predicting the per capita crime rate
(crim):
# 1. Clean and pivot Lasso metrics
lasso_cleaned <- lasso_cv_metrics %>%
select(metric, estimate) %>%
pivot_wider(names_from = metric, values_from = estimate) %>%
mutate(
Model = "Lasso Regression",
Hyperparameters = paste0("Penalty (λ) ≈ ", round(best_lasso$penalty, 4), ", scaled")
)
# 2. Clean and pivot Ridge metrics
ridge_cleaned <- ridge_cv_metrics %>%
select(metric, estimate) %>%
pivot_wider(names_from = metric, values_from = estimate) %>%
mutate(
Model = "Ridge Regression",
Hyperparameters = paste0("Penalty (λ) ≈ ", round(best_ridge$penalty, 4), ", scaled")
)
# 3. Clean and pivot Full Linear Regression metrics
full_lm_cleaned <- full_lm_metrics %>%
select(metric, estimate) %>%
pivot_wider(names_from = metric, values_from = estimate) %>%
mutate(
Model = "Full Linear Regression",
Hyperparameters = "Baseline (all 13 predictors)"
)
# 4. Clean and pivot Best Subset metrics
# (Best subset already has 'model' instead of 'Model', let's align it)
subset_cleaned <- best_subset_metrics %>%
pivot_wider(names_from = metric, values_from = estimate) %>%
mutate(
Model = "Best Subset Selection",
Hyperparameters = paste0("Optimal Subset Size p = ", best_subset_size)
)
# 5. Bind all datasets dynamically into a single master summary data frame
cv_master_comparison <- bind_rows(
full_lm_cleaned,
lasso_cleaned,
ridge_cleaned,
subset_cleaned
) %>%
# Standardize column arrangements and capitalize metric headers
select(Model, RMSE = rmse, MAE = mae, `R-squared` = rsq, Hyperparameters)
# 6. Render the live interactive DT Table
datatable(
cv_master_comparison,
extensions = 'Responsive',
options = list(
pageLength = 4,
dom = 't',
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color': '#2c3e50', 'color': '#fff'});",
"}"
)
),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left; color: #2c3e50; font-weight: bold; font-size: 16px;',
'Dynamic 10-Fold Cross-Validated Performance Matrix (Target Variable: crim)'
)
) %>%
# Apply professional rounding to the active columns dynamically
formatRound(columns = c('RMSE', 'MAE', 'R-squared'), digits = 4)
When selecting a model for production or scientific reporting, relying on a single metric is often misleading. The choice of RMSE, MAE, or \(R^2\) as your primary decision criterion depends entirely on the specific real-world scenario and mathematical objectives.
\[RMSE = \sqrt{\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2} \quad \text{vs.} \quad MAE = \frac{1}{n} \sum_{i=1}^n |y_i - \hat{y}_i|\]
\[R^2 = 1 - \frac{\sum (y_i - \hat{y}_i)^2}{\sum (y_i - \bar{y})^2}\]