Lasso is less flexible than least squares because it shrinks some coefficients exactly to zero via L1 regularization. This reduces variance but increases bias. Prediction improves if the bias increase is less than the variance reduction.
Ridge regression also reduces model flexibility through L2 regularization. It doesn’t set coefficients exactly to zero, but still reduces variance at the cost of some bias. Like lasso, performance improves if the bias added is less than the variance removed.
Non-linear methods (like splines, GAMs, trees) are more flexible, capturing complex patterns. They usually reduce bias but increase variance. Prediction improves when bias reduction outweighs variance increase.
data(College)
train_idx <- sample(1:nrow(College), nrow(College)/2)
train <- College[train_idx, ]
test <- College[-train_idx, ]
lm.fit <- lm(Apps ~ ., data = train)
lm.pred <- predict(lm.fit, newdata = test)
lm.mse <- mean((lm.pred - test$Apps)^2)
lm.mse
## [1] 1135758
x_train <- model.matrix(Apps ~ ., data = train)[, -1]
y_train <- train$Apps
x_test <- model.matrix(Apps ~ ., data = test)[, -1]
y_test <- test$Apps
cv.ridge <- cv.glmnet(x_train, y_train, alpha = 0)
best_lambda_ridge <- cv.ridge$lambda.min
ridge.pred <- predict(cv.ridge, s = best_lambda_ridge, newx = x_test)
ridge.mse <- mean((ridge.pred - y_test)^2)
ridge.mse
## [1] 976261.5
cv.lasso <- cv.glmnet(x_train, y_train, alpha = 1)
best_lambda_lasso <- cv.lasso$lambda.min
lasso.pred <- predict(cv.lasso, s = best_lambda_lasso, newx = x_test)
lasso.mse <- mean((lasso.pred - y_test)^2)
lasso.mse
## [1] 1115901
lasso.model <- glmnet(x_train, y_train, alpha = 1)
lasso.coef <- predict(lasso.model, type = "coefficients", s = best_lambda_lasso)[1:18, ]
num_nonzero <- sum(lasso.coef != 0)
num_nonzero
## [1] 18
pcr.fit <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP")
best_M_pcr <- which.min(pcr.fit$validation$PRESS)
pcr.pred <- predict(pcr.fit, newdata = test, ncomp = best_M_pcr)
pcr.mse <- mean((pcr.pred - test$Apps)^2)
pcr.mse
## [1] 1135758
best_M_pcr
## [1] 17
pls.fit <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pls.fit, val.type = "MSEP")
best_M_pls <- which.min(pls.fit$validation$PRESS)
pls.pred <- predict(pls.fit, newdata = test, ncomp = best_M_pls)
pls.mse <- mean((pls.pred - test$Apps)^2)
pls.mse
## [1] 1135758
best_M_pls
## [1] 17
results <- data.frame(
Method = c("Least Squares", "Ridge", "Lasso", "PCR", "PLS"),
Test_MSE = c(lm.mse, ridge.mse, lasso.mse, pcr.mse, pls.mse)
)
results
## Method Test_MSE
## 1 Least Squares 1135758.3
## 2 Ridge 976261.5
## 3 Lasso 1115900.6
## 4 PCR 1135758.3
## 5 PLS 1135758.3
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
data("Boston")
set.seed(1)
train_idx <- sample(1:nrow(Boston), nrow(Boston)/2)
train <- Boston[train_idx, ]
test <- Boston[-train_idx, ]
x_train <- model.matrix(crim ~ ., data = train)[, -1]
y_train <- train$crim
x_test <- model.matrix(crim ~ ., data = test)[, -1]
y_test <- test$crim
install.packages("leaps")
##
## The downloaded binary packages are in
## /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//RtmpOQ0hOi/downloaded_packages
library(leaps)
regfit.full <- regsubsets(crim ~ ., data = train, nvmax = 13)
test.mat <- model.matrix(crim ~ ., data = test)
val.errors <- rep(NA, 13)
for (i in 1:13) {
coefi <- coef(regfit.full, id = i)
pred <- test.mat[, names(coefi)] %*% coefi
val.errors[i] <- mean((test$crim - pred)^2)
}
best_subset_mse <- min(val.errors)
best_subset_vars <- which.min(val.errors)
best_subset_mse
## [1] 40.14557
best_subset_vars
## [1] 1
cv.ridge <- cv.glmnet(x_train, y_train, alpha = 0)
ridge.pred <- predict(cv.ridge, s = cv.ridge$lambda.min, newx = x_test)
ridge.mse <- mean((ridge.pred - y_test)^2)
ridge.mse
## [1] 40.92777
cv.lasso <- cv.glmnet(x_train, y_train, alpha = 1)
lasso.pred <- predict(cv.lasso, s = cv.lasso$lambda.min, newx = x_test)
lasso.mse <- mean((lasso.pred - y_test)^2)
lasso.mse
## [1] 40.90173
lasso.coef <- predict(cv.lasso, s = cv.lasso$lambda.min, type = "coefficients")
sum(lasso.coef != 0)
## [1] 12
pcr.fit <- pcr(crim ~ ., data = train, scale = TRUE, validation = "CV")
best_M_pcr <- which.min(pcr.fit$validation$PRESS)
pcr.pred <- predict(pcr.fit, newdata = test, ncomp = best_M_pcr)
pcr.mse <- mean((pcr.pred - test$crim)^2)
pcr.mse
## [1] 41.54639
boston_results <- data.frame(
Method = c("Best Subset", "Ridge", "Lasso", "PCR"),
Test_MSE = c(best_subset_mse, ridge.mse, lasso.mse, pcr.mse)
)
boston_results
## Method Test_MSE
## 1 Best Subset 40.14557
## 2 Ridge 40.92777
## 3 Lasso 40.90173
## 4 PCR 41.54639
Based on test MSE, either ridge or lasso regression is likely best, with lasso offering the added benefit of variable selection. Best subset also performs well but may overfit slightly depending on the number of variables selected.
lasso.coef[lasso.coef != 0]
## [1] 17.65005513 0.03516255 -0.11838293 -0.43135144 -7.19578180 0.04271112
## [7] -0.76801501 0.52430211 -0.35072332 -0.01307754 0.25559458 -0.14805010
The lasso automatically selects a subset of predictors, which helps interpretability and reduces overfitting.
Ridge uses all features.