The lasso adds L1 penalty to the loss function that shrinks some coefficients exactly to zero, thus reducing model flexibility compared to ordinary least squaress (OLS). OLS has higher variance because it fits the data closely, especially when predictors are highly correlated or when the number of predictors is large relative to the sample size. The lasso introduces bias but reduces variance.
Ridge regression adds an L2 penalty, shrinking coefficients toward zero but not exactly zero. Like lasso, it is also less flexible than OLS. Ridge introduces bias but controls bias especially when multicollinearity is present.
Non-linear methods generally have greater flexibility than linear methods like OLS. They also tend to have lower bias but higher variance. Improved prediction happens when the decrease in bias outweighs the incease in variance.
library(ISLR)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
library(MASS)
library(leaps)
set.seed(123)
data(College)
train_index <- sample(1:nrow(College), nrow(College) / 2)
train <- College[train_index, ]
test <- College[-train_index, ]
x_train <- model.matrix(Apps ~ ., data = train)[, -1]
y_train <- train$Apps
x_test <- model.matrix(Apps ~ ., data = test)[, -1]
y_test <- test$Apps
lm_fit <- lm(Apps ~ ., data = train)
lm_pred <- predict(lm_fit, newdata = test)
lm_mse <- mean((lm_pred - y_test)^2)
cat("(b) Test MSE for linear regression:", lm_mse, "\n")
## (b) Test MSE for linear regression: 1373995
set.seed(123)
grid <- 10^seq(10, -2, length = 100)
ridge_mod <- cv.glmnet(x_train, y_train, alpha = 0, lambda = grid, standardize = TRUE)
ridge_best_lambda <- ridge_mod$lambda.min
cat("(c) Best lambda for ridge regression:", ridge_best_lambda, "\n")
## (c) Best lambda for ridge regression: 24.77076
ridge_pred <- predict(ridge_mod, s = ridge_best_lambda, newx = x_test)
ridge_mse <- mean((ridge_pred - y_test)^2)
cat("(c) Test MSE for ridge regression:", ridge_mse, "\n")
## (c) Test MSE for ridge regression: 1451546
set.seed(123)
lasso_mod <- cv.glmnet(x_train, y_train, alpha = 1, lambda = grid, standardize = TRUE)
lasso_best_lambda <- lasso_mod$lambda.min
cat("(c) Best lambda for lasso:", lasso_best_lambda, "\n")
## (c) Best lambda for lasso: 14.17474
lasso_pred <- predict(lasso_mod, s = lasso_best_lambda, newx = x_test)
lasso_mse <- mean((lasso_pred - y_test)^2)
cat("Test MSE for lasso:", lasso_mse, "\n")
## Test MSE for lasso: 1390403
lasso_coefs <- predict(lasso_mod, s = lasso_best_lambda, type = "coefficients")
non_zero <- sum(lasso_coefs != 0) - 1
cat("Number of non-zero coefficients in lasso:", non_zero, "\n")
## Number of non-zero coefficients in lasso: 12
set.seed(123)
pcr_fit <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pcr_fit, val.type = "MSEP")
pcr_best_m <- which.min(pcr_fit$validation$PRESS)
pcr_pred <- predict(pcr_fit, newdata = test, ncomp = pcr_best_m)
pcr_mse <- mean((pcr_pred - y_test)^2)
cat("Test MSE for PCR:", pcr_mse, "with M =", pcr_best_m, "\n")
## Test MSE for PCR: 1373995 with M = 17
set.seed(123)
pls_fit <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pls_fit, val.type = "MSEP")
pls_best_m <- which.min(pls_fit$validation$PRESS)
pls_pred <- predict(pls_fit, newdata = test, ncomp = pls_best_m)
pls_mse <- mean((pls_pred - y_test)^2)
cat("Test MSE for PLS:", pls_mse, "with M =", pls_best_m, "\n")
## Test MSE for PLS: 1384151 with M = 10
set.seed(123)
data(Boston)
predict.regsubsets <- function(object, newdata, id, ...) {
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefs <- coef(object, id = id)
vars <- names(coefs)
mat[, vars] %*% coefs
}
#pred <- predict.regsubsets(best.fit, Boston[folds == j,], id = i)
k <- 10
p = ncol(Boston) - 1
folds <- sample(1:k, nrow(Boston), replace = TRUE)
cv.errors <- matrix(NA, k, p)
for (j in 1:k) {
best.fit <- regsubsets(crim ~ ., data = Boston[folds != j,], nvmax = p)
for (i in 1:p) {
pred <- predict.regsubsets(best.fit, Boston[folds == j,], id = i)
cv.errors[j, i] <- mean((Boston$crim[folds == j] - pred)^2)
}
}
mean.cv.errors <- apply(cv.errors, 2, mean)
best.nvars <- which.min(mean.cv.errors)
cat("Best subset selection chose", best.nvars, "variables.\n")
## Best subset selection chose 8 variables.
plot(mean.cv.errors, pch = 19, type = "b")
set.seed(123)
x <- model.matrix(crim ~ ., Boston)[, -1]
y <- Boston$crim
cv.ridge = cv.glmnet(x, y, alpha = 0, type.measure = "mse")
plot(cv.ridge)
ridge_rmse = sqrt(cv.ridge$cvm[cv.ridge$lambda == cv.ridge$lambda.1se])
cat("Ridge Regression MSE:", ridge_rmse, "\n")
## Ridge Regression MSE: 7.398632
cv.lasso = cv.glmnet(x, y, alpha = 1, type.measure = "mse")
plot(cv.lasso)
lasso_rmse = sqrt(cv.lasso$cvm[cv.lasso$lambda == cv.lasso$lambda.1se])
cat("Lasso Regression MSE:", lasso_rmse, "\n")
## Lasso Regression MSE: 7.572919
pcr.fit = pcr(crim ~ ., data = Boston, scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "RMSEP")
pcr_rmse = min(RMSEP(pcr.fit)$val[1,1,])
cat("PCR MSE:", pcr_rmse, "\n")
## PCR MSE: 6.514599
summary(pcr.fit)
## Data: X dimension: 506 13
## Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.61 7.199 7.202 6.765 6.753 6.756 6.749
## adjCV 8.61 7.197 7.199 6.761 6.746 6.752 6.744
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.746 6.618 6.632 6.622 6.613 6.586 6.515
## adjCV 6.741 6.612 6.626 6.615 6.607 6.578 6.506
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 47.70 60.36 69.67 76.45 82.99 88.00 91.14 93.45
## crim 30.69 30.87 39.27 39.61 39.61 39.86 40.14 42.47
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 95.40 97.04 98.46 99.52 100.0
## crim 42.55 42.78 43.04 44.13 45.4
PCR has the lowest RMSE, and it involves all 13 components. The PCR model performs the best according to cross-validation