The correct answer is iii. That is because the Lasso adds an L1 penalty to the least squares objective, making it less flexible and improving he prediction accuracy compared to least sqaures.
The correct answer is iii. This is because ridge regression adds a L2 penalty to the least sqaures objective. It also shrinks coefficients toward zero, but not exactly zero.
The correct answer is i. That is because non-linear methods are more flexible than linear models. This means that there is lower bias,but higher variance.
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
set.seed(1)
train_index <- createDataPartition(College$Apps, p = 0.7, list = FALSE)
train <- College[train_index, ]
test <- College[-train_index, ]
lm_fit <- lm(Apps ~ ., data = train)
lm_pred <- predict(lm_fit, newdata = test)
lm_mse <- mean((lm_pred - test$Apps)^2)
lm_mse
## [1] 921637.5
install.packages("glmnet")
## Installing package into 'C:/Users/austr/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'glmnet' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'glmnet'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\austr\AppData\Local\R\win-library\4.4\00LOCK\glmnet\libs\x64\glmnet.dll
## to C:\Users\austr\AppData\Local\R\win-library\4.4\glmnet\libs\x64\glmnet.dll:
## Permission denied
## Warning: restored 'glmnet'
##
## The downloaded binary packages are in
## C:\Users\austr\AppData\Local\Temp\RtmpMv8wNb\downloaded_packages
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
x_train <- model.matrix(Apps ~ ., data = train)[, -1]
y_train <- train$Apps
x_test <- model.matrix(Apps ~ ., data = test)[, -1]
y_test <- test$Apps
set.seed(1)
ridge_cv <- cv.glmnet(x_train, y_train, alpha = 0)
best_lambda_ridge <- ridge_cv$lambda.min
ridge_pred <- predict(ridge_cv, s = best_lambda_ridge, newx = x_test)
ridge_mse <- mean((ridge_pred - y_test)^2)
ridge_mse
## [1] 1032561
The test error value is 1032561.
set.seed(1)
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1)
best_lambda_lasso <- lasso_cv$lambda.min
lasso_pred <- predict(lasso_cv, s = best_lambda_lasso, newx = x_test)
lasso_mse <- mean((lasso_pred - y_test)^2)
lasso_mse
## [1] 924712.9
lasso_coef <- predict(lasso_cv, s = best_lambda_lasso, type = "coefficients")
nonzero_count <- sum(lasso_coef != 0) - 1
nonzero_count
## [1] 17
The test error value is 924712.9, and there is 17 non-zero coefficient estimates.
install.packages("pls")
## Installing package into 'C:/Users/austr/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'pls' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\austr\AppData\Local\Temp\RtmpMv8wNb\downloaded_packages
library(pls)
## Warning: package 'pls' was built under R version 4.4.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
set.seed(1)
pcr_fit <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
summary(pcr_fit)
## Data: X dimension: 545 17
## Y dimension: 545 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4012 3956 2197 2198 1862 1787 1774
## adjCV 4012 3956 2194 2195 1850 1777 1768
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1745 1741 1663 1653 1675 1677 1692
## adjCV 1734 1733 1657 1646 1669 1670 1685
## 14 comps 15 comps 16 comps 17 comps
## CV 1687 1572 1268 1229
## adjCV 1683 1542 1257 1220
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 30.774 56.86 63.89 69.96 75.50 80.34 83.92 87.39
## Apps 3.484 71.35 71.53 80.87 82.05 82.35 83.27 83.42
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.39 92.72 94.89 96.66 97.87 98.70 99.34
## Apps 84.72 84.91 84.95 84.97 84.97 85.21 91.17
## 16 comps 17 comps
## X 99.85 100.00
## Apps 92.67 92.95
validationplot(pcr_fit, val.type = "MSEP")
best_m_pcr <- which.min(pcr_fit$validation$PRESS)
pcr_pred <- predict(pcr_fit, newdata = test, ncomp = best_m_pcr)
pcr_mse <- mean((pcr_pred - test$Apps)^2)
pcr_mse
## [1] 921637.5
The test error value is 921637.5, and the value of M selected by cross-validation is 17.
set.seed(1)
pls_fit <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
summary(pls_fit)
## Data: X dimension: 545 17
## Y dimension: 545 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4012 2003 1700 1602 1510 1325 1260
## adjCV 4012 1999 1693 1594 1492 1308 1249
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1250 1245 1237 1235 1233 1230 1229
## adjCV 1240 1235 1227 1225 1223 1220 1219
## 14 comps 15 comps 16 comps 17 comps
## CV 1229 1229 1229 1229
## adjCV 1219 1220 1220 1220
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 26.22 38.96 62.40 65.22 67.99 73.15 77.62 81.12
## Apps 76.71 84.67 86.89 90.38 92.42 92.75 92.81 92.85
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 83.38 86.63 89.56 90.79 92.46 94.70 96.64
## Apps 92.89 92.92 92.93 92.95 92.95 92.95 92.95
## 16 comps 17 comps
## X 98.78 100.00
## Apps 92.95 92.95
validationplot(pls_fit, val.type = "MSEP")
best_m_pls <- which.min(pls_fit$validation$PRESS)
pls_pred <- predict(pls_fit, newdata = test, ncomp = best_m_pls)
pls_mse <- mean((pls_pred - test$Apps)^2)
pls_mse
## [1] 921438
The test error value is 921438, and the value of M selected by cross-validation is 13.
data.frame(
Method = c("Least Squares", "Ridge", "Lasso", "PCR", "PLS"),
Test_MSE = c(lm_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse)
)
## Method Test_MSE
## 1 Least Squares 921637.5
## 2 Ridge 1032561.1
## 3 Lasso 924712.9
## 4 PCR 921637.5
## 5 PLS 921438.0
I think that we can predict the number of applications with good accuracy. With the lowest MSE being 921438.0 from PLS, and the least squares and PCR methods following closely, we can see that there is very little difference between the models. ## 11.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:ISLR2':
##
## Boston
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.2
library(glmnet)
library(pls)
library(caret)
library(tidyverse)
data(Boston)
set.seed(1)
train_index <- createDataPartition(Boston$crim, p = 0.7, list = FALSE)
train <- Boston[train_index, ]
test <- Boston[-train_index, ]
regfit_full <- regsubsets(crim ~ ., data = train, nvmax = 13)
reg_summary <- summary(regfit_full)
which.min(reg_summary$bic)
## [1] 2
test_mat <- model.matrix(crim ~ ., data = test)
coef_best <- coef(regfit_full, id = which.min(reg_summary$bic))
pred_best <- test_mat[, names(coef_best)] %*% coef_best
mse_best <- mean((test$crim - pred_best)^2)
mse_best
## [1] 22.15037
x_train <- model.matrix(crim ~ ., train)[, -1]
y_train <- train$crim
x_test <- model.matrix(crim ~ ., test)[, -1]
y_test <- test$crim
ridge_cv <- cv.glmnet(x_train, y_train, alpha = 0)
ridge_lambda <- ridge_cv$lambda.min
ridge_pred <- predict(ridge_cv, s = ridge_lambda, newx = x_test)
ridge_mse <- mean((ridge_pred - y_test)^2)
ridge_mse
## [1] 20.51066
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1)
lasso_lambda <- lasso_cv$lambda.min
lasso_pred <- predict(lasso_cv, s = lasso_lambda, newx = x_test)
lasso_mse <- mean((lasso_pred - y_test)^2)
lasso_coef <- predict(lasso_cv, s = lasso_lambda, type = "coefficients")
lasso_nonzero <- sum(lasso_coef != 0) - 1
lasso_mse
## [1] 20.45048
lasso_nonzero
## [1] 12
pcr_fit <- pcr(crim ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pcr_fit, val.type = "MSEP")
best_m_pcr <- which.min(pcr_fit$validation$PRESS)
pcr_pred <- predict(pcr_fit, test, ncomp = best_m_pcr)
pcr_mse <- mean((pcr_pred - test$crim)^2)
pcr_mse
## [1] 21.8159
data.frame(
Method = c("Best Subset", "Ridge", "Lasso", "PCR"),
Test_MSE = c(mse_best, ridge_mse, lasso_mse, pcr_mse),
Nonzero_Coefficients = c(NA, 13, lasso_nonzero, best_m_pcr)
)
## Method Test_MSE Nonzero_Coefficients
## 1 Best Subset 22.15037 NA
## 2 Ridge 20.51066 13
## 3 Lasso 20.45048 12
## 4 PCR 21.81590 13
Out of all the models, the Lasso regression has the best performance, with the lowest MSE value of 20.45. We can also see that it reduces the number of predictors used to 12 out of 13, which makes the model more simple and interpretable.
The final model does not use all the features. Since I picked the Lasso method, it performs variable selection by shrinking some coefficients to exactly zero. This helps with overfitting and improves the interpretability.