In this exercise, we will generate simulated data, and will then use this data to perform best subset selection.
options(repos = c(CRAN = "https://cran.rstudio.com/"))
install.packages("leaps")
## Installing package into 'C:/Users/My PC/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'leaps' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'leaps'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\My PC\AppData\Local\R\win-library\4.4\00LOCK\leaps\libs\x64\leaps.dll
## to C:\Users\My PC\AppData\Local\R\win-library\4.4\leaps\libs\x64\leaps.dll:
## Permission denied
## Warning: restored 'leaps'
##
## The downloaded binary packages are in
## C:\Users\My PC\AppData\Local\Temp\RtmpMxCtxU\downloaded_packages
install.packages("pls")
## Installing package into 'C:/Users/My PC/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'pls' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\My PC\AppData\Local\Temp\RtmpMxCtxU\downloaded_packages
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
library(pls)
## Warning: package 'pls' was built under R version 4.4.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(42)
x <- rnorm(100)
ep <- rnorm(100)
y <- 2 + 3 * x - 2 * x^2 + 0.5 * x^3 + ep
dat <- data.frame(x, y)
summary(regsubsets(y ~ poly(x, 10, raw = TRUE), data = dat))
## Subset selection object
## Call: regsubsets.formula(y ~ poly(x, 10, raw = TRUE), data = dat)
## 10 Variables (and intercept)
## Forced in Forced out
## poly(x, 10, raw = TRUE)1 FALSE FALSE
## poly(x, 10, raw = TRUE)2 FALSE FALSE
## poly(x, 10, raw = TRUE)3 FALSE FALSE
## poly(x, 10, raw = TRUE)4 FALSE FALSE
## poly(x, 10, raw = TRUE)5 FALSE FALSE
## poly(x, 10, raw = TRUE)6 FALSE FALSE
## poly(x, 10, raw = TRUE)7 FALSE FALSE
## poly(x, 10, raw = TRUE)8 FALSE FALSE
## poly(x, 10, raw = TRUE)9 FALSE FALSE
## poly(x, 10, raw = TRUE)10 FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## poly(x, 10, raw = TRUE)1 poly(x, 10, raw = TRUE)2
## 1 ( 1 ) " " " "
## 2 ( 1 ) "*" "*"
## 3 ( 1 ) "*" "*"
## 4 ( 1 ) "*" "*"
## 5 ( 1 ) "*" "*"
## 6 ( 1 ) "*" "*"
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## poly(x, 10, raw = TRUE)3 poly(x, 10, raw = TRUE)4
## 1 ( 1 ) "*" " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) "*" " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) " " " "
## 6 ( 1 ) " " "*"
## 7 ( 1 ) " " " "
## 8 ( 1 ) "*" " "
## poly(x, 10, raw = TRUE)5 poly(x, 10, raw = TRUE)6
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## poly(x, 10, raw = TRUE)7 poly(x, 10, raw = TRUE)8
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## poly(x, 10, raw = TRUE)9 poly(x, 10, raw = TRUE)10
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" " "
## 8 ( 1 ) "*" " "
summary(regsubsets(y ~ poly(x, 10, raw = TRUE), data = dat, method = "forward"))
## Subset selection object
## Call: regsubsets.formula(y ~ poly(x, 10, raw = TRUE), data = dat, method = "forward")
## 10 Variables (and intercept)
## Forced in Forced out
## poly(x, 10, raw = TRUE)1 FALSE FALSE
## poly(x, 10, raw = TRUE)2 FALSE FALSE
## poly(x, 10, raw = TRUE)3 FALSE FALSE
## poly(x, 10, raw = TRUE)4 FALSE FALSE
## poly(x, 10, raw = TRUE)5 FALSE FALSE
## poly(x, 10, raw = TRUE)6 FALSE FALSE
## poly(x, 10, raw = TRUE)7 FALSE FALSE
## poly(x, 10, raw = TRUE)8 FALSE FALSE
## poly(x, 10, raw = TRUE)9 FALSE FALSE
## poly(x, 10, raw = TRUE)10 FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## poly(x, 10, raw = TRUE)1 poly(x, 10, raw = TRUE)2
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " "*"
## 3 ( 1 ) "*" "*"
## 4 ( 1 ) "*" "*"
## 5 ( 1 ) "*" "*"
## 6 ( 1 ) "*" "*"
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## poly(x, 10, raw = TRUE)3 poly(x, 10, raw = TRUE)4
## 1 ( 1 ) "*" " "
## 2 ( 1 ) "*" " "
## 3 ( 1 ) "*" " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## poly(x, 10, raw = TRUE)5 poly(x, 10, raw = TRUE)6
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" " "
## 8 ( 1 ) "*" " "
## poly(x, 10, raw = TRUE)7 poly(x, 10, raw = TRUE)8
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" " "
## 8 ( 1 ) "*" " "
## poly(x, 10, raw = TRUE)9 poly(x, 10, raw = TRUE)10
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" " "
## 8 ( 1 ) "*" "*"
summary(regsubsets(y ~ poly(x, 10, raw = TRUE), data = dat, method = "backward"))
## Subset selection object
## Call: regsubsets.formula(y ~ poly(x, 10, raw = TRUE), data = dat, method = "backward")
## 10 Variables (and intercept)
## Forced in Forced out
## poly(x, 10, raw = TRUE)1 FALSE FALSE
## poly(x, 10, raw = TRUE)2 FALSE FALSE
## poly(x, 10, raw = TRUE)3 FALSE FALSE
## poly(x, 10, raw = TRUE)4 FALSE FALSE
## poly(x, 10, raw = TRUE)5 FALSE FALSE
## poly(x, 10, raw = TRUE)6 FALSE FALSE
## poly(x, 10, raw = TRUE)7 FALSE FALSE
## poly(x, 10, raw = TRUE)8 FALSE FALSE
## poly(x, 10, raw = TRUE)9 FALSE FALSE
## poly(x, 10, raw = TRUE)10 FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: backward
## poly(x, 10, raw = TRUE)1 poly(x, 10, raw = TRUE)2
## 1 ( 1 ) "*" " "
## 2 ( 1 ) "*" "*"
## 3 ( 1 ) "*" "*"
## 4 ( 1 ) "*" "*"
## 5 ( 1 ) "*" "*"
## 6 ( 1 ) "*" "*"
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## poly(x, 10, raw = TRUE)3 poly(x, 10, raw = TRUE)4
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " " "
## 6 ( 1 ) " " " "
## 7 ( 1 ) " " " "
## 8 ( 1 ) " " " "
## poly(x, 10, raw = TRUE)5 poly(x, 10, raw = TRUE)6
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) "*" " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" "*"
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## poly(x, 10, raw = TRUE)7 poly(x, 10, raw = TRUE)8
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## poly(x, 10, raw = TRUE)9 poly(x, 10, raw = TRUE)10
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" " "
## 8 ( 1 ) "*" "*"
res <- cv.glmnet(poly(dat$x, 10, raw = TRUE), dat$y, alpha = 1)
(best <- res$lambda.min)
## [1] 0.09804219
plot(res)
out <- glmnet(poly(dat$x, 10, raw = TRUE), dat$y, alpha = 1, lambda = res$lambda.min)
predict(out, type = "coefficients", s = best)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.8457308
## 1 2.9092918
## 2 -1.9287428
## 3 0.5161012
## 4 .
## 5 .
## 6 .
## 7 .
## 8 .
## 9 .
## 10 .
When fitting the lasso model, the optimal model that minimizes MSE selects three predictors, as observed in the simulation. The estimated coefficients (2.9, -1.9, and 0.5) closely align with the true values used in the simulation.
dat$y <- 2 - 2 * x^2 + 0.2 * x^7 + ep
summary(regsubsets(y ~ poly(x, 10, raw = TRUE), data = dat))
## Subset selection object
## Call: regsubsets.formula(y ~ poly(x, 10, raw = TRUE), data = dat)
## 10 Variables (and intercept)
## Forced in Forced out
## poly(x, 10, raw = TRUE)1 FALSE FALSE
## poly(x, 10, raw = TRUE)2 FALSE FALSE
## poly(x, 10, raw = TRUE)3 FALSE FALSE
## poly(x, 10, raw = TRUE)4 FALSE FALSE
## poly(x, 10, raw = TRUE)5 FALSE FALSE
## poly(x, 10, raw = TRUE)6 FALSE FALSE
## poly(x, 10, raw = TRUE)7 FALSE FALSE
## poly(x, 10, raw = TRUE)8 FALSE FALSE
## poly(x, 10, raw = TRUE)9 FALSE FALSE
## poly(x, 10, raw = TRUE)10 FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## poly(x, 10, raw = TRUE)1 poly(x, 10, raw = TRUE)2
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " "*"
## 3 ( 1 ) " " "*"
## 4 ( 1 ) " " "*"
## 5 ( 1 ) " " "*"
## 6 ( 1 ) " " "*"
## 7 ( 1 ) " " "*"
## 8 ( 1 ) " " "*"
## poly(x, 10, raw = TRUE)3 poly(x, 10, raw = TRUE)4
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) "*" " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) "*" "*"
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" " "
## 8 ( 1 ) "*" "*"
## poly(x, 10, raw = TRUE)5 poly(x, 10, raw = TRUE)6
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" "*"
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## poly(x, 10, raw = TRUE)7 poly(x, 10, raw = TRUE)8
## 1 ( 1 ) "*" " "
## 2 ( 1 ) "*" " "
## 3 ( 1 ) "*" " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " " "
## 6 ( 1 ) " " "*"
## 7 ( 1 ) " " "*"
## 8 ( 1 ) " " "*"
## poly(x, 10, raw = TRUE)9 poly(x, 10, raw = TRUE)10
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
res <- cv.glmnet(poly(dat$x, 10, raw = TRUE), dat$y, alpha = 1)
(best <- res$lambda.min)
## [1] 1.126906
plot(res)
out <- glmnet(poly(dat$x, 10, raw = TRUE), dat$y, alpha = 1, lambda = best)
predict(out, type = "coefficients", s = best)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.061389580
## 1 .
## 2 -0.883080980
## 3 .
## 4 -0.121018425
## 5 0.028984084
## 6 -0.009540039
## 7 0.188796928
## 8 .
## 9 .
## 10 .
When fitting the lasso model, it does not perfectly replicate the simulation, as it retains coefficients for powers of \(x\) that were not included in the original simulation.
In this exercise, we will predict the number of applications received using the other variables in the College data set.
set.seed(42)
train <- sample(nrow(College), nrow(College) * 2 / 3)
test <- setdiff(seq_len(nrow(College)), train)
mse <- list()
fit <- lm(Apps ~ ., data = College[train, ])
(mse$lm <- mean((predict(fit, College[test, ]) - College$Apps[test])^2))
## [1] 1695269
mm <- model.matrix(Apps ~ ., data = College[train, ])
fit2 <- cv.glmnet(mm, College$Apps[train], alpha = 0)
p <- predict(fit2, model.matrix(Apps ~ ., data = College[test, ]), s = fit2$lambda.min)
(mse$ridge <- mean((p - College$Apps[test])^2))
## [1] 2804369
mm <- model.matrix(Apps ~ ., data = College[train, ])
fit3 <- cv.glmnet(mm, College$Apps[train], alpha = 1)
p <- predict(fit3, model.matrix(Apps ~ ., data = College[test, ]), s = fit3$lambda.min)
(mse$lasso <- mean((p - College$Apps[test])^2))
## [1] 1822322
fit4 <- pcr(Apps ~ ., data = College[train, ], scale = TRUE, validation = "CV")
validationplot(fit4, val.type = "MSEP")
p <- predict(fit4, College[test, ], ncomp = 17)
(mse$pcr <- mean((p - College$Apps[test])^2))
## [1] 1695269
fit5 <- plsr(Apps ~ ., data = College[train, ], scale = TRUE, validation = "CV")
validationplot(fit5, val.type = "MSEP")
p <- predict(fit5, College[test, ], ncomp = 12)
(mse$pls <- mean((p - College$Apps[test])^2))
## [1] 1696902
barplot(unlist(mse), ylab = "Test MSE", horiz = TRUE)
Both Ridge and Lasso regression yield the lowest test errors, but the Ridge regression model produces the lowest error in this specific case with this particular seed.