library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(leaps)
## Warning: package 'leaps' was built under R version 4.3.3
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8

Lab 7

8. In this exercise, we will generate simulated data, and will then use this data to perform best subset selection. (a) Use the rnorm() function to generate a predictor X of length n = 100, as well as a noise vector ϵ of length n = 100.

set.seed(1)
x <- rnorm(100)
epsilon <- rnorm(100)
  1. Generate a response vector Y of length n = 100 according to the model Y = β0 + β1X + β2X2 + β3X3 + ϵ, where β0, β1, β2, and β3 are constants of your choice.
beta0 <- 4
beta1 <- 2
beta2 <- 1
beta3 <- -0.5

Y <- beta0 + (beta1 * x) + (beta2 * x^2) + (beta3 * x^3) + epsilon 
  1. Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors X, X2,…,X10. What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coeffcients of the best model obtained. Note you will need to use the data.frame() function to create a single data set containing both X and Y .
data.full <- data.frame(y = Y, x = x)
regfit.full <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full, nvmax = 10)
reg.summary <- summary(regfit.full)

par(mfrow = c(2, 2))
plot(reg.summary$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col = "red", cex = 2, pch = 20)
mtext("Plots of C_p, BIC, and adjusted R^2 for best subset selection", side = 3, line = -2, outer = TRUE)

##We find that with C_p and adjusted r^2 we would pick the 4-variable model, but with BIC we would pick the 3-variable model.The best model has 4 variables.

coef(regfit.full, which.max(reg.summary$adjr2))
## (Intercept)           x      I(x^2)      I(x^3)      I(x^5) 
##  4.07200775  2.38745596  0.84575641 -0.94202574  0.08072292
  1. Repeat (c), using forward stepwise selection and also using backwards stepwise selection. How does your answer compare to the results in (c)?
regfit.fwd <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full, nvmax = 10, method = "forward")
reg.summary.fwd <- summary(regfit.fwd)

par(mfrow = c(2, 2))
plot(reg.summary.fwd$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(reg.summary.fwd$cp), reg.summary.fwd$cp[which.min(reg.summary.fwd$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary.fwd$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(reg.summary.fwd$bic), reg.summary.fwd$bic[which.min(reg.summary.fwd$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary.fwd$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(reg.summary.fwd$adjr2), reg.summary.fwd$adjr2[which.max(reg.summary.fwd$adjr2)], col = "red", cex = 2, pch = 20)
mtext("Plots of C_p, BIC and adjusted R^2 for forward stepwise selection", side = 3, line = -2, outer = TRUE)

##We obtain the same results with forward stepwise selection. The model with 4 variables is the best fit.

regfit.bwd <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full, nvmax = 10, method = "backward")
reg.summary.bwd <- summary(regfit.bwd)

par(mfrow = c(2, 2))
plot(reg.summary.bwd$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(reg.summary.bwd$cp), reg.summary.bwd$cp[which.min(reg.summary.bwd$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary.bwd$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(reg.summary.bwd$bic), reg.summary.bwd$bic[which.min(reg.summary.bwd$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary.bwd$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(reg.summary.bwd$adjr2), reg.summary.bwd$adjr2[which.max(reg.summary.bwd$adjr2)], col = "red", cex = 2, pch = 20)
mtext("Plots of C_p, BIC and adjusted R^2 for backward stepwise selection", side = 3, line = -2, outer = TRUE)

  1. Now fit a lasso model to the simulated data, again using X, X2, …,X10 as predictors. Use cross-validation to select the optimal value of λ. Create plots of the cross-validation error as a function of λ. Report the resulting coeffcient estimates, and discuss the results obtained.
xmat <- model.matrix(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full)[, -1]
cv.lasso <- cv.glmnet(xmat, Y, alpha = 1)
plot(cv.lasso)

bestlam <- cv.lasso$lambda.min
bestlam
## [1] 0.02263167
fit.lasso <- glmnet(xmat, Y, alpha = 1)
predict(fit.lasso, s = bestlam, type = "coefficients")[1:11, ]
##   (Intercept)             x        I(x^2)        I(x^3)        I(x^4) 
##  4.1487840948  1.8829647714  0.7143433309 -0.4586384985  0.0000000000 
##        I(x^5)        I(x^6)        I(x^7)        I(x^8)        I(x^9) 
##  0.0000000000  0.0042021942  0.0000000000  0.0006359117  0.0000000000 
##       I(x^10) 
##  0.0000000000
  1. Now generate a response vector Y according to the model Y = β0 + β7X7 + ϵ, and perform best subset selection and the lasso. Discuss the results obtained.
beta7 <- 7
y <- beta0 + beta7 * x^7 + epsilon
data.full <- data.frame(y = y, x = x)
regfit.full <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full, nvmax = 10)
reg.summary <- summary(regfit.full)
par(mfrow = c(2, 2))
plot(reg.summary$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col = "red", cex = 2, pch = 20)

xmat <- model.matrix(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full)[, -1]
cv.lasso <- cv.glmnet(xmat, y, alpha = 1)
bestlam <- cv.lasso$lambda.min
bestlam
## [1] 12.36884
fit.lasso <- glmnet(xmat, y, alpha = 1)
predict(fit.lasso, s = bestlam, type = "coefficients")[1:11, ]
## (Intercept)           x      I(x^2)      I(x^3)      I(x^4)      I(x^5) 
##    4.820215    0.000000    0.000000    0.000000    0.000000    0.000000 
##      I(x^6)      I(x^7)      I(x^8)      I(x^9)     I(x^10) 
##    0.000000    6.796694    0.000000    0.000000    0.000000

9. In this exercise, we will predict the number of applications received using the other variables in the College data set. (a) Split the data set into a training set and a test set.

set.seed(10)
train_set <- sample(1:nrow(College), nrow(College)/2)
test_set <- (-train_set)

train <- College[train_set, ]
test <- College[test_set, ]
  1. Fit a linear model using least squares on the training set, and report the test error obtained.
lm <- lm(Apps ~ ., data = train)
pred_lm <- predict(lm, test)
mean((pred_lm - test$Apps)^2)
## [1] 1020100
train_matrix <- model.matrix(Apps ~ ., data = train)
test_matrix <- model.matrix(Apps ~ ., data = test)
grid <- 10 ^ seq(4, -2, length = 100)
fit_ridge <- glmnet(train_matrix, train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
cv_ridge <- cv.glmnet(train_matrix, train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
bestlam <- cv_ridge$lambda.min 
bestlam
## [1] 0.01
pred_ridge <- predict(fit_ridge, s = bestlam, newx = test_matrix)
mean((pred_ridge - test$Apps)^2)
## [1] 1020090
fit_lasso <- glmnet(train_matrix, train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
cv_lasso <- cv.glmnet(train_matrix, train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
bestlam_lasso <- cv_lasso$lambda.min
bestlam_lasso
## [1] 0.01
pred_lasso <- predict(fit_lasso, s = bestlam_lasso, newx = test_matrix)
mean((pred_lasso - test$Apps)^2)
## [1] 1020097
library(pls)
## Warning: package 'pls' was built under R version 4.3.3
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
fit_pcr <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(fit_pcr, val.type = "MSEP")

pred_pcr <- predict(fit_pcr, test, ncomp = 10)
mean((pred_pcr - test$Apps)^2)
## [1] 1422699
fit_pls <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(fit_pls, val.type = "MSEP")

pred_pls <- predict(fit_pls, test, ncomp = 10)
mean((pred_pls - test$Apps)^2)
## [1] 1029442
test.avg <- mean(test$Apps)
lm.r2 <- 1 - mean((pred_lm - test$Apps)^2) / mean((test.avg - test$Apps)^2)
ridge.r2 <- 1 - mean((pred_ridge - test$Apps)^2) / mean((test.avg - test$Apps)^2)
lasso.r2 <- 1 - mean((pred_lasso - test$Apps)^2) / mean((test.avg - test$Apps)^2)
pcr.r2 <- 1 - mean((pred_pcr - test$Apps)^2) / mean((test.avg - test$Apps)^2)
pls.r2 <- 1 - mean((pred_pls - test$Apps)^2) / mean((test.avg - test$Apps)^2)
print(lm.r2)
## [1] 0.9076134
print(ridge.r2)
## [1] 0.9076143
print(lasso.r2)
## [1] 0.9076137
print(pcr.r2)
## [1] 0.8711516
print(pls.r2)
## [1] 0.9067674

All models except for PCR predict college applications with over 90% accuracy. The difference is small between the other four model approaches.