Lab 7

8. In this exercise, we will generate simulated data, and will then use this data to perform best subset selection. (a) Use the rnorm() function to generate a predictor X of length n = 100, as well as a noise vector ϵ of length n = 100.

set.seed(1)
x <- rnorm(100)
epsilon <- rnorm(100)

Generate a response vector Y of length n = 100 according to the model Y = β0 + β1X + β2X2 + β3X3 + ϵ, where β0, β1, β2, and β3 are constants of your choice.

beta0 <- 4
beta1 <- 2
beta2 <- 1
beta3 <- -0.5

Y <- beta0 + (beta1 * x) + (beta2 * x^2) + (beta3 * x^3) + epsilon

Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors X, X2,…,X10. What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coeffcients of the best model obtained. Note you will need to use the data.frame() function to create a single data set containing both X and Y .

data.full <- data.frame(y = Y, x = x)
regfit.full <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full, nvmax = 10)
reg.summary <- summary(regfit.full)

par(mfrow = c(2, 2))
plot(reg.summary$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col = "red", cex = 2, pch = 20)
mtext("Plots of C_p, BIC, and adjusted R^2 for best subset selection", side = 3, line = -2, outer = TRUE)

##We find that with C_p and adjusted r^2 we would pick the 4-variable model, but with BIC we would pick the 3-variable model.The best model has 4 variables.

coef(regfit.full, which.max(reg.summary$adjr2))

## (Intercept)           x      I(x^2)      I(x^3)      I(x^5) 
##  4.07200775  2.38745596  0.84575641 -0.94202574  0.08072292

Repeat (c), using forward stepwise selection and also using backwards stepwise selection. How does your answer compare to the results in (c)?

regfit.fwd <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full, nvmax = 10, method = "forward")
reg.summary.fwd <- summary(regfit.fwd)

par(mfrow = c(2, 2))
plot(reg.summary.fwd$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(reg.summary.fwd$cp), reg.summary.fwd$cp[which.min(reg.summary.fwd$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary.fwd$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(reg.summary.fwd$bic), reg.summary.fwd$bic[which.min(reg.summary.fwd$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary.fwd$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(reg.summary.fwd$adjr2), reg.summary.fwd$adjr2[which.max(reg.summary.fwd$adjr2)], col = "red", cex = 2, pch = 20)
mtext("Plots of C_p, BIC and adjusted R^2 for forward stepwise selection", side = 3, line = -2, outer = TRUE)

##We obtain the same results with forward stepwise selection. The model with 4 variables is the best fit.

regfit.bwd <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full, nvmax = 10, method = "backward")
reg.summary.bwd <- summary(regfit.bwd)

par(mfrow = c(2, 2))
plot(reg.summary.bwd$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(reg.summary.bwd$cp), reg.summary.bwd$cp[which.min(reg.summary.bwd$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary.bwd$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(reg.summary.bwd$bic), reg.summary.bwd$bic[which.min(reg.summary.bwd$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary.bwd$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(reg.summary.bwd$adjr2), reg.summary.bwd$adjr2[which.max(reg.summary.bwd$adjr2)], col = "red", cex = 2, pch = 20)
mtext("Plots of C_p, BIC and adjusted R^2 for backward stepwise selection", side = 3, line = -2, outer = TRUE)

Now fit a lasso model to the simulated data, again using X, X2, …,X10 as predictors. Use cross-validation to select the optimal value of λ. Create plots of the cross-validation error as a function of λ. Report the resulting coeffcient estimates, and discuss the results obtained.

xmat <- model.matrix(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full)[, -1]
cv.lasso <- cv.glmnet(xmat, Y, alpha = 1)
plot(cv.lasso)

bestlam <- cv.lasso$lambda.min
bestlam

## [1] 0.02263167

fit.lasso <- glmnet(xmat, Y, alpha = 1)
predict(fit.lasso, s = bestlam, type = "coefficients")[1:11, ]

##   (Intercept)             x        I(x^2)        I(x^3)        I(x^4) 
##  4.1487840948  1.8829647714  0.7143433309 -0.4586384985  0.0000000000 
##        I(x^5)        I(x^6)        I(x^7)        I(x^8)        I(x^9) 
##  0.0000000000  0.0042021942  0.0000000000  0.0006359117  0.0000000000 
##       I(x^10) 
##  0.0000000000

Now generate a response vector Y according to the model Y = β0 + β7X7 + ϵ, and perform best subset selection and the lasso. Discuss the results obtained.

beta7 <- 7
y <- beta0 + beta7 * x^7 + epsilon
data.full <- data.frame(y = y, x = x)
regfit.full <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full, nvmax = 10)
reg.summary <- summary(regfit.full)
par(mfrow = c(2, 2))
plot(reg.summary$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col = "red", cex = 2, pch = 20)

xmat <- model.matrix(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full)[, -1]
cv.lasso <- cv.glmnet(xmat, y, alpha = 1)
bestlam <- cv.lasso$lambda.min
bestlam

## [1] 12.36884

fit.lasso <- glmnet(xmat, y, alpha = 1)
predict(fit.lasso, s = bestlam, type = "coefficients")[1:11, ]

## (Intercept)           x      I(x^2)      I(x^3)      I(x^4)      I(x^5) 
##    4.820215    0.000000    0.000000    0.000000    0.000000    0.000000 
##      I(x^6)      I(x^7)      I(x^8)      I(x^9)     I(x^10) 
##    0.000000    6.796694    0.000000    0.000000    0.000000

9. In this exercise, we will predict the number of applications received using the other variables in the College data set. (a) Split the data set into a training set and a test set.

set.seed(10)
train_set <- sample(1:nrow(College), nrow(College)/2)
test_set <- (-train_set)

train <- College[train_set, ]
test <- College[test_set, ]

Fit a linear model using least squares on the training set, and report the test error obtained.

lm <- lm(Apps ~ ., data = train)
pred_lm <- predict(lm, test)
mean((pred_lm - test$Apps)^2)

## [1] 1020100

The test error obtained is 1.0201 x 10^6.
1. Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

train_matrix <- model.matrix(Apps ~ ., data = train)
test_matrix <- model.matrix(Apps ~ ., data = test)
grid <- 10 ^ seq(4, -2, length = 100)
fit_ridge <- glmnet(train_matrix, train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
cv_ridge <- cv.glmnet(train_matrix, train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
bestlam <- cv_ridge$lambda.min 
bestlam

## [1] 0.01

pred_ridge <- predict(fit_ridge, s = bestlam, newx = test_matrix)
mean((pred_ridge - test$Apps)^2)

## [1] 1020090

The test error is smaller for ridge regression than least squares at 1.02009 x 10^6.
1. Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coeffcient estimates.

fit_lasso <- glmnet(train_matrix, train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
cv_lasso <- cv.glmnet(train_matrix, train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
bestlam_lasso <- cv_lasso$lambda.min
bestlam_lasso

## [1] 0.01

pred_lasso <- predict(fit_lasso, s = bestlam_lasso, newx = test_matrix)
mean((pred_lasso - test$Apps)^2)

## [1] 1020097

The test error is smaller for lasso than for least squares at 1.007079 x 10^6.
1. Fit a PCR model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

library(pls)

## Warning: package 'pls' was built under R version 4.3.3

## 
## Attaching package: 'pls'

## The following object is masked from 'package:stats':
## 
##     loadings

fit_pcr <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(fit_pcr, val.type = "MSEP")

pred_pcr <- predict(fit_pcr, test, ncomp = 10)
mean((pred_pcr - test$Apps)^2)

## [1] 1422699

The test error is higher than that of least squares at 1.422699 x 10^6.
1. Fit a PLS model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

fit_pls <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(fit_pls, val.type = "MSEP")

pred_pls <- predict(fit_pls, test, ncomp = 10)
mean((pred_pls - test$Apps)^2)

## [1] 1029442

The test error is higher than that of least squares at 1.029442 x 10^6.
1. Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these fve approaches?

test.avg <- mean(test$Apps)
lm.r2 <- 1 - mean((pred_lm - test$Apps)^2) / mean((test.avg - test$Apps)^2)
ridge.r2 <- 1 - mean((pred_ridge - test$Apps)^2) / mean((test.avg - test$Apps)^2)
lasso.r2 <- 1 - mean((pred_lasso - test$Apps)^2) / mean((test.avg - test$Apps)^2)
pcr.r2 <- 1 - mean((pred_pcr - test$Apps)^2) / mean((test.avg - test$Apps)^2)
pls.r2 <- 1 - mean((pred_pls - test$Apps)^2) / mean((test.avg - test$Apps)^2)
print(lm.r2)

## [1] 0.9076134

print(ridge.r2)

## [1] 0.9076143

print(lasso.r2)

## [1] 0.9076137

print(pcr.r2)

## [1] 0.8711516

print(pls.r2)

## [1] 0.9067674

All models except for PCR predict college applications with over 90% accuracy. The difference is small between the other four model approaches.

Lab 7

Brianna Stopher

2025-03-31

Lab 7