Question 8

In this exercise, we will generate simulated data, and will then use this data to perform best subset selection.

Part A.

Use the rnorm() function to generate a predictor X of length n =100, as well as a noise vector ϵ of length n = 100.

set.seed(1)
n <- 100
X <- rnorm(n)
epsilon <- rnorm(n)

Part B.

Generate a response vector Y of length n = 100 according to the model Y =β0+β1X+β2X2+β3X3+ϵ, where β0, β1, β2, and β3 are constants of your choice.

# Choose arbitrary coefficients
beta0 <- 1
beta1 <- 2
beta2 <- 3
beta3 <- 4

Y <- beta0 + beta1*X + beta2*X^2 + beta3*X^3 + epsilon

Part c.

Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors X,X2,…,X10. What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coefficients of the best model obtained. Note you will need to use the data.frame() function to create a single data set containing both X and Y .

library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
# Create dataset with polynomial terms up to X^10
data <- data.frame(Y = Y, X = X)
for (i in 2:10) {
  data[[paste0("X", i)]] <- X^i
}

# Run best subset selection
regfit.full <- regsubsets(Y ~ ., data = data, nvmax = 10)

# Summary statistics
reg.summary <- summary(regfit.full)

# Plots
par(mfrow = c(2, 2))
plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "b")
which.min(reg.summary$cp)
## [1] 4
plot(reg.summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "b")
which.min(reg.summary$bic)
## [1] 3
plot(reg.summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted R^2", type = "b")
which.max(reg.summary$adjr2)
## [1] 4
# Best model coefficients
coef(regfit.full, which.max(reg.summary$adjr2))
## (Intercept)           X          X2          X3          X5 
##  1.07200775  2.38745596  2.84575641  3.55797426  0.08072292

Part D.

Repeat (c), using forward stepwise selection and also using back wards stepwise selection. How does your answer compare to the results in (c)?

These results are much different than C as far as proformance.

# Forward Stepwise Selection
regfit.fwd <- regsubsets(Y ~ ., data = data, nvmax = 10, method = "forward")
fwd.summary <- summary(regfit.fwd)

# Backward Stepwise Selection
regfit.bwd <- regsubsets(Y ~ ., data = data, nvmax = 10, method = "backward")
bwd.summary <- summary(regfit.bwd)

# Plotting: Cp, BIC, Adjusted R² for forward and backward
par(mfrow = c(3, 2))  # 3 rows, 2 columns

# Forward selection plots
plot(fwd.summary$cp, xlab = "Number of Variables", ylab = "Cp", main = "Forward Stepwise - Cp", type = "b")
plot(fwd.summary$bic, xlab = "Number of Variables", ylab = "BIC", main = "Forward Stepwise - BIC", type = "b")
plot(fwd.summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted R²", main = "Forward Stepwise - Adjusted R²", type = "b")

# Backward selection plots
plot(bwd.summary$cp, xlab = "Number of Variables", ylab = "Cp", main = "Backward Stepwise - Cp", type = "b")
plot(bwd.summary$bic, xlab = "Number of Variables", ylab = "BIC", main = "Backward Stepwise - BIC", type = "b")
plot(bwd.summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted R²", main = "Backward Stepwise - Adjusted R²", type = "b")

which.min(fwd.summary$cp)
## [1] 4
which.min(fwd.summary$bic)
## [1] 3
which.max(fwd.summary$adjr2)
## [1] 4
which.min(bwd.summary$cp)
## [1] 4
which.min(bwd.summary$bic)
## [1] 3
which.max(bwd.summary$adjr2)
## [1] 4

Part E.

Now fit a lasso model to the simulated data, again using X,X2, …,X10 as predictors. Use cross-validation to select the optimal value of λ. Create plots of the cross-validation error as a function of λ. Report the resulting coefficient estimates, and discuss the results obtained.

library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
# Create model matrix
X.mat <- model.matrix(Y ~ . - 1, data = data)

# Lasso with cross-validation
cv.lasso <- cv.glmnet(X.mat, Y, alpha = 1)

# Plot CV error vs lambda
plot(cv.lasso)

# Coefficients
coef(cv.lasso, s = "lambda.min")
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept) 1.168794337
## X           2.164793590
## X2          2.639485133
## X3          3.800683773
## X4          0.041512567
## X5          0.014068421
## X6          .          
## X7          0.004039751
## X8          .          
## X9          .          
## X10         .

Part F.

Now generate a response vector Y according to the model Y =β0+β7X7+ϵ, and perform best subset selection and the lasso. Discuss the results obtained.

# Generate new Y
Y_new <- beta0 + 7 * X^7 + epsilon
data$Y_new <- Y_new

# Best Subset
regfit.new <- regsubsets(Y_new ~ . - Y, data = data, nvmax = 10)
summary(regfit.new)
## Subset selection object
## Call: regsubsets.formula(Y_new ~ . - Y, data = data, nvmax = 10)
## 10 Variables  (and intercept)
##     Forced in Forced out
## X       FALSE      FALSE
## X2      FALSE      FALSE
## X3      FALSE      FALSE
## X4      FALSE      FALSE
## X5      FALSE      FALSE
## X6      FALSE      FALSE
## X7      FALSE      FALSE
## X8      FALSE      FALSE
## X9      FALSE      FALSE
## X10     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
##           X   X2  X3  X4  X5  X6  X7  X8  X9  X10
## 1  ( 1 )  " " " " " " " " " " " " "*" " " " " " "
## 2  ( 1 )  " " "*" " " " " " " " " "*" " " " " " "
## 3  ( 1 )  " " "*" " " " " "*" " " "*" " " " " " "
## 4  ( 1 )  "*" "*" "*" " " " " " " "*" " " " " " "
## 5  ( 1 )  "*" "*" "*" "*" " " " " "*" " " " " " "
## 6  ( 1 )  "*" " " "*" " " " " "*" "*" "*" " " "*"
## 7  ( 1 )  "*" " " "*" " " "*" "*" "*" "*" " " "*"
## 8  ( 1 )  "*" "*" "*" "*" " " "*" "*" "*" " " "*"
## 9  ( 1 )  "*" "*" "*" "*" " " "*" "*" "*" "*" "*"
## 10  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
# Lasso
cv.lasso.new <- cv.glmnet(X.mat, Y_new, alpha = 1)
plot(cv.lasso.new)

coef(cv.lasso.new, s = "lambda.min")
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                   s1
## (Intercept) 1.820215
## X           .       
## X2          .       
## X3          .       
## X4          .       
## X5          .       
## X6          .       
## X7          6.796694
## X8          .       
## X9          .       
## X10         .

Question 9

In this exercise, we will predict the number of applications received using the other variables in the College data set.

Part A.

Split the data set into a training set and a test set.

library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
set.seed(1)

train <- sample(1:nrow(College), nrow(College) / 2)
test <- -train

train_data <- College[train, ]
test_data <- College[test, ]

Part B.

Fit a linear model using least squares on the training set, and report the test error obtained.

lm.fit <- lm(Apps ~ ., data = train_data)
lm.pred <- predict(lm.fit, newdata = test_data)
mean((lm.pred - test_data$Apps)^2)  # Test MSE
## [1] 1135758

Part C.

Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

train.mat <- model.matrix(Apps ~ ., data = train_data)
test.mat <- model.matrix(Apps ~ ., data = test_data)

ridge.cv <- cv.glmnet(train.mat, train_data$Apps, alpha = 0)
ridge.pred <- predict(ridge.cv, s = ridge.cv$lambda.min, newx = test.mat)
mean((ridge.pred - test_data$Apps)^2)
## [1] 976261.5

Part D.

Fit a lasso model on the training set, with λ chosen by cross validation. Report the test error obtained, along with the number of non-zero coefficient estimates.

lasso.cv <- cv.glmnet(train.mat, train_data$Apps, alpha = 1)
lasso.pred <- predict(lasso.cv, s = lasso.cv$lambda.min, newx = test.mat)
mean((lasso.pred - test_data$Apps)^2)
## [1] 1115901
# Number of non-zero coefficients
sum(coef(lasso.cv, s = lasso.cv$lambda.min) != 0)
## [1] 18

Part E.

Fit a PCR model on the training set, with M chosen by cross validation. Report the test error obtained, along with the value of M selected by cross-validation.

library(pls)

# Fit PCR with CV
pcr.fit <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

# Safely extract MSE — handles both 2D and 3D outputs
mse_obj <- MSEP(pcr.fit)$val

# Check structure of MSE
print(dim(mse_obj))  # For debugging
## [1]  2  1 18
# Extract CV MSE across components
if (length(dim(mse_obj)) == 2) {
  # If it's a 2D matrix (common)
  cv_mse <- mse_obj[, "CV"]
} else {
  # If it's a 3D array
  cv_mse <- as.vector(mse_obj["CV", , 1])
}

# Find best number of components
opt_ncomp <- which.min(cv_mse)

# Predict on test data
pcr.pred <- predict(pcr.fit, newdata = test_data, ncomp = opt_ncomp)

# Compute test MSE
test_mse <- mean((pcr.pred - test_data$Apps)^2)

# Output results
cat("Optimal number of components:", opt_ncomp, "\n")
## Optimal number of components: 1
cat("Test MSE for PCR:", round(test_mse, 2), "\n")
## Test MSE for PCR: 11208606
# Plot MSE
validationplot(pcr.fit, val.type = "MSEP", main = "PCR Cross-Validation Error")

Part F.

Fit a PLS model on the training set, with M chosen by cross validation. Report the test error obtained, along with the value of M selected by cross-validation.

library(pls)

# Fit PLS model with cross-validation
pls.fit <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

# Plot CV MSEP
validationplot(pls.fit, val.type = "MSEP", main = "PLS Cross-Validation Error")

# Safely extract MSE (usually 3D array: type x components x response)
mse_obj <- MSEP(pls.fit)$val
print(dim(mse_obj))  # For debugging
## [1]  2  1 18
# Extract CV error across components
if (length(dim(mse_obj)) == 2) {
  cv_mse <- mse_obj[, "CV"]
} else {
  cv_mse <- as.vector(mse_obj["CV", , 1])
}

# Find optimal number of components
opt_ncomp <- which.min(cv_mse)

# Predict on test data
pls.pred <- predict(pls.fit, newdata = test_data, ncomp = opt_ncomp)

# Compute test MSE
test_mse <- mean((pls.pred - test_data$Apps)^2)

# Output results
cat("Optimal number of components (PLS):", opt_ncomp, "\n")
## Optimal number of components (PLS): 1
cat("Test MSE for PLS:", round(test_mse, 2), "\n")
## Test MSE for PLS: 2494701

Part G.

Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?

Linear Regression 1,135,758 Baseline model using all predictors Ridge Regression 976,261.5 Improved performance due to shrinkage Lasso Regression 1,115,901 Slightly worse than ridge; selected 18 predictors PCR 11,208,606 Very poor; chose 1 component, indicating underfitting PLS 2,494,701 Better than PCR, worse than others; chose 1 component