Question 2: Conceptual Bias-Variance Tradeoff

(a) Lasso vs. Least Squares

Lasso is less flexible than least squares because it shrinks some coefficients exactly to zero via L1 regularization. This reduces variance but increases bias. Prediction improves if the bias increase is less than the variance reduction.

(b) Ridge Regression vs. Least Squares

Ridge regression also reduces model flexibility through L2 regularization. It doesn’t set coefficients exactly to zero, but still reduces variance at the cost of some bias. Like lasso, performance improves if the bias added is less than the variance removed.

(c) Non-linear Methods vs. Least Squares

Non-linear methods (like splines, GAMs, trees) are more flexible, capturing complex patterns. They usually reduce bias but increase variance. Prediction improves when bias reduction outweighs variance increase.


Question 9: Predicting College Applications

(a) Data Splitting

data(College)
train_idx <- sample(1:nrow(College), nrow(College)/2)
train <- College[train_idx, ]
test <- College[-train_idx, ]

(b) Linear Model (Least Squares)

lm.fit <- lm(Apps ~ ., data = train)
lm.pred <- predict(lm.fit, newdata = test)
lm.mse <- mean((lm.pred - test$Apps)^2)
lm.mse
## [1] 1135758

(c) Ridge Regression (CV-selected λ)

x_train <- model.matrix(Apps ~ ., data = train)[, -1]
y_train <- train$Apps
x_test <- model.matrix(Apps ~ ., data = test)[, -1]
y_test <- test$Apps

cv.ridge <- cv.glmnet(x_train, y_train, alpha = 0)
best_lambda_ridge <- cv.ridge$lambda.min

ridge.pred <- predict(cv.ridge, s = best_lambda_ridge, newx = x_test)
ridge.mse <- mean((ridge.pred - y_test)^2)
ridge.mse
## [1] 976261.5

(d) Lasso Regression (CV-selected λ)

cv.lasso <- cv.glmnet(x_train, y_train, alpha = 1)
best_lambda_lasso <- cv.lasso$lambda.min

lasso.pred <- predict(cv.lasso, s = best_lambda_lasso, newx = x_test)
lasso.mse <- mean((lasso.pred - y_test)^2)
lasso.mse
## [1] 1115901
lasso.model <- glmnet(x_train, y_train, alpha = 1)
lasso.coef <- predict(lasso.model, type = "coefficients", s = best_lambda_lasso)[1:18, ]
num_nonzero <- sum(lasso.coef != 0)
num_nonzero
## [1] 18

(e) Principal Component Regression (PCR)

pcr.fit <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP")

best_M_pcr <- which.min(pcr.fit$validation$PRESS)
pcr.pred <- predict(pcr.fit, newdata = test, ncomp = best_M_pcr)
pcr.mse <- mean((pcr.pred - test$Apps)^2)
pcr.mse
## [1] 1135758
best_M_pcr
## [1] 17

(f) Partial Least Squares (PLS)

pls.fit <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pls.fit, val.type = "MSEP")

best_M_pls <- which.min(pls.fit$validation$PRESS)
pls.pred <- predict(pls.fit, newdata = test, ncomp = best_M_pls)
pls.mse <- mean((pls.pred - test$Apps)^2)
pls.mse
## [1] 1135758
best_M_pls
## [1] 17

(g) Comparison

results <- data.frame(
  Method = c("Least Squares", "Ridge", "Lasso", "PCR", "PLS"),
  Test_MSE = c(lm.mse, ridge.mse, lasso.mse, pcr.mse, pls.mse)
)
results
##          Method  Test_MSE
## 1 Least Squares 1135758.3
## 2         Ridge  976261.5
## 3         Lasso 1115900.6
## 4           PCR 1135758.3
## 5           PLS 1135758.3

Question 11: Predicting Crime Rate in Boston Data

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
data("Boston")

(a) Try Regression Methods

set.seed(1)


train_idx <- sample(1:nrow(Boston), nrow(Boston)/2)
train <- Boston[train_idx, ]
test <- Boston[-train_idx, ]


x_train <- model.matrix(crim ~ ., data = train)[, -1]
y_train <- train$crim
x_test <- model.matrix(crim ~ ., data = test)[, -1]
y_test <- test$crim

Best Subset Selection

install.packages("leaps")
## 
## The downloaded binary packages are in
##  /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//RtmpOQ0hOi/downloaded_packages
library(leaps)
regfit.full <- regsubsets(crim ~ ., data = train, nvmax = 13)
test.mat <- model.matrix(crim ~ ., data = test)

val.errors <- rep(NA, 13)
for (i in 1:13) {
  coefi <- coef(regfit.full, id = i)
  pred <- test.mat[, names(coefi)] %*% coefi
  val.errors[i] <- mean((test$crim - pred)^2)
}
best_subset_mse <- min(val.errors)
best_subset_vars <- which.min(val.errors)
best_subset_mse
## [1] 40.14557
best_subset_vars
## [1] 1

Ridge Regression

cv.ridge <- cv.glmnet(x_train, y_train, alpha = 0)
ridge.pred <- predict(cv.ridge, s = cv.ridge$lambda.min, newx = x_test)
ridge.mse <- mean((ridge.pred - y_test)^2)
ridge.mse
## [1] 40.92777

Lasso

cv.lasso <- cv.glmnet(x_train, y_train, alpha = 1)
lasso.pred <- predict(cv.lasso, s = cv.lasso$lambda.min, newx = x_test)
lasso.mse <- mean((lasso.pred - y_test)^2)
lasso.mse
## [1] 40.90173
lasso.coef <- predict(cv.lasso, s = cv.lasso$lambda.min, type = "coefficients")
sum(lasso.coef != 0)
## [1] 12

PCR

pcr.fit <- pcr(crim ~ ., data = train, scale = TRUE, validation = "CV")
best_M_pcr <- which.min(pcr.fit$validation$PRESS)
pcr.pred <- predict(pcr.fit, newdata = test, ncomp = best_M_pcr)
pcr.mse <- mean((pcr.pred - test$crim)^2)
pcr.mse
## [1] 41.54639

(b) Best Performing Model

boston_results <- data.frame(
  Method = c("Best Subset", "Ridge", "Lasso", "PCR"),
  Test_MSE = c(best_subset_mse, ridge.mse, lasso.mse, pcr.mse)
)
boston_results
##        Method Test_MSE
## 1 Best Subset 40.14557
## 2       Ridge 40.92777
## 3       Lasso 40.90173
## 4         PCR 41.54639

Based on test MSE, either ridge or lasso regression is likely best, with lasso offering the added benefit of variable selection. Best subset also performs well but may overfit slightly depending on the number of variables selected.

(c) Feature Inclusion Discussion

lasso.coef[lasso.coef != 0]
##  [1] 17.65005513  0.03516255 -0.11838293 -0.43135144 -7.19578180  0.04271112
##  [7] -0.76801501  0.52430211 -0.35072332 -0.01307754  0.25559458 -0.14805010

The lasso automatically selects a subset of predictors, which helps interpretability and reduces overfitting.

Ridge uses all features.