Assignment 5

Ex 2

correct answer: (iii) Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.

The lasso adds L1 penalty to the loss function that shrinks some coefficients exactly to zero, thus reducing model flexibility compared to ordinary least squaress (OLS). OLS has higher variance because it fits the data closely, especially when predictors are highly correlated or when the number of predictors is large relative to the sample size. The lasso introduces bias but reduces variance.

correct answer: (iii) Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.

Ridge regression adds an L2 penalty, shrinking coefficients toward zero but not exactly zero. Like lasso, it is also less flexible than OLS. Ridge introduces bias but controls bias especially when multicollinearity is present.

correct answer: (ii) More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias

Non-linear methods generally have greater flexibility than linear methods like OLS. They also tend to have lower bias but higher variance. Improved prediction happens when the decrease in bias outweighs the incease in variance.

Ex 9

library(ISLR)
library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-8

library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:stats':
## 
##     loadings

library(MASS)
library(leaps)

set.seed(123)

data(College)
train_index <- sample(1:nrow(College), nrow(College) / 2)
train <- College[train_index, ]
test <- College[-train_index, ]

x_train <- model.matrix(Apps ~ ., data = train)[, -1]  
y_train <- train$Apps
x_test <- model.matrix(Apps ~ ., data = test)[, -1]
y_test <- test$Apps

lm_fit <- lm(Apps ~ ., data = train)
lm_pred <- predict(lm_fit, newdata = test)
lm_mse <- mean((lm_pred - y_test)^2)
cat("(b) Test MSE for linear regression:", lm_mse, "\n")

## (b) Test MSE for linear regression: 1373995

set.seed(123)
grid <- 10^seq(10, -2, length = 100)
ridge_mod <- cv.glmnet(x_train, y_train, alpha = 0, lambda = grid, standardize = TRUE)
ridge_best_lambda <- ridge_mod$lambda.min
cat("(c) Best lambda for ridge regression:", ridge_best_lambda, "\n")

## (c) Best lambda for ridge regression: 24.77076

ridge_pred <- predict(ridge_mod, s = ridge_best_lambda, newx = x_test)
ridge_mse <- mean((ridge_pred - y_test)^2)
cat("(c) Test MSE for ridge regression:", ridge_mse, "\n")

## (c) Test MSE for ridge regression: 1451546

set.seed(123)

lasso_mod <- cv.glmnet(x_train, y_train, alpha = 1, lambda = grid, standardize = TRUE)
lasso_best_lambda <- lasso_mod$lambda.min
cat("(c) Best lambda for lasso:", lasso_best_lambda, "\n")

## (c) Best lambda for lasso: 14.17474

lasso_pred <- predict(lasso_mod, s = lasso_best_lambda, newx = x_test)
lasso_mse <- mean((lasso_pred - y_test)^2)
cat("Test MSE for lasso:", lasso_mse, "\n")

## Test MSE for lasso: 1390403

lasso_coefs <- predict(lasso_mod, s = lasso_best_lambda, type = "coefficients")
non_zero <- sum(lasso_coefs != 0) - 1  
cat("Number of non-zero coefficients in lasso:", non_zero, "\n")

## Number of non-zero coefficients in lasso: 12

set.seed(123)

pcr_fit <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pcr_fit, val.type = "MSEP")

pcr_best_m <- which.min(pcr_fit$validation$PRESS)

pcr_pred <- predict(pcr_fit, newdata = test, ncomp = pcr_best_m)
pcr_mse <- mean((pcr_pred - y_test)^2)
cat("Test MSE for PCR:", pcr_mse, "with M =", pcr_best_m, "\n")

## Test MSE for PCR: 1373995 with M = 17

set.seed(123)

pls_fit <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pls_fit, val.type = "MSEP")

pls_best_m <- which.min(pls_fit$validation$PRESS)

pls_pred <- predict(pls_fit, newdata = test, ncomp = pls_best_m)
pls_mse <- mean((pls_pred - y_test)^2)
cat("Test MSE for PLS:", pls_mse, "with M =", pls_best_m, "\n")

## Test MSE for PLS: 1384151 with M = 10

All MSEs are fairly close, so there are no dramatic improvements, but linear model does have the lowest test MSE.

Ex 11

set.seed(123)
data(Boston)

predict.regsubsets <- function(object, newdata, id, ...) {
  form <- as.formula(object$call[[2]])
  mat <- model.matrix(form, newdata)
  coefs <- coef(object, id = id)
  vars <- names(coefs)
  mat[, vars] %*% coefs
}

#pred <- predict.regsubsets(best.fit, Boston[folds == j,], id = i)


k <- 10
p = ncol(Boston) - 1

folds <- sample(1:k, nrow(Boston), replace = TRUE)
cv.errors <- matrix(NA, k, p)

for (j in 1:k) {
  best.fit <- regsubsets(crim ~ ., data = Boston[folds != j,], nvmax = p)
  for (i in 1:p) {
    pred <- predict.regsubsets(best.fit, Boston[folds == j,], id = i)
    cv.errors[j, i] <- mean((Boston$crim[folds == j] - pred)^2)
  }
}
mean.cv.errors <- apply(cv.errors, 2, mean)
best.nvars <- which.min(mean.cv.errors)
cat("Best subset selection chose", best.nvars, "variables.\n")

## Best subset selection chose 8 variables.

plot(mean.cv.errors, pch = 19, type = "b")

set.seed(123)

x <- model.matrix(crim ~ ., Boston)[, -1]
y <- Boston$crim

cv.ridge = cv.glmnet(x, y, alpha = 0, type.measure = "mse")
plot(cv.ridge)

ridge_rmse = sqrt(cv.ridge$cvm[cv.ridge$lambda == cv.ridge$lambda.1se])

cat("Ridge Regression MSE:", ridge_rmse, "\n")

## Ridge Regression MSE: 7.398632

cv.lasso = cv.glmnet(x, y, alpha = 1, type.measure = "mse")
plot(cv.lasso)

lasso_rmse = sqrt(cv.lasso$cvm[cv.lasso$lambda == cv.lasso$lambda.1se])
cat("Lasso Regression MSE:", lasso_rmse, "\n")

## Lasso Regression MSE: 7.572919

pcr.fit = pcr(crim ~ ., data = Boston, scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "RMSEP")

pcr_rmse = min(RMSEP(pcr.fit)$val[1,1,]) 
cat("PCR MSE:", pcr_rmse, "\n")

## PCR MSE: 6.514599

summary(pcr.fit)

## Data:    X dimension: 506 13 
##  Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 13
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            8.61    7.199    7.202    6.765    6.753    6.756    6.749
## adjCV         8.61    7.197    7.199    6.761    6.746    6.752    6.744
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       6.746    6.618    6.632     6.622     6.613     6.586     6.515
## adjCV    6.741    6.612    6.626     6.615     6.607     6.578     6.506
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       47.70    60.36    69.67    76.45    82.99    88.00    91.14    93.45
## crim    30.69    30.87    39.27    39.61    39.61    39.86    40.14    42.47
##       9 comps  10 comps  11 comps  12 comps  13 comps
## X       95.40     97.04     98.46     99.52     100.0
## crim    42.55     42.78     43.04     44.13      45.4

PCR has the lowest RMSE, and it involves all 13 components. The PCR model performs the best according to cross-validation

Assignment 5

Kashfia Sharmin

2025-04-02

Ex 2

Ex 9

Ex 11