2

    1. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. Lasso regression adds an L1 penalty that shrinks some coefficients to zero, making the model less flexible than least squares. This increases bias but reduces variance. Prediction improves when the variance reduction outweighs the increase in bias.
    1. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. Ridge regression applies an L2 penalty that shrinks all coefficients but keeps them nonzero. It is less flexible than least squares, with higher bias and lower variance. Prediction improves when the drop in variance is greater than the increase in bias.
    1. More flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. Non-linear methods are more flexible than least squares, allowing them to capture complex patterns. They tend to have lower bias but higher variance. Prediction improves when the decrease in bias is greater than the increase in variance.

9

# (a)
library(ISLR)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-9
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
library(MASS) 
library(leaps)

data(College)
set.seed(1)
train <- sample(1:nrow(College), nrow(College)/2)
test <- setdiff(1:nrow(College), train)

x <- model.matrix(Apps ~ ., data = College)[, -1]
y <- College$Apps
# (b)
lm.fit <- lm(Apps ~ ., data = College, subset = train)
lm.pred <- predict(lm.fit, College[test, ])
mean((College$Apps[test] - lm.pred)^2)  
## [1] 1135758
# (c)
grid <- 10^seq(10, -2, length = 100)
ridge.mod <- cv.glmnet(x[train, ], y[train], alpha = 0, lambda = grid)
ridge.pred <- predict(ridge.mod, s = ridge.mod$lambda.min, newx = x[test, ])
mean((y[test] - ridge.pred)^2)
## [1] 1134677
# (d)
lasso.mod <- cv.glmnet(x[train, ], y[train], alpha = 1, lambda = grid)
lasso.pred <- predict(lasso.mod, s = lasso.mod$lambda.min, newx = x[test, ])
mean((y[test] - lasso.pred)^2)  
## [1] 1133422
coef(lasso.mod, s = lasso.mod$lambda.min)
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                        s0
## (Intercept) -7.931498e+02
## PrivateYes  -3.078903e+02
## Accept       1.777242e+00
## Enroll      -1.450532e+00
## Top10perc    6.659456e+01
## Top25perc   -2.221506e+01
## F.Undergrad  8.983869e-02
## P.Undergrad  1.005260e-02
## Outstate    -1.082871e-01
## Room.Board   2.118762e-01
## Books        2.922508e-01
## Personal     6.234085e-03
## PhD         -1.542914e+01
## Terminal     6.364841e+00
## S.F.Ratio    2.284667e+01
## perc.alumni  1.114025e+00
## Expend       4.861825e-02
## Grad.Rate    7.466015e+00
sum(coef(lasso.mod, s = lasso.mod$lambda.min) != 0)
## [1] 18
# (e)
pcr.fit <- pcr(Apps ~ ., data = College, subset = train, scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP") 

pcr.pred <- predict(pcr.fit, College[test, ], ncomp = 10)  
mean((pcr.pred - y[test])^2)
## [1] 1723100
# (f)
pls.fit <- plsr(Apps ~ ., data = College, subset = train, scale = TRUE, validation = "CV")
validationplot(pls.fit, val.type = "MSEP")

pls.pred <- predict(pls.fit, College[test, ], ncomp = 10)
mean((pls.pred - y[test])^2)
## [1] 1131661

11

data(Boston)
set.seed(1)
train = sample(1:nrow(Boston), nrow(Boston)/2)
test = setdiff(1:nrow(Boston), train)

x = model.matrix(crim ~ ., Boston)[, -1]
y = Boston$crim
# (a)
regfit.full = regsubsets(crim ~ ., data = Boston, subset = train, nvmax = 13)
test.mat = model.matrix(crim ~ ., data = Boston[test, ])

val.errors = rep(NA, 13)
for (i in 1:13) {
  coefi = coef(regfit.full, id = i)
  pred = test.mat[, names(coefi)] %*% coefi
  val.errors[i] = mean((Boston$crim[test] - pred)^2)
}

which.min(val.errors)
## [1] 1
min(val.errors) 
## [1] 40.14557
grid = 10^seq(10, -2, length = 100)
ridge.mod = glmnet(x[train, ], y[train], alpha = 0, lambda = grid)
ridge.cv = cv.glmnet(x[train, ], y[train], alpha = 0)
ridge.pred = predict(ridge.cv, s = ridge.cv$lambda.min, newx = x[test, ])
mean((y[test] - ridge.pred)^2)
## [1] 40.92777
lasso.mod = glmnet(x[train, ], y[train], alpha = 1, lambda = grid)
lasso.cv = cv.glmnet(x[train, ], y[train], alpha = 1)
lasso.pred = predict(lasso.cv, s = lasso.cv$lambda.min, newx = x[test, ])
mean((y[test] - lasso.pred)^2)
## [1] 40.90173
coef(lasso.cv, s = lasso.cv$lambda.min)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                      s0
## (Intercept) 17.65005513
## zn           0.03516255
## indus       -0.11838293
## chas        -0.43135144
## nox         -7.19578180
## rm           0.04271112
## age          .         
## dis         -0.76801501
## rad          0.52430211
## tax          .         
## ptratio     -0.35072332
## black       -0.01307754
## lstat        0.25559458
## medv        -0.14805010
pcr.fit = pcr(crim ~ ., data = Boston, subset = train, scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP")  

pcr.pred = predict(pcr.fit, Boston[test, ], ncomp = 5)  
mean((pcr.pred - y[test])^2)
## [1] 44.23119
# (b) Out of all the models I tried, the lasso regression performed the best. The lasso model's error (40.9) was almost the same as the best subset model (40.15) and it used several predictors instead of just one.
# (c) No, the lasso model does not use all the variables. It removes some coefficients from the model by No, the lasso model does not use all the variables setting the coefficients to zero.