2
- Less flexible and hence will give improved prediction accuracy when
its increase in bias is less than its decrease in variance. Lasso
regression adds an L1 penalty that shrinks some coefficients to zero,
making the model less flexible than least squares. This increases bias
but reduces variance. Prediction improves when the variance reduction
outweighs the increase in bias.
- Less flexible and hence will give improved prediction accuracy when
its increase in bias is less than its decrease in variance. Ridge
regression applies an L2 penalty that shrinks all coefficients but keeps
them nonzero. It is less flexible than least squares, with higher bias
and lower variance. Prediction improves when the drop in variance is
greater than the increase in bias.
- More flexible and hence will give improved prediction accuracy when
its increase in bias is less than its decrease in variance. Non-linear
methods are more flexible than least squares, allowing them to capture
complex patterns. They tend to have lower bias but higher variance.
Prediction improves when the decrease in bias is greater than the
increase in variance.
9
# (a)
library(ISLR)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-9
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
library(MASS)
library(leaps)
data(College)
set.seed(1)
train <- sample(1:nrow(College), nrow(College)/2)
test <- setdiff(1:nrow(College), train)
x <- model.matrix(Apps ~ ., data = College)[, -1]
y <- College$Apps
# (b)
lm.fit <- lm(Apps ~ ., data = College, subset = train)
lm.pred <- predict(lm.fit, College[test, ])
mean((College$Apps[test] - lm.pred)^2)
## [1] 1135758
# (c)
grid <- 10^seq(10, -2, length = 100)
ridge.mod <- cv.glmnet(x[train, ], y[train], alpha = 0, lambda = grid)
ridge.pred <- predict(ridge.mod, s = ridge.mod$lambda.min, newx = x[test, ])
mean((y[test] - ridge.pred)^2)
## [1] 1134677
# (d)
lasso.mod <- cv.glmnet(x[train, ], y[train], alpha = 1, lambda = grid)
lasso.pred <- predict(lasso.mod, s = lasso.mod$lambda.min, newx = x[test, ])
mean((y[test] - lasso.pred)^2)
## [1] 1133422
coef(lasso.mod, s = lasso.mod$lambda.min)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -7.931498e+02
## PrivateYes -3.078903e+02
## Accept 1.777242e+00
## Enroll -1.450532e+00
## Top10perc 6.659456e+01
## Top25perc -2.221506e+01
## F.Undergrad 8.983869e-02
## P.Undergrad 1.005260e-02
## Outstate -1.082871e-01
## Room.Board 2.118762e-01
## Books 2.922508e-01
## Personal 6.234085e-03
## PhD -1.542914e+01
## Terminal 6.364841e+00
## S.F.Ratio 2.284667e+01
## perc.alumni 1.114025e+00
## Expend 4.861825e-02
## Grad.Rate 7.466015e+00
sum(coef(lasso.mod, s = lasso.mod$lambda.min) != 0)
## [1] 18
# (e)
pcr.fit <- pcr(Apps ~ ., data = College, subset = train, scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP")

pcr.pred <- predict(pcr.fit, College[test, ], ncomp = 10)
mean((pcr.pred - y[test])^2)
## [1] 1723100
# (f)
pls.fit <- plsr(Apps ~ ., data = College, subset = train, scale = TRUE, validation = "CV")
validationplot(pls.fit, val.type = "MSEP")

pls.pred <- predict(pls.fit, College[test, ], ncomp = 10)
mean((pls.pred - y[test])^2)
## [1] 1131661
11
data(Boston)
set.seed(1)
train = sample(1:nrow(Boston), nrow(Boston)/2)
test = setdiff(1:nrow(Boston), train)
x = model.matrix(crim ~ ., Boston)[, -1]
y = Boston$crim
# (a)
regfit.full = regsubsets(crim ~ ., data = Boston, subset = train, nvmax = 13)
test.mat = model.matrix(crim ~ ., data = Boston[test, ])
val.errors = rep(NA, 13)
for (i in 1:13) {
coefi = coef(regfit.full, id = i)
pred = test.mat[, names(coefi)] %*% coefi
val.errors[i] = mean((Boston$crim[test] - pred)^2)
}
which.min(val.errors)
## [1] 1
min(val.errors)
## [1] 40.14557
grid = 10^seq(10, -2, length = 100)
ridge.mod = glmnet(x[train, ], y[train], alpha = 0, lambda = grid)
ridge.cv = cv.glmnet(x[train, ], y[train], alpha = 0)
ridge.pred = predict(ridge.cv, s = ridge.cv$lambda.min, newx = x[test, ])
mean((y[test] - ridge.pred)^2)
## [1] 40.92777
lasso.mod = glmnet(x[train, ], y[train], alpha = 1, lambda = grid)
lasso.cv = cv.glmnet(x[train, ], y[train], alpha = 1)
lasso.pred = predict(lasso.cv, s = lasso.cv$lambda.min, newx = x[test, ])
mean((y[test] - lasso.pred)^2)
## [1] 40.90173
coef(lasso.cv, s = lasso.cv$lambda.min)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 17.65005513
## zn 0.03516255
## indus -0.11838293
## chas -0.43135144
## nox -7.19578180
## rm 0.04271112
## age .
## dis -0.76801501
## rad 0.52430211
## tax .
## ptratio -0.35072332
## black -0.01307754
## lstat 0.25559458
## medv -0.14805010
pcr.fit = pcr(crim ~ ., data = Boston, subset = train, scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP")

pcr.pred = predict(pcr.fit, Boston[test, ], ncomp = 5)
mean((pcr.pred - y[test])^2)
## [1] 44.23119
# (b) Out of all the models I tried, the lasso regression performed the best. The lasso model's error (40.9) was almost the same as the best subset model (40.15) and it used several predictors instead of just one.
# (c) No, the lasso model does not use all the variables. It removes some coefficients from the model by No, the lasso model does not use all the variables setting the coefficients to zero.