Question 2a

  1. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.

Question 2b

  1. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.

Question 2c

  1. More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias

Question 9a

library(ISLR)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-2
attach(College)
set.seed(11)
index = sample(1:nrow(College), round(nrow(College)*.7))
train = College[index,]
test = College[-index,]

Question 9b

lm.fit = lm(Apps ~ ., data = train)

#predict and get test error
lm.pred = predict(lm.fit, test)
lm.error = mean((lm.pred-test$Apps)**2)
print(lm.error)
## [1] 682474.3

The MSE is 682474.3

Question 9c

# matrix for test and training
train.mat <- model.matrix(Apps ~ ., data = train)
test.mat <- model.matrix(Apps ~ ., data = test)

grid <- 10 ^ seq(4, -2, length = 100)

fit.ridge <- glmnet(train.mat, train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
cv.ridge <- cv.glmnet(train.mat, train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)

bestlam.ridge <- cv.ridge$lambda.min

pred.ridge <- predict(fit.ridge, s = bestlam.ridge, newx = test.mat)
mean((pred.ridge - test$Apps)^2)
## [1] 682427.7

I see the same MSE for ridge regression and least squares

Question 9d

fit.lasso <- glmnet(train.mat, train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
cv.lasso <- cv.glmnet(train.mat, train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
bestlam.lasso <- cv.lasso$lambda.min
bestlam.lasso
## [1] 0.01
pred.lasso <- predict(fit.lasso, s = bestlam.lasso, newx = test.mat)
mean((pred.lasso - test$Apps)^2)
## [1] 682353.9

This MSE is actually slightly lower than the least squares.

Question 9e

library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
fit.pcr <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(fit.pcr, val.type = "MSEP")

pred.pcr <- predict(fit.pcr, test, ncomp = 10)
mean((pred.pcr - test$Apps)^2)
## [1] 979550.1

You can see that on this the MSE is higher than that on least squares

Question 9f

fit.pls <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(fit.pls, val.type = "MSEP")

pred.pls <- predict(fit.pls, test, ncomp = 10)
mean((pred.pls - test$Apps)^2)
## [1] 660366.4

The MSR is actually lower for PLS than for least squares.

test.avg <- mean(test$Apps)
lm.r2 <- 1 - mean((lm.pred - test$Apps)^2) / mean((test.avg - test$Apps)^2)
ridge.r2 <- 1 - mean((pred.ridge - test$Apps)^2) / mean((test.avg - test$Apps)^2)
lasso.r2 <- 1 - mean((pred.lasso - test$Apps)^2) / mean((test.avg - test$Apps)^2)
pcr.r2 <- 1 - mean((pred.pcr - test$Apps)^2) / mean((test.avg - test$Apps)^2)
pls.r2 <- 1 - mean((pred.pls - test$Apps)^2) / mean((test.avg - test$Apps)^2)
lm.r2
## [1] 0.9318928
ridge.r2
## [1] 0.9318975
lasso.r2
## [1] 0.9319048
pcr.r2
## [1] 0.9022463
pls.r2
## [1] 0.9340991

It looks like we can accurately predict the number of college applications. All the predictions are > than 90% with only a 3 point difference between the highest and lowest.

Question 10a

library(leaps)
set.seed(1)
x <- matrix(rnorm(1000 * 20), 1000, 20)
b <- rnorm(20)
b[3] <- 0
b[4] <- 0
b[9] <- 0
b[19] <- 0
b[10] <- 0
eps <- rnorm(1000)
y <- x %*% b + eps

Question 10b

train <- sample(seq(1000), 100, replace = FALSE)
test <- -train
x.train <- x[train, ]
x.test <- x[test, ]
y.train <- y[train]
y.test <- y[test]

Question 10c

data.train <- data.frame(y = y.train, x = x.train)
regfit.full <- regsubsets(y ~ ., data = data.train, nvmax = 20)
train.mat <- model.matrix(y ~ ., data = data.train, nvmax = 20)
val.errors <- rep(NA, 20)
for (i in 1:20) {
    coefi <- coef(regfit.full, id = i)
    pred <- train.mat[, names(coefi)] %*% coefi
    val.errors[i] <- mean((pred - y.train)^2)
}
plot(val.errors, xlab = "Number of predictors", ylab = "Training MSE", pch = 19, type = "b")

Question 10d

data.test <- data.frame(y = y.test, x = x.test)
test.mat <- model.matrix(y ~ ., data = data.test, nvmax = 20)
val.errors <- rep(NA, 20)
for (i in 1:20) {
    coefi <- coef(regfit.full, id = i)
    pred <- test.mat[, names(coefi)] %*% coefi
    val.errors[i] <- mean((pred - y.test)^2)
}
plot(val.errors, xlab = "Number of predictors", ylab = "Test MSE", pch = 19, type = "b")

Question 10e

It looks like model with 15 predictors has the lowest but we will use which.min() to make sure

which.min(val.errors)
## [1] 15

Model 15 has the smallest test MSE

Question 10f

coef(regfit.full, which.min(val.errors))
##  (Intercept)          x.2          x.4          x.5          x.6          x.7 
## -0.003933937  0.359127426  0.202707344  1.036265913 -0.253843053 -1.282753293 
##          x.8         x.11         x.12         x.13         x.14         x.15 
##  0.691581077  0.895769881  0.526887865 -0.207638251 -0.507929833 -0.892604795 
##         x.16         x.17         x.18         x.20 
## -0.343062241  0.184479252  1.646950451 -1.060191640

The best model actually caught all the coefficients that were zeroed out.

Question 10g

val.errors <- rep(NA, 20)
x_cols = colnames(x, do.NULL = FALSE, prefix = "x.")
for (i in 1:20) {
    coefi <- coef(regfit.full, id = i)
    val.errors[i] <- sqrt(sum((b[x_cols %in% names(coefi)] - coefi[names(coefi) %in% x_cols])^2) + sum(b[!(x_cols %in% names(coefi))])^2)
}
plot(val.errors, xlab = "Number of coefficients", ylab = "Error between estimated and true coefficients", pch = 19, type = "b")

Error between estimated and true coefficients and test error isn’t actually directly correlated. A better fit of true coefficients doesn’t always mean a lower test MSE