#2 (A.) (iii) Less flexible and will give improved prediction accuracy when its increase in bias is less than its decreasein variance. As lambda increases, flexibility of fit decreases, and so the estimated coefficients decreasewith some being zero. This leads to a substantial decrease in the variance of the predictions for a small increase in bias.
(B.) (iii) Less flexible and will give improved prediction accuracy when its increase in bias is less than its decreasein variance. As lambda increases, flexibility of fit decreases, and so the estimated coefficients decreasewith some being zero. This leads to a substantial decrease in the variance of the predictions for a small increase in bias. However, every variable has a non-zero coefficient.
(C.) (ii) Non-linear models will generally be more flexible, and so predictions tend to have a higher variance andlower bias. So predictions will improve if the variance rises less than a decrease in the bias (bias-variancetrade off).
#Start Question9
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.6.3
## Loading required package: Matrix
## Loaded glmnet 4.1
library(pls)
## Warning: package 'pls' was built under R version 3.6.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
#(A.)
set.seed(11)
train = sample(1:dim(College)[1], dim(College)[1] / 2)
test <- -train
College.train <- College[train, ]
College.test <- College[test, ]
#(B.)
fit.lm <- lm(Apps ~ ., data = College.train)
pred.lm <- predict(fit.lm, College.test)
mean((pred.lm - College.test$Apps)^2)
## [1] 1026096
#Test RSS is 1,538,442.
#(C.)
train.mat <- model.matrix(Apps ~ ., data = College.train)
test.mat <- model.matrix(Apps ~ ., data = College.test)
grid <- 10 ^ seq(4, -2, length = 100)
fit.ridge <- glmnet(train.mat, College.train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
cv.ridge <- cv.glmnet(train.mat, College.train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
bestlam.ridge <- cv.ridge$lambda.min
bestlam.ridge
## [1] 0.01
pred.ridge <- predict(fit.ridge, s = bestlam.ridge, newx = test.mat)
mean((pred.ridge - College.test$Apps)^2)
## [1] 1026069
#Test RSS is slightly higher than OLS, 1,608,859.
#(D.)
fit.lasso <- glmnet(train.mat, College.train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
cv.lasso <- cv.glmnet(train.mat, College.train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
bestlam.lasso <- cv.lasso$lambda.min
bestlam.lasso
## [1] 0.01
pred.lasso <- predict(fit.lasso, s = bestlam.lasso, newx = test.mat)
mean((pred.lasso - College.test$Apps)^2)
## [1] 1026036
predict(fit.lasso, s = bestlam.lasso, type = "coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 37.86520037
## (Intercept) .
## PrivateYes -551.14946609
## Accept 1.74980812
## Enroll -1.36005786
## Top10perc 65.55655577
## Top25perc -22.52640339
## F.Undergrad 0.10181853
## P.Undergrad 0.01789131
## Outstate -0.08706371
## Room.Board 0.15384585
## Books -0.12227313
## Personal 0.16194591
## PhD -14.29638634
## Terminal -1.03118224
## S.F.Ratio 4.47956819
## perc.alumni -0.45456280
## Expend 0.05618050
## Grad.Rate 9.07242834
#Test RSS is slightly higher than OLS, 1635280.
#(E.)
fit.pcr <- pcr(Apps ~ ., data = College.train, scale = TRUE, validation = "CV")
validationplot(fit.pcr, val.type = "MSEP")
pred.pcr <- predict(fit.pcr, College.test, ncomp = 10)
mean((pred.pcr - College.test$Apps)^2)
## [1] 1867486
#Test RSS for PCR is about 3,014,496.
#(F.)
fit.pls <- plsr(Apps ~ ., data = College.train, scale = TRUE, validation = "CV")
validationplot(fit.pls, val.type = "MSEP")
pred.pls <- predict(fit.pls, College.test, ncomp = 10)
mean((pred.pls - College.test$Apps)^2)
## [1] 1031287
#Test RSS for PLS is about 1,508,987.
#(G.)
test.avg <- mean(College.test$Apps)
lm.r2 <- 1 - mean((pred.lm - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
ridge.r2 <- 1 - mean((pred.ridge - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
lasso.r2 <- 1 - mean((pred.lasso - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
pcr.r2 <- 1 - mean((pred.pcr - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
pls.r2 <- 1 - mean((pred.pls - College.test$Apps)^2) / mean((test.avg - College.test$Apps)^2)
#End Question9
#Start Question11
library(MASS)
library(leaps)
## Warning: package 'leaps' was built under R version 3.6.3
library(glmnet)
library(pls)
#(A.)
#Best Subset Selection:
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id = id)
mat[, names(coefi)] %*% coefi
}
k = 10
p = ncol(Boston) - 1
folds = sample(rep(1:k, length = nrow(Boston)))
cv.errors = matrix(NA, k, p)
for (i in 1:k) {
best.fit = regsubsets(crim ~ ., data = Boston[folds != i, ], nvmax = p)
for (j in 1:p) {
pred = predict(best.fit, Boston[folds == i, ], id = j)
cv.errors[i, j] = mean((Boston$crim[folds == i] - pred)^2)
}
}
rmse.cv = sqrt(apply(cv.errors, 2, mean))
plot(rmse.cv, pch = 19, type = "b")
which.min(rmse.cv)
## [1] 12
rmse.cv[which.min(rmse.cv)]
## [1] 6.54371
#Lasso method for a sparse model.
x = model.matrix(crim ~ . - 1, data = Boston)
y = Boston$crim
cv.lasso = cv.glmnet(x, y, type.measure = "mse")
plot(cv.lasso)
coef(cv.lasso)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.0894283
## zn .
## indus .
## chas .
## nox .
## rm .
## age .
## dis .
## rad 0.2643196
## tax .
## ptratio .
## black .
## lstat .
## medv .
sqrt(cv.lasso$cvm[cv.lasso$lambda == cv.lasso$lambda.1se])
## [1] 7.437989
#Ridge Regression
x = model.matrix(crim ~ . - 1, data = Boston)
y = Boston$crim
cv.ridge = cv.glmnet(x, y, type.measure = "mse", alpha = 0)
plot(cv.ridge)
coef(cv.ridge)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.523899548
## zn -0.002949852
## indus 0.029276741
## chas -0.166526006
## nox 1.874769661
## rm -0.142852604
## age 0.006207995
## dis -0.094547258
## rad 0.045932737
## tax 0.002086668
## ptratio 0.071258052
## black -0.002605281
## lstat 0.035745604
## medv -0.023480540
sqrt(cv.ridge$cvm[cv.ridge$lambda == cv.ridge$lambda.1se])
## [1] 7.665736
#PCR
pcr.fit = pcr(crim ~ ., data = Boston, scale = TRUE, validation = "CV")
summary(pcr.fit)
## Data: X dimension: 506 13
## Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.61 7.201 7.204 6.765 6.774 6.775 6.818
## adjCV 8.61 7.198 7.201 6.761 6.767 6.770 6.810
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.804 6.664 6.691 6.692 6.689 6.658 6.586
## adjCV 6.796 6.656 6.682 6.682 6.678 6.646 6.574
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 47.70 60.36 69.67 76.45 82.99 88.00 91.14 93.45
## crim 30.69 30.87 39.27 39.61 39.61 39.86 40.14 42.47
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 95.40 97.04 98.46 99.52 100.0
## crim 42.55 42.78 43.04 44.13 45.4
#(B.) & (C.)
#I would choose the 9 parameter best subset model because it had the best cross-validated RMSE, next to PCR, but it was simpler model than the 13 component PCR model.
#End Question11