For parts (a) through (c), indicate which of i. through iv. is correct. Justify your answer.
More flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.
Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
Less flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.
The lasso relative to least squares is less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance because increasing variance decreases faster in bias and creates a U shape standard error curve.
Ridge regression relative to least squares is also less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance because increasing variance decreases faster in bias and creates a U shape standard error curve.
For non-linear methods relative to least squares they are more flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.
In this exercise, we will predict the number of applications received using the other variables in the College data set.
library(ISLR)
data(College)
set.seed(100)
trainID = sample(1:nrow(College), nrow(College)/2)
train = College[trainID,]
test = College[-trainID,]
lm_fit = lm(Apps~., data=train)
lm_preds = predict(lm_fit, test)
lm_error = mean((test$Apps - lm_preds)^2)
lm_error
## [1] 1504500
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-2
library(Matrix)
train_X = model.matrix(Apps ~ ., data = train)
train_Y = train[, "Apps"]
test_X = model.matrix(Apps ~ ., data = test)
test_Y = test[, "Apps"]
grid = 10 ^ seq(4, -2, length=100)
ridge_model = glmnet(train_X, train_Y, alpha =0, lambda=grid, thresh = 1e-12)
best_lam_ridge = ridge_model$lambda.min
ridge_preds = predict(ridge_model, newx= test_X, s=best_lam_ridge)
ridge_error = mean((test_Y - ridge_preds)^2)
ridge_error
## [1] 2251839
lasso_mod = glmnet(train_X, train_Y, alpha =1, lambda=grid, thresh = 1e-12)
lasso_cross_val = cv.glmnet(train_X, train_Y, alpha =1, lambda=grid, thresh = 1e-12)
best_lam_lasso = lasso_cross_val$lambda.min
lasso_preds = predict(lasso_mod, newx= test_X, s=best_lam_lasso)
lasso_error = mean((test_Y - lasso_preds)^2)
lasso_error
## [1] 1515826
predict(lasso_mod, s = best_lam_lasso, type="coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -1.227716e+03
## (Intercept) .
## PrivateYes -5.479428e+02
## Accept 1.154348e+00
## Enroll .
## Top10perc 5.534285e+01
## Top25perc -1.831892e+01
## F.Undergrad 6.548539e-02
## P.Undergrad .
## Outstate -2.653510e-02
## Room.Board 2.148067e-01
## Books .
## Personal 1.049757e-01
## PhD -6.468985e+00
## Terminal -5.767750e+00
## S.F.Ratio 1.820440e+01
## perc.alumni -9.225205e+00
## Expend 7.750439e-02
## Grad.Rate 1.256597e+01
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
pcr_model = pcr(Apps~., data=train, scale=TRUE, validation="CV")
validationplot(pcr_model, val.type="MSEP")
summary(pcr_model)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3373 3346 1694 1699 1681 1302 1287
## adjCV 3373 3349 1692 1700 1714 1299 1285
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1242 1214 1205 1197 1195 1210 1222
## adjCV 1240 1211 1203 1195 1192 1207 1218
## 14 comps 15 comps 16 comps 17 comps
## CV 1223 1232 1097 1099
## adjCV 1219 1228 1093 1094
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 30.926 57.63 64.72 70.66 76.23 80.63 84.43 87.76
## Apps 4.911 75.42 75.68 75.89 85.82 86.28 87.22 87.79
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.78 93.23 95.17 97.01 98.10 98.88 99.45
## Apps 87.93 88.26 88.35 88.35 88.38 88.38 88.39
## 16 comps 17 comps
## X 99.88 100.00
## Apps 90.82 90.98
pcr_preds = predict(pcr_model, test, ncomp=10)
pcr_error = mean((test$Apps - pcr_preds)^2)
pcr_error
## [1] 3017368
pls_model = plsr(Apps~., data=train, scale=TRUE, validation="CV")
validationplot(pls_model, val.type="MSEP")
pls_preds = predict(pls_model, test, ncomp=10)
pls_error = mean((test$Apps - pls_preds)^2)
pls_error
## [1] 1509808
test_avg = mean(test$Apps)
lm_r_sq =1 - lm_error/mean((test_avg - test$Apps)^2)
ridge_r_sq = 1 - ridge_error/mean((test_avg - test$Apps)^2)
lasso_r_sq = 1 - lasso_error/mean((test_avg - test$Apps)^2)
pcr_r_sq = 1 - pcr_error/mean((test_avg - test$Apps)^2)
pls_r_sq = 1 - pls_error/mean((test_avg - test$Apps)^2)
all_r_sq = c(lm_r_sq, ridge_r_sq, lasso_r_sq, pcr_r_sq, pls_r_sq)
names(all_r_sq) = c("lm", "ridge", "lasso", "pcr", "pls")
barplot(all_r_sq )
Looking at the bar plots, it looks like lm, lasso and pls all have very high accuracies, whereas ridge and pcr are a little bit lower (around 80%).
We will now try to predict per capita crime rate in the Boston data set.
### Subset Selection
library(ISLR)
library(MASS)
library(leaps)
library(glmnet)
data(Boston)
set.seed(100)
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id = id)
xvars = names(coefi)
mat[, xvars] %*% coefi
}
k = 10
folds = sample(1:k, nrow(Boston), replace = TRUE)
cv_errors = matrix(NA, k, 13, dimnames = list(NULL, paste(1:13)))
for (j in 1:k) {
best_ss_model = regsubsets(crim ~ ., data = Boston[folds != j, ], nvmax = 13)
for (i in 1:13) {
preds = predict(best_ss_model, Boston[folds == j, ], id = i)
cv_errors[j, i] = mean((Boston$crim[folds == j] - preds)^2)
}
}
mean_of_cv_errors = apply(cv_errors, 2, mean)
plot(mean_of_cv_errors, type = "b", xlab = "Number of variables", ylab = "CV error")
best_ss_reg_model = regsubsets(crim~., data=Boston, nvmax=13)
coef(best_ss_reg_model, 12)
## (Intercept) zn indus chas nox
## 16.985713928 0.044673247 -0.063848469 -0.744367726 -10.202169211
## rm dis rad tax ptratio
## 0.439588002 -0.993556631 0.587660185 -0.003767546 -0.269948860
## black lstat medv
## -0.007518904 0.128120290 -0.198877768
### Lasso
model_x = model.matrix(crim ~ ., Boston)[, -1]
model_y = Boston$crim
cross_val_out = cv.glmnet(model_x, model_y, alpha = 1, type.measure = "mse")
plot(cross_val_out)
cross_val_out
##
## Call: cv.glmnet(x = model_x, y = model_y, type.measure = "mse", alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.0426 53 43.07 13.89 12
## 1se 3.0758 7 55.14 17.34 1
### Ridge Regression
rr_cv_out = cv.glmnet(model_x, model_y, alpha = 0, type.measure = "mse")
rr_cv_out
##
## Call: cv.glmnet(x = model_x, y = model_y, type.measure = "mse", alpha = 0)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.54 100 43.80 11.78 13
## 1se 56.31 50 55.57 13.99 13
### PCR
pcr_model = pcr(crim ~ ., data = Boston, scale = TRUE, validation = "CV")
validationplot(pcr_model, val.type = "MSEP")
Looking at all of these models, it looks like best subset selection has the best results.
In the best subset selection we would drop the first feature and keep the remaining 12 by looking at the huge drop from 1 to 2 on the graph.