knitr::opts_chunk$set(echo = TRUE)
library(ISLR2)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(e1071)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(pls)
##
## Attaching package: 'pls'
##
## The following object is masked from 'package:stats':
##
## loadings
For parts (a) through (c), indicate which of i. through iv. is correct. Justify your answer. i. More flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. ii. More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias. iii. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. iv. Less flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.
In this exercise, we will predict the number of applications received using the other variables in the College data set.
College = na.omit(College)
set.seed(23)
train_index = sample(1:nrow(College), round(nrow(College) * 0.7))
train = College[train_index, ]
nrow(train)/nrow(College)
## [1] 0.7001287
test = College[-train_index, ]
nrow(test)/nrow(College)
## [1] 0.2998713
lm1 = lm(Apps ~ ., data = train)
summary(lm1)
##
## Call:
## lm(formula = Apps ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4208.4 -425.3 -33.0 333.6 6690.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -281.91608 512.77657 -0.550 0.582701
## PrivateYes -539.22843 179.89789 -2.997 0.002851 **
## Accept 1.69570 0.04968 34.130 < 2e-16 ***
## Enroll -1.20775 0.23009 -5.249 2.22e-07 ***
## Top10perc 56.39038 6.92669 8.141 2.86e-15 ***
## Top25perc -19.15110 5.69876 -3.361 0.000834 ***
## F.Undergrad 0.09308 0.04209 2.212 0.027427 *
## P.Undergrad -0.02399 0.04947 -0.485 0.627905
## Outstate -0.10720 0.02352 -4.557 6.46e-06 ***
## Room.Board 0.15524 0.05899 2.632 0.008748 **
## Books 0.02254 0.29756 0.076 0.939644
## Personal 0.06588 0.08240 0.800 0.424334
## PhD -6.51968 6.24054 -1.045 0.296627
## Terminal -7.69474 6.65873 -1.156 0.248375
## S.F.Ratio 12.80536 16.31116 0.785 0.432768
## perc.alumni -0.21073 5.31056 -0.040 0.968362
## Expend 0.10206 0.01648 6.191 1.20e-09 ***
## Grad.Rate 11.14624 3.70542 3.008 0.002755 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1078 on 526 degrees of freedom
## Multiple R-squared: 0.9282, Adjusted R-squared: 0.9259
## F-statistic: 400 on 17 and 526 DF, p-value: < 2.2e-16
pred_lm1 = predict(lm1, test)
(mse_lm1 = mean((pred_lm1 - test$Apps)^2))
## [1] 1081431
set.seed(23)
x = model.matrix(Apps ~ ., data = College)[, -1]
y = College$Apps
grid = 10^seq(10, -2, length = 100)
ridge_model = glmnet(x[train_index, ], y[train_index], alpha = 0, lambda = grid)
dim(coef(ridge_model))
## [1] 18 100
set.seed(23)
cv_output = cv.glmnet(x[train_index, ], y[train_index], alpha = 0)
plot(cv_output)
best_lambda = cv_output$lambda.min
best_lambda
## [1] 370.1777
ridge_pred = predict(ridge_model, s = best_lambda, newx = x[-train_index, ])
(mse_ridge = mean((ridge_pred - y[-train_index])^2))
## [1] 905098.6
lasso_model = glmnet(x[train_index, ], y[train_index], alpha = 1, lambda = grid)
plot(lasso_model)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
set.seed(23)
cv_output = cv.glmnet(x[train_index, ], y[train_index], alpha = 1)
plot(cv_output)
best_lambda = cv_output$lambda.min
best_lambda
## [1] 15.29579
lasso_pred = predict(lasso_model, s = best_lambda, newx = x[-train_index, ])
(mse_lasso = mean((lasso_pred - y[-train_index])^2))
## [1] 980202.4
Lasso is a little worse MSE than the ridge regression but has the advantage of including less predictors.
set.seed(23)
pcr_model = pcr(Apps ~ ., data = College, scale = TRUE, validation = "CV")
summary(pcr_model)
## Data: X dimension: 777 17
## Y dimension: 777 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3873 3836 2031 2033 1752 1569 1576
## adjCV 3873 3835 2029 2032 1658 1558 1573
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1576 1535 1498 1496 1501 1499 1506
## adjCV 1574 1531 1494 1493 1498 1496 1503
## 14 comps 15 comps 16 comps 17 comps
## CV 1507 1457 1190 1147
## adjCV 1504 1443 1183 1140
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 31.670 57.30 64.30 69.90 75.39 80.38 83.99 87.40
## Apps 2.316 73.06 73.07 82.08 84.08 84.11 84.32 85.18
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.50 92.91 95.01 96.81 97.9 98.75 99.36
## Apps 85.88 86.06 86.06 86.10 86.1 86.13 90.32
## 16 comps 17 comps
## X 99.84 100.00
## Apps 92.52 92.92
validationplot(pcr_model, val.type = "MSEP")
set.seed(23)
pcr_model = pcr(Apps ~ ., data = train, scale = T, validation = "CV")
validationplot(pcr_model, val.type = "MSEP")
pcr_pred = predict(pcr_model, x[-train_index, ], ncomp = 16)
(mse_pcr = mean((pcr_pred - y[-train_index])^2))
## [1] 1026276
pcr_model = pcr(y ~ x, scale = T, ncomp = 16)
summary(pcr_model)
## Data: X dimension: 777 17
## Y dimension: 777 1
## Fit method: svdpc
## Number of components considered: 16
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 31.670 57.30 64.30 69.90 75.39 80.38 83.99 87.40
## y 2.316 73.06 73.07 82.08 84.08 84.11 84.32 85.18
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.50 92.91 95.01 96.81 97.9 98.75 99.36
## y 85.88 86.06 86.06 86.10 86.1 86.13 90.32
## 16 comps
## X 99.84
## y 92.52
set.seed(23)
pls_model = plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
summary(pls_model)
## Data: X dimension: 544 17
## Y dimension: 544 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3962 2000 1747 1578 1502 1417 1304
## adjCV 3962 1995 1743 1570 1482 1393 1290
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1281 1262 1252 1244 1240 1237 1237
## adjCV 1268 1250 1241 1233 1230 1227 1227
## 14 comps 15 comps 16 comps 17 comps
## CV 1238 1239 1239 1239
## adjCV 1228 1228 1228 1228
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 26.23 43.75 62.91 65.69 69.03 73.79 77.56 80.84
## Apps 76.32 83.22 86.85 90.11 91.95 92.48 92.58 92.66
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 83.31 86.56 88.95 91.68 92.78 94.43 97.01
## Apps 92.73 92.77 92.80 92.81 92.82 92.82 92.82
## 16 comps 17 comps
## X 98.53 100.00
## Apps 92.82 92.82
pls_pred = predict(pls_model, x[-train_index, ], ncomp = 6)
(mse_pls = mean((pls_pred - y[-train_index])^2))
## [1] 1039176
pls_model = plsr(y ~ x, scale = T, ncomp = 6)
summary(pls_model)
## Data: X dimension: 777 17
## Y dimension: 777 1
## Fit method: kernelpls
## Number of components considered: 6
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## X 25.76 40.33 62.59 64.97 66.87 71.33
## y 78.01 85.14 87.67 90.73 92.63 92.72
results <- tibble(
Model = c("Linear", "Ridge", "Lasso", "PCR", "PLS"),
MSE = c(mse_lm1, mse_ridge, mse_lasso, mse_pcr, mse_pls)
)
print(results)
## # A tibble: 5 × 2
## Model MSE
## <chr> <dbl>
## 1 Linear 1081431.
## 2 Ridge 905099.
## 3 Lasso 980202.
## 4 PCR 1026276.
## 5 PLS 1039176.
Estimated accuracy of ridge
1 - mean(abs(y[-train_index] - ridge_pred))/mean(y[-train_index])
## [1] 0.8025739
Using the ridge regression model we can predict the number of apps a college will recieve with roughly 80% accuracy.
Ridge regression performed the best while linear regression performed the worst. Lasso was second to ridge and has the advantage of being less complex. There is quite a bit of difference between the five approaches.
We will now try to predict per capita crime rate in the Boston data set.
library(leaps)
boston = na.omit(Boston)
names(boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "lstat" "medv"
dim(boston)
## [1] 506 13
bestsub_full = regsubsets(crim ~ ., data = boston, nvmax = 12)
summary(bestsub_full)
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = boston, nvmax = 12)
## 12 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 12
## Selection Algorithm: exhaustive
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" " "
## 3 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" "*"
## 4 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " "*" "*" " " " " " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" " " "*"
## 7 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" "*"
## 8 ( 1 ) "*" "*" " " "*" " " " " "*" "*" " " "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" " " "*" "*" "*"
## 10 ( 1 ) "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
sum_bestsub_full = summary(bestsub_full)
sum_bestsub_full$rsq
## [1] 0.3912567 0.4207965 0.4244920 0.4334892 0.4378328 0.4421077 0.4451514
## [8] 0.4470236 0.4482891 0.4487917 0.4493353 0.4493378
par(mfrow = c(2,2))
plot(sum_bestsub_full$rss, xlab = "Number of Variables", ylab = "RSS", type = "b")
plot(sum_bestsub_full$adjr2, xlab = "Number of Variables", ylab = "Adj Rsq", type = "b")
which.max(sum_bestsub_full$adjr2)
## [1] 9
points(9, sum_bestsub_full$adjr2[9], col = "red", cex = 2, pch = 20)
plot(sum_bestsub_full$cp, xlab = "Number of Variables", ylab = "Cp", type = "b")
which.min(sum_bestsub_full$cp)
## [1] 7
points(7, sum_bestsub_full$cp[7], col = "red", cex = 2, pch = 20)
which.min(sum_bestsub_full$bic)
## [1] 2
plot(sum_bestsub_full$bic, xlab = "Number of Variables", ylab = "BIC", type = "b")
points(2, sum_bestsub_full$bic[2], col = "red", cex = 2, pch = 20)
plot(bestsub_full, scale = "r2")
plot(bestsub_full, scale = "adjr2")
plot(bestsub_full, scale = "Cp")
plot(bestsub_full, scale = "bic")
bestsub_fwd = regsubsets(crim ~ ., data = boston, nvmax = 12, method = "forward")
summary(bestsub_fwd)
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = boston, nvmax = 12, method = "forward")
## 12 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 12
## Selection Algorithm: forward
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" " "
## 3 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" "*"
## 4 ( 1 ) " " " " " " " " " " " " " " "*" " " "*" "*" "*"
## 5 ( 1 ) " " " " " " " " "*" " " " " "*" " " "*" "*" "*"
## 6 ( 1 ) " " " " " " "*" "*" " " " " "*" " " "*" "*" "*"
## 7 ( 1 ) " " " " " " "*" "*" " " "*" "*" " " "*" "*" "*"
## 8 ( 1 ) "*" " " " " "*" "*" " " "*" "*" " " "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" " " "*" "*" "*"
## 10 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
bestsub_bwd = regsubsets(crim ~ ., data = boston, nvmax = 12, method = "backward")
summary(bestsub_bwd)
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = boston, nvmax = 12, method = "backward")
## 12 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 12
## Selection Algorithm: backward
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " "*"
## 3 ( 1 ) " " " " " " " " " " " " "*" "*" " " " " " " "*"
## 4 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " " " "*"
## 5 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " " " " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" " " "*"
## 7 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" "*"
## 8 ( 1 ) "*" " " " " "*" "*" " " "*" "*" " " "*" "*" "*"
## 9 ( 1 ) "*" " " " " "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
Validation-Set and CV
set.seed(23)
bos_trainindex = sample(1:nrow(boston), round(nrow(boston) * 0.7))
bos_train = boston[bos_trainindex, ]
bos_test = boston[-bos_trainindex, ]
best_full = regsubsets(crim ~ ., data = bos_train, nvmax = 12)
bos_test_mat = model.matrix(crim ~ ., data = bos_test)
val_errors = rep(NA, 12)
for (i in 1:12) {
coefi = coef(best_full, id = i)
pred = bos_test_mat[, names(coefi)] %*% coefi
val_errors[i] = mean((bos_test$crim - pred)^2)
}
val_errors
## [1] 71.82721 67.73263 67.91258 66.18935 65.72125 65.06243 65.08777 64.58699
## [9] 64.42018 64.30347 64.22391 64.22645
which.min(val_errors)
## [1] 11
coef(best_full, 11)
## (Intercept) zn indus chas nox rm
## 10.015776003 0.042874577 -0.034525266 -0.593856820 -8.910659924 0.861207153
## dis rad tax ptratio lstat medv
## -0.930892054 0.569182556 -0.003715808 -0.215010916 0.104085416 -0.215429594
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id = id)
xvars = names(coefi)
mat[, xvars] %*% coefi
}
best_full = regsubsets(crim ~ ., data = boston, nvmax = 12)
coef(best_full, 11)
## (Intercept) zn indus chas nox
## 13.801555544 0.045818028 -0.058345869 -0.828283847 -10.022404078
## rm dis rad tax ptratio
## 0.623650191 -1.008539667 0.612822152 -0.003782956 -0.304784434
## lstat medv
## 0.137698956 -0.220092318
k = 10
n = nrow(boston)
set.seed(23)
folds = sample(rep(1:k, length = n))
cv_errors = matrix(NA, k, 12, dimnames = list(NULL, paste(1:12)))
for (j in 1:k) {
best_fit = regsubsets(crim ~ ., data = boston[folds != j, ], nvmax = 12)
for (i in 1:12) {
pred = predict.regsubsets(best_fit, boston[folds == j, ], id = i)
cv_errors[j, i] = mean((boston$crim[folds == j] - pred)^2)
}
}
mean_cv_errors = apply(cv_errors, 2, mean)
mean_cv_errors
## 1 2 3 4 5 6 7 8
## 45.75570 43.92465 44.00890 43.72869 43.78591 43.39965 42.97987 42.79437
## 9 10 11 12
## 42.65616 42.68130 42.84731 42.85968
par(mfrow = c(1, 1))
plot(mean_cv_errors, type = "b")
Cross validation selects a 9-variable model
best_full = regsubsets(crim ~ ., data = boston, nvmax = 12)
coef(best_full, 9)
## (Intercept) zn indus nox rm dis
## 13.18247199 0.04305936 -0.08820604 -10.46823468 0.63510592 -1.00637216
## rad ptratio lstat medv
## 0.56096588 -0.30423849 0.14040855 -0.22012525
MSE on all data with model on full data set
mean_cv_errors[which.min(mean_cv_errors)]
## 9
## 42.65616
MSE from the validation. Modeled on training set
mse_bestsub = val_errors[which.min(val_errors)]
mse_bestsub
## [1] 64.22391
set.seed(23)
x = model.matrix(crim ~ ., data = boston)[, -1]
y = boston$crim
grid = 10^seq(10, -2, length = 100)
ridge_model = glmnet(x[bos_trainindex, ], y[bos_trainindex], alpha = 0, lambda = grid)
dim(coef(ridge_model))
## [1] 13 100
set.seed(23)
cv_output = cv.glmnet(x[bos_trainindex, ], y[bos_trainindex], alpha = 0, lambda = grid, thresh = 1e-12)
plot(cv_output)
best_lambda = cv_output$lambda.min
best_lambda
## [1] 0.3764936
ridge_pred = predict(cv_output, s = best_lambda, newx = x[-bos_trainindex, ])
(mse_ridge = mean((ridge_pred - y[-bos_trainindex])^2))
## [1] 65.23585
bos_lasso_model = glmnet(x[bos_trainindex, ], y[bos_trainindex], alpha = 1, lambda = grid)
plot(bos_lasso_model)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
set.seed(23)
cv_output = cv.glmnet(x[bos_trainindex, ], y[bos_trainindex], alpha = 1, lambda = grid)
plot(cv_output)
best_lambda = cv_output$lambda.min
best_lambda
## [1] 0.1629751
bos_lasso_pred = predict(bos_lasso_model, s = best_lambda, newx = x[-bos_trainindex, ])
(mse_lasso = mean((bos_lasso_pred - y[-bos_trainindex])^2))
## [1] 66.95691
set.seed(23)
pcr_model = pcr(crim ~ ., data = boston, scale = TRUE, validation = "CV")
summary(pcr_model)
## Data: X dimension: 506 12
## Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 12
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.61 7.282 7.278 6.885 6.863 6.813 6.805
## adjCV 8.61 7.278 7.275 6.879 6.861 6.808 6.800
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps
## CV 6.687 6.704 6.693 6.689 6.673 6.593
## adjCV 6.679 6.697 6.685 6.681 6.662 6.581
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 49.93 63.64 72.94 80.21 86.83 90.26 92.79 94.99
## crim 29.39 29.55 37.39 37.85 38.85 39.23 41.73 41.82
## 9 comps 10 comps 11 comps 12 comps
## X 96.78 98.33 99.48 100.00
## crim 42.12 42.43 43.58 44.93
validationplot(pcr_model, val.type = "MSEP")
set.seed(23)
pcr_model = pcr(crim ~ ., data = bos_train, scale = T, validation = "CV")
validationplot(pcr_model, val.type = "MSEP")
pcr_pred = predict(pcr_model, x[-bos_trainindex, ], ncomp = 12)
(mse_pcr = mean((pcr_pred - y[-bos_trainindex])^2))
## [1] 64.22645
pcr_model = pcr(y ~ x, scale = T, ncomp = 12)
summary(pcr_model)
## Data: X dimension: 506 12
## Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 12
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 49.93 63.64 72.94 80.21 86.83 90.26 92.79 94.99
## y 29.39 29.55 37.39 37.85 38.85 39.23 41.73 41.82
## 9 comps 10 comps 11 comps 12 comps
## X 96.78 98.33 99.48 100.00
## y 42.12 42.43 43.58 44.93
set.seed(23)
pls_model = plsr(crim ~ ., data = bos_train, scale = TRUE, validation = "CV")
summary(pls_model)
## Data: X dimension: 354 12
## Y dimension: 354 1
## Fit method: kernelpls
## Number of components considered: 12
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 7.725 6.245 5.877 5.822 5.815 5.820 5.829
## adjCV 7.725 6.242 5.870 5.816 5.806 5.809 5.813
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps
## CV 5.819 5.802 5.798 5.799 5.800 5.800
## adjCV 5.805 5.789 5.785 5.786 5.787 5.787
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 49.33 59.31 67.36 74.81 82.51 84.49 87.63 91.91
## crim 35.67 44.27 45.90 46.89 47.25 47.78 47.86 47.89
## 9 comps 10 comps 11 comps 12 comps
## X 94.90 96.82 98.48 100.00
## crim 47.91 47.91 47.91 47.91
pls_pred = predict(pls_model, x[-bos_trainindex, ], ncomp = 4)
(mse_pls = mean((pls_pred - y[-bos_trainindex])^2))
## [1] 64.89516
pls_model = plsr(y ~ x, scale = T, ncomp = 4)
summary(pls_model)
## Data: X dimension: 506 12
## Y dimension: 506 1
## Fit method: kernelpls
## Number of components considered: 4
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps
## X 49.49 58.85 64.82 74.54
## y 33.00 41.20 43.32 44.04
coef(pls_model)
## , , 4 comps
##
## y
## zn 0.6988784546
## indus -1.2842038256
## chas -0.1727892697
## nox -0.5630332959
## rm 0.3995676697
## age -0.0006796827
## dis -1.4386617072
## rad 3.8109052016
## tax 1.1424698640
## ptratio -0.5901572705
## lstat 1.6847596784
## medv -1.4764633453
results <- tibble(
Model = c("Best Subset", "Ridge", "Lasso", "PCR", "PLS"),
MSE = round(c(mse_bestsub, mse_ridge, mse_lasso, mse_pcr, mse_pls), 2)
)
print(results)
## # A tibble: 5 × 2
## Model MSE
## <chr> <dbl>
## 1 Best Subset 64.2
## 2 Ridge 65.2
## 3 Lasso 67.0
## 4 PCR 64.2
## 5 PLS 64.9
Best subset model performed the best but the PCR with 12 components performed equally as good. PLS was also able to perform nearly just as good with a much lower 4 components.
Based on the seed we ran (23), I would pick the best subset model and the PLS model because they both had low MSEs. They were evaluated using either validation set error or cross-validation. The Advantage of PLS is fewer components. The advantage of the best subset is simplicity. The best subset used 9 predictors while the PLS used 4 components created from combinations of all 12 predictors.
The best subset did not involve all of the features as it selected 9. The PLS used all the features in the data set.