## Question 2: For parts (a) through (c), indicate which of i. through iv. is correct. Justify your answer.
using the other variables in the College data set.
library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(pls)
##
## Attaching package: 'pls'
##
## The following object is masked from 'package:caret':
##
## R2
##
## The following object is masked from 'package:stats':
##
## loadings
library(leaps)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## The following object is masked from 'package:ISLR2':
##
## Boston
data("College")
attach(College)
set.seed(2)
train <- sample(1:nrow(College), nrow(College)/2)
test <- -train
y.test <- Apps[test]
lm.fit <- lm(Apps ~., data=College, subset = train)
lm.pred <- predict(lm.fit, College[test,])
lm.error <- mean((lm.pred - y.test)^2)
lm.error
## [1] 1093608
model_mat <- model.matrix(Apps ~ ., data = College[train, ])
summary(model_mat)
## (Intercept) PrivateYes Accept Enroll
## Min. :1 Min. :0.0000 Min. : 72.0 Min. : 51.0
## 1st Qu.:1 1st Qu.:0.0000 1st Qu.: 561.5 1st Qu.: 239.0
## Median :1 Median :1.0000 Median : 1218.5 Median : 438.0
## Mean :1 Mean :0.7216 Mean : 2191.4 Mean : 835.5
## 3rd Qu.:1 3rd Qu.:1.0000 3rd Qu.: 2669.0 3rd Qu.: 947.0
## Max. :1 Max. :1.0000 Max. :26330.0 Max. :6180.0
## Top10perc Top25perc F.Undergrad P.Undergrad
## Min. : 1.00 Min. : 9.00 Min. : 139.0 Min. : 1.00
## 1st Qu.:16.00 1st Qu.: 42.00 1st Qu.: 970.2 1st Qu.: 98.75
## Median :25.00 Median : 55.50 Median : 1696.5 Median : 390.00
## Mean :28.07 Mean : 57.01 Mean : 4011.0 Mean : 991.18
## 3rd Qu.:36.00 3rd Qu.: 70.25 3rd Qu.: 4598.0 3rd Qu.: 1107.25
## Max. :95.00 Max. :100.00 Max. :28938.0 Max. :21836.00
## Outstate Room.Board Books Personal
## Min. : 2340 Min. :2146 Min. : 120.0 Min. : 300
## 1st Qu.: 7344 1st Qu.:3600 1st Qu.: 498.8 1st Qu.: 850
## Median :10280 Median :4232 Median : 519.0 Median :1230
## Mean :10545 Mean :4402 Mean : 547.2 Mean :1373
## 3rd Qu.:13035 3rd Qu.:5088 3rd Qu.: 600.0 3rd Qu.:1700
## Max. :21700 Max. :8124 Max. :1300.0 Max. :6800
## PhD Terminal S.F.Ratio perc.alumni
## Min. : 16.00 Min. : 25.00 Min. : 2.90 Min. : 2.00
## 1st Qu.: 63.00 1st Qu.: 72.00 1st Qu.:11.30 1st Qu.:14.00
## Median : 76.00 Median : 83.00 Median :13.50 Median :21.00
## Mean : 73.19 Mean : 80.47 Mean :13.91 Mean :23.18
## 3rd Qu.: 86.00 3rd Qu.: 92.00 3rd Qu.:16.43 3rd Qu.:31.00
## Max. :103.00 Max. :100.00 Max. :27.60 Max. :63.00
## Expend Grad.Rate
## Min. : 3480 Min. : 10.00
## 1st Qu.: 6864 1st Qu.: 54.00
## Median : 8604 Median : 65.00
## Mean : 9687 Mean : 65.95
## 3rd Qu.:10932 3rd Qu.: 78.00
## Max. :45702 Max. :118.00
ridge <- cv.glmnet(model_mat, College$Apps[train], thresh = 1e-12)
p <- predict(ridge, model.matrix(Apps ~ ., data = College[test, ]), s = ridge$lambda.min)
(mean((p - College$Apps[test])^2))
## [1] 1082899
x <- model.matrix(Apps~., data=College)[, -1]
x.train <- x[train,]
y <- College$Apps
y.train <- y[train]
set.seed(42)
train <- sample(nrow(College), nrow(College) * .9)
test <- setdiff(seq_len(nrow(College)), train)
mse <- list()
fit <- lm(Apps ~ ., data = College[train, ])
summary(fit)
##
## Call:
## lm(formula = Apps ~ ., data = College[train, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -4933.5 -423.1 -45.0 328.9 7817.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -562.12834 449.82870 -1.250 0.211857
## PrivateYes -526.51151 152.06294 -3.462 0.000569 ***
## Accept 1.58621 0.04385 36.174 < 2e-16 ***
## Enroll -0.90689 0.19489 -4.653 3.92e-06 ***
## Top10perc 52.95003 5.99742 8.829 < 2e-16 ***
## Top25perc -16.36255 4.81822 -3.396 0.000724 ***
## F.Undergrad 0.06341 0.03513 1.805 0.071535 .
## P.Undergrad 0.04177 0.03394 1.231 0.218877
## Outstate -0.08228 0.02068 -3.978 7.69e-05 ***
## Room.Board 0.17247 0.05231 3.297 0.001028 **
## Books 0.05421 0.25740 0.211 0.833262
## Personal 0.02549 0.06766 0.377 0.706431
## PhD -10.33329 5.20327 -1.986 0.047443 *
## Terminal -2.22350 5.73238 -0.388 0.698223
## S.F.Ratio 19.26847 14.26146 1.351 0.177116
## perc.alumni -0.50686 4.60830 -0.110 0.912450
## Expend 0.07885 0.01311 6.013 2.97e-09 ***
## Grad.Rate 9.06841 3.22532 2.812 0.005071 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1069 on 681 degrees of freedom
## Multiple R-squared: 0.93, Adjusted R-squared: 0.9283
## F-statistic: 532.4 on 17 and 681 DF, p-value: < 2.2e-16
mean((predict(fit, College[test, ]) - College$Apps[test])^2)
## [1] 588882.2
set.seed(42)
model_mat <- model.matrix(Apps ~ ., data = College[train, ])
summary(model_mat)
## (Intercept) PrivateYes Accept Enroll Top10perc
## Min. :1 Min. :0.0000 Min. : 90 Min. : 35.0 Min. : 1.00
## 1st Qu.:1 1st Qu.:0.0000 1st Qu.: 615 1st Qu.: 245.0 1st Qu.:15.00
## Median :1 Median :1.0000 Median : 1113 Median : 438.0 Median :24.00
## Mean :1 Mean :0.7268 Mean : 2046 Mean : 786.8 Mean :27.85
## 3rd Qu.:1 3rd Qu.:1.0000 3rd Qu.: 2413 3rd Qu.: 902.5 3rd Qu.:35.50
## Max. :1 Max. :1.0000 Max. :26330 Max. :6392.0 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.00 Min. : 199 Min. : 1.0 Min. : 2580
## 1st Qu.: 41.50 1st Qu.: 1002 1st Qu.: 94.0 1st Qu.: 7392
## Median : 55.00 Median : 1715 Median : 350.0 Median :10194
## Mean : 56.23 Mean : 3732 Mean : 873.6 Mean :10537
## 3rd Qu.: 69.00 3rd Qu.: 4056 3rd Qu.: 976.0 3rd Qu.:13015
## Max. :100.00 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1880 Min. : 96.0 Min. : 250 Min. : 8.0
## 1st Qu.:3598 1st Qu.: 450.0 1st Qu.: 850 1st Qu.: 62.0
## Median :4210 Median : 500.0 Median :1200 Median : 75.0
## Mean :4374 Mean : 549.9 Mean :1339 Mean : 72.9
## 3rd Qu.:5056 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 86.0
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.0
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 25.00 Min. : 2.90 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.00 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6780
## Median : 82.00 Median :13.50 Median :21.00 Median : 8471
## Mean : 79.95 Mean :14.08 Mean :22.97 Mean : 9739
## 3rd Qu.: 92.00 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10890
## Max. :100.00 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.0
## 1st Qu.: 54.0
## Median : 66.0
## Mean : 65.9
## 3rd Qu.: 78.0
## Max. :118.0
lasso <- cv.glmnet(model_mat, College$Apps[train], alpha = 1, thresh = 1e-12)
p <- predict(lasso, model.matrix(Apps ~ ., data = College[test, ]), s = lasso$lambda.min)
coef_values <- coef(lasso)
non_zero_count <- sum(coef_values != 0)
print(non_zero_count)
## [1] 4
(lasso <- mean((p - College$Apps[test])^2))
## [1] 538462.5
library(pls)
library(ISLR2)
attach(College)
## The following objects are masked from College (pos = 3):
##
## Accept, Apps, Books, Enroll, Expend, F.Undergrad, Grad.Rate,
## Outstate, P.Undergrad, perc.alumni, Personal, PhD, Private,
## Room.Board, S.F.Ratio, Terminal, Top10perc, Top25perc
x=model.matrix(Apps~.,College)[,-1]
y=College$Apps
set.seed(10)
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
College.train = College[train, ]
College.test = College[test, ]
y.test=y[test]
pcr.college=pcr(Apps~., data=College.train,scale=TRUE,validation="CV")
summary(pcr.college)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4347 4334 2384 2404 2015 1961 1889
## adjCV 4347 4335 2379 2404 1994 1953 1883
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1891 1862 1852 1851 1857 1864 1889
## adjCV 1885 1845 1846 1844 1850 1857 1883
## 14 comps 15 comps 16 comps 17 comps
## CV 1920 1694 1308 1279
## adjCV 1988 1643 1296 1266
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 32.6794 56.94 64.38 70.61 76.27 80.97 84.48 87.54
## Apps 0.9148 71.17 71.36 79.85 81.49 82.73 82.79 83.70
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.50 92.89 94.96 96.81 97.97 98.73 99.39
## Apps 83.86 84.08 84.11 84.11 84.16 84.28 93.08
## 16 comps 17 comps
## X 99.86 100.00
## Apps 93.71 93.95
validationplot(pcr.college, val.type="MSEP")
pcr.pred=predict(pcr.college,x[test,],ncomp=10)
mean((pcr.pred-y.test)^2)
## [1] 1422699
pls.college=plsr(Apps~., data=College.train,scale=TRUE, validation="CV")
validationplot(pls.college, val.type="MSEP")
summary(pls.college)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4347 2172 1874 1760 1623 1473 1358
## adjCV 4347 2167 1865 1750 1595 1444 1341
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1342 1333 1328 1320 1319 1320 1320
## adjCV 1327 1319 1314 1305 1305 1306 1306
## 14 comps 15 comps 16 comps 17 comps
## CV 1320 1320 1320 1320
## adjCV 1305 1306 1305 1305
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 24.27 38.72 62.64 65.26 69.01 73.96 78.86 82.18
## Apps 76.96 84.31 86.80 91.48 93.37 93.75 93.81 93.84
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 85.35 87.42 89.18 91.41 92.70 94.58 97.16
## Apps 93.88 93.91 93.93 93.94 93.95 93.95 93.95
## 16 comps 17 comps
## X 98.15 100.00
## Apps 93.95 93.95
pls.pred=predict(pls.college,x[test,],ncomp=9)
mean((pls.pred-y.test)^2)
## [1] 1049868
# Load libraries
library(MASS) # For Boston dataset
library(ggplot2) # For plotting
library(GGally) # For correlation matrix plots
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
data(Boston)
attach(Boston)
pairs(Boston)
ggpairs(Boston[, c("crim", "rm", "age", "dis", "tax", "medv")])
cor(Boston)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## rm age dis rad tax ptratio
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## black lstat medv
## crim -0.38506394 0.4556215 -0.3883046
## zn 0.17552032 -0.4129946 0.3604453
## indus -0.35697654 0.6037997 -0.4837252
## chas 0.04878848 -0.0539293 0.1752602
## nox -0.38005064 0.5908789 -0.4273208
## rm 0.12806864 -0.6138083 0.6953599
## age -0.27353398 0.6023385 -0.3769546
## dis 0.29151167 -0.4969958 0.2499287
## rad -0.44441282 0.4886763 -0.3816262
## tax -0.44180801 0.5439934 -0.4685359
## ptratio -0.17738330 0.3740443 -0.5077867
## black 1.00000000 -0.3660869 0.3334608
## lstat -0.36608690 1.0000000 -0.7376627
## medv 0.33346082 -0.7376627 1.0000000
set.seed(42)
train <- sample(1:nrow(Boston), nrow(Boston)*0.70)
test <- -train
y.test <- crim[test]
lm.fit <- lm(crim~., data=Boston, subset=train)
summary(lm.fit)
##
## Call:
## lm(formula = crim ~ ., data = Boston, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.598 -2.475 -0.342 1.139 73.341
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.813866 9.711813 2.040 0.04210 *
## zn 0.052731 0.026025 2.026 0.04353 *
## indus -0.066322 0.104379 -0.635 0.52560
## chas -0.809503 1.652821 -0.490 0.62461
## nox -12.608230 6.983619 -1.805 0.07190 .
## rm 0.876485 0.757534 1.157 0.24807
## age -0.009146 0.022781 -0.401 0.68832
## dis -1.219266 0.387667 -3.145 0.00181 **
## rad 0.688041 0.115323 5.966 6.1e-09 ***
## tax -0.007097 0.006754 -1.051 0.29411
## ptratio -0.373803 0.244075 -1.532 0.12657
## black -0.007415 0.005466 -1.357 0.17582
## lstat 0.186500 0.096357 1.936 0.05376 .
## medv -0.246119 0.076966 -3.198 0.00152 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.036 on 340 degrees of freedom
## Multiple R-squared: 0.4392, Adjusted R-squared: 0.4178
## F-statistic: 20.48 on 13 and 340 DF, p-value: < 2.2e-16
lm.pred <- predict(lm.fit, Boston[test,])
lm.error <- mean((lm.pred - y.test)^2)
lm.error
## [1] 25.13525
# Choose the model using cross-validation and the best subset method
set.seed(1)
predict.regsubsets <- function(object, newdata, id, ...) {
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id = id)
xvars <- names(coefi)
mat[, xvars] %*% coefi
}
k = 10
folds <- sample(1:k, nrow(Boston), replace = TRUE)
cv.errors <- matrix(NA, k, 13, dimnames = list(NULL, paste(1:13)))
for (j in 1:k) {
best.fit <- regsubsets(crim ~ ., data = Boston[folds != j, ], nvmax = 13)
for (i in 1:13) {
pred <- predict(best.fit, Boston[folds == j, ], id = i)
cv.errors[j, i] <- mean((Boston$crim[folds == j] - pred)^2)
}
}
mean.cv.errors <- apply(cv.errors, 2, mean)
mean.cv.errors
## 1 2 3 4 5 6 7 8
## 45.44573 43.87260 43.94979 44.02424 43.96415 43.96199 42.96268 42.66948
## 9 10 11 12 13
## 42.53822 42.73416 42.52367 42.46014 42.50125
plot(mean.cv.errors, type="b")
points(which.min(mean.cv.errors), mean.cv.errors[12], pch=20, col='red', cex=2)
min(mean.cv.errors)
## [1] 42.46014
x <- model.matrix(crim~., Boston)[, -1]
y <- Boston$crim
lasso.fit <- glmnet(x[train, ], y[train], alpha=1)
cv.lasso.fit <- cv.glmnet(x[train, ], y[train], alpha=1)
plot(cv.lasso.fit)
bestlam.lasso <- cv.lasso.fit$lambda.min
bestlam.lasso
## [1] 0.0157584
lasso.pred <- predict(lasso.fit, s=bestlam.lasso, newx=x[test,])
lasso.error <- mean((lasso.pred-y[test])^2)
lasso.error
## [1] 24.67009
ridge.fit <- glmnet(x[train, ], y[train], alpha=0)
cv.ridge.fit <- cv.glmnet(x[train, ], y[train], alpha=0)
plot(cv.ridge.fit)
bestlam.ridge <- cv.ridge.fit$lambda.min
bestlam.ridge
## [1] 0.5533077
ridge.pred <- predict(ridge.fit, s=bestlam.ridge, newx=x[test,])
ridge.error <- mean((ridge.pred-y[test])^2)
ridge.error
## [1] 23.17563
set.seed(4)
pcr.fit <- pcr(crim~., data=Boston, subset=train, scale=TRUE, validation ="CV")
summary(pcr.fit)
## Data: X dimension: 354 13
## Y dimension: 354 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 9.235 7.894 7.896 7.478 7.501 7.515 7.525
## adjCV 9.235 7.888 7.890 7.467 7.493 7.506 7.515
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 7.496 7.383 7.422 7.395 7.405 7.345 7.220
## adjCV 7.485 7.366 7.408 7.380 7.389 7.328 7.203
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 45.82 59.29 68.21 75.55 82.5 87.54 90.79 93.16
## crim 28.55 28.90 36.20 36.29 36.4 36.69 37.53 39.76
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 95.27 97.04 98.46 99.53 100.00
## crim 40.07 40.57 40.74 42.06 43.92
validationplot(pcr.fit, val.type = "MSEP")
pcr.pred <- predict(pcr.fit,x[test,], ncomp=13)
pcr.error <- mean((pcr.pred-y.test)^2)
pcr.error
## [1] 25.13525
errors <- c(lm.error, ridge.error, lasso.error, pcr.error,min(mean.cv.errors))
names(errors) <- c("lm", "ridge", "lasso", "pcr", "cross+best")
print(sort(errors))
## ridge lasso pcr lm cross+best
## 23.17563 24.67009 25.13525 25.13525 42.46014
#(b) Propose a model (or set of models) that seem to perform well on this data set, and justify your answer. Make sure that you are evaluating model performance using validation set error, crossvalidation, or some other reasonable alternative, as opposed to using training error.
#I would suggest that the optimal model is the linear model with the best subset of 3 predictors based on the information in part (a) above. Despite having less test error than the three variable linear model, the Lasso and Ridge models I’ve shown above still have 12 and 13 predictors, respectively. I find it difficult to justify the reduction in test error when compared to adding 9–10 extra variables.
#(c) Does your chosen model involve all of the features in the data set? Why or why not?
#No, not all feature is involved. This method of selection uses the best subset. It contains three of the 13 potential predictors, as was already mentioned.