Use the rnorm() function to generate a predictor X of length n = 100, as well as a noise vector ϵ of length n = 100.
Generate a response vector Y of length n = 100 according to the model.Y = β0 + β1X + β2X2 + β3X3 + ϵ, where β0, β1, β2, and β3 are constants of your choice.
set.seed(1)
x<-rnorm(100)
esp<-rnorm(100)
bo<-3
b1<-2.5
b2<-2
b3<-3
y<-bo + b1*x + b2*x^2 + b3*x^3 + esp
data<-data.frame(x, x^2, x^3, x^4, x^5, x^6, x^7, x^8, x^9, x^10, y)
fit<- regsubsets(y ~ ., data = data, nvmax = 10)
fit_summary<-summary(fit)
min_bic_index<-which.min(fit_summary$bic)
min_cp_index<-which.min(fit_summary$cp)
max_adjr2_index<-which.max(fit_summary$adjr2)
# Plot BIC, Cp, ADJR2
par(mfrow = c(2, 2))
plot(fit_summary$bic, type = "l", xlab = "Number of Variables", ylab = "BIC", main = "BIC vs Number of Variables")
points(min_bic_index, fit_summary$bic[min_bic_index], col = "red", pch = 16)
plot(fit_summary$cp, type = "l", xlab = "Number of Variables", ylab = "Cp", main = "Cp vs Number of Variables")
points(min_cp_index, fit_summary$cp[min_cp_index], col = "red", pch = 16)
plot(fit_summary$adjr2, type = "l", xlab = "Number of Variables", ylab = "Adjusted R-squared", main = "Adjusted R-squared vs Number of Variables")
points(max_adjr2_index, fit_summary$adjr2[max_adjr2_index], col = "red", pch = 16)
It looks like the performance of models dramatically improves with the
inclusion of a 3rd predictor. The optimal values vary between ranges
3-8, though marginal from the initial dramatic improvement from variable
3.
regfit.fwd <- regsubsets(y ~ ., data = data, nvmax = 10, method = "forward")
fwd_summary<-summary(regfit.fwd)
fwd_min_bic <- which.min(fwd_summary$bic)
fwd_min_cp<- which.min(fwd_summary$cp)
fwd_max_adjr2<- which.max(fwd_summary$adjr2)
# Plot AIC, BIC, Cp
par(mfrow = c(2, 2))
plot(fwd_summary$bic, type = "b", xlab = "Number of Variables", ylab = "BIC", main = "BIC vs Number of Variables")
points(fwd_min_bic, fwd_summary$bic[fwd_min_bic], col = "red", pch = 16)
plot(fwd_summary$cp, type = "b", xlab = "Number of Variables", ylab = "Cp", main = "Cp vs Number of Variables")
points(fwd_min_cp, fwd_summary$cp[fwd_min_cp], col = "red", pch = 16)
plot(fwd_summary$adjr2, type = "b", xlab = "Number of Variables", ylab = "Adjusted R-squared", main = "Adjusted R-squared vs Number of Variables")
points(fwd_max_adjr2, fwd_summary$adjr2[fwd_max_adjr2], col = "red", pch = 16)
regfit.bwd <- regsubsets(y ~ ., data = data, nvmax = 10, method = "backward")
bwd.summary<-summary(regfit.bwd)
bwd_min_bic <-which.min(bwd.summary$bic)
bwd_min_cp<- which.min(bwd.summary$cp)
bwd_max_adjr2<-which.max(bwd.summary$adjr2)
# Plot AIC, BIC, Cp
par(mfrow = c(2, 2))
plot(bwd.summary$bic, type = "b", xlab = "Number of Variables", ylab = "BIC", main = "BIC vs Number of Variables")
points(fwd_min_bic, bwd.summary$bic[fwd_min_bic], col = "red", pch = 16)
plot(bwd.summary$cp, type = "b", xlab = "Number of Variables", ylab = "Cp", main = "Cp vs Number of Variables")
points(bwd_min_cp, bwd.summary$cp[bwd_min_cp], col = "red", pch = 16)
plot(bwd.summary$adjr2, type = "b", xlab = "Number of Variables", ylab = "Adjusted R-squared", main = "Adjusted R-squared vs Number of Variables")
points(bwd_max_adjr2, bwd.summary$adjr2[bwd_max_adjr2], col = "red", pch = 16)
It appears the results are pretty consistent across methods, ranging form 3-6 depending on the assessment metric considered, with 3 being the largest improvement.
set.seed(123)
train.x<-as.matrix(data[,-11])
train.y<-as.matrix(data[,11])
lasso<-glmnet(train.x, train.y, alpha = 1)
plot(lasso)
cv.out<-cv.glmnet(train.x, train.y, alpha = 1)
plot(cv.out)
b7<-2
new_y<-bo + b7*x^7 + esp
data2<-data.frame(x, x^2, x^3, x^4, x^5, x^6, x^7, x^8, x^9, x^10, new_y)
regfit2 <- regsubsets(new_y ~ ., data = data2, nvmax = 10)
new_summary<-summary(regfit2)
new_min_bic<-which.min(new_summary$bic)
new_min_cp<-which.min(new_summary$cp)
new_max_adjr2<-which.max(new_summary$adjr2)
#coefficients
coef(regfit2, new_min_bic)
## (Intercept) x.7
## 2.95894 2.00077
coef(regfit2, new_min_cp)
## (Intercept) x.2 x.7
## 3.0704904 -0.1417084 2.0015552
coef(regfit2, new_max_adjr2)
## (Intercept) x x.2 x.3 x.7
## 3.0762524 0.2914016 -0.1617671 -0.2526527 2.0091338
# Plot BIC, Cp, AdR2
par(mfrow = c(2, 2))
plot(new_summary$bic, type = "b", xlab = "Number of Variables", ylab = "BIC", main = "BIC vs Number of Variables")
points(new_min_bic, new_summary$bic[new_min_bic], col = "red", pch = 16)
plot(new_summary$cp, type = "b", xlab = "Number of Variables", ylab = "Cp", main = "Cp vs Number of Variables")
points(new_min_cp, new_summary$cp[new_min_cp], col = "red", pch = 16)
plot(new_summary$adjr2, type = "b", xlab = "Number of Variables", ylab = "Adjusted R-squared", main = "Adjusted R-squared vs Number of Variables")
points(new_max_adjr2, new_summary$adjr2[new_max_adjr2], col = "red", pch = 16)
set.seed(123)
train.x2<-as.matrix(data2[,-11])
train.y2<-as.matrix(data2[,11])
lasso2<-glmnet(train.x2, train.y2, alpha = 1)
cv.out2<-cv.glmnet(train.x2, train.y2, alpha = 1)
plot(cv.out2)
best_lambda = cv.out2$lambda.min
predict(lasso2, s=best_lambda, type="coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3.205086
## x .
## x.2 .
## x.3 .
## x.4 .
## x.5 .
## x.6 .
## x.7 1.942447
## x.8 .
## x.9 .
## x.10 .
To recap, the BO value I set was 3, and B7 was set at value 2. Y was defined in terms of BO and B7, which were the two values determined to include by lasso. The lasso model estimated a signifantly lower intercept value, but a relatively close b7 B7 value.
The best subsets approach suggests the best model includes a range of 4-7 predictor variables. It’s predictions for intercept values are generally closer to the true intercept value. Its estimation for the B7 value is also relatively close.
attach(College)
x <-model.matrix(Apps ~ ., College)[, -1]
y <-College$Apps
set.seed (1)
train <-sample(1: nrow(College), nrow(College)/2)
test<-(-train)
y.test<-Apps[test]
col_lm<-lm(Apps ~ ., data = College, subset = train)
grid<-10^seq(10, -2, length = 100)
summary(col_lm)
##
## Call:
## lm(formula = Apps ~ ., data = College, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5741.2 -479.5 15.3 359.6 7258.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.902e+02 6.381e+02 -1.238 0.216410
## PrivateYes -3.070e+02 2.006e+02 -1.531 0.126736
## Accept 1.779e+00 5.420e-02 32.830 < 2e-16 ***
## Enroll -1.470e+00 3.115e-01 -4.720 3.35e-06 ***
## Top10perc 6.673e+01 8.310e+00 8.030 1.31e-14 ***
## Top25perc -2.231e+01 6.533e+00 -3.415 0.000708 ***
## F.Undergrad 9.269e-02 5.529e-02 1.676 0.094538 .
## P.Undergrad 9.397e-03 5.493e-02 0.171 0.864275
## Outstate -1.084e-01 2.700e-02 -4.014 7.22e-05 ***
## Room.Board 2.115e-01 7.224e-02 2.928 0.003622 **
## Books 2.912e-01 3.985e-01 0.731 0.465399
## Personal 6.133e-03 8.803e-02 0.070 0.944497
## PhD -1.548e+01 6.681e+00 -2.316 0.021082 *
## Terminal 6.415e+00 7.290e+00 0.880 0.379470
## S.F.Ratio 2.283e+01 2.047e+01 1.115 0.265526
## perc.alumni 1.134e+00 6.083e+00 0.186 0.852274
## Expend 4.857e-02 1.619e-02 2.999 0.002890 **
## Grad.Rate 7.490e+00 4.397e+00 1.703 0.089324 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1083 on 370 degrees of freedom
## Multiple R-squared: 0.9389, Adjusted R-squared: 0.9361
## F-statistic: 334.3 on 17 and 370 DF, p-value: < 2.2e-16
pred<-predict(col_lm, College[test,])
mean((pred - y.test)^2)
## [1] 1135758
The test error for least squares is listed above as 1135758
set.seed (1)
ridge.mod<- glmnet(x[train , ], y[train], alpha = 0, lambda = grid , thresh = 1e-12)
cv.out <- cv.glmnet(x[train , ], y[train], alpha = 0)
plot(cv.out)
bestlam <- cv.out$lambda.min
bestlam
## [1] 405.8404
ridge.pred <- predict(ridge.mod , s = bestlam ,
newx = x[test , ])
mean((ridge.pred - y.test)^2)
## [1] 976647.8
Best lambda is reported above as approximately 406, with the test error being marginally smaller than the least squares error listed as 976647.
lasso.mod <- glmnet(x[train , ], y[train], alpha = 1,
lambda = grid)
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
set.seed (1)
cv.out <- cv.glmnet(x[train , ], y[train], alpha = 1)
plot(cv.out)
bestlam <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod , s = bestlam ,
newx = x[test , ])
mean ((lasso.pred - y.test)^2)
## [1] 1116252
Test error from Lasso is higher than ridge regression at 1116252, but marginally lower than least sqaures.
set.seed (2)
pcr.fit <- pcr(Apps ~ ., data = College , scale = TRUE ,
validation = "CV")
summary(pcr.fit)
## Data: X dimension: 777 17
## Y dimension: 777 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3873 3831 2030 2037 1720 1595 1594
## adjCV 3873 3831 2028 2037 1688 1586 1590
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1583 1548 1507 1508 1516 1516 1520
## adjCV 1584 1543 1503 1505 1512 1512 1516
## 14 comps 15 comps 16 comps 17 comps
## CV 1523 1416 1169 1146
## adjCV 1519 1403 1163 1139
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 31.670 57.30 64.30 69.90 75.39 80.38 83.99 87.40
## Apps 2.316 73.06 73.07 82.08 84.08 84.11 84.32 85.18
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.50 92.91 95.01 96.81 97.9 98.75 99.36
## Apps 85.88 86.06 86.06 86.10 86.1 86.13 90.32
## 16 comps 17 comps
## X 99.84 100.00
## Apps 92.52 92.92
set.seed (2)
pcr.fit <- pcr(Apps ~ ., data = College , subset = train ,
scale = TRUE , validation = "CV")
validationplot(pcr.fit , val.type = "MSEP")
pcr.pred <- predict(pcr.fit , x[test , ], ncomp = 17)
mean(( pcr.pred - y.test)^2)
## [1] 1135758
This method resulted in a test error noticeably higher than the other methods.
set.seed (1)
pls.fit <- plsr(Apps ~ ., data = College, subset = train ,
scale = TRUE , validation = "CV")
summary(pls.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4288 2217 2019 1761 1630 1533 1347
## adjCV 4288 2211 2012 1749 1605 1510 1331
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1309 1303 1286 1283 1283 1277 1271
## adjCV 1296 1289 1273 1270 1270 1264 1258
## 14 comps 15 comps 16 comps 17 comps
## CV 1270 1270 1270 1270
## adjCV 1258 1257 1257 1257
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 27.21 50.73 63.06 65.52 70.20 74.20 78.62 80.81
## Apps 75.39 81.24 86.97 91.14 92.62 93.43 93.56 93.68
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 83.29 87.17 89.15 91.37 92.58 94.42 96.98
## Apps 93.76 93.79 93.83 93.86 93.88 93.89 93.89
## 16 comps 17 comps
## X 98.78 100.00
## Apps 93.89 93.89
pls.pred <- predict(pls.fit , x[test , ], ncomp = 15)
mean (( pls.pred - y.test)^2)
## [1] 1135806
pls.fit <- plsr(Apps ~ ., data = College, scale = TRUE ,
ncomp = 1)
summary(pls.fit)
## Data: X dimension: 777 17
## Y dimension: 777 1
## Fit method: kernelpls
## Number of components considered: 1
## TRAINING: % variance explained
## 1 comps
## X 25.76
## Apps 78.01
SS_residual<- sum((y.test - ridge.pred)^2)
SS_total<- sum((y.test - mean(y.test))^2)
R_squared<- 1 - (SS_residual / SS_total)
R_squared
## [1] 0.9153346
Ridge regression performed best of all the models, having the lowest error listed above. The R2 indicates we can predict 91% of the variance accurately.
set.seed(123)
p <- 20
n <- 1000
X <- matrix(rnorm(n * p), n, p)
beta <- rep(0, p)
beta[c(1, 5, 10, 19)] <- c(7, 15, 4, 9)
epsilon <- rnorm(n)
Y <- X %*% beta + epsilon
set.seed(2)
train<-sample(seq(n), 100, replace = FALSE)
test<--train
x.train<-X[train, ]
x.test<-X[test, ]
y.train<-Y[train]
y.test<-Y[test]
train_data<-data.frame(x=x.train, y=y.train)
train.mat <- model.matrix(y ~ ., data = train_data, nvmax = 20)
regfit.best <- regsubsets(y ~ ., data = train_data, nvmax = p)
val.errors <- rep(NA, 20)
for (i in 1:20) {
coefi <- coef(regfit.best , id = i)
pred <- train.mat[, names(coefi)] %*% coefi
val.errors[i] <- mean ((y.train - pred)^2)
}
plot(val.errors, xlab = "count predictors", ylab = "MSE", pch = 19, type = "b")
(d) Plot the test set MSE associated with the best model of each
size.
#creating data frame and model matrix
test_data<-data.frame(y=y.test, x=x.test)
test.mat<-model.matrix(y ~ ., data = test_data, nvmax = 20)
# loop for
test.val.errors<-rep(NA, 20)
for (i in 1:20) {
coefi<-coef(regfit.best, id = i)
pred<-test.mat[, names(coefi)]%*%coefi
test.val.errors[i] <- mean((pred - y.test)^2)
}
plot(test.val.errors, xlab = "count predictors", ylab = "MSE", pch = 19, type = "b")
(e) For which model size does the test set MSE take on its minimum
value? Comment on your results. If it takes on its minimum value for a
model containing only an intercept or a model containing all of the
features, then play around with the way that you are generating the data
in (a) until you come up with a scenario in which the test set MSE is
minimized for an intermediate model size.
which.min(test.val.errors)
## [1] 4
The output suggests that a model of 4 variables is the best fitting.
coef(regfit.best, which.min(test.val.errors))
## (Intercept) x.1 x.5 x.10 x.19
## 0.04906815 7.32822930 14.98255669 3.89655475 8.96235082
Looks like the best model was close at estimating the coefficients for non-zero variables.
\[ \sqrt{\sum_{j=1}^{p} (\beta_{j} - \hat{\beta}^{r}_{j})^2} \] for a range of values of r, where \[ \hat{\beta}^{r}_{j} \] j is the jth coefficient estimate for the best model containing r coefficients. Comment on what you observe. How does this compare to the test MSE plot from (d)?
val.errors = rep(NA, p)
x_cols = colnames(X, do.NULL = FALSE, prefix = "x.")
for (i in 1:20) {
coefi = coef(regfit.best, id = i)
val.errors[i] = sqrt(sum((beta[x_cols %in% names(coefi)] - coefi[names(coefi) %in% x_cols])^2) + sum(beta[!(x_cols %in% names(coefi))])^2)
}
plot(val.errors, xlab = "coefficients", ylab = "Error", pch = 19, type = "b")
This shows that the coefficient values 4 result in the lowest error.
#Trying this round as per the books models.
set.seed(123)
train <- sample(c(TRUE , FALSE), nrow(Boston),
replace = TRUE)
test <- (!train)
# Best subset on training data
regfit.best <- regsubsets(crim ~ .,data = Boston[train , ], nvmax = 12)
test.mat <- model.matrix(crim ~ ., data = Boston[test , ])
val.errors <- rep(NA, 12)
for (i in 1:12) {
coefi <- coef(regfit.best , id = i)
pred <- test.mat[, names(coefi)] %*% coefi
val.errors[i] <- mean ((Boston$crim[test] - pred)^2)
}
plot(val.errors, type = "b")
which.min(val.errors)
## [1] 2
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:ISLR2':
##
## Boston
library(leaps)
library(glmnet)
data(Boston)
set.seed (1)
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id = id)
xvars = names(coefi)
mat[, xvars] %*% coefi
}
# preparing cross validation
k <- 10
n <- nrow(Boston)
# loop for cross validation
folds<-sample(rep (1:k, length = n, replace = TRUE))
cv.errors<-matrix(NA, k, 12, dimnames = list(NULL , paste (1:12)))
for (j in 1:k) {best.fit = regsubsets(crim ~ .,data = Boston[folds != j,], nvmax = 12)
for (i in 1:12) {
pred = predict(best.fit, Boston[folds == j,], id = i)
cv.errors[j,i]<- mean((Boston$crim[folds == j] - pred)^2)
}}
# extracting mean error
mean.cv.errors<-apply(cv.errors , 2, mean)
mean.cv.errors
## 1 2 3 4 5 6 7 8
## 46.00617 44.22854 43.80757 44.63674 44.37501 44.10329 43.38296 43.18012
## 9 10 11 12
## 42.81453 43.01895 43.03912 42.88730
# plotting, minimum error, coefficients of best model
par(mfrow = c(1, 1))
plot(mean.cv.errors , type = "b")
which.min(mean.cv.errors)
## 9
## 9
coef(best.fit, 10)
## (Intercept) zn indus nox rm dis
## 11.888303433 0.038959165 -0.090953359 -9.133040291 0.753924930 -0.843086409
## rad ptratio black lstat medv
## 0.533033181 -0.261373448 -0.006864546 0.185125289 -0.189245945
# Coefficients for best model
reg.best <- regsubsets(crim ~ ., data = Boston, nvmax = 12)
coef(reg.best , 11)
## (Intercept) zn indus nox rm
## 17.096652918 0.044858511 -0.069176572 -10.458590328 0.445708393
## dis rad tax ptratio black
## -0.997154027 0.583934313 -0.003454533 -0.265327998 -0.007599276
## lstat medv
## 0.127214918 -0.204431117
# preparing for ridge
x <- model.matrix(crim ~ ., Boston)[, -1]
y <- Boston$crim
grid <- 10^seq(10, -2, length = 100)
# splitting data
set.seed (1)
train <- sample (1: nrow(x), nrow(x) / 2)
test <- (-train)
y.test <- y[test]
# Ridge Model and Cross Validation
ridge.mod<- glmnet(x[train , ], y[train], alpha = 0, lambda = grid , thresh = 1e-12)
cv.out <- cv.glmnet(x[train , ], y[train], alpha = 0)
plot(cv.out)
# extracting best lambda value
bestlam <- cv.out$lambda.min
bestlam
## [1] 0.5919159
# prediction with best lam value
ridge.pred <- predict(ridge.mod, s = bestlam ,
newx = x[test , ])
mean((ridge.pred-y.test)^2)
## [1] 40.92116
# coefficients at best lam
out <- glmnet(x, y, alpha = 0)
predict(out , type = "coefficients", s = bestlam)[1:13, ]
## (Intercept) zn indus chas nox rm
## 8.602062567 0.032328926 -0.081145060 -0.740075545 -5.084980838 0.327881992
## age dis rad tax ptratio black
## 0.002079295 -0.683124956 0.413934152 0.003705697 -0.127315475 -0.008534520
## lstat
## 0.142710234
The best lambda value is at .59, which results in MSE 40.1. None of the coefficients were zeroed out, characteristic of the ridge method (i.e. no variable selection).
# lasso fit and plotting
lasso.mod <- glmnet(x[train , ], y[train], alpha = 1,
lambda = grid)
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
set.seed (1)
cv.out <- cv.glmnet(x[train , ], y[train], alpha = 1)
plot(cv.out)
bestlam <- cv.out$lambda.min
bestlam
## [1] 0.05148183
lasso.pred <- predict(lasso.mod , s = bestlam, newx = x[test , ])
mean((lasso.pred - y.test)^2)
## [1] 40.99066
out<-glmnet(x, y, alpha = 1, lambda = grid)
lasso.coef <- predict(out , type = "coefficients",
s = bestlam)[1:13, ]
lasso.coef
## (Intercept) zn indus chas nox
## 1.269174e+01 3.628905e-02 -7.012417e-02 -5.842484e-01 -6.989138e+00
## rm age dis rad tax
## 2.308650e-01 0.000000e+00 -7.886241e-01 5.146154e-01 -2.235567e-05
## ptratio black lstat
## -1.884305e-01 -7.544153e-03 1.248260e-01
The best lambda for lasso was at .018, with an MSE of 40.9, age was zeroed out as a coefficient.
#fitting pcr
set.seed (1)
pcr.fit <- pcr(crim ~ ., data = Boston, subset = train ,
scale = TRUE , validation = "CV")
validationplot(pcr.fit , val.type = "MSEP")
#prediction pcr/ calculating error
pcr.pred <- predict(pcr.fit , x[test , ], ncomp = 12)
mean (( pcr.pred - y.test)^2)
## [1] 42.5644
# fitting model with best M
pcr.fit <- pcr(y ~ x, scale = TRUE , ncomp = 12)
summary(pcr.fit)
## Data: X dimension: 506 13
## Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 12
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 47.70 60.36 69.67 76.45 82.99 88.00 91.14 93.45
## y 30.69 30.87 39.27 39.61 39.61 39.86 40.14 42.47
## 9 comps 10 comps 11 comps 12 comps
## X 95.40 97.04 98.46 99.52
## y 42.55 42.78 43.04 44.13
PCR shows an MSE of 41.1 at M = 12.
Ridge regression performed the best, with the lowest MSE of 40.1, with lasso trailing closely behind at 40.9.
It does, considering that ridge does not perform variable selection this was to be expected.