The correct answer is: iii. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
The correct answer is: iii. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
The correct answer is: ii. More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.
library(ISLR)
data("College")
College[1:3,]
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
# Split College data set
set.seed(1)
split = sample(777, 389)
# training set
train = College[split,]
# testing set
test = College[-split,]
# linear model using least squares on training set
lm.fit=lm(Apps ~ . , data = train)
lm.preds = predict(lm.fit, newdata = test, type = "response")
# test error
error=mse(test$Apps,lm.preds)
error
## [1] 1138680
The test error of the linear model is approximately 1138680.
set.seed(1)
# train and test
x.train = model.matrix(Apps ~ ., data=train)[, -1]
x.test = model.matrix(Apps ~ ., data=test)[, -1]
y.train= train$Apps
y.test = College$Apps
# choose λ using cross-validation
cv.out.ridge=cv.glmnet(x.train,y.train,alpha=0)
ridge.bestlam=cv.out.ridge$lambda.min
# ridge regression model on training set, with λ chosen by cross-validation
ridge.fit = glmnet(x.train,
y.train,
alpha = 0,
lambda = ridge.bestlam)
ridge.pred = predict(ridge.fit, s = ridge.bestlam, newx = x.test)
# test error
error = mse(test$Apps, ridge.pred)
error
## [1] 979026.3
The test error of the ridge regression model is approximately 979026.3.
cv.out.lasso=cv.glmnet(x.train,y.train,alpha=1)
lasso.bestlam=cv.out.lasso$lambda.min
# lasso model on training set
lasso.fit = glmnet(x.train,
y.train,
alpha = 1,
lambda = lasso.bestlam)
lasso.pred = predict(lasso.fit, s = lasso.bestlam, newx = x.test)
# test error
error = mse(test$Apps, lasso.pred)
error
## [1] 1119383
# non-zero Coefficient Estimates
lasso.coef = predict(lasso.fit, type = "coefficients", s = lasso.bestlam)[1:length(lasso.fit$beta), ]
lasso.coef[lasso.coef != 0]
## (Intercept) PrivateYes Accept Enroll Top10perc
## -7.632005e+02 -3.138095e+02 1.763133e+00 -1.321329e+00 6.499414e+01
## Top25perc F.Undergrad P.Undergrad Outstate Room.Board
## -2.094179e+01 7.172773e-02 1.195662e-02 -1.048984e-01 2.090564e-01
## Books Personal PhD Terminal S.F.Ratio
## 2.924628e-01 3.587029e-03 -1.449167e+01 5.340248e+00 2.168324e+01
## perc.alumni Expend
## 5.184239e-01 4.812161e-02
The test error of the Lasso Model is approximately 1119383.
# pcr model on training set
pcr.fit = pcr(
Apps ~ .,
data = train ,
scale = TRUE,
validation = "CV"
)
#value of M
M=pcr.fit$ncomp
M
## [1] 17
pcr.pred = predict(pcr.fit, newx = x.test, ncomp = 17)
# test error
error = mse(test$Apps, pcr.pred)
error
## [1] 28083385
The test error of the pcr model is approximately 28083385.
# pls model on training set
pls.fit = plsr(
Apps ~ .,
data = train ,
scale = TRUE,
validation = "CV"
)
#value of M
M=pls.fit$ncomp
M
## [1] 17
pls.pred = predict(pls.fit, newx = x.test, ncomp = 17)
# test error
error = mse(test$Apps, pls.pred)
error
## [1] 28083385
The test error of the pls is approximately 28083385.
The pls model and the pcr model have the highest test errors and the ridge regression model has the lowest test error. So the ridge regression model will give the most accurate prediction of the number of college applications received.
detach(College)
library(MASS)
data("Boston")
Boston[1:3,]
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## medv
## 1 24.0
## 2 21.6
## 3 34.7
# Split Boston data set
set.seed(1)
split2 = sample(506, 253)
# training set
train2 = Boston[split2,]
# testing set
test2 = Boston[-split2,]
library(ModelMetrics)
k = 10
# folds = j are in the test set and the rest are in the training set
folds <- sample(1:k, nrow(Boston), replace = TRUE)
cv.errors <- matrix(NA, k, 13, dimnames = list(NULL, paste(1:13)))
library(leaps)
# predict method for regsubsets
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id = id)
mat[, names(coefi)] %*% coefi
}
# performs cross-validation
for (j in 1:k) {
best.fit = regsubsets(crim ~ ., data = Boston[folds != j,], nvmax = 13)
for (i in 1:13) {
pred = predict(best.fit, Boston[folds == j,], id = i)
cv.errors[j, i] <- mse(Boston$crim[folds == j], pred)
}
}
mean.cv.errors = apply(cv.errors, 2, mean)
plot(mean.cv.errors,
type = "b",
xlab = "Number of variables",
ylab = "CV error")
The cross-validation selects a 12 variables model. The test error of the best subset model is approximately 40.29928
# train and test
x.train2 = model.matrix(crim ~ ., data=train2)[, -1]
x.test2 = model.matrix(crim ~ ., data=test2)[, -1]
y.train2= train2$crim
y.test2 = Boston$crim
# choose λ using cross-validation
cv.out.ridge2=cv.glmnet(x.train2,y.train2,alpha=0)
ridge.bestlam2=cv.out.ridge2$lambda.min
# ridge regression model on training set, with λ chosen by cross-validation
ridge.fit2 = glmnet(x.train2,
y.train2,
alpha = 0,
lambda = ridge.bestlam2)
ridge.pred2 = predict(ridge.fit2, s = ridge.bestlam2, newx = x.test2)
# test error
error = mse(test2$crim, ridge.pred2)
error
## [1] 40.9232
The test error of the ridge regression model is approximately 40.9232.
cv.out.lasso2=cv.glmnet(x.train2,y.train2,alpha=1)
lasso.bestlam2=cv.out.lasso2$lambda.min
lasso.bestlam2
## [1] 0.06805595
# lasso model on training set
lasso.fit2 = glmnet(x.train2,
y.train2,
alpha = 1,
lambda = lasso.bestlam2)
lasso.pred2 = predict(lasso.fit2, s = lasso.bestlam2, newx = x.test2)
# test error
error = mse(test2$crim, lasso.pred2)
error
## [1] 40.89965
# non-zero Coefficient Estimates
lasso.coef = predict(lasso.fit2, type = "coefficients", s = lasso.bestlam2)[1:length(lasso.fit2$beta), ]
lasso.coef[lasso.coef != 0]
## (Intercept) zn indus chas nox rm
## 17.66443063 0.03537765 -0.11874369 -0.43271606 -7.17230268 0.03843416
## dis rad ptratio black lstat
## -0.77046617 0.52419170 -0.34966634 -0.01308509 0.25510388
The test error of the lasso is approximately 40.89965. ### Test Error and Value of M of a PCR Model
# pcr model on training set
pcr.fit2 = pcr(
crim ~ .,
data = train2 ,
scale = TRUE,
validation = "CV"
)
#value of M
M=pcr.fit2$ncomp
M
## [1] 13
pcr.pred2 = predict(pcr.fit2, newx = x.test2, ncomp = 13)
# test error
error = mse(test2$crim, pcr.pred2)
error
## [1] 109.997
The test error of the PCR is approximately 109.997.
The pcr model has the highest test error and the best subset model has the lowest test error. So the best subset model performs the best.
The model chosen by best subset only involves 12 predictors instead of all 13 predictors.