For parts (a) through (c), indicate which of i. through iv. is correct.
III is correct. The lasso, relative to least squares, is less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
III is correct. Ridge regression, relative to least squares, is less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
II is correct. Non-Linear Method, relative to least squares, more flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.
In this exercise, we will predict the number of applications received using the other variables in the College data set.
require (ISLR)
## Loading required package: ISLR
data (College)
set.seed(1)
trainid=sample(1:nrow(College), nrow(College)/2)
data.train<-College[trainid,]
data.test<-College[-trainid,]
fit.lm <- lm(Apps~., data=data.train)
pred.lm <- predict(fit.lm, data.test)
mean((data.test$Apps - pred.lm)^2)
## [1] 1108531
Test Error = 1108531
require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
xmat.train <- model.matrix(Apps~., data=data.train)[,-1]
xmat.test <- model.matrix(Apps~., data=data.test)[,-1]
fit.ridge <- cv.glmnet(xmat.train, data.train$Apps, alpha=0)
(lambda <- fit.ridge$lambda.min)
## [1] 450.7435
Lambda = 450.7435
pred.ridge <- predict(fit.ridge, s=lambda, newx=xmat.test)
mean((data.test$Apps - pred.ridge)^2)
## [1] 1037616
Test Error = 1037616
fit.lasso <- cv.glmnet(xmat.train, data.train$Apps, alpha=1)
lambda <- fit.lasso$lambda.min
Lambda = 29.65591
pred.lasso <- predict(fit.lasso, s=lambda, newx=xmat.test)
mean((data.test$Apps - pred.lasso)^2)
## [1] 1025248
Test Error = 1025248
coef.lasso <- predict(fit.lasso, type="coefficients", s=lambda)[1:ncol(College),]
coef.lasso[coef.lasso != 0]
## (Intercept) PrivateYes Accept Enroll Top10perc
## -4.514223e+02 -4.814352e+02 1.535012e+00 -4.035455e-01 4.668170e+01
## Top25perc F.Undergrad Outstate Room.Board PhD
## -7.091364e+00 -3.426988e-03 -5.020064e-02 1.851610e-01 -3.849885e+00
## Terminal perc.alumni Expend Grad.Rate
## -3.443747e+00 -2.117870e+00 3.192846e-02 2.695730e+00
length(coef.lasso[coef.lasso != 0])
## [1] 14
Number of non-zero coefficient estimates = 14
require(pls)
## Loading required package: pls
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(1)
fit.pcr <- pcr(Apps~., data=data.train, scale=TRUE, validation="CV")
validationplot(fit.pcr, val.type="MSEP")
summary(fit.pcr)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4335 4179 2364 2374 1996 1844 1845
## adjCV 4335 4182 2360 2374 1788 1831 1838
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1850 1863 1809 1809 1812 1815 1825
## adjCV 1844 1857 1801 1800 1804 1808 1817
## 14 comps 15 comps 16 comps 17 comps
## CV 1810 1823 1273 1281
## adjCV 1806 1789 1260 1268
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 31.216 57.68 64.73 70.55 76.33 81.30 85.01
## Apps 6.976 71.47 71.58 83.32 83.44 83.45 83.46
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 88.40 91.16 93.36 95.38 96.94 97.96 98.76
## Apps 83.47 84.53 84.86 84.98 84.98 84.99 85.24
## 15 comps 16 comps 17 comps
## X 99.40 99.87 100.00
## Apps 90.87 93.93 93.97
Minimum CV at M=16
pred.pcr <- predict(fit.pcr, data.test, ncomp=16)
mean((data.test$Apps - pred.pcr)^2)
## [1] 1166897
Test Error = 1166897
fit.pls <- plsr(Apps~., data=data.train, scale=TRUE, validation="CV")
validationplot(fit.pls, val.type="MSEP")
summary(fit.pls)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4335 2176 1888 1734 1605 1400 1311
## adjCV 4335 2171 1882 1724 1571 1373 1295
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1297 1287 1278 1278 1277 1281 1281
## adjCV 1281 1273 1265 1265 1264 1267 1268
## 14 comps 15 comps 16 comps 17 comps
## CV 1283 1284 1285 1285
## adjCV 1270 1271 1271 1272
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 26.91 43.08 63.26 65.16 68.50 73.75 76.10
## Apps 76.64 83.93 87.14 91.90 93.49 93.85 93.91
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 79.03 81.76 85.41 89.03 91.38 93.31 95.43
## Apps 93.94 93.96 93.96 93.96 93.97 93.97 93.97
## 15 comps 16 comps 17 comps
## X 97.41 98.78 100.00
## Apps 93.97 93.97 93.97
Minimum CV at M=10
pred.pls <- predict(fit.pls, data.test, ncomp=10)
mean ((data.test$Apps-pred.pls)^2)
## [1] 1134531
Test Error = 1134531
The test errors aren’t much different. The ridge and lasso seem to perform slightly better while the PCR/PLS don’t show any improvement from the full linear regression model.
We will now try to predict per capita crime rate in the Boston data set.
require(leaps)
## Loading required package: leaps
require(glmnet)
require(MASS)
## Loading required package: MASS
data(Boston)
split data into training and test sets
set.seed(1)
trainid <- sample(1:nrow(Boston), nrow(Boston)/2)
bost.train <- Boston[trainid,]
bost.test <- Boston[-trainid,]
xmat.train <- model.matrix(crim~., data=bost.train)[,-1]
xmat.test <- model.matrix(crim~., data=bost.test)[,-1]
Lasso Regression Model
fit.lasso <- cv.glmnet(xmat.train, bost.train$crim, alpha=1)
(lambda <- fit.lasso$lambda.min)
## [1] 0.082852
Lambda = 0.082852
pred.lasso <- predict(fit.lasso, s=lambda, newx=xmat.test)
mean((bost.test$crim - pred.lasso)^2)
## [1] 38.37664
Test Error = 38.37664
predict(fit.lasso, s=lambda, type="coefficients")
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 7.204613949
## zn 0.037084733
## indus -0.034037718
## chas -0.518884600
## nox -9.663821264
## rm 1.187198534
## age .
## dis -0.971658062
## rad 0.517060059
## tax .
## ptratio -0.220503429
## black -0.002432899
## lstat 0.176698759
## medv -0.193172676
Ridge Regression Model
fit.ridge <- cv.glmnet(xmat.train, bost.train$crim, alpha=0)
(lambda <- fit.ridge$lambda.min)
## [1] 0.8679707
Lambda = 0.8679707
pred.ridge <- predict(fit.ridge, s=lambda, newx=xmat.test)
mean((bost.test$crim - pred.ridge)^2)
## [1] 38.37876
Test Error = 38.37876
predict(fit.ridge, s=lambda, type="coefficients")
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.345518586
## zn 0.033709412
## indus -0.057715341
## chas -0.790088255
## nox -6.303985099
## rm 1.081377953
## age 0.006069496
## dis -0.784154381
## rad 0.388516755
## tax 0.004574584
## ptratio -0.118258105
## black -0.004400293
## lstat 0.189925722
## medv -0.152802441
predict.regsubsets <- function(object, newdata, id, ...)
{
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id=id)
xvars <- names(coefi)
mat[,xvars]%*%coefi
}
Forward Selection
fit.fwd <- regsubsets(crim~., data=bost.train, nvmax=ncol(Boston)-1)
(fwd.summary <- summary(fit.fwd))
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = bost.train, nvmax = ncol(Boston) -
## 1)
## 13 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
## zn indus chas nox rm age dis rad tax ptratio black lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " "*" " "
## 3 ( 1 ) " " " " " " " " "*" " " " " "*" " " " " " " "*" " "
## 4 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " " " " " "*"
## 5 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " " " " " " " "*"
## 6 ( 1 ) "*" " " " " "*" "*" " " "*" "*" " " " " " " " " "*"
## 7 ( 1 ) "*" " " " " "*" "*" " " "*" "*" " " " " " " "*" "*"
## 8 ( 1 ) "*" " " " " "*" "*" " " "*" "*" " " "*" " " "*" "*"
## 9 ( 1 ) "*" " " " " "*" "*" " " "*" "*" "*" "*" " " "*" "*"
## 10 ( 1 ) "*" " " "*" "*" "*" " " "*" "*" "*" "*" " " "*" "*"
## 11 ( 1 ) "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
err.fwd <- rep(NA, ncol(Boston)-1)
for(i in 1:(ncol(Boston)-1)) {
pred.fwd <- predict (fit.fwd, bost.test, id=i)
err.fwd[i] <- mean((bost.test$crim - pred.fwd)^2)
}
Test Error = 39.27592
plot(err.fwd, type="b", main="Test MSE for Forward Selection", xlab="Number of Predictors")
which.min(err.fwd)
## [1] 4
Probably choose the lasso model because it has the least test error value and eliminates some predictors to reduce model complexity.
No, because not all predictors add value to the model.