PROBLEM 2

For parts (a) through (c), indicate which of i. through iv. is correct.

  1. III is correct. The lasso, relative to least squares, is less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.

  2. III is correct. Ridge regression, relative to least squares, is less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.

  3. II is correct. Non-Linear Method, relative to least squares, more flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.

PROBLEM 9

In this exercise, we will predict the number of applications received using the other variables in the College data set.

require (ISLR)
## Loading required package: ISLR
data (College)
set.seed(1)
trainid=sample(1:nrow(College), nrow(College)/2)
data.train<-College[trainid,]
data.test<-College[-trainid,]
fit.lm <- lm(Apps~., data=data.train)
pred.lm <- predict(fit.lm, data.test)
mean((data.test$Apps - pred.lm)^2)
## [1] 1108531

Test Error = 1108531

require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
xmat.train <- model.matrix(Apps~., data=data.train)[,-1]
xmat.test <- model.matrix(Apps~., data=data.test)[,-1]
fit.ridge <- cv.glmnet(xmat.train, data.train$Apps, alpha=0)
(lambda <- fit.ridge$lambda.min)
## [1] 450.7435

Lambda = 450.7435

pred.ridge <- predict(fit.ridge, s=lambda, newx=xmat.test)
mean((data.test$Apps - pred.ridge)^2)
## [1] 1037616

Test Error = 1037616

fit.lasso <- cv.glmnet(xmat.train, data.train$Apps, alpha=1)
lambda <- fit.lasso$lambda.min

Lambda = 29.65591

pred.lasso <- predict(fit.lasso, s=lambda, newx=xmat.test)
mean((data.test$Apps - pred.lasso)^2)
## [1] 1025248

Test Error = 1025248

coef.lasso <- predict(fit.lasso, type="coefficients", s=lambda)[1:ncol(College),]
coef.lasso[coef.lasso != 0]
##   (Intercept)    PrivateYes        Accept        Enroll     Top10perc 
## -4.514223e+02 -4.814352e+02  1.535012e+00 -4.035455e-01  4.668170e+01 
##     Top25perc   F.Undergrad      Outstate    Room.Board           PhD 
## -7.091364e+00 -3.426988e-03 -5.020064e-02  1.851610e-01 -3.849885e+00 
##      Terminal   perc.alumni        Expend     Grad.Rate 
## -3.443747e+00 -2.117870e+00  3.192846e-02  2.695730e+00
length(coef.lasso[coef.lasso != 0])
## [1] 14

Number of non-zero coefficient estimates = 14

require(pls)
## Loading required package: pls
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
set.seed(1)
fit.pcr <- pcr(Apps~., data=data.train, scale=TRUE, validation="CV")
validationplot(fit.pcr, val.type="MSEP")

summary(fit.pcr)
## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4335     4179     2364     2374     1996     1844     1845
## adjCV         4335     4182     2360     2374     1788     1831     1838
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1850     1863     1809      1809      1812      1815      1825
## adjCV     1844     1857     1801      1800      1804      1808      1817
##        14 comps  15 comps  16 comps  17 comps
## CV         1810      1823      1273      1281
## adjCV      1806      1789      1260      1268
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X      31.216    57.68    64.73    70.55    76.33    81.30    85.01
## Apps    6.976    71.47    71.58    83.32    83.44    83.45    83.46
##       8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X       88.40    91.16     93.36     95.38     96.94     97.96     98.76
## Apps    83.47    84.53     84.86     84.98     84.98     84.99     85.24
##       15 comps  16 comps  17 comps
## X        99.40     99.87    100.00
## Apps     90.87     93.93     93.97

Minimum CV at M=16

pred.pcr <- predict(fit.pcr, data.test, ncomp=16) 
mean((data.test$Apps - pred.pcr)^2)
## [1] 1166897

Test Error = 1166897

fit.pls <- plsr(Apps~., data=data.train, scale=TRUE, validation="CV")
validationplot(fit.pls, val.type="MSEP")

summary(fit.pls)
## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4335     2176     1888     1734     1605     1400     1311
## adjCV         4335     2171     1882     1724     1571     1373     1295
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1297     1287     1278      1278      1277      1281      1281
## adjCV     1281     1273     1265      1265      1264      1267      1268
##        14 comps  15 comps  16 comps  17 comps
## CV         1283      1284      1285      1285
## adjCV      1270      1271      1271      1272
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X       26.91    43.08    63.26    65.16    68.50    73.75    76.10
## Apps    76.64    83.93    87.14    91.90    93.49    93.85    93.91
##       8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X       79.03    81.76     85.41     89.03     91.38     93.31     95.43
## Apps    93.94    93.96     93.96     93.96     93.97     93.97     93.97
##       15 comps  16 comps  17 comps
## X        97.41     98.78    100.00
## Apps     93.97     93.97     93.97

Minimum CV at M=10

pred.pls <- predict(fit.pls, data.test, ncomp=10)  
mean ((data.test$Apps-pred.pls)^2)
## [1] 1134531

Test Error = 1134531

The test errors aren’t much different. The ridge and lasso seem to perform slightly better while the PCR/PLS don’t show any improvement from the full linear regression model.

PROBLEM 11

We will now try to predict per capita crime rate in the Boston data set.

require(leaps)
## Loading required package: leaps
require(glmnet)  
require(MASS)    
## Loading required package: MASS
data(Boston)

split data into training and test sets

set.seed(1)
trainid <- sample(1:nrow(Boston), nrow(Boston)/2)
bost.train <- Boston[trainid,]
bost.test <- Boston[-trainid,]
xmat.train <- model.matrix(crim~., data=bost.train)[,-1]
xmat.test <- model.matrix(crim~., data=bost.test)[,-1]

Lasso Regression Model

fit.lasso <- cv.glmnet(xmat.train, bost.train$crim, alpha=1)
(lambda <- fit.lasso$lambda.min)  
## [1] 0.082852

Lambda = 0.082852

pred.lasso <- predict(fit.lasso, s=lambda, newx=xmat.test)
mean((bost.test$crim - pred.lasso)^2)
## [1] 38.37664

Test Error = 38.37664

predict(fit.lasso, s=lambda, type="coefficients")
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept)  7.204613949
## zn           0.037084733
## indus       -0.034037718
## chas        -0.518884600
## nox         -9.663821264
## rm           1.187198534
## age          .          
## dis         -0.971658062
## rad          0.517060059
## tax          .          
## ptratio     -0.220503429
## black       -0.002432899
## lstat        0.176698759
## medv        -0.193172676

Ridge Regression Model

fit.ridge <- cv.glmnet(xmat.train, bost.train$crim, alpha=0)
(lambda <- fit.ridge$lambda.min)  
## [1] 0.8679707

Lambda = 0.8679707

pred.ridge <- predict(fit.ridge, s=lambda, newx=xmat.test)
mean((bost.test$crim - pred.ridge)^2)
## [1] 38.37876

Test Error = 38.37876

predict(fit.ridge, s=lambda, type="coefficients")
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept)  2.345518586
## zn           0.033709412
## indus       -0.057715341
## chas        -0.790088255
## nox         -6.303985099
## rm           1.081377953
## age          0.006069496
## dis         -0.784154381
## rad          0.388516755
## tax          0.004574584
## ptratio     -0.118258105
## black       -0.004400293
## lstat        0.189925722
## medv        -0.152802441
predict.regsubsets <- function(object, newdata, id, ...)
{
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id=id)
xvars <- names(coefi)
mat[,xvars]%*%coefi
}

Forward Selection

fit.fwd <- regsubsets(crim~., data=bost.train, nvmax=ncol(Boston)-1)
(fwd.summary <- summary(fit.fwd))
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = bost.train, nvmax = ncol(Boston) - 
##     1)
## 13 Variables  (and intercept)
##         Forced in Forced out
## zn          FALSE      FALSE
## indus       FALSE      FALSE
## chas        FALSE      FALSE
## nox         FALSE      FALSE
## rm          FALSE      FALSE
## age         FALSE      FALSE
## dis         FALSE      FALSE
## rad         FALSE      FALSE
## tax         FALSE      FALSE
## ptratio     FALSE      FALSE
## black       FALSE      FALSE
## lstat       FALSE      FALSE
## medv        FALSE      FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
##           zn  indus chas nox rm  age dis rad tax ptratio black lstat medv
## 1  ( 1 )  " " " "   " "  " " " " " " " " "*" " " " "     " "   " "   " " 
## 2  ( 1 )  " " " "   " "  " " " " " " " " "*" " " " "     " "   "*"   " " 
## 3  ( 1 )  " " " "   " "  " " "*" " " " " "*" " " " "     " "   "*"   " " 
## 4  ( 1 )  "*" " "   " "  " " " " " " "*" "*" " " " "     " "   " "   "*" 
## 5  ( 1 )  "*" " "   " "  "*" " " " " "*" "*" " " " "     " "   " "   "*" 
## 6  ( 1 )  "*" " "   " "  "*" "*" " " "*" "*" " " " "     " "   " "   "*" 
## 7  ( 1 )  "*" " "   " "  "*" "*" " " "*" "*" " " " "     " "   "*"   "*" 
## 8  ( 1 )  "*" " "   " "  "*" "*" " " "*" "*" " " "*"     " "   "*"   "*" 
## 9  ( 1 )  "*" " "   " "  "*" "*" " " "*" "*" "*" "*"     " "   "*"   "*" 
## 10  ( 1 ) "*" " "   "*"  "*" "*" " " "*" "*" "*" "*"     " "   "*"   "*" 
## 11  ( 1 ) "*" " "   "*"  "*" "*" " " "*" "*" "*" "*"     "*"   "*"   "*" 
## 12  ( 1 ) "*" " "   "*"  "*" "*" "*" "*" "*" "*" "*"     "*"   "*"   "*" 
## 13  ( 1 ) "*" "*"   "*"  "*" "*" "*" "*" "*" "*" "*"     "*"   "*"   "*"
err.fwd <- rep(NA, ncol(Boston)-1)
for(i in 1:(ncol(Boston)-1)) {
  pred.fwd <- predict (fit.fwd, bost.test, id=i)
  err.fwd[i] <- mean((bost.test$crim - pred.fwd)^2)
}

Test Error = 39.27592

plot(err.fwd, type="b", main="Test MSE for Forward Selection", xlab="Number of Predictors")

which.min(err.fwd)
## [1] 4

Probably choose the lasso model because it has the least test error value and eliminates some predictors to reduce model complexity.

No, because not all predictors add value to the model.