library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
library(pls)
## Warning: package 'pls' was built under R version 4.0.4
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.4
## Loading required package: Matrix
## Loaded glmnet 4.1-1
library(MASS)
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.4
  1. For parts (a) through (c), indicate which of i. through iv. is correct. Justify your answer.
  1. The lasso, relative to least squares, is:
  1. More flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
  2. More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.
  3. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
  4. Less flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.
  1. Repeat (a) for ridge regression relative to least squares.
  2. Repeat (a) for non-linear methods relative to least squares.
  1. iii With increasing λ variance starts to decrease faster than bias increases leading to a low in the U shaped MSE curve.

  2. iii With increasing λ variance starts to decrease faster than bias increases leading to a low in the U shaped MSE curve.

  3. ii

  1. In this exercise, we will predict the number of applications received using the other variables in the College data set.
  1. Split the data set into a training set and a test set.
data(College)
set.seed(123)
trainid = sample(1:nrow(College), nrow(College)/2)
train = College[trainid,]
test = College[-trainid,]
  1. Fit a linear model using least squares on the training set, and report the test error obtained.
lm.fit = lm(Apps~., data=train)
lm.pred = predict(lm.fit, test)
lm.err = mean((test$Apps - lm.pred)^2)
lm.err
## [1] 1373995
  1. Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
train.X = model.matrix(Apps ~ ., data = train)
train.Y = train[, "Apps"]
test.X = model.matrix(Apps ~ ., data = test)
test.Y = test[, "Apps"]
grid = 10 ^ seq(4, -2, length=100)
ridge.mod = glmnet(train.X, train.Y, alpha =0, lambda=grid, thresh = 1e-12)
lambda.best = ridge.mod$lambda.min
ridge.pred = predict(ridge.mod, newx= test.X, s=lambda.best)
(ridge.err = mean((test.Y - ridge.pred)^2))
## [1] 2070580
  1. Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.
lasso.mod = glmnet(train.X, train.Y, alpha =1, lambda=grid, thresh = 1e-12)
lasso.cv = cv.glmnet(train.X, train.Y, alpha =1, lambda=grid, thresh = 1e-12)
lambda.best = lasso.cv$lambda.min
lasso.pred = predict(lasso.mod, newx= test.X, s=lambda.best)
(lasso.err = mean((test.Y - lasso.pred)^2))
## [1] 1399213
predict(lasso.mod, s = lambda.best, type="coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept) -448.89015205
## (Intercept)    .         
## PrivateYes  -707.26279699
## Accept         1.33100318
## Enroll         .         
## Top10perc     26.66029296
## Top25perc      .         
## F.Undergrad    .         
## P.Undergrad    .         
## Outstate      -0.02716977
## Room.Board     0.12159181
## Books          .         
## Personal      -0.03546138
## PhD           -2.99477563
## Terminal      -5.67422601
## S.F.Ratio      .         
## perc.alumni   -6.74346397
## Expend         0.07451472
## Grad.Rate      7.18317440
  1. Fit a PCR model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.
pcr.fit = pcr(Apps~., data=train, scale=TRUE, validation="CV")
validationplot(pcr.fit, val.type="MSEP")

summary(pcr.fit)
## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3258     3228     1667     1617     1317     1325     1220
## adjCV         3258     3227     1665     1615     1304     1324     1216
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1211     1213     1159      1161      1161      1166      1173
## adjCV     1212     1213     1156      1158      1158      1163      1170
##        14 comps  15 comps  16 comps  17 comps
## CV         1174      1165     999.5      1014
## adjCV      1170      1162     996.4      1010
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X      32.969    59.69    66.68    72.25    77.31    81.82    85.18    88.34
## Apps    3.259    74.43    76.35    84.40    84.42    86.69    86.79    87.01
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       91.32     93.61     95.48     97.05     98.11     98.90     99.40
## Apps    88.38     88.39     88.47     88.49     88.50     88.51     88.74
##       16 comps  17 comps
## X        99.81    100.00
## Apps     91.48     91.58
pcr.pred = predict(pcr.fit, test, ncomp=10)

(pcr.err = mean((test$Apps - pcr.pred)^2))
## [1] 2887472
  1. Fit a PLS model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.
pls.fit = plsr(Apps~., data=train, scale=TRUE, validation="CV")
validationplot(pls.fit, val.type="MSEP")

pls.pred = predict(pls.fit, test, ncomp=10)
(pls.err = mean((test$Apps - pls.pred)^2))
## [1] 1384151
  1. Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?
test.avg = mean(test$Apps)
lm.r2 =1 - lm.err/mean((test.avg - test$Apps)^2)
ridge.r2 =1 - ridge.err/mean((test.avg - test$Apps)^2)
lasso.r2 =1 - lasso.err/mean((test.avg - test$Apps)^2)
pcr.r2 =1 - pcr.err/mean((test.avg - test$Apps)^2)
pls.r2 =1 - pls.err/mean((test.avg - test$Apps)^2)
all.r2 = c(lm.r2, ridge.r2, lasso.r2, pcr.r2, pls.r2)
names(all.r2) = c("lm", "ridge", "lasso", "pcr", "pls")
barplot(all.r2 )

very similar results with pcr with a slightly lower accuracy..

  1. We will now try to predict per capita crime rate in the Boston data set.
  1. Try out some of the regression methods explored in this chapter, such as best subset selection, the lasso, ridge regression, and PCR. Present and discuss results for the approaches that you consider.
  1. best subset
data(Boston)
set.seed(12)
predict.regsubsets = function(object, newdata, id, ...) {
    form = as.formula(object$call[[2]])
    mat = model.matrix(form, newdata)
    coefi = coef(object, id = id)
    xvars = names(coefi)
    mat[, xvars] %*% coefi
}

k = 10
folds = sample(1:k, nrow(Boston), replace = TRUE)
cv.errors = matrix(NA, k, 13, dimnames = list(NULL, paste(1:13)))
for (j in 1:k) {
    best.fit = regsubsets(crim ~ ., data = Boston[folds != j, ], nvmax = 13)
    for (i in 1:13) {
        pred = predict(best.fit, Boston[folds == j, ], id = i)
        cv.errors[j, i] = mean((Boston$crim[folds == j] - pred)^2)
    }
}
mean.cv.errors = apply(cv.errors, 2, mean)
plot(mean.cv.errors, type = "b", xlab = "Number of variables", ylab = "CV error")

min(mean.cv.errors)
## [1] 45.33533
regfit.best = regsubsets(crim~., data=Boston, nvmax=13)
coef(regfit.best, 12)
##   (Intercept)            zn         indus          chas           nox 
##  16.985713928   0.044673247  -0.063848469  -0.744367726 -10.202169211 
##            rm           dis           rad           tax       ptratio 
##   0.439588002  -0.993556631   0.587660185  -0.003767546  -0.269948860 
##         black         lstat          medv 
##  -0.007518904   0.128120290  -0.198877768
  1. Lasso
x = model.matrix(crim ~ ., Boston)[, -1]
y = Boston$crim
cv.out = cv.glmnet(x, y, alpha = 1, type.measure = "mse")
plot(cv.out)

cv.out
## 
## Call:  cv.glmnet(x = x, y = y, type.measure = "mse", alpha = 1) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index Measure    SE Nonzero
## min 0.0563    50   43.18 12.10      11
## 1se 3.0758     7   54.76 13.98       1
  1. Ridge
cv.out = cv.glmnet(x, y, alpha = 0, type.measure = "mse")
cv.out
## 
## Call:  cv.glmnet(x = x, y = y, type.measure = "mse", alpha = 0) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index Measure    SE Nonzero
## min   0.54   100   43.16 14.93      13
## 1se  74.44    47   58.00 18.58      13
pcr.fit = pcr(crim ~ ., data = Boston, scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP")

  1. Propose a model (or set of models) that seem to perform well on this data set, and justify your answer. Make sure that you are evaluating model performance using validation set error, crossvalidation, or some other reasonable alternative, as opposed to using training error.

Best subset selection yields the best results.

  1. Does your chosen model involve all of the features in the data set? Why or why not?

1 Feature is missing as the model is created by Best subset selection.