Question 2

(a) The lasso, relative to least squares, is:

  1. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. ## (b) Repeat (a) for ridge regression relative to least squares.
  2. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. ## (c) Repeat (a) for non-linear methods relative to least squares.
  3. More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.

Question 9

library(ISLR2)
college <- College

##(a) Split the data set into a training set and a test set.

set.seed(1)
sample <- sample(1:nrow(college),size=0.3*nrow(college))
train <- college[-sample,]
test <- college[sample,]

##(b) Fit a linear model using least squares on the training set, and report the test error obtained.

lmfit = lm(Apps ~., data = train)
lmpred = predict(lmfit, test, type="response")
mean((lmpred-test$Apps)^2)
## [1] 1740793
# the test error obtained using least squares is 1740793. 

##(c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
x <- model.matrix(Apps ~., train)
y <- model.matrix(Apps~., test)
cv.out <- cv.glmnet(x, train$Apps,alpha=0)
bestlam <-  cv.out$lambda.min
bestlam
## [1] 327.157
ridge <- glmnet(x,train$Apps,alpha = 0)
ridgepred <- predict(ridge,s=bestlam,newx = y) 
mean((ridgepred - test$Apps)^2)
## [1] 3041114
# The MSE obtained is 3041114

##(d) Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.

x.tr <- model.matrix(Apps ~., train)
y.tr <- train$Apps
x.test <- model.matrix(Apps ~., test)
y.test <- test$Apps
set.seed(1)
cv.out <- cv.glmnet(x.tr, y.tr, alpha=1)
bestlam <- cv.out$lambda.min
bestlam
## [1] 25.92663
ridge <- glmnet(x.tr, y.tr, alpha = 1)
ridgepred <- predict(ridge,s=bestlam,newx = x.test) 
mean((ridgepred - test$Apps)^2)
## [1] 1846253
# the MSE obtained is 1846253

##(e) Fit a PCR model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

x <- model.matrix(Apps ~., college)
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
set.seed (1)
pcrfit <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
summary(pcrfit)
## Data:    X dimension: 544 17 
##  Y dimension: 544 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3514     3535     1725     1716     1415     1285     1292
## adjCV         3514     3535     1722     1723     1409     1266     1287
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1237     1214     1202      1205      1197      1209      1213
## adjCV     1232     1210     1200      1203      1193      1205      1209
##        14 comps  15 comps  16 comps  17 comps
## CV         1215      1224      1084      1091
## adjCV      1211      1223      1080      1085
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X     31.8106    57.40    64.08    69.93    75.19    80.22    84.02    87.55
## Apps   0.5794    76.64    77.24    85.07    87.46    87.56    88.33    88.85
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       90.67     93.11     95.14     96.95     98.00     98.86     99.39
## Apps    89.18     89.22     89.41     89.46     89.47     89.51     89.54
##       16 comps  17 comps
## X        99.83    100.00
## Apps     91.82     92.16
validationplot(pcrfit , val.type = "MSEP")

pcrpred <- predict(pcrfit,test,ncomp=9)
mean((pcrpred - y.test)^2)
## [1] 4003530
# The test error obtained is 4003530 with M=9

##(f) Fit a PLS model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

library(pls)
set.seed (1)
plsfit <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
summary(plsfit)
## Data:    X dimension: 544 17 
##  Y dimension: 544 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3514     1552     1163     1173     1158     1133     1102
## adjCV         3514     1548     1157     1171     1152     1124     1095
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1092     1091     1088      1088      1090      1091      1091
## adjCV     1086     1085     1083      1082      1085      1086      1085
##        14 comps  15 comps  16 comps  17 comps
## CV         1091      1091      1091      1091
## adjCV      1085      1085      1085      1085
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       25.51    34.63    62.83    66.57    69.80    73.56    77.21    81.02
## Apps    81.56    88.94    89.77    90.64    91.51    91.96    92.06    92.08
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       83.27     85.14     87.90     90.76     93.56     96.06     97.63
## Apps    92.11     92.14     92.15     92.16     92.16     92.16     92.16
##       16 comps  17 comps
## X        99.18    100.00
## Apps     92.16     92.16
validationplot(plsfit , val.type = "MSEP")

plspred <- predict(plsfit,test,ncomp=10)
mean((plspred - y.test)^2)
## [1] 1750013
## The test error obtained is 1750013 with M=9.

##(g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches? # the test errors based on MSE are ranked best to worst below: #1. linear model #2. PLS #3. lasso #4. ridge #5. PCR There is a decently sized difference in test errors between PCR and the other models.

Number 11

(a) Try out some of the regression methods explored in this chapter, such as best subset selection, the lasso, ridge regression, and PCR. Present and discuss results for the approaches that you consider.

Ridge

boston <- Boston
set.seed(1)
sample <- sample(1:nrow(boston),size=0.3*nrow(boston))
train <- boston[-sample,]
test <- boston[sample,]

library(glmnet)
x <- model.matrix(crim ~., train)
y <- model.matrix(crim~., test)
cv.out <- cv.glmnet(x, train$crim,alpha=0)
bestlam <-  cv.out$lambda.min
ridge <- glmnet(x,train$crim,alpha = 0)
ridgepred <- predict(ridge,s=bestlam,newx = y) 
MSEridge <- mean((ridgepred - test$crim)^2)
MSEridge
## [1] 55.46288
rootMSE <- sqrt(MSEridge)
rootMSE
## [1] 7.447341
plot(cv.out)

Lasso

x.tr <- model.matrix(crim ~., train)
y.tr <- train$crim
x.test <- model.matrix(crim ~., test)
y.test <- test$crim
set.seed(1)
grid=10^seq(10,-2,length=100)
cv.out <- cv.glmnet(x.tr, y.tr, alpha=1)
bestlam <- cv.out$lambda.min
lasso <- glmnet(x.tr, y.tr, alpha = 1)
lassopred <- predict(lasso,s=bestlam,newx = x.test) 
MSElasso <- mean((lassopred - test$crim)^2)
MSElasso
## [1] 54.74424
rootMSE <- sqrt(MSElasso)
rootMSE
## [1] 7.398935
plot(cv.out)

PCR

x <- model.matrix(crim ~., boston)
library(pls)
set.seed (1)
pcrfit <- pcr(crim ~ ., data = train, scale = TRUE, validation = "CV")
summary(pcrfit)
## Data:    X dimension: 355 12 
##  Y dimension: 355 1
## Fit method: svdpc
## Number of components considered: 12
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            8.08    6.796    6.789    6.413    6.414    6.362    6.342
## adjCV         8.08    6.793    6.786    6.405    6.409    6.356    6.335
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps
## CV       6.211    6.262    6.236     6.219     6.208     6.163
## adjCV    6.203    6.253    6.226     6.210     6.196     6.151
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       49.93    63.55    73.09    80.42    87.06    90.28    92.85    95.07
## crim    30.14    30.66    38.60    38.60    39.91    40.58    42.88    42.90
##       9 comps  10 comps  11 comps  12 comps
## X       96.87     98.33     99.51    100.00
## crim    43.48     43.89     44.76     45.75
validationplot(pcrfit , val.type = "MSEP")

pcrpred <- predict(pcrfit,test,ncomp=7)
MSEpcr <- mean((pcrpred - y.test)^2)
MSEpcr
## [1] 57.31212
rootMSE <- sqrt(MSEpcr)
rootMSE
## [1] 7.570477

##(b) Propose a model (or set of models) that seem to perform well on this data set, and justify your answer. Make sure that you are evaluating model performance using validation set error, crossvalidation, or some other reasonable alternative, as opposed to using training error.

The lowest MSE is 54.74, the lasso model. The lower the MSE, the better; so the lasso model measured the least amount of error. The root MSE of the lasso model is 7.4. This tells us the difference between predicted and actual values.

cv.out <- cv.glmnet(x.tr, y.tr, alpha=1)
predict(cv.out,type='coefficient',s=bestlam)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept)  6.54479074
## (Intercept)  .         
## zn           0.03536269
## indus       -0.05554954
## chas        -0.88096081
## nox         -6.24186702
## rm           0.21909432
## age          .         
## dis         -0.72136600
## rad          0.50486034
## tax          .         
## ptratio     -0.10572140
## lstat        0.15571559
## medv        -0.12416743

(c) Does your chosen model involve all of the features in the data set? Why or why not?

#The lasso model does not involve all of the features in the dataset. It on only includes 10. This is shown above where the coefficients of age and tax are not there. The model did not include these variables because the cv error is lowest without these variables.