set.seed(1005)

Exercise 2

  1. The ridge, relative to least squares, is less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.

  2. The lasso, relative to least squares, is less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.

  3. A non-linear model, relative to least squares, is more flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.

Exercise 9

9. In this exercise, we will predict the number of applications received using the other variables in the College data set.

attach(College)
# moving the target variable to the right
college <- College %>% 
  relocate(Apps, .after = last_col())

(a) Split the data set into a training set and a test set.

train.index <- createDataPartition(Apps, p = .5, 
                                  list = FALSE, 
                                  times = 1)
test.index=(-train.index)
train.data <- college[ train.index,]
test.data  <- college[-train.index,]

(b) Fit a linear model using least squares on the training set, and report the test error obtained.

lm.fit = lm(Apps ~., data = train.data)
pred.lm = predict(lm.fit, newdata = test.data)
mean((pred.lm - test.data$Apps)^2)
## [1] 1568334

(c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

x=model.matrix(Apps~.,college[,-1])
y=Apps
y.test = y[test.index]
# try different lambda, scale , plot shows best lambda
grid=10^seq(10,-2,length=100)
ridge.mod=glmnet(x[train.index,],y[train.index],alpha=0, lambda= grid, thresh = 1e-12)
dim(coef(ridge.mod))
## [1]  18 100
ridge.mod$lambda[60] # lambda for obs 60
## [1] 705.4802
cv.out=cv.glmnet(x[train.index,],y[train.index],alpha=0) #cross validation
bestlam=cv.out$lambda.min #best lambda
ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test.index,]) #predictions
out=glmnet(x,y,alpha=0)
predict(out,type = 'coefficients', s=bestlam,) #all the coefficients for best lambda
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -1.939082e+03
## (Intercept)  .           
## Accept       1.010301e+00
## Enroll       4.556312e-01
## Top10perc    2.560732e+01
## Top25perc    4.580389e-01
## F.Undergrad  8.239719e-02
## P.Undergrad  3.283942e-02
## Outstate    -4.394720e-02
## Room.Board   1.789143e-01
## Books        1.004418e-01
## Personal    -3.393769e-03
## PhD         -2.190755e+00
## Terminal    -2.885401e+00
## S.F.Ratio    2.097753e+01
## perc.alumni -1.017715e+01
## Expend       7.802475e-02
## Grad.Rate    1.040950e+01
mean((ridge.pred-y.test)^2) #error rate
## [1] 2363968

(d) Fit a lasso model on the training set, with λ chosen by cross- validation. Report the test error obtained, along with the number of non-zero coefficient estimates.

lasso.mod=glmnet(x[train.index,],y[train.index],alpha=1, lambda= grid, thresh = 1e-12)
dim(coef(ridge.mod))
## [1]  18 100
lasso.mod$lambda[60] # lambda for obs 60
## [1] 705.4802
cv.out=cv.glmnet(x[train.index,],y[train.index],alpha=1) #cross validation
bestlam=cv.out$lambda.min #best lambda
lasso.pred=predict(ridge.mod,s=bestlam,newx=x[test.index,]) #predictions
out=glmnet(x,y,alpha=1)
predict(out,type = 'coefficients', s=bestlam,) #all the coefficients for best lambda
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -930.64053249
## (Intercept)    .         
## Accept         1.56913333
## Enroll        -0.71693773
## Top10perc     47.03896732
## Top25perc    -12.35544301
## F.Undergrad    0.05029422
## P.Undergrad    0.04925476
## Outstate      -0.10518557
## Room.Board     0.13216688
## Books          .         
## Personal       0.02743217
## PhD           -6.33491339
## Terminal      -1.32106437
## S.F.Ratio     21.57813585
## perc.alumni   -1.43392828
## Expend         0.08064847
## Grad.Rate      7.47223018
mean((lasso.pred-y.test)^2) #error rate
## [1] 1636813
print('The LASSO model selected 15 predictors.')
## [1] "The LASSO model selected 15 predictors."

(e) Fit a PCR model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.

set.seed(1001)
pcr.fit = pcr(Apps ~.,data=college, scale = TRUE, validation= 'CV')
summary(pcr.fit) #return statistics
## Data:    X dimension: 777 17 
##  Y dimension: 777 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3873     3835     2029     2033     1856     1588     1585
## adjCV         3873     3835     2027     2032     1733     1583     1582
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1570     1542     1495      1495      1500      1501      1507
## adjCV     1572     1537     1492      1492      1498      1498      1504
##        14 comps  15 comps  16 comps  17 comps
## CV         1508      1441      1167      1127
## adjCV      1505      1424      1160      1121
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X      31.670    57.30    64.30    69.90    75.39    80.38    83.99    87.40
## Apps    2.316    73.06    73.07    82.08    84.08    84.11    84.32    85.18
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       90.50     92.91     95.01     96.81      97.9     98.75     99.36
## Apps    85.88     86.06     86.06     86.10      86.1     86.13     90.32
##       16 comps  17 comps
## X        99.84    100.00
## Apps     92.52     92.92
validationplot(pcr.fit, val.type = "MSEP", ylab = "Mean Squared Error", type = "b") #mean squared error

pcr.pred=predict(pcr.fit, test.data, ncomp=5)
mean((pcr.pred-y.test)^2)
## [1] 3178434

(f) Fit a PLS model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.

pls.fit =plsr(Apps~.,data=train.data, scale = TRUE, validation = 'CV')
summary(pls.fit)
## Data:    X dimension: 389 17 
##  Y dimension: 389 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3468     1572     1358     1173     1150     1150     1128
## adjCV         3468     1569     1364     1170     1147     1145     1121
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1124     1119     1116      1116      1117      1115      1116
## adjCV     1118     1113     1110      1111      1112      1110      1110
##        14 comps  15 comps  16 comps  17 comps
## CV         1117      1117      1116      1117
## adjCV      1111      1111      1111      1111
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       26.20    48.52    63.55    67.87    72.94    74.86    78.86    81.71
## Apps    80.33    85.46    89.47    90.30    90.68    91.20    91.32    91.38
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       85.16     87.85     90.66     92.66     93.55     95.15     97.25
## Apps    91.39     91.40     91.40     91.40     91.41     91.41     91.41
##       16 comps  17 comps
## X        98.12    100.00
## Apps     91.41     91.41
validationplot(pls.fit, val.type = "MSEP", ylab = "Mean Squared Error", type = "b")

pls.pred=predict(pls.fit, test.data, ncomp=5)
mean((pls.pred-y.test)^2)
## [1] 2364958

(g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?

Based on the MSE,The ridge regression model appears to be the best to predict number of college applications received.

Exercise 11

We will now try to predict per capita crime rate in the Boston data set. (a) Try out some of the regression methods explored in this chapter, such as best subset selection, the lasso, ridge regression, and PCR. Present and discuss results for the approaches that you consider.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(leaps)
attach(Boston)
boston <- Boston %>% 
  relocate(crim, .after = last_col())

regfit.full=regsubsets(crim~.,boston,nvmax = 13)
reg.summary =summary(regfit.full)
par(mfrow=c(2,2))

plot(reg.summary$rss, xlab="Number of Variables ", ylab="RSS", type="b")

plot(reg.summary$adjr2, xlab="Number of Variables ", ylab="Adjusted RSq",type="b")
point = which.max(reg.summary$adjr2)
points(point,reg.summary$adjr2[point], col ="red", cex=2, pch =20)

plot(reg.summary$bic, xlab="Number of Variables ", ylab="BIC", type="b")
point =which.min(reg.summary$bic)
points(point,reg.summary$bic[point], col ="red", cex=2, pch =20)


plot(reg.summary$cp, xlab="Number of Variables ", ylab="Cp",type="b")
point =which.min(reg.summary$cp)
points(point,reg.summary$cp[point], col ="red", cex=2, pch =20)

set.seed(1)
train=sample(c(TRUE ,FALSE), nrow(boston),rep=TRUE)
test=(!train)
# using validation to find the model with the lower mse
bostontest.mat <- model.matrix(crim~ ., data = boston[test,], nvmax = 13)
val.errors <- rep(NA, 13)
for (i in 1:13) {coefi <- coef(regfit.full, id = i)
    pred <- bostontest.mat[, names(coefi)] %*% coefi
    val.errors[i] <- mean((pred - boston[test,]$crim)^2)}

plot(val.errors, xlab = "Number of predictors", ylab = "Test MSE", pch = 19, type = "b",col="orange")
point =which.min(val.errors)
points(point,val.errors[point], col ="red", cex=2, pch =20)

val.errors[point]
## [1] 48.80265
# sampling
train.index <- createDataPartition(crim, p = .5, 
                                  list = FALSE, 
                                  times = 1)
test.index=(-train.index)
train.data <- boston[ train.index,]
test.data  <- boston[-train.index,]

(c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

x=model.matrix(crim~.,boston[,-1])
y=crim
y.test = y[test.index]

RIDGE

# try different lambda, scale , plot shows best lambda
grid=10^seq(10,-2,length=100)
ridge.mod=glmnet(x[train.index,],y[train.index],alpha=0, lambda= grid, thresh = 1e-12)
dim(coef(ridge.mod))
## [1]  14 100
cv.out=cv.glmnet(x[train.index,],y[train.index],alpha=0) #cross validation
bestlam=cv.out$lambda.min #best lambda
ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test.index,]) #predictions
out=glmnet(x,y,alpha=0)
mean((ridge.pred-y.test)^2) #error rate
## [1] 36.15396

LASSO

lasso.mod=glmnet(x[train.index,],y[train.index],alpha=1, lambda= grid, thresh = 1e-12)
dim(coef(ridge.mod))
## [1]  14 100
lasso.mod$lambda[60] # lambda for obs 60
## [1] 705.4802
cv.out=cv.glmnet(x[train.index,],y[train.index],alpha=1) #cross validation
bestlam=cv.out$lambda.min #best lambda
lasso.pred=predict(ridge.mod,s=bestlam,newx=x[test.index,]) #predictions
out=glmnet(x,y,alpha=1)
predict(out,type = 'coefficients', s=bestlam,) #all the coefficients for best lambda
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept)  1.918452482
## (Intercept)  .          
## indus        .          
## chas        -0.009965673
## nox          .          
## rm           .          
## age          .          
## dis         -0.093543735
## rad          0.464809399
## tax          .          
## ptratio      .          
## black       -0.007034536
## lstat        0.123201610
## medv        -0.063806206
mean((lasso.pred-y.test)^2) #error rate
## [1] 35.78623
print('The LASSO model selected 13 predictors.')
## [1] "The LASSO model selected 13 predictors."

PCR

pcr.fit = pcr(crim ~.,data=train.data, scale = TRUE, validation= 'CV')
#summary(pcr.fit)
validationplot(pcr.fit, val.type = "MSEP", ylab = "Mean Squared Error", type = "b") #mean squared error

pcr.pred=predict(pcr.fit, test.data, ncomp=3)
mean((pcr.pred-y.test)^2)
## [1] 38.2857

PLS

pls.fit =plsr(crim~.,data=train.data, scale = TRUE, validation = 'CV')
#summary(pls.fit)
validationplot(pls.fit, val.type = "MSEP", ylab = "Mean Squared Error", type = "b")

pls.pred=predict(pls.fit, test.data, ncomp=5)
mean((pls.pred-y.test)^2)
## [1] 35.52806

(b) Propose a model (or set of models) that seem to perform well on this data set, and justify your answer. Make sure that you are evaluating model performance using validation set error, cross-validation, or some other reasonable alternative, as opposed to using training error.

The subset model selection appears to be the best solution because it allowed me to have the lowest error rate while keeping a reasonable goodness-of-fit.

(c) Does your chosen model involve all of the features in the data set? Why or why not?

The model chosen only contains 9 predictors because it’s the set that deliver the best results overall.