Import the libraries used for this workspace

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.2
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.2
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.2.2
## Loaded glmnet 4.1-6
library(pls)
## Warning: package 'pls' was built under R version 4.2.2
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.2
library(MASS)

Import the dataset

data("College")
attach(College)

(a)Split the data set into a training set and a test set.

set.seed(2)
train <- sample(1:nrow(College), nrow(College)/2)
test <- -train
y.test <- Apps[test]
  1. Fit a linear model using least squares on the training set, and report the test error obtained.
lm.fit <- lm(Apps ~., data=College, subset = train)
lm.pred <- predict(lm.fit, College[test,])
lm.error <- mean((lm.pred - y.test)^2)
lm.error
## [1] 1093608
  1. Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
x <- model.matrix(Apps~., data=College)[, -1]
x.train <- x[train,]
y <- College$Apps
y.train <- y[train]
set.seed(7)
ridge.fit <- glmnet(x.train,y.train,alpha=0)
cv.ridge.fit <- cv.glmnet(x.train,y.train,alpha=0)
plot(cv.ridge.fit)

bestlam <- cv.ridge.fit$lambda.min
bestlam
## [1] 424.2704
ridge.pred <- predict(ridge.fit, s=bestlam, newx=x[test,]) 
ridge.error <- mean((ridge.pred-y.test)^2)
ridge.error
## [1] 1138030
  1. Fit a lasso model on the training set, with λ chosen by cross-validation. Report the test error obtained, along with the number of non-zero coefficient estimates.
set.seed(2)
lasso.fit <- glmnet(x.train,y.train,alpha=1)
cv.lasso.fit<- cv.glmnet(x.train,y.train,alpha=1)
bestlam.lasso <- cv.lasso.fit$lambda.min
bestlam.lasso
## [1] 15.97351
lasso.pred <- predict(lasso.fit, s=bestlam.lasso, newx=x[test,])
lasso.error <- mean((lasso.pred-y.test)^2)
lasso.error
## [1] 1045922
lasso.c <- predict(lasso.fit,type="coefficients", s=bestlam)[1:17,]
length(lasso.c[lasso.c != 0])
## [1] 3
  1. Fit a PCR model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.
set.seed(1)
pcr.fit <- pcr(Apps~., data=College, subset=train, scale=TRUE, validation ="CV")
summary(pcr.fit)
## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4440     4463     2362     2385     2092     1835     1832
## adjCV         4440     4465     2359     2384     2066     1824     1823
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1829     1821     1753      1764      1771      1772      1784
## adjCV     1818     1813     1746      1757      1764      1764      1777
##        14 comps  15 comps  16 comps  17 comps
## CV         1778      1747      1401      1338
## adjCV      1773      1701      1385      1322
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X     31.4816    57.40    64.84    70.54    75.80    80.23    84.04    87.64
## Apps   0.1398    72.45    72.50    80.45    84.74    84.77    85.06    85.16
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       90.87     93.29     95.33     96.98     98.05     98.75     99.37
## Apps    85.93     86.16     86.21     86.32     86.40     86.61     92.49
##       16 comps  17 comps
## X        99.85    100.00
## Apps     93.65     94.32
validationplot(pcr.fit, val.type = "MSEP")

pcr.pred <- predict(pcr.fit,x[test,], ncomp=17) 
pcr.error <- mean((pcr.pred-y.test)^2)
pcr.error
## [1] 1093608
  1. Fit a PLS model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.
set.seed(1)
pls.fit <- plsr(Apps~., data=College, subset=train, scale=TRUE, validation="CV")
summary(pls.fit)
## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4440     2153     1701     1711     1650     1498     1413
## adjCV         4440     2147     1677     1704     1615     1470     1392
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1378     1358     1356      1342      1338      1337      1337
## adjCV     1361     1342     1339      1326      1322      1321      1321
##        14 comps  15 comps  16 comps  17 comps
## CV         1338      1338      1338      1338
## adjCV      1322      1322      1322      1322
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       25.76    31.99    61.88    65.00    68.68    73.82    78.33    81.38
## Apps    77.92    87.80    88.38    92.06    93.59    93.92    94.01    94.09
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       83.24     85.59     87.76     90.49     92.69     94.27     97.03
## Apps    94.20     94.27     94.30     94.31     94.32     94.32     94.32
##       16 comps  17 comps
## X        99.21    100.00
## Apps     94.32     94.32
validationplot(pls.fit, val.type = "MSEP")

pls.pred <- predict(pls.fit,x[test,],ncomp=12) 
pls.error <- mean((pls.pred-y.test)^2)
pls.error
## [1] 1085346
  1. Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?
errors <- c(lm.error, ridge.error, lasso.error, pcr.error, pls.error)
names(errors) <- c("lm", "ridge", "lasso", "pcr", "pls")
barplot(sort(errors))

print(sort(errors))
##   lasso     pls      lm     pcr   ridge 
## 1045922 1085346 1093608 1093608 1138030