Import the libraries used for this workspace
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.2
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.2
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.2.2
## Loaded glmnet 4.1-6
library(pls)
## Warning: package 'pls' was built under R version 4.2.2
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.2
library(MASS)
Import the dataset
data("College")
attach(College)
(a)Split the data set into a training set and a test set.
set.seed(2)
train <- sample(1:nrow(College), nrow(College)/2)
test <- -train
y.test <- Apps[test]
lm.fit <- lm(Apps ~., data=College, subset = train)
lm.pred <- predict(lm.fit, College[test,])
lm.error <- mean((lm.pred - y.test)^2)
lm.error
## [1] 1093608
x <- model.matrix(Apps~., data=College)[, -1]
x.train <- x[train,]
y <- College$Apps
y.train <- y[train]
set.seed(7)
ridge.fit <- glmnet(x.train,y.train,alpha=0)
cv.ridge.fit <- cv.glmnet(x.train,y.train,alpha=0)
plot(cv.ridge.fit)
bestlam <- cv.ridge.fit$lambda.min
bestlam
## [1] 424.2704
ridge.pred <- predict(ridge.fit, s=bestlam, newx=x[test,])
ridge.error <- mean((ridge.pred-y.test)^2)
ridge.error
## [1] 1138030
set.seed(2)
lasso.fit <- glmnet(x.train,y.train,alpha=1)
cv.lasso.fit<- cv.glmnet(x.train,y.train,alpha=1)
bestlam.lasso <- cv.lasso.fit$lambda.min
bestlam.lasso
## [1] 15.97351
lasso.pred <- predict(lasso.fit, s=bestlam.lasso, newx=x[test,])
lasso.error <- mean((lasso.pred-y.test)^2)
lasso.error
## [1] 1045922
lasso.c <- predict(lasso.fit,type="coefficients", s=bestlam)[1:17,]
length(lasso.c[lasso.c != 0])
## [1] 3
set.seed(1)
pcr.fit <- pcr(Apps~., data=College, subset=train, scale=TRUE, validation ="CV")
summary(pcr.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4440 4463 2362 2385 2092 1835 1832
## adjCV 4440 4465 2359 2384 2066 1824 1823
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1829 1821 1753 1764 1771 1772 1784
## adjCV 1818 1813 1746 1757 1764 1764 1777
## 14 comps 15 comps 16 comps 17 comps
## CV 1778 1747 1401 1338
## adjCV 1773 1701 1385 1322
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 31.4816 57.40 64.84 70.54 75.80 80.23 84.04 87.64
## Apps 0.1398 72.45 72.50 80.45 84.74 84.77 85.06 85.16
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.87 93.29 95.33 96.98 98.05 98.75 99.37
## Apps 85.93 86.16 86.21 86.32 86.40 86.61 92.49
## 16 comps 17 comps
## X 99.85 100.00
## Apps 93.65 94.32
validationplot(pcr.fit, val.type = "MSEP")
pcr.pred <- predict(pcr.fit,x[test,], ncomp=17)
pcr.error <- mean((pcr.pred-y.test)^2)
pcr.error
## [1] 1093608
set.seed(1)
pls.fit <- plsr(Apps~., data=College, subset=train, scale=TRUE, validation="CV")
summary(pls.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4440 2153 1701 1711 1650 1498 1413
## adjCV 4440 2147 1677 1704 1615 1470 1392
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1378 1358 1356 1342 1338 1337 1337
## adjCV 1361 1342 1339 1326 1322 1321 1321
## 14 comps 15 comps 16 comps 17 comps
## CV 1338 1338 1338 1338
## adjCV 1322 1322 1322 1322
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 25.76 31.99 61.88 65.00 68.68 73.82 78.33 81.38
## Apps 77.92 87.80 88.38 92.06 93.59 93.92 94.01 94.09
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 83.24 85.59 87.76 90.49 92.69 94.27 97.03
## Apps 94.20 94.27 94.30 94.31 94.32 94.32 94.32
## 16 comps 17 comps
## X 99.21 100.00
## Apps 94.32 94.32
validationplot(pls.fit, val.type = "MSEP")
pls.pred <- predict(pls.fit,x[test,],ncomp=12)
pls.error <- mean((pls.pred-y.test)^2)
pls.error
## [1] 1085346
errors <- c(lm.error, ridge.error, lasso.error, pcr.error, pls.error)
names(errors) <- c("lm", "ridge", "lasso", "pcr", "pls")
barplot(sort(errors))
print(sort(errors))
## lasso pls lm pcr ridge
## 1045922 1085346 1093608 1093608 1138030