library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-2
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x tidyr::unpack() masks Matrix::unpack()
library(ISLR)
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
library(leaps)
library(glmnet)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
iii - As the value of lambda increases, the impact of the models coefficients is smoothed, increasing bias and decreasing variance. So the optimal model will that along the cross validation line when the marginal increase of bias becomes bigger that than the variance reduction.
iii - As the value of lambda increases, the impact of the models coefficients is smoothed, increasing bias and decreasing variance. So the optimal model will that along the cross validation line when the marginal increase of bias becomes bigger that than the variance reduction.
ii - Non-linear models are more flexible but as you increase the flexibility, you increase the variance and may end up overfitting the data. So the optiomal model will be the point along the cross validation line just before the marginal reduction in bias become smaller than the increase in variance.
as_tibble(College)
## # A tibble: 777 x 18
## Private Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Yes 1660 1232 721 23 52 2885 537
## 2 Yes 2186 1924 512 16 29 2683 1227
## 3 Yes 1428 1097 336 22 50 1036 99
## 4 Yes 417 349 137 60 89 510 63
## 5 Yes 193 146 55 16 44 249 869
## 6 Yes 587 479 158 38 62 678 41
## 7 Yes 353 340 103 17 45 416 230
## 8 Yes 1899 1720 489 37 68 1594 32
## 9 Yes 1038 839 227 30 63 973 306
## 10 Yes 582 498 172 21 44 799 78
## # ... with 767 more rows, and 10 more variables: Outstate <dbl>,
## # Room.Board <dbl>, Books <dbl>, Personal <dbl>, PhD <dbl>, Terminal <dbl>,
## # S.F.Ratio <dbl>, perc.alumni <dbl>, Expend <dbl>, Grad.Rate <dbl>
summary(College)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
set.seed(1)
train=sample(c(TRUE,FALSE), ((nrow(College)/2)+1),rep=TRUE)
test=(!train)
col.train <- College[train,]
col.test <- College[test,]
col.lm <- lm(Apps~., data = col.train)
col.lm.pred <- predict(col.lm, col.test)
lm.MRSS <- mean( (col.test$Apps-col.lm.pred)^2)
lm.MRSS
## [1] 1077995
x.train <- model.matrix(Apps~., col.train)[,-1]
y.train <- col.train$Apps
x.test <- model.matrix(Apps~., col.test)[,-1]
y.test <- col.test$Apps
grid <- 10^seq(10,-2,length=100)
col.ridge <- glmnet(x.train,y.train,alpha=0,lambda=grid)
set.seed(1)
cv.out=cv.glmnet(x.train,y.train,alpha=0)
bestlam=cv.out$lambda.min
col.ridge.pred <- predict(col.ridge,s=bestlam,newx=x.test)
ridge.MRSS <- mean((col.ridge.pred-y.test)^2)
ridge.MRSS
## [1] 930050
col.lasso <- glmnet(x.train,y.train,alpha=1,lambda=grid, thresh=1e-12)
set.seed(1)
cv.out.lasso <- cv.glmnet(x.train,y.train,alpha=1)
bestlam.lasso <- cv.out.lasso$lambda.min
col.lasso.pred <- predict(col.lasso,s=bestlam.lasso,newx=x.test)
lasso.MRSS <- mean((col.lasso.pred-y.test)^2)
lasso.MRSS
## [1] 998187.6
predict(col.lasso,s=bestlam.lasso, type="coefficients")
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -493.54618485
## PrivateYes -375.96382308
## Accept 1.62547308
## Enroll -0.61576761
## Top10perc 45.86837931
## Top25perc -8.56576959
## F.Undergrad .
## P.Undergrad .
## Outstate -0.08886467
## Room.Board 0.14257229
## Books .
## Personal .
## PhD .
## Terminal -8.54009477
## S.F.Ratio 10.68567450
## perc.alumni -2.07714983
## Expend 0.06817736
## Grad.Rate 5.78392758
Five predictors had a zero coefficient. 12 predictors have non-zero coefficients.
set.seed(1)
col.pcr<-pcr(Apps~., data=col.train,scale=TRUE, validation="CV")
validationplot(col.pcr,val.type="MSEP")
pcr.components=rep(NA,17)
for(i in 1:17){
col.pcr.pred<-predict(col.pcr,col.test,ncomp=i)
pcr.components[i]<-mean((col.pcr.pred-col.test$Apps)^2)
}
pcr.components
## [1] 10017947 3149728 3085874 2263542 1940907 1914759 1902246 1914291
## [9] 1631069 1468919 1492047 1553345 1561114 1561856 1420067 1141053
## [17] 1077995
col.pcr.pred<-predict(col.pcr,col.test,ncomp=10)
pcr.MRSS<-mean((col.pcr.pred-col.test$Apps)^2)
pcr.MRSS
## [1] 1468919
set.seed(15)
col.pls<-plsr(Apps~., data=col.train,scale=TRUE, validation="CV")
validationplot(col.pls,val.type="MSEP")
pls.components=rep(NA,17)
for(i in 1:17){
col.pls.pred<-predict(col.pls,col.test,ncomp= i)
pls.components[i]<-mean((col.pls.pred - col.test$Apps)^2)
}
pls.components
## [1] 2463404 1873551 1302696 1551109 1505036 1110472 1110609 1109600 1105320
## [10] 1104187 1099639 1094506 1084774 1076663 1078026 1077927 1077995
col.pls.pred<-predict(col.pls,col.test,ncomp= 6)
pls.MRSS<-mean((col.pls.pred - col.test$Apps)^2)
pls.MRSS
## [1] 1110472
lm.MRSS
## [1] 1077995
ridge.MRSS
## [1] 930050
lasso.MRSS
## [1] 998187.6
pcr.MRSS
## [1] 1468919
pls.MRSS
## [1] 1110472
Comparing the results,the ridge model had the lowest test error. The PCR model had the highest test error. So the ridge model more accuratelly predicts the application number.
as_tibble(Boston)
## # A tibble: 506 x 14
## crim zn indus chas nox rm age dis rad tax ptratio black
## <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296 15.3 397.
## 2 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242 17.8 397.
## 3 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242 17.8 393.
## 4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222 18.7 395.
## 5 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222 18.7 397.
## 6 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222 18.7 394.
## 7 0.0883 12.5 7.87 0 0.524 6.01 66.6 5.56 5 311 15.2 396.
## 8 0.145 12.5 7.87 0 0.524 6.17 96.1 5.95 5 311 15.2 397.
## 9 0.211 12.5 7.87 0 0.524 5.63 100 6.08 5 311 15.2 387.
## 10 0.170 12.5 7.87 0 0.524 6.00 85.9 6.59 5 311 15.2 387.
## # ... with 496 more rows, and 2 more variables: lstat <dbl>, medv <dbl>
set.seed(1)
train=sample(c(TRUE,FALSE), ((nrow(Boston)/2)+1),rep=TRUE)
test=(!train)
bos.train <- Boston[train,]
bos.test <- Boston[test,]
bos.lm <- lm(crim~., data = bos.train)
bos.lm.pred <- predict(bos.lm, bos.test)
bos.lm.MRSS <- mean( (bos.test$crim-bos.lm.pred)^2)
bos.lm.MRSS
## [1] 22.04901
regfit.best=regsubsets(crim~.,data=bos.train,nvmax=13)
test.mat=model.matrix(crim~.,data=bos.test)
val.errors=rep(NA,13)
for(i in 1:13){
coefi=coef(regfit.best,id=i)
pred=test.mat[,names(coefi)]%*%coefi
val.errors[i]=mean((bos.test$crim-pred)^2)
}
which.min(val.errors)
## [1] 2
coef(regfit.best,10)
## (Intercept) zn indus nox rm
## 23.186240114 0.059435201 -0.093162542 -19.567845950 1.515871588
## dis rad ptratio black lstat
## -1.384520186 0.588866514 -0.538677123 -0.005334238 0.163344059
## medv
## -0.362774754
coefi=coef(regfit.best,id=2)
pred=test.mat[,names(coefi)]%*%coefi
bos.subset.MRSS =mean((bos.test$crim-pred)^2)
bos.subset.MRSS
## [1] 19.89497
set.seed(1)
x.train <- model.matrix(crim~., bos.train)[,-1]
y.train <- bos.train$crim
x.test <- model.matrix(crim~., bos.test)[,-1]
y.test <- bos.test$crim
grid <- 10^seq(10,-2,length=100)
bos.ridge <- glmnet(x.train,y.train,alpha=0,lambda=grid)
bos.cv.out=cv.glmnet(x.train,y.train,alpha=0)
bos.bestlam=bos.cv.out$lambda.min
bos.ridge.pred <- predict(bos.ridge, s=bos.bestlam, newx=x.test)
bos.ridge.MRSS <- mean((bos.ridge.pred-y.test)^2)
bos.ridge.MRSS
## [1] 20.01404
bos.lasso <- glmnet(x.train,y.train,alpha=1,lambda=grid, thresh=1e-12)
bos.cv.out.lasso <- cv.glmnet(x.train,y.train,alpha=1)
bos.bestlam.lasso <- bos.cv.out.lasso$lambda.min
bos.lasso.pred <- predict(bos.lasso, s=bos.bestlam.lasso, newx=x.test)
bos.lasso.MRSS <- mean((bos.lasso.pred-y.test)^2)
bos.lasso.MRSS
## [1] 19.65453
bos.pcr<-pcr(crim~., data=bos.train, scale=TRUE, validation="CV")
validationplot(bos.pcr,val.type="MSEP")
bos.pcr.components=rep(NA,13)
for(i in 1:13){
bos.pcr.pred<-predict(bos.pcr,bos.test,ncomp=i)
bos.pcr.components[i]<-mean((bos.pcr.pred-bos.test$crim)^2)
}
bos.pcr.components
## [1] 27.95384 27.78242 22.15407 21.85607 22.15933 22.04936 22.44147 20.66903
## [9] 20.65891 20.45213 20.41691 22.81438 22.04901
bos.pcr.pred<-predict(bos.pcr,bos.test,ncomp=8)
bos.pcr.MRSS<-mean((bos.pcr.pred-bos.test$crim)^2)
bos.pcr.MRSS
## [1] 20.66903
bos.pls<-plsr(crim~., data=bos.train,scale=TRUE, validation="CV")
validationplot(bos.pls,val.type="MSEP")
bos.pls.components=rep(NA,13)
for(i in 1:13){
bos.pls.pred<-predict(bos.pls,bos.test,ncomp= i)
bos.pls.components[i]<-mean((bos.pls.pred - bos.test$crim)^2)
}
bos.pls.components
## [1] 25.75036 20.94541 21.51690 22.03018 21.62070 22.19780 22.53804 22.43431
## [9] 22.18842 22.04813 22.04890 22.04890 22.04901
bos.pls.pred<-predict(bos.pls,bos.test,ncomp= 2)
bos.pls.MRSS<-mean((bos.pls.pred - bos.test$crim)^2)
bos.pls.MRSS
## [1] 20.94541
bos.lm.MRSS
## [1] 22.04901
bos.subset.MRSS
## [1] 19.89497
bos.ridge.MRSS
## [1] 20.01404
bos.lasso.MRSS
## [1] 19.65453
bos.pcr.MRSS
## [1] 20.66903
bos.pls.MRSS
## [1] 20.94541
The model with the lowest test error (19.89497 )was the linear model using the best subset selection.
The best subset model used only 5 of the 13 predictors in the data set. The high accuracy of both model was the fact that it did not use in full all predictors. That means they reduced the variability and flexibility without a substantial increase of bias: the predictors that were not significantly correlated to the dependent variable were excluded.