libraries:
library(ISLR)
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## The following object is masked from 'package:ISLR2':
##
## Boston
library(leaps)
library(glmnet)
Chapter 06 (page 283): 2, 9, 11
data(College)
head(College)
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Agnes Scott College 510 63 12960 5450 450
## Alaska Pacific University 249 869 7560 4120 800
## Albertson College 678 41 13500 3335 500
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Agnes Scott College 875 92 97 7.7 37 19016
## Alaska Pacific University 1500 76 72 11.9 2 10922
## Albertson College 675 67 73 9.4 11 9727
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
set.seed(123)
training = sample(1:nrow(College), nrow(College)/2)
testing = -training
train = College[training, ]
test = College[testing, ]
lm.model1 <- lm(Apps ~ ., data = train)
summary(lm.model1)
##
## Call:
## lm(formula = Apps ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2623.9 -472.8 -64.4 319.2 6042.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.778e+01 5.815e+02 0.134 0.89366
## PrivateYes -8.146e+02 2.002e+02 -4.069 5.77e-05 ***
## Accept 1.350e+00 7.318e-02 18.448 < 2e-16 ***
## Enroll -1.455e-01 2.627e-01 -0.554 0.58006
## Top10perc 3.784e+01 7.111e+00 5.322 1.79e-07 ***
## Top25perc -8.680e+00 5.646e+00 -1.538 0.12502
## F.Undergrad 2.157e-02 4.778e-02 0.451 0.65192
## P.Undergrad -2.767e-03 5.049e-02 -0.055 0.95632
## Outstate -5.351e-02 2.467e-02 -2.169 0.03075 *
## Room.Board 1.709e-01 6.102e-02 2.801 0.00536 **
## Books 5.341e-02 3.183e-01 0.168 0.86682
## Personal -9.869e-02 8.594e-02 -1.148 0.25157
## PhD -6.503e+00 6.215e+00 -1.046 0.29608
## Terminal -6.148e+00 7.156e+00 -0.859 0.39083
## S.F.Ratio -8.663e+00 1.953e+01 -0.443 0.65769
## perc.alumni -8.923e+00 5.544e+00 -1.610 0.10835
## Expend 7.938e-02 1.951e-02 4.068 5.80e-05 ***
## Grad.Rate 1.115e+01 3.966e+00 2.811 0.00520 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 965.7 on 370 degrees of freedom
## Multiple R-squared: 0.9158, Adjusted R-squared: 0.9119
## F-statistic: 236.6 on 17 and 370 DF, p-value: < 2.2e-16
lm.preds <- predict(lm.model1, newdata = test)
lm.MSE <- mean((test$Apps - lm.preds)^2)
lm.MSE
## [1] 1373995
x=model.matrix(Apps~.,data = train)[,-1]
y=model.matrix(Apps~.,data = test)[,-1]
grid=10^seq(10,-5,length=1000)
ridge = cv.glmnet(x,train$Apps,alpha=0, lambda = grid, thresh=1e-12)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
best.lambda = ridge$lambda.min
best.lambda
## [1] 18.24993
r.preds <- predict(ridge, newx = y, s = best.lambda)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
r.MSE <- mean((test$Apps - r.preds)^2)
r.MSE
## [1] 1430032
lasso <- cv.glmnet(x, train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
best.lambda.las <- lasso$lambda.min
best.lambda.las
## [1] 22.45698
las.preds <- predict(lasso, newx = y, s = best.lambda.las)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
las.MSE <- mean((test$Apps - las.preds)^2)
las.MSE
## [1] 1397791
pcr <- pcr(Apps~., data = train, scale = T, validation = "CV")
validationplot(pcr, val.type = "MSEP")
best_ncomp <- which.min(pcr$validation$PRESS)
best_ncomp
## [1] 16
pcr.preds <- predict(pcr, test, ncomp = 16)
pcr.MSE = mean((test$Apps-pcr.preds)^2)
pcr.MSE
## [1] 1437295
pls = plsr(Apps~., data=train, scale=T, validation="CV")
validationplot(pls, val.type="MSEP")
best_ncomp2 <- which.min(pls$validation$PRESS)
best_ncomp2
## [1] 11
pls.pred = predict(pls, test, ncomp=11)
pls.MSE=mean((test$Apps - pls.pred)^2)
pls.MSE
## [1] 1382123
res<-as_tibble(list(Model=c("Linear", "Ridge", "Lasso","PCR","PLS"), MSE=c(lm.MSE,r.MSE,las.MSE,pcr.MSE,pls.MSE)))
res %>%
arrange(MSE) %>%
mutate(MSE=format(MSE,big.mark=","))
## # A tibble: 5 × 2
## Model MSE
## <chr> <chr>
## 1 Linear 1,373,995
## 2 PLS 1,382,123
## 3 Lasso 1,397,791
## 4 Ridge 1,430,032
## 5 PCR 1,437,295
data(Boston)
First, I will try the best subset selection:
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id = id)
mat[, names(coefi)] %*% coefi
}
k = 10
p = ncol(Boston) - 1
folds = sample(rep(1:k, length = nrow(Boston)))
cv.errors = matrix(NA, k, p)
for (i in 1:k) {
best.fit = regsubsets(crim ~ ., data = Boston[folds != i, ], nvmax = p)
for (j in 1:p) {
pred = predict(best.fit, Boston[folds == i, ], id = j)
cv.errors[i, j] = mean((Boston$crim[folds == i] - pred)^2)
}
}
rmse.cv = sqrt(apply(cv.errors, 2, mean))
plot(rmse.cv, pch = 19, type = "b")
which.min(rmse.cv)
## [1] 11
best.rmse=rmse.cv[which.min(rmse.cv)]
best.rmse
## [1] 6.614246
Next, I’ll try the Lasso
X = model.matrix(crim ~ . -1, data = Boston)
Y = Boston$crim
lasso = cv.glmnet(X, Y, type.measure = "mse")
plot(lasso)
coef(lasso)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.0894283
## zn .
## indus .
## chas .
## nox .
## rm .
## age .
## dis .
## rad 0.2643196
## tax .
## ptratio .
## black .
## lstat .
## medv .
lasso.rmse = sqrt(lasso$cvm[lasso$lambda == lasso$lambda.1se])
lasso.rmse
## [1] 7.410605
Then, I’ll try ridge regression.
ridge = cv.glmnet(X, Y, type.measure = "mse", alpha = 0)
plot(ridge)
coef(ridge)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.523899542
## zn -0.002949852
## indus 0.029276741
## chas -0.166526007
## nox 1.874769665
## rm -0.142852604
## age 0.006207995
## dis -0.094547258
## rad 0.045932737
## tax 0.002086668
## ptratio 0.071258052
## black -0.002605281
## lstat 0.035745604
## medv -0.023480540
ridge.rmse = sqrt(ridge$cvm[ridge$lambda == ridge$lambda.1se])
ridge.rmse
## [1] 7.668311
Lastly, here is PCR.
pcr2 = pcr(crim ~., data = Boston, scale = TRUE, validation = "CV")
summary(pcr2)
## Data: X dimension: 506 13
## Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.61 7.202 7.198 6.766 6.759 6.765 6.777
## adjCV 8.61 7.199 7.196 6.762 6.750 6.760 6.771
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.769 6.640 6.653 6.635 6.633 6.615 6.527
## adjCV 6.762 6.633 6.646 6.627 6.625 6.605 6.518
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 47.70 60.36 69.67 76.45 82.99 88.00 91.14 93.45
## crim 30.69 30.87 39.27 39.61 39.61 39.86 40.14 42.47
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 95.40 97.04 98.46 99.52 100.0
## crim 42.55 42.78 43.04 44.13 45.4
pcr.rmse = sqrt(pcr2$validation$adj[13])
pcr.rmse
## [1] 6.359239
models<-as_tibble(list(Model=c("Best Subsets", "Lasso","Ridge","PCR"), RMSE=c(best.rmse,lasso.rmse,ridge.rmse,pcr.rmse)))
models %>%
arrange(RMSE)
## # A tibble: 4 × 2
## Model RMSE
## <chr> <dbl>
## 1 PCR 6.36
## 2 Best Subsets 6.61
## 3 Lasso 7.41
## 4 Ridge 7.67
The PCR performed the best along side Best Subsets using RMSE to measure model performance.
This model has 13 predictors and the lowest RMSE. It includes all 13 variables to create linear combinations to explain variance and reduce over-fitting.