require(ISLR)
## Loading required package: ISLR
require(caret)
## Loading required package: caret
## Loading required package: ggplot2
## Loading required package: lattice
require(tidyverse)
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
data('College')
set.seed(1)
inTrain <- createDataPartition(College$Apps, p = 0.75, list = FALSE)
training <- College[inTrain,]
testing <- College[-inTrain,]
preObj <- preProcess(training, method = c('center', 'scale'))
training <- predict(preObj, training)
testing <- predict(preObj, testing)
y_train <- training$Apps
y_test <- testing$Apps
one_hot_encoding <- dummyVars(Apps ~ ., data = training)
x_train <- predict(one_hot_encoding, training)
x_test <- predict(one_hot_encoding, testing)
lin_model <- lm(Apps ~ ., data = training)
pred <- predict(lin_model, testing)
(lin_info <- postResample(pred, testing$Apps))
## RMSE Rsquared MAE
## 0.2799768 0.9201765 0.1568743
ridge_fit <- train(x = x_train, y = y_train,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = 0,
lambda = seq(0, 10e2, length.out = 20)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
(ridge_info <- postResample(predict(ridge_fit, x_test), y_test))
## RMSE Rsquared MAE
## 0.2853247 0.9211286 0.1645806
coef(ridge_fit$finalModel, ridge_fit$bestTune$lambda)
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.034871314
## Private.No 0.075423210
## Private.Yes -0.076037580
## Accept 0.665628733
## Enroll 0.090243372
## Top10perc 0.107160248
## Top25perc 0.011628030
## F.Undergrad 0.063308801
## P.Undergrad 0.017427317
## Outstate -0.028995432
## Room.Board 0.048720533
## Books 0.012799145
## Personal -0.002894430
## PhD -0.017989250
## Terminal -0.010434665
## S.F.Ratio 0.006920126
## perc.alumni -0.031683867
## Expend 0.083525070
## Grad.Rate 0.058131023
plot(ridge_fit)

plot(varImp(ridge_fit))

lasso_fit <- train(x = x_train, y = y_train,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0.0001, 1, length.out = 50)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
(lasso_info <- postResample(predict(lasso_fit, x_test), y_test))
## RMSE Rsquared MAE
## 0.2914812 0.9141364 0.1511801
coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda)
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -1.470609e-02
## Private.No 5.410732e-02
## Private.Yes .
## Accept 8.883839e-01
## Enroll .
## Top10perc 1.092201e-01
## Top25perc .
## F.Undergrad .
## P.Undergrad .
## Outstate .
## Room.Board 1.483337e-04
## Books .
## Personal .
## PhD .
## Terminal .
## S.F.Ratio .
## perc.alumni .
## Expend 4.067011e-02
## Grad.Rate 4.062721e-06
plot(lasso_fit)

plot(varImp(lasso_fit))

pcr_model <- train(x = x_train, y = y_train,
method = 'pcr',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(ncomp = 1:10))
(pcr_info <- postResample(predict(pcr_model, x_test), y_test))
## RMSE Rsquared MAE
## 0.3231292 0.8916531 0.1986075
coef(pcr_model$finalModel)
## , , 10 comps
##
## .outcome
## Private.No 0.031985972
## Private.Yes -0.031985972
## Accept 0.343576750
## Enroll 0.305359773
## Top10perc 0.042630417
## Top25perc 0.027790893
## F.Undergrad 0.273818439
## P.Undergrad -0.049487667
## Outstate 0.038573119
## Room.Board 0.070607615
## Books 0.016433593
## Personal -0.023529455
## PhD -0.023992433
## Terminal -0.024182230
## S.F.Ratio 0.003741623
## perc.alumni -0.070567887
## Expend 0.090126298
## Grad.Rate 0.071302714
plot(pcr_model)

plot(varImp(pcr_model))

pls_model <- train(x = x_train, y = y_train,
method = 'pls',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(ncomp = 1:10))
(pls_info <- postResample(predict(pls_model, x_test), y_test))
## RMSE Rsquared MAE
## 0.2792580 0.9209302 0.1572165
coef(pls_model$finalModel)
## , , 10 comps
##
## .outcome
## Private.No 0.071314109
## Private.Yes -0.071314109
## Accept 1.039263053
## Enroll -0.169855531
## Top10perc 0.235572152
## Top25perc -0.070270182
## F.Undergrad -0.022580399
## P.Undergrad 0.032522451
## Outstate -0.085919942
## Room.Board 0.036344367
## Books 0.004835496
## Personal 0.007546695
## PhD -0.044709439
## Terminal 0.002332025
## S.F.Ratio 0.010868271
## perc.alumni -0.009046689
## Expend 0.072963738
## Grad.Rate 0.037425850
plot(pls_model)

as_data_frame(rbind(lin_info,
ridge_info,
lasso_info,
pcr_info,
pls_info)) %>%
mutate(model = c('Linear', 'Ridge', 'Lasso', 'PCR', 'PLS')) %>%
select(model, RMSE, Rsquared)
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 5 × 3
## model RMSE Rsquared
## <chr> <dbl> <dbl>
## 1 Linear 0.280 0.920
## 2 Ridge 0.285 0.921
## 3 Lasso 0.291 0.914
## 4 PCR 0.323 0.892
## 5 PLS 0.279 0.921
testing %>%
summarize(sd = sd(Apps))
## sd
## 1 0.9818241
require(ggthemes)
## Loading required package: ggthemes
residfunc <- function(fit, data) {
predict(fit, data) - testing$Apps
}
data_frame(Observed = testing$Apps,
LM = residfunc(lin_model, testing),
Ridge = residfunc(ridge_fit, x_test),
Lasso = residfunc(lasso_fit, x_test),
PCR = residfunc(pcr_model, x_test),
PLS = residfunc(pls_model, x_test)) %>%
gather(Model, Residuals, -Observed) %>%
ggplot(aes(Observed, Residuals, col = Model)) +
geom_hline(yintercept = 0, lty = 2) +
geom_point(alpha = 0.6) +
geom_smooth(method = 'loess', alpha = 0.01, col = 'lightsalmon2') +
facet_wrap(~ Model, ncol = 5) +
theme_tufte() +
theme(legend.position = 'top') +
coord_flip()
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using formula 'y ~ x'

require(MASS); require(tidyverse); require(caret); require(leaps)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: leaps
set.seed(1)
data('Boston')
inTrain <- createDataPartition(Boston$crim, p = 0.6, list = FALSE)
x_train <- Boston[inTrain, -1]
y_train <- Boston[inTrain, 1]
x_test <- Boston[-inTrain, -1]
y_test <- Boston[-inTrain, 1]
best_subs <- regsubsets(x = x_train, y = y_train, nvmax = 13)
fit_summary <- summary(best_subs)
require(ggplot2); require(ggthemes)
data_frame(MSE = fit_summary$rss/nrow(x_train),
Cp = fit_summary$cp,
BIC = fit_summary$bic,
AdjustedR2 = fit_summary$adjr2) %>%
mutate(id = row_number()) %>%
gather(Metric, value, -id) %>%
ggplot(aes(id, value, col = Metric)) +
geom_line() + geom_point() + ylab('') +
xlab('Number of Variables Used') +
facet_wrap(~ Metric, scales = 'free') +
theme_tufte() + scale_x_continuous(breaks = 1:13)

scales = c('r2', 'adjr2', 'bic', 'Cp')
par(mfrow = c(2,2))
for (scale in scales) {
plot(best_subs, scale = scale)
}

par(mfrow = c(1,1))
test_errors <- rep(NA,13)
test.mat <- model.matrix(crim ~ ., data = Boston[-inTrain,])
for (i in 1:13){
coefs <- coef(best_subs, id=i)
pred <- test.mat[,names(coefs)]%*%coefs
test_errors[i] <- sqrt(mean((y_test - pred)^2))
}
data_frame(RMSE = test_errors) %>%
mutate(id = row_number()) %>%
ggplot(aes(id, RMSE)) +
geom_line() + geom_point() +
xlab('Number of Variables Used') +
ggtitle('MSE on testing set') +
theme_tufte() +
scale_x_continuous(breaks = 1:13)

(regsubset_info <- min(test_errors))
## [1] 5.811256
coef(best_subs, id = 1:5)
## [[1]]
## (Intercept) rad
## -2.4448623 0.6546826
##
## [[2]]
## (Intercept) rad medv
## 2.6574618 0.5769903 -0.1963036
##
## [[3]]
## (Intercept) rad ptratio medv
## 10.8303135 0.6042171 -0.4067664 -0.2351904
##
## [[4]]
## (Intercept) zn dis rad medv
## 5.83377235 0.05509227 -0.72924037 0.52407648 -0.21866690
##
## [[5]]
## (Intercept) zn rm dis rad medv
## -1.70826140 0.05299909 1.50466724 -0.73918276 0.51079178 -0.29652993
lasso_fit <- train(x = x_train, y = y_train,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0.001, 1, length.out = 100)),
preProcess = c('center', 'scale'))
(lasso_info <- postResample(predict(lasso_fit, x_test), y_test))
## RMSE Rsquared MAE
## 5.793303 0.433477 2.603667
coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3.7575062
## zn .
## indus .
## chas .
## nox .
## rm .
## age .
## dis -0.1856880
## rad 4.2982107
## tax .
## ptratio .
## black .
## lstat 0.1178773
## medv -1.0546456
lasso_fit$bestTune
## alpha lambda
## 84 1 0.8385455
plot(lasso_fit)

plot(varImp(lasso_fit))

ridge_fit <- train(x_train, y_train,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = 0,
lambda = seq(0, 1e2, length.out = 50)),
preProcess = c('center', 'scale'))
(ridge_info <- postResample(predict(ridge_fit, x_test), y_test))
## RMSE Rsquared MAE
## 5.7137087 0.4498421 2.7943386
coef(ridge_fit$finalModel, ridge_fit$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3.75750624
## zn 0.54011923
## indus -0.34230036
## chas -0.30633776
## nox 0.14724495
## rm 0.44593736
## age 0.10933511
## dis -0.98108462
## rad 2.85345034
## tax 1.16015712
## ptratio -0.09232032
## black -0.31532518
## lstat 0.64017157
## medv -1.44312967
ridge_fit$bestTune
## alpha lambda
## 2 0 2.040816
plot(ridge_fit)

plot(varImp(ridge_fit))

glmnet_fit <- train(x_train, y_train,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = seq(0, 1, length.out = 6),
lambda = seq(0, 1e2, length.out = 20)),
preProcess = c('center', 'scale'))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
(glmnet_info <- postResample(predict(glmnet_fit, x_test), y_test))
## RMSE Rsquared MAE
## 5.7504399 0.4478022 2.8789035
coef(object = glmnet_fit$finalModel, s = glmnet_fit$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3.75750624
## zn 0.79927708
## indus -0.56669351
## chas -0.32661514
## nox -0.32145785
## rm 0.70164835
## age 0.03721032
## dis -1.61848336
## rad 4.01878688
## tax 0.60120176
## ptratio -0.37415233
## black -0.09300755
## lstat 0.54462549
## medv -2.16987001
glmnet_fit$bestTune
## alpha lambda
## 1 0 0
plot(glmnet_fit)

plot(varImp(glmnet_fit))

pcr_fit <- train(x_train, y_train,
method = 'pcr',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(ncomp = 1:13),
preProcess = c('center', 'scale'))
(pcr_info <- postResample(predict(pcr_fit, x_test), y_test))
## RMSE Rsquared MAE
## 5.8816610 0.4296175 3.0845598
pcr_fit
## Principal Component Analysis
##
## 306 samples
## 13 predictor
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 275, 277, 276, 275, 275, 275, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 6.560574 0.4800110 3.843889
## 2 6.559955 0.4824507 3.879334
## 3 6.175577 0.5954423 3.134008
## 4 6.166002 0.6001710 3.114538
## 5 6.165989 0.6016351 3.099134
## 6 6.164470 0.5941572 3.241512
## 7 6.171809 0.5937923 3.231744
## 8 6.030915 0.6217769 3.231019
## 9 6.056110 0.6186845 3.265161
## 10 6.064727 0.6201973 3.238258
## 11 6.044505 0.6234399 3.204063
## 12 6.173310 0.5958234 3.406977
## 13 6.116109 0.6072988 3.293449
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 8.
plot(pcr_fit)

plot(varImp(pcr_fit))

pls_fit <- train(x_train, y_train,
method = 'pls',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(ncomp = 1:13),
preProcess = c('center', 'scale'))
(pls_info <- postResample(predict(pls_fit, x_test), y_test))
## RMSE Rsquared MAE
## 5.7361826 0.4505769 2.8789603
pls_fit
## Partial Least Squares
##
## 306 samples
## 13 predictor
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 275, 275, 274, 275, 276, 276, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 6.201189 0.5251744 3.645062
## 2 5.818300 0.6334711 3.019228
## 3 5.818943 0.6391877 3.147983
## 4 5.820687 0.6344493 3.194376
## 5 5.855419 0.6352588 3.196379
## 6 5.864940 0.6351171 3.216697
## 7 5.867013 0.6354134 3.224509
## 8 5.848427 0.6395298 3.186069
## 9 5.844713 0.6395603 3.188456
## 10 5.846431 0.6391123 3.187532
## 11 5.845136 0.6392478 3.186003
## 12 5.845142 0.6392653 3.185977
## 13 5.845154 0.6392627 3.185996
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 2.
plot(pls_fit)

rbind(c(regsubset_info, NA, NA),
lasso_info,
ridge_info,
glmnet_info,
pcr_info,
pls_info)
## RMSE Rsquared MAE
## 5.811256 NA NA
## lasso_info 5.793303 0.4334770 2.603667
## ridge_info 5.713709 0.4498421 2.794339
## glmnet_info 5.750440 0.4478022 2.878903
## pcr_info 5.881661 0.4296175 3.084560
## pls_info 5.736183 0.4505769 2.878960