# load required packages and data (Carseats data)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(ISLR)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
head(Carseats)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1 9.50 138 73 11 276 120 Bad 42 17
## 2 11.22 111 48 16 260 83 Good 65 10
## 3 10.06 113 35 10 269 80 Medium 59 12
## 4 7.40 117 100 4 466 97 Medium 55 14
## 5 4.15 141 64 3 340 128 Bad 38 13
## 6 10.81 124 113 13 501 72 Bad 78 16
## Urban US
## 1 Yes Yes
## 2 Yes Yes
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes No
## 6 No Yes
# getting needed columns but first of all, we need to check out what's inside the dataframe
names(Carseats)
## [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
# subset the data
Carseats_sub <- Carseats[, c("Sales", "Price", "Advertising", "Education", "Age")]
# Use ols_step_all_possible() function to list all possible combination of variables and plot them out to see relevant model discrimination criteria
lm.fit1 <- lm(Sales ~ Price + Advertising + Education + Age, data = Carseats_sub)
k <- ols_step_all_possible(lm.fit1)
k
## Index N Predictors R-Square Adj. R-Square
## 1 1 1 Price 0.197981150 0.1959660275
## 2 2 1 Advertising 0.072633905 0.0703038396
## 4 3 1 Age 0.053738398 0.0513608563
## 3 4 1 Education 0.002699347 0.0001935666
## 5 5 2 Price Advertising 0.281855603 0.2782377474
## 7 6 2 Price Age 0.275675991 0.2720270037
## 6 7 2 Price Education 0.200165012 0.1961356167
## 9 8 2 Advertising Age 0.125805449 0.1214014467
## 8 9 2 Advertising Education 0.074476510 0.0698139236
## 10 10 2 Education Age 0.056283830 0.0515295927
## 12 11 3 Price Advertising Age 0.359549330 0.3546974308
## 11 12 3 Price Advertising Education 0.283214413 0.2777842187
## 13 13 3 Price Education Age 0.277663252 0.2721910041
## 14 14 3 Advertising Education Age 0.127524779 0.1209151186
## 15 15 4 Price Advertising Education Age 0.360753937 0.3542805595
## Mallow's Cp
## 1 99.579815
## 2 177.033811
## 4 188.709636
## 3 220.247453
## 5 49.752497
## 7 53.570975
## 6 100.230373
## 9 146.178294
## 8 177.895237
## 10 189.136774
## 12 3.744346
## 11 50.912868
## 13 54.343016
## 14 147.115893
## 15 5.000000
# plot the ols_step_all_possible() object, you can see the performance of all these models in terms of R^2, Adj R^2, AIC
plot(k)
# Select the subset of predictors that do the best at meeting some well-defined objective criteria, such as having the largest R2 value or the smallest MSE, Mallow’s Cp or AIC:
k_best <- ols_step_best_subset(lm.fit1)
k_best
## Best Subsets Regression
## ----------------------------------------------
## Model Index Predictors
## ----------------------------------------------
## 1 Price
## 2 Price Advertising
## 3 Price Advertising Age
## 4 Price Advertising Education Age
## ----------------------------------------------
##
## Subsets Regression Summary
## -------------------------------------------------------------------------------------------------------------------------------------
## Adj. Pred
## Model R-Square R-Square R-Square C(p) AIC SBIC SBC MSEP FPE HSP APC
## -------------------------------------------------------------------------------------------------------------------------------------
## 1 0.1980 0.1960 0.19 99.5798 1882.4564 746.4597 1894.4307 2565.0698 6.4447 0.0162 0.8101
## 2 0.2819 0.2782 0.2707 49.7525 1840.2718 704.5099 1856.2376 2302.6170 5.7997 0.0145 0.7290
## 3 0.3595 0.3547 0.3462 3.7443 1796.4723 661.4073 1816.4297 2058.7030 5.1982 0.0130 0.6534
## 4 0.3608 0.3543 0.3441 5.0000 1797.7193 662.6947 1821.6681 2060.0462 5.2144 0.0131 0.6554
## -------------------------------------------------------------------------------------------------------------------------------------
## AIC: Akaike Information Criteria
## SBIC: Sawa's Bayesian Information Criteria
## SBC: Schwarz Bayesian Criteria
## MSEP: Estimated error of prediction, assuming multivariate normality
## FPE: Final Prediction Error
## HSP: Hocking's Sp
## APC: Amemiya Prediction Criteria
# plot out best predictors
plot(k_best)
## Stepwise Forward Regression
# Build regression model from a set of candidate variables by "entering" variables based on their p-values, in a stepwise manner until there is no variable left to enter any more.
# The model should include all the candidate predictor variables. (If details is set to TRUE, each step is displayed)
lm.fit2 <- lm(Sales ~ ., data = Carseats_sub)
k_forward <- ols_step_forward_p(lm.fit2)
k_forward
##
## Selection Summary
## -----------------------------------------------------------------------------
## Variable Adj.
## Step Entered R-Square R-Square C(p) AIC RMSE
## -----------------------------------------------------------------------------
## 1 Price 0.1980 0.1960 99.5798 1882.4564 2.5323
## 2 Advertising 0.2819 0.2782 49.7525 1840.2718 2.3993
## 3 Age 0.3595 0.3547 3.7443 1796.4723 2.2686
## -----------------------------------------------------------------------------
# plot them out
plot(k_forward)
# show details
ols_step_forward_p(lm.fit2, details = TRUE)
## Forward Selection Method
## ---------------------------
##
## Candidate Terms:
##
## 1. Price
## 2. Advertising
## 3. Education
## 4. Age
##
## We are selecting variables based on p value...
##
##
## Forward Selection: Step 1
##
## - Price
##
## Model Summary
## --------------------------------------------------------------
## R 0.445 RMSE 2.532
## R-Squared 0.198 Coef. Var 33.781
## Adj. R-Squared 0.196 MSE 6.413
## Pred R-Squared 0.190 MAE 2.045
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## ---------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## ---------------------------------------------------------------------
## Regression 630.030 1 630.030 98.248 0.0000
## Residual 2552.244 398 6.413
## Total 3182.275 399
## ---------------------------------------------------------------------
##
## Parameter Estimates
## ----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ----------------------------------------------------------------------------------------
## (Intercept) 13.642 0.633 21.558 0.000 12.398 14.886
## Price -0.053 0.005 -0.445 -9.912 0.000 -0.064 -0.043
## ----------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 2
##
## - Advertising
##
## Model Summary
## --------------------------------------------------------------
## R 0.531 RMSE 2.399
## R-Squared 0.282 Coef. Var 32.006
## Adj. R-Squared 0.278 MSE 5.757
## Pred R-Squared 0.271 MAE 1.890
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## ---------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## ---------------------------------------------------------------------
## Regression 896.942 2 448.471 77.907 0.0000
## Residual 2285.333 397 5.757
## Total 3182.275 399
## ---------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------
## (Intercept) 13.003 0.607 21.428 0.000 11.810 14.196
## Price -0.055 0.005 -0.458 -10.755 0.000 -0.065 -0.045
## Advertising 0.123 0.018 0.290 6.809 0.000 0.088 0.159
## -----------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 3
##
## - Age
##
## Model Summary
## --------------------------------------------------------------
## R 0.600 RMSE 2.269
## R-Squared 0.360 Coef. Var 30.263
## Adj. R-Squared 0.355 MSE 5.147
## Pred R-Squared 0.346 MAE 1.815
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## ---------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## ---------------------------------------------------------------------
## Regression 1144.185 3 381.395 74.105 0.0000
## Residual 2038.090 396 5.147
## Total 3182.275 399
## ---------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------
## (Intercept) 16.003 0.719 22.266 0.000 14.590 17.417
## Price -0.058 0.005 -0.486 -12.022 0.000 -0.068 -0.049
## Advertising 0.123 0.017 0.290 7.201 0.000 0.089 0.157
## Age -0.049 0.007 -0.280 -6.931 0.000 -0.063 -0.035
## -----------------------------------------------------------------------------------------
##
##
##
## No more variables to be added.
##
## Variables Entered:
##
## + Price
## + Advertising
## + Age
##
##
## Final Model Output
## ------------------
##
## Model Summary
## --------------------------------------------------------------
## R 0.600 RMSE 2.269
## R-Squared 0.360 Coef. Var 30.263
## Adj. R-Squared 0.355 MSE 5.147
## Pred R-Squared 0.346 MAE 1.815
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## ---------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## ---------------------------------------------------------------------
## Regression 1144.185 3 381.395 74.105 0.0000
## Residual 2038.090 396 5.147
## Total 3182.275 399
## ---------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------
## (Intercept) 16.003 0.719 22.266 0.000 14.590 17.417
## Price -0.058 0.005 -0.486 -12.022 0.000 -0.068 -0.049
## Advertising 0.123 0.017 0.290 7.201 0.000 0.089 0.157
## Age -0.049 0.007 -0.280 -6.931 0.000 -0.063 -0.035
## -----------------------------------------------------------------------------------------
##
## Selection Summary
## -----------------------------------------------------------------------------
## Variable Adj.
## Step Entered R-Square R-Square C(p) AIC RMSE
## -----------------------------------------------------------------------------
## 1 Price 0.1980 0.1960 99.5798 1882.4564 2.5323
## 2 Advertising 0.2819 0.2782 49.7525 1840.2718 2.3993
## 3 Age 0.3595 0.3547 3.7443 1796.4723 2.2686
## -----------------------------------------------------------------------------
## Stepwise Backward Regression
# Build regression model from a set of candidate variables by "removing" variables based on their p-values, in a stepwise manner until there is no variable left to remove any more.
# Again, the model should include all the candidate predictor variables. (If details is set to TRUE, each step is displayed)
lm.fit3 <- lm(Sales ~ ., data = Carseats_sub)
k_backward <- ols_step_backward_p(lm.fit3)
k_backward
##
##
## Elimination Summary
## --------------------------------------------------------------------------
## Variable Adj.
## Step Removed R-Square R-Square C(p) AIC RMSE
## --------------------------------------------------------------------------
## 1 Education 0.3595 0.3547 3.7443 1796.4723 2.2686
## --------------------------------------------------------------------------
# plot them out
plot(k_backward)
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
# show details
ols_step_backward_p(lm.fit3, details = TRUE)
## Backward Elimination Method
## ---------------------------
##
## Candidate Terms:
##
## 1 . Price
## 2 . Advertising
## 3 . Education
## 4 . Age
##
## We are eliminating variables based on p value...
##
## - Education
##
## Backward Elimination: Step 1
##
## Variable Education Removed
##
## Model Summary
## --------------------------------------------------------------
## R 0.600 RMSE 2.269
## R-Squared 0.360 Coef. Var 30.263
## Adj. R-Squared 0.355 MSE 5.147
## Pred R-Squared 0.346 MAE 1.815
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## ---------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## ---------------------------------------------------------------------
## Regression 1144.185 3 381.395 74.105 0.0000
## Residual 2038.090 396 5.147
## Total 3182.275 399
## ---------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------
## (Intercept) 16.003 0.719 22.266 0.000 14.590 17.417
## Price -0.058 0.005 -0.486 -12.022 0.000 -0.068 -0.049
## Advertising 0.123 0.017 0.290 7.201 0.000 0.089 0.157
## Age -0.049 0.007 -0.280 -6.931 0.000 -0.063 -0.035
## -----------------------------------------------------------------------------------------
##
##
##
## No more variables satisfy the condition of p value = 0.3
##
##
## Variables Removed:
##
## - Education
##
##
## Final Model Output
## ------------------
##
## Model Summary
## --------------------------------------------------------------
## R 0.600 RMSE 2.269
## R-Squared 0.360 Coef. Var 30.263
## Adj. R-Squared 0.355 MSE 5.147
## Pred R-Squared 0.346 MAE 1.815
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## ---------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## ---------------------------------------------------------------------
## Regression 1144.185 3 381.395 74.105 0.0000
## Residual 2038.090 396 5.147
## Total 3182.275 399
## ---------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------
## (Intercept) 16.003 0.719 22.266 0.000 14.590 17.417
## Price -0.058 0.005 -0.486 -12.022 0.000 -0.068 -0.049
## Advertising 0.123 0.017 0.290 7.201 0.000 0.089 0.157
## Age -0.049 0.007 -0.280 -6.931 0.000 -0.063 -0.035
## -----------------------------------------------------------------------------------------
##
##
## Elimination Summary
## --------------------------------------------------------------------------
## Variable Adj.
## Step Removed R-Square R-Square C(p) AIC RMSE
## --------------------------------------------------------------------------
## 1 Education 0.3595 0.3547 3.7443 1796.4723 2.2686
## --------------------------------------------------------------------------
# load required packages and data (Carseats data)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
library(AER)
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
library(dplyr)
data(HMDA)
# check out what's inside the CreditCard data
dim(HMDA)
## [1] 2380 14
names(HMDA)
## [1] "deny" "pirat" "hirat" "lvrat" "chist" "mhist"
## [7] "phist" "unemp" "selfemp" "insurance" "condomin" "afam"
## [13] "single" "hschool"
# check out variable class
str(HMDA)
## 'data.frame': 2380 obs. of 14 variables:
## $ deny : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ pirat : num 0.221 0.265 0.372 0.32 0.36 ...
## $ hirat : num 0.221 0.265 0.248 0.25 0.35 ...
## $ lvrat : num 0.8 0.922 0.92 0.86 0.6 ...
## $ chist : Factor w/ 6 levels "1","2","3","4",..: 5 2 1 1 1 1 1 2 2 2 ...
## $ mhist : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 1 1 2 2 2 1 ...
## $ phist : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ unemp : num 3.9 3.2 3.2 4.3 3.2 ...
## $ selfemp : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ insurance: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ condomin : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ afam : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ single : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 2 1 1 2 ...
## $ hschool : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
# convert the outcome variable "card", and two factor class variables, "owner" and "selfemp," to binary numeric class
HMDA$phist <- ifelse(HMDA$phist == "yes", 1, 0)
HMDA$insurance <- ifelse(HMDA$insurance == "yes", 1, 0)
HMDA$condomin <- ifelse(HMDA$condomin == "yes", 1, 0)
HMDA$afam <- ifelse(HMDA$afam == "yes", 1, 0)
HMDA$single <- ifelse(HMDA$single == "yes", 1, 0)
HMDA$hschool <- ifelse(HMDA$hschool == "yes", 1, 0)
# do a 75-25 train-test split
set.seed(123)
index <- createDataPartition(HMDA$deny, p=0.75, list=FALSE)
training <- HMDA[index, ]
testing <- HMDA[-index, ]
# initialize train control function
fitControl1 <- trainControl(method = "repeatedcv", number = 5, repeats = 10, classProbs = TRUE)
# train a logistic regression model, let's time it
library(tictoc)
tic()
logit.fit1 = train(
form = deny ~ .,
data = training,
trControl = fitControl1,
method = "glm",
family = "binomial"
)
toc()
## 1.705 sec elapsed
# using trained model to predict testing data
pred1 <- predict(logit.fit1, testing)
# show confusion table and calculate accuracy
confmat_logit1 <- table(Predicted = pred1, Actual = testing$deny)
confmat_logit1
## Actual
## Predicted no yes
## no 510 51
## yes 13 20
(confmat_logit1[1, 1] + confmat_logit1[2, 2]) / sum(confmat_logit1) * 100
## [1] 89.22559
# Does more cross-validation iterations improve prediction accuracy? Let's find out
fitControl2 <- trainControl(method = "repeatedcv", number = 10, repeats = 20, classProbs = TRUE)
# train a logistic regression model
tic()
logit.fit2 = train(
form = deny ~ .,
data = training,
trControl = fitControl2,
method = "glm",
family = "binomial"
)
toc()
## 5.079 sec elapsed
# more cv iterations certainly takes more time
# using trained model to predict testing data
pred2 <- predict(logit.fit2, testing)
# show confusion table and calculate accuracy
confmat_logit2 <- table(Predicted = pred2, Actual = testing$deny)
confmat_logit2
## Actual
## Predicted no yes
## no 510 51
## yes 13 20
(confmat_logit2[1, 1] + confmat_logit2[2, 2]) / sum(confmat_logit2) * 100
## [1] 89.22559
# accuracy does not improve, so more is not necessarily better (at least for this case)
## adding ROC curve as additional model evaluation metric, let's compare the performance of logistic regression against that of GBM
# note that since the ROC curve is based on the predicted class probabilities (which are not computed automatically), another option is required. The classProbs = TRUE option is used to include these calculations.
## logistic regression ##
# initialize train control function
fitControl <- trainControl(method = "repeatedcv", number = 5, repeats = 10, classProbs = TRUE, savePredictions = TRUE)
logit.fit = train(
form = deny ~ .,
data = training,
trControl = fitControl,
method = "glm",
family = "binomial",
metric = "Accuracy" # logit model uses Accuracy (ROC is not used here)
)
# we compare the performance of logistic regression against against Gradient Boost Machine (GBM), which is the more sophisticated model
# manually define the hyperparameters of a GBM
hyperparams <- expand.grid(n.trees = 200, # number of trees
interaction.depth = 1, # depth of variable interactions
shrinkage = 0.1, # learning rate
n.minobsinnode = 10) # minimum number of obs per node on a tree
gbmControl <- trainControl(method = "repeatedcv", number = 5, repeats = 10, classProbs = TRUE, savePredictions = TRUE)
# Apply hyperparameter grid to train(), we also need to load gbm package
library(gbm)
## Loaded gbm 2.1.8
gbm.fit <- train(deny ~ .,
data = training,
method = "gbm",
trControl = gbmControl,
verbose = FALSE,
metric = "Accuracy",
tuneGrid = hyperparams)
# we will use MLeval package to automatically generate ROC curve
# MLeval stands for "Machine Learning Model Evaluation"
library(MLeval) # for evalm() function
# create confusion matrix for the logistic regression model
logit.pred <- predict(logit.fit, testing)
confusionMatrix(data=logit.pred, testing$deny)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 510 51
## yes 13 20
##
## Accuracy : 0.8923
## 95% CI : (0.8645, 0.916)
## No Information Rate : 0.8805
## P-Value [Acc > NIR] : 0.207
##
## Kappa : 0.3341
##
## Mcnemar's Test P-Value : 3.746e-06
##
## Sensitivity : 0.9751
## Specificity : 0.2817
## Pos Pred Value : 0.9091
## Neg Pred Value : 0.6061
## Prevalence : 0.8805
## Detection Rate : 0.8586
## Detection Prevalence : 0.9444
## Balanced Accuracy : 0.6284
##
## 'Positive' Class : no
##
test = evalm(logit.fit) # this gives the ROC curve for the test set
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Averaging probs.
## Group 1 type: repeatedcv
## Observations: 1786
## Number of groups: 1
## Observations per group: 1786
## Positive: yes
## Negative: no
## Group: Group 1
## Positive: 214
## Negative: 1572
## ***Performance Metrics***
## Group 1 Optimal Informedness = 0.545450762169746
## Group 1 AUC-ROC = 0.83
# we need to create another "predict" object in "prob" class in order to feed into evalm() that shows the ROC for each outcome (yes, no)
logit.predprob <- predict(logit.fit, testing, type = "prob")
m1 = data.frame(logit.predprob, testing$deny)
test1 <- evalm(m1)
## ***MLeval: Machine Learning Model Evaluation***
## Input: data frame of probabilities of observed labels
## Group does not exist, making column.
## Observations: 594
## Number of groups: 1
## Observations per group: 594
## Positive: yes
## Negative: no
## Group: Group1
## Positive: 71
## Negative: 523
## ***Performance Metrics***
## Group1 Optimal Informedness = 0.49403495543048
## Group1 AUC-ROC = 0.79
# create confusion matrix for the logistic regression model
gbm.pred <- predict(gbm.fit, testing)
confusionMatrix(data=gbm.pred, testing$deny)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 506 48
## yes 17 23
##
## Accuracy : 0.8906
## 95% CI : (0.8627, 0.9145)
## No Information Rate : 0.8805
## P-Value [Acc > NIR] : 0.2459454
##
## Kappa : 0.3592
##
## Mcnemar's Test P-Value : 0.0001984
##
## Sensitivity : 0.9675
## Specificity : 0.3239
## Pos Pred Value : 0.9134
## Neg Pred Value : 0.5750
## Prevalence : 0.8805
## Detection Rate : 0.8519
## Detection Prevalence : 0.9327
## Balanced Accuracy : 0.6457
##
## 'Positive' Class : no
##
test = evalm(gbm.fit) # this gives the ROC curve for test set
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Averaging probs.
## Group 1 type: repeatedcv
## Observations: 1786
## Number of groups: 1
## Observations per group: 1786
## Positive: yes
## Negative: no
## Group: Group 1
## Positive: 214
## Negative: 1572
## ***Performance Metrics***
## Group 1 Optimal Informedness = 0.570135074076716
## Group 1 AUC-ROC = 0.83
# we need to create another "predict" object in "prob" class in order to feed into evalm() that shows the ROC for each outcome (yes, no)
gbm.predprob <- predict(gbm.fit, testing, type = "prob")
m2 = data.frame(gbm.predprob, testing$deny)
test2 <- evalm(m2)
## ***MLeval: Machine Learning Model Evaluation***
## Input: data frame of probabilities of observed labels
## Group does not exist, making column.
## Observations: 594
## Number of groups: 1
## Observations per group: 594
## Positive: yes
## Negative: no
## Group: Group1
## Positive: 71
## Negative: 523
## ***Performance Metrics***
## Group1 Optimal Informedness = 0.478415425632187
## Group1 AUC-ROC = 0.79
# GBM does fair a lot better than logistic regression!
# you can also evaluate two evaluation metrics sequentially in one swoop
compare <- evalm(list(m1, m1), gnames=c('logistic','gbm'))
## ***MLeval: Machine Learning Model Evaluation***
## Input: data frame of probabilities of observed labels
## Group does not exist, making column.
## Observations: 594
## Number of groups: 1
## Observations per group: 594
## Positive: yes
## Negative: no
## Group: Group1
## Positive: 71
## Negative: 523
## ***Performance Metrics***
## Group1 Optimal Informedness = 0.49403495543048
## Group1 AUC-ROC = 0.79
## Grid vs. Random Search
grid <- expand.grid(interaction.depth=c(1, 2), # Depth of variable interactions
n.trees=c(5, 10, 20), # Num trees to fit (10 or 20)
shrinkage=c(0.01, 0.05, 0.1), # try three values of learning rate
n.minobsinnode = 20)
# how many unique combinations of hyperparameters should we evaluate?
dim(grid)
## [1] 18 4
## train a grid search GBM model using last example
# Train control with grid search
fitControl <- trainControl(method = "repeatedcv",
number = 5,
repeats = 10,
search = "grid")
# fit the model
set.seed(123)
tic()
gbm.grid <- train(deny ~ .,
data = training,
method = "gbm",
trControl = fitControl,
#tuneLength = 4, # the tuneLength parameter tells the algorithm to try different default values for the main parameter, in this case we use 4
verbose = FALSE,
tuneGrid = grid)
toc()
## 8.838 sec elapsed
# Plot the performance of the training models
plot(gbm.grid)
# display hyperparameter metrics
res.grid <- gbm.grid$results
res.grid
## shrinkage interaction.depth n.minobsinnode n.trees Accuracy Kappa
## 1 0.01 1 20 5 0.8801804 0.00000000
## 7 0.05 1 20 5 0.8801804 0.00000000
## 13 0.10 1 20 5 0.8801804 0.00000000
## 4 0.01 2 20 5 0.8801804 0.00000000
## 10 0.05 2 20 5 0.8801804 0.00000000
## 16 0.10 2 20 5 0.8801804 0.00000000
## 2 0.01 1 20 10 0.8801804 0.00000000
## 8 0.05 1 20 10 0.8801804 0.00000000
## 14 0.10 1 20 10 0.8815242 0.02125468
## 5 0.01 2 20 10 0.8801804 0.00000000
## 11 0.05 2 20 10 0.8801804 0.00000000
## 17 0.10 2 20 10 0.8839878 0.05908329
## 3 0.01 1 20 20 0.8801804 0.00000000
## 9 0.05 1 20 20 0.8811322 0.01417699
## 15 0.10 1 20 20 0.8888602 0.13609812
## 6 0.01 2 20 20 0.8801804 0.00000000
## 12 0.05 2 20 20 0.8832591 0.04822811
## 18 0.10 2 20 20 0.8888026 0.18036163
## AccuracySD KappaSD
## 1 0.0009602064 0.00000000
## 7 0.0009602064 0.00000000
## 13 0.0009602064 0.00000000
## 4 0.0009602064 0.00000000
## 10 0.0009602064 0.00000000
## 16 0.0009602064 0.00000000
## 2 0.0009602064 0.00000000
## 8 0.0009602064 0.00000000
## 14 0.0019884765 0.02758901
## 5 0.0009602064 0.00000000
## 11 0.0009602064 0.00000000
## 17 0.0028912682 0.03917515
## 3 0.0009602064 0.00000000
## 9 0.0022007629 0.02937326
## 15 0.0049104765 0.05874387
## 6 0.0009602064 0.00000000
## 12 0.0028118936 0.03957363
## 18 0.0058082601 0.06247223
## train a random search GBM model from last example
# Train control with random search
fitControl <- trainControl(method = "repeatedcv",
number = 5,
repeats = 10,
search = "random")
set.seed(123)
tic()
gbm.random <- train(deny ~ .,
data = training,
method = "gbm",
trControl = fitControl,
#tuneLength = 4,
verbose = FALSE,
tuneGrid = grid)
toc()
## 8.881 sec elapsed
# random search is slightly faster!
# Plot the performance of the training models
plot(gbm.random)
# display hyperparameter metrics
res.random <- gbm.random$results
res.random
## shrinkage interaction.depth n.minobsinnode n.trees Accuracy Kappa
## 1 0.01 1 20 5 0.8801804 0.00000000
## 7 0.05 1 20 5 0.8801804 0.00000000
## 13 0.10 1 20 5 0.8801804 0.00000000
## 4 0.01 2 20 5 0.8801804 0.00000000
## 10 0.05 2 20 5 0.8801804 0.00000000
## 16 0.10 2 20 5 0.8801804 0.00000000
## 2 0.01 1 20 10 0.8801804 0.00000000
## 8 0.05 1 20 10 0.8801804 0.00000000
## 14 0.10 1 20 10 0.8815242 0.02125468
## 5 0.01 2 20 10 0.8801804 0.00000000
## 11 0.05 2 20 10 0.8801804 0.00000000
## 17 0.10 2 20 10 0.8839878 0.05908329
## 3 0.01 1 20 20 0.8801804 0.00000000
## 9 0.05 1 20 20 0.8811322 0.01417699
## 15 0.10 1 20 20 0.8888602 0.13609812
## 6 0.01 2 20 20 0.8801804 0.00000000
## 12 0.05 2 20 20 0.8832591 0.04822811
## 18 0.10 2 20 20 0.8888026 0.18036163
## AccuracySD KappaSD
## 1 0.0009602064 0.00000000
## 7 0.0009602064 0.00000000
## 13 0.0009602064 0.00000000
## 4 0.0009602064 0.00000000
## 10 0.0009602064 0.00000000
## 16 0.0009602064 0.00000000
## 2 0.0009602064 0.00000000
## 8 0.0009602064 0.00000000
## 14 0.0019884765 0.02758901
## 5 0.0009602064 0.00000000
## 11 0.0009602064 0.00000000
## 17 0.0028912682 0.03917515
## 3 0.0009602064 0.00000000
## 9 0.0022007629 0.02937326
## 15 0.0049104765 0.05874387
## 6 0.0009602064 0.00000000
## 12 0.0028118936 0.03957363
## 18 0.0058082601 0.06247223
Using American National Election Studies (ANES) data (2012) from Steve’s toy data to (1) selection import voter-level attributes, and (2) estimate their effects on voters’ party identification (pid).
For pedagogical purpose, we select only variables that can be measured on a continuous, ordinal, or binary scale:
We do not have any a priori information about how these variables are distributed or how they are related to the outcome variable (pid) of interests, so let’s find out using adaptive LASSO and improving model fit via bootstrap and cross validation techniques.
# load required packages and data
install.packages(c("stevedata", "polywog"))
##
## The downloaded binary packages are in
## /var/folders/4p/njfl6g_15nb18hpr_snt5g740000gn/T//Rtmp7eR2Nl/downloaded_packages
library(stevedata)
data(anes_partytherms) # major party thermometer index data
names(anes_partytherms) # check out what's inside the data
## [1] "year" "uid" "stateabb"
## [4] "therm_dem" "therm_gop" "therm_bmp"
## [7] "mpti" "age" "educat"
## [10] "urbanism" "pid7" "incomeperc"
## [13] "race4" "unemployed" "polint"
## [16] "distrust_govt" "govt_crooked" "govt_waste"
## [19] "govt_biginterests"
# subset data to 2012 election only
anes_partytherms <- anes_partytherms[anes_partytherms$year == 2012, ]
# subset needed variables and rename the subsetted data as ANES to save on typing :-)
ANES <- anes_partytherms[, c("age", "educat", "pid7", "incomeperc", "unemployed", "distrust_govt")]
# check out df dimension
dim(ANES)
## [1] 5914 6
## What variables influence voters' party identification (pid)? (from 1 to 7 (more Democrat <--> more Republican)
# generate log age because this could be how age is related to pid (be sure to filter out cases where age == 0)
ANES <- ANES[ANES$age != 0, ]
dim(ANES)
## [1] 5854 6
ANES$lage <- log(ANES$age)
# Using "pid7" as our dependent variable, we estimate an adaptive LASSO model w/o cross-validation (cv) of polynomial degree and penalization factor
library(polywog)
## Loading required package: miscTools
polywog.fit <- polywog(pid7 ~ ., data = ANES, degree = 2, thresh = 1e-4)
summary(polywog.fit) # note that no s.e. generated as LASSO only concerns coef estimates, you need bootstrap to get simulated s.e.
##
## Call:
## polywog(formula = pid7 ~ ., data = ANES, degree = 2, thresh = 1e-04)
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 5.5455041 NA
## age -0.0157075 NA
## educat 0.0199406 NA
## incomeperc 0.0000000 NA
## unemployed 0.0000000 NA
## distrust_govt 0.0000000 NA
## lage -0.5582637 NA
## age^2 0.0002374 NA
## age.educat -0.0004647 NA
## age.incomeperc 0.0000000 NA
## age.unemployed 0.0000000 NA
## age.distrust_govt 0.0145500 NA
## age.lage -0.0020012 NA
## educat^2 0.0000000 NA
## educat.incomeperc 0.0000000 NA
## educat.unemployed 0.0000000 NA
## educat.distrust_govt 0.0000000 NA
## educat.lage 0.0039425 NA
## incomeperc^2 0.0000000 NA
## incomeperc.unemployed 0.0000000 NA
## incomeperc.distrust_govt 0.0000000 NA
## incomeperc.lage 0.0692410 NA
## unemployed.distrust_govt 0.0000000 NA
## unemployed.lage -0.0030015 NA
## distrust_govt^2 0.0000000 NA
## distrust_govt.lage 0.0000000 NA
## lage^2 -0.1452616 NA
##
## Regularization method: Adaptive LASSO
## Adaptive weights: inverse linear model coefficients
## Number of observations: 2849
## Polynomial expansion degree: 2
## Model family: gaussian
## Bootstrap iterations: 0
## Penalization parameter (lambda): 0.2468
# Bootstrap the fitted model
polywog.boot <- bootPolywog(polywog.fit, nboot = 5)
summary(polywog.boot)
##
## Call:
## polywog(formula = pid7 ~ ., data = ANES, degree = 2, thresh = 1e-04)
##
## Coefficients:
## Estimate Std. Error 2.5% 97.5%
## (Intercept) 5.546e+00 1.465e+00 3.322e+00 7.024
## age -1.571e-02 9.115e-03 -1.063e-02 0.010
## educat 1.994e-02 1.307e-01 0.000e+00 0.285
## incomeperc 0.000e+00 1.045e-01 0.000e+00 0.221
## unemployed 0.000e+00 2.117e-01 -4.892e-01 -0.001
## distrust_govt 0.000e+00 2.144e-01 -2.251e-01 0.314
## lage -5.583e-01 5.381e-01 -1.565e+00 -0.287
## age^2 2.374e-04 6.159e-05 1.334e-04 0.000
## age.educat -4.647e-04 2.397e-03 -4.924e-03 0.000
## age.incomeperc 0.000e+00 5.669e-04 -1.027e-03 0.000
## age.unemployed 0.000e+00 5.861e-03 0.000e+00 0.013
## age.distrust_govt 1.455e-02 8.924e-03 7.602e-05 0.020
## age.lage -2.001e-03 1.179e-03 1.867e-04 0.003
## educat^2 0.000e+00 1.027e-02 -1.998e-02 0.000
## educat.incomeperc 0.000e+00 1.070e-03 -2.153e-03 0.000
## educat.unemployed 0.000e+00 0.000e+00 0.000e+00 0.000
## educat.distrust_govt 0.000e+00 1.625e-02 0.000e+00 0.033
## educat.lage 3.942e-03 7.600e-03 3.080e-03 0.019
## incomeperc^2 0.000e+00 4.575e-04 -9.208e-04 0.000
## incomeperc.unemployed 0.000e+00 7.271e-02 -1.463e-01 0.000
## incomeperc.distrust_govt 0.000e+00 1.201e-02 -2.417e-02 0.000
## incomeperc.lage 6.924e-02 3.491e-02 5.781e-03 0.091
## unemployed.distrust_govt 0.000e+00 4.892e-02 -9.662e-02 0.006
## unemployed.lage -3.001e-03 3.753e-03 -7.566e-03 0.000
## distrust_govt^2 0.000e+00 0.000e+00 0.000e+00 0.000
## distrust_govt.lage 0.000e+00 8.774e-02 -1.004e-02 0.194
## lage^2 -1.453e-01 8.260e-02 -2.422e-01 -0.040
##
## Regularization method: Adaptive LASSO
## Adaptive weights: inverse linear model coefficients
## Number of observations: 2849
## Polynomial expansion degree: 2
## Model family: gaussian
## Bootstrap iterations: 5
## Penalization parameter (lambda): 0.2468
# we then estimate an adaptive LASSO model w/ cross-validation (cv) of polynomial degree and penalization factor
set.seed(123)
cv.fit <- cv.polywog(pid7 ~ ., data = ANES,
degrees.cv = 1:2, # 2-degree CV
nfolds = 5, # 5-fold
thresh = 1e-4)
summary(cv.fit)
## Length Class Mode
## results 6 -none- numeric
## degree.min 1 -none- numeric
## polywog.fit 25 polywog list
# Extract best model and bootstrap
fit1 <- cv.fit$polywog.fit
fit1 <- bootPolywog(fit1, nboot = 5)
summary(fit1)
##
## Call:
## polywog(formula = pid7 ~ ., data = ANES, degree = 2L, boot = 0,
## thresh = 1e-04, model = TRUE, X = FALSE, y = FALSE)
##
## Coefficients:
## Estimate Std. Error 2.5% 97.5%
## (Intercept) 5.546e+00 9.976e-01 3.808e+00 5.843
## age -1.571e-02 1.013e-02 -1.034e-02 0.014
## educat 1.994e-02 1.368e-01 1.314e-02 0.320
## incomeperc 0.000e+00 5.004e-02 2.187e-04 0.109
## unemployed 0.000e+00 1.112e-01 -2.098e-01 0.054
## distrust_govt 0.000e+00 3.846e-01 -6.165e-01 0.356
## lage -5.583e-01 2.167e-01 -6.734e-01 -0.125
## age^2 2.374e-04 1.248e-04 4.527e-05 0.000
## age.educat -4.647e-04 2.808e-03 -6.418e-03 0.000
## age.incomeperc 0.000e+00 6.075e-04 -2.253e-04 0.001
## age.unemployed 0.000e+00 6.801e-03 0.000e+00 0.015
## age.distrust_govt 1.455e-02 6.956e-03 6.897e-03 0.023
## age.lage -2.001e-03 1.896e-03 -3.191e-03 0.001
## educat^2 0.000e+00 4.839e-03 -1.254e-02 0.000
## educat.incomeperc 0.000e+00 1.947e-02 -4.124e-02 0.000
## educat.unemployed 0.000e+00 4.790e-02 -5.438e-02 0.067
## educat.distrust_govt 0.000e+00 2.886e-02 0.000e+00 0.058
## educat.lage 3.942e-03 1.456e-02 1.179e-02 0.044
## incomeperc^2 0.000e+00 2.740e-02 0.000e+00 0.055
## incomeperc.unemployed 0.000e+00 1.317e-01 -2.952e-01 0.000
## incomeperc.distrust_govt 0.000e+00 8.129e-03 -1.636e-02 0.000
## incomeperc.lage 6.924e-02 3.706e-02 1.874e-02 0.111
## unemployed.distrust_govt 0.000e+00 8.247e-03 0.000e+00 0.017
## unemployed.lage -3.001e-03 5.520e-02 -9.720e-03 0.108
## distrust_govt^2 0.000e+00 0.000e+00 0.000e+00 0.000
## distrust_govt.lage 0.000e+00 6.154e-02 -5.237e-03 0.124
## lage^2 -1.453e-01 5.900e-02 -2.967e-01 -0.157
##
## Regularization method: Adaptive LASSO
## Adaptive weights: inverse linear model coefficients
## Number of observations: 2849
## Polynomial expansion degree: 2
## Model family: gaussian
## Bootstrap iterations: 5
## Penalization parameter (lambda): 0.2468
# What would you say on the result? What variables and their combindations should be included (excluded)? Verdict?