Stepwise Variable Selection

# load required packages and data (Carseats data)
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(ISLR)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
head(Carseats)
##   Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1  9.50       138     73          11        276   120       Bad  42        17
## 2 11.22       111     48          16        260    83      Good  65        10
## 3 10.06       113     35          10        269    80    Medium  59        12
## 4  7.40       117    100           4        466    97    Medium  55        14
## 5  4.15       141     64           3        340   128       Bad  38        13
## 6 10.81       124    113          13        501    72       Bad  78        16
##   Urban  US
## 1   Yes Yes
## 2   Yes Yes
## 3   Yes Yes
## 4   Yes Yes
## 5   Yes  No
## 6    No Yes
# getting needed columns but first of all, we need to check out what's inside the dataframe
names(Carseats)
##  [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
##  [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
## [11] "US"
# subset the data

Carseats_sub <- Carseats[, c("Sales", "Price", "Advertising", "Education", "Age")]


# Use ols_step_all_possible() function to list all possible combination of variables and plot them out to see relevant model discrimination criteria
lm.fit1 <- lm(Sales ~ Price + Advertising + Education + Age, data = Carseats_sub)
k <- ols_step_all_possible(lm.fit1)
k
##    Index N                      Predictors    R-Square Adj. R-Square
## 1      1 1                           Price 0.197981150  0.1959660275
## 2      2 1                     Advertising 0.072633905  0.0703038396
## 4      3 1                             Age 0.053738398  0.0513608563
## 3      4 1                       Education 0.002699347  0.0001935666
## 5      5 2               Price Advertising 0.281855603  0.2782377474
## 7      6 2                       Price Age 0.275675991  0.2720270037
## 6      7 2                 Price Education 0.200165012  0.1961356167
## 9      8 2                 Advertising Age 0.125805449  0.1214014467
## 8      9 2           Advertising Education 0.074476510  0.0698139236
## 10    10 2                   Education Age 0.056283830  0.0515295927
## 12    11 3           Price Advertising Age 0.359549330  0.3546974308
## 11    12 3     Price Advertising Education 0.283214413  0.2777842187
## 13    13 3             Price Education Age 0.277663252  0.2721910041
## 14    14 3       Advertising Education Age 0.127524779  0.1209151186
## 15    15 4 Price Advertising Education Age 0.360753937  0.3542805595
##    Mallow's Cp
## 1    99.579815
## 2   177.033811
## 4   188.709636
## 3   220.247453
## 5    49.752497
## 7    53.570975
## 6   100.230373
## 9   146.178294
## 8   177.895237
## 10  189.136774
## 12    3.744346
## 11   50.912868
## 13   54.343016
## 14  147.115893
## 15    5.000000
# plot the ols_step_all_possible() object, you can see the performance of all these models in terms of R^2, Adj R^2, AIC
plot(k)

# Select the subset of predictors that do the best at meeting some well-defined objective criteria, such as having the largest R2 value or the smallest MSE, Mallow’s Cp or AIC:
k_best <- ols_step_best_subset(lm.fit1)
k_best
##            Best Subsets Regression            
## ----------------------------------------------
## Model Index    Predictors
## ----------------------------------------------
##      1         Price                           
##      2         Price Advertising               
##      3         Price Advertising Age           
##      4         Price Advertising Education Age 
## ----------------------------------------------
## 
##                                                      Subsets Regression Summary                                                      
## -------------------------------------------------------------------------------------------------------------------------------------
##                        Adj.        Pred                                                                                               
## Model    R-Square    R-Square    R-Square     C(p)         AIC         SBIC         SBC         MSEP        FPE       HSP       APC  
## -------------------------------------------------------------------------------------------------------------------------------------
##   1        0.1980      0.1960        0.19    99.5798    1882.4564    746.4597    1894.4307    2565.0698    6.4447    0.0162    0.8101 
##   2        0.2819      0.2782      0.2707    49.7525    1840.2718    704.5099    1856.2376    2302.6170    5.7997    0.0145    0.7290 
##   3        0.3595      0.3547      0.3462     3.7443    1796.4723    661.4073    1816.4297    2058.7030    5.1982    0.0130    0.6534 
##   4        0.3608      0.3543      0.3441     5.0000    1797.7193    662.6947    1821.6681    2060.0462    5.2144    0.0131    0.6554 
## -------------------------------------------------------------------------------------------------------------------------------------
## AIC: Akaike Information Criteria 
##  SBIC: Sawa's Bayesian Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
##  MSEP: Estimated error of prediction, assuming multivariate normality 
##  FPE: Final Prediction Error 
##  HSP: Hocking's Sp 
##  APC: Amemiya Prediction Criteria
# plot out best predictors
plot(k_best)

## Stepwise Forward Regression
# Build regression model from a set of candidate variables by "entering" variables based on their p-values, in a stepwise manner until there is no variable left to enter any more. 
# The model should include all the candidate predictor variables. (If details is set to TRUE, each step is displayed)

lm.fit2 <- lm(Sales ~ ., data = Carseats_sub)
k_forward <- ols_step_forward_p(lm.fit2)
k_forward
## 
##                               Selection Summary                               
## -----------------------------------------------------------------------------
##         Variable                     Adj.                                        
## Step      Entered      R-Square    R-Square     C(p)         AIC        RMSE     
## -----------------------------------------------------------------------------
##    1    Price            0.1980      0.1960    99.5798    1882.4564    2.5323    
##    2    Advertising      0.2819      0.2782    49.7525    1840.2718    2.3993    
##    3    Age              0.3595      0.3547     3.7443    1796.4723    2.2686    
## -----------------------------------------------------------------------------
# plot them out
plot(k_forward)

# show details
ols_step_forward_p(lm.fit2, details = TRUE)
## Forward Selection Method    
## ---------------------------
## 
## Candidate Terms: 
## 
## 1. Price 
## 2. Advertising 
## 3. Education 
## 4. Age 
## 
## We are selecting variables based on p value...
## 
## 
## Forward Selection: Step 1 
## 
## - Price 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.445       RMSE                2.532 
## R-Squared               0.198       Coef. Var          33.781 
## Adj. R-Squared          0.196       MSE                 6.413 
## Pred R-Squared          0.190       MAE                 2.045 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                 ANOVA                                 
## ---------------------------------------------------------------------
##                 Sum of                                               
##                Squares         DF    Mean Square      F         Sig. 
## ---------------------------------------------------------------------
## Regression     630.030          1        630.030    98.248    0.0000 
## Residual      2552.244        398          6.413                     
## Total         3182.275        399                                    
## ---------------------------------------------------------------------
## 
##                                   Parameter Estimates                                    
## ----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## ----------------------------------------------------------------------------------------
## (Intercept)    13.642         0.633                 21.558    0.000    12.398    14.886 
##       Price    -0.053         0.005       -0.445    -9.912    0.000    -0.064    -0.043 
## ----------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 2 
## 
## - Advertising 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.531       RMSE                2.399 
## R-Squared               0.282       Coef. Var          32.006 
## Adj. R-Squared          0.278       MSE                 5.757 
## Pred R-Squared          0.271       MAE                 1.890 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                 ANOVA                                 
## ---------------------------------------------------------------------
##                 Sum of                                               
##                Squares         DF    Mean Square      F         Sig. 
## ---------------------------------------------------------------------
## Regression     896.942          2        448.471    77.907    0.0000 
## Residual      2285.333        397          5.757                     
## Total         3182.275        399                                    
## ---------------------------------------------------------------------
## 
##                                    Parameter Estimates                                    
## -----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -----------------------------------------------------------------------------------------
## (Intercept)    13.003         0.607                  21.428    0.000    11.810    14.196 
##       Price    -0.055         0.005       -0.458    -10.755    0.000    -0.065    -0.045 
## Advertising     0.123         0.018        0.290      6.809    0.000     0.088     0.159 
## -----------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 3 
## 
## - Age 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.600       RMSE                2.269 
## R-Squared               0.360       Coef. Var          30.263 
## Adj. R-Squared          0.355       MSE                 5.147 
## Pred R-Squared          0.346       MAE                 1.815 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                 ANOVA                                 
## ---------------------------------------------------------------------
##                 Sum of                                               
##                Squares         DF    Mean Square      F         Sig. 
## ---------------------------------------------------------------------
## Regression    1144.185          3        381.395    74.105    0.0000 
## Residual      2038.090        396          5.147                     
## Total         3182.275        399                                    
## ---------------------------------------------------------------------
## 
##                                    Parameter Estimates                                    
## -----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -----------------------------------------------------------------------------------------
## (Intercept)    16.003         0.719                  22.266    0.000    14.590    17.417 
##       Price    -0.058         0.005       -0.486    -12.022    0.000    -0.068    -0.049 
## Advertising     0.123         0.017        0.290      7.201    0.000     0.089     0.157 
##         Age    -0.049         0.007       -0.280     -6.931    0.000    -0.063    -0.035 
## -----------------------------------------------------------------------------------------
## 
## 
## 
## No more variables to be added.
## 
## Variables Entered: 
## 
## + Price 
## + Advertising 
## + Age 
## 
## 
## Final Model Output 
## ------------------
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.600       RMSE                2.269 
## R-Squared               0.360       Coef. Var          30.263 
## Adj. R-Squared          0.355       MSE                 5.147 
## Pred R-Squared          0.346       MAE                 1.815 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                 ANOVA                                 
## ---------------------------------------------------------------------
##                 Sum of                                               
##                Squares         DF    Mean Square      F         Sig. 
## ---------------------------------------------------------------------
## Regression    1144.185          3        381.395    74.105    0.0000 
## Residual      2038.090        396          5.147                     
## Total         3182.275        399                                    
## ---------------------------------------------------------------------
## 
##                                    Parameter Estimates                                    
## -----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -----------------------------------------------------------------------------------------
## (Intercept)    16.003         0.719                  22.266    0.000    14.590    17.417 
##       Price    -0.058         0.005       -0.486    -12.022    0.000    -0.068    -0.049 
## Advertising     0.123         0.017        0.290      7.201    0.000     0.089     0.157 
##         Age    -0.049         0.007       -0.280     -6.931    0.000    -0.063    -0.035 
## -----------------------------------------------------------------------------------------
## 
##                               Selection Summary                               
## -----------------------------------------------------------------------------
##         Variable                     Adj.                                        
## Step      Entered      R-Square    R-Square     C(p)         AIC        RMSE     
## -----------------------------------------------------------------------------
##    1    Price            0.1980      0.1960    99.5798    1882.4564    2.5323    
##    2    Advertising      0.2819      0.2782    49.7525    1840.2718    2.3993    
##    3    Age              0.3595      0.3547     3.7443    1796.4723    2.2686    
## -----------------------------------------------------------------------------
## Stepwise Backward Regression
# Build regression model from a set of candidate variables by "removing" variables based on their p-values, in a stepwise manner until there is no variable left to remove any more. 
# Again, the model should include all the candidate predictor variables. (If details is set to TRUE, each step is displayed)

lm.fit3 <- lm(Sales ~ ., data = Carseats_sub)
k_backward <- ols_step_backward_p(lm.fit3)
k_backward
## 
## 
##                            Elimination Summary                             
## --------------------------------------------------------------------------
##         Variable                   Adj.                                       
## Step     Removed     R-Square    R-Square     C(p)        AIC        RMSE     
## --------------------------------------------------------------------------
##    1    Education      0.3595      0.3547    3.7443    1796.4723    2.2686    
## --------------------------------------------------------------------------
# plot them out
plot(k_backward)
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

# show details
ols_step_backward_p(lm.fit3, details = TRUE)
## Backward Elimination Method 
## ---------------------------
## 
## Candidate Terms: 
## 
## 1 . Price 
## 2 . Advertising 
## 3 . Education 
## 4 . Age 
## 
## We are eliminating variables based on p value...
## 
## - Education 
## 
## Backward Elimination: Step 1 
## 
##  Variable Education Removed 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.600       RMSE                2.269 
## R-Squared               0.360       Coef. Var          30.263 
## Adj. R-Squared          0.355       MSE                 5.147 
## Pred R-Squared          0.346       MAE                 1.815 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                 ANOVA                                 
## ---------------------------------------------------------------------
##                 Sum of                                               
##                Squares         DF    Mean Square      F         Sig. 
## ---------------------------------------------------------------------
## Regression    1144.185          3        381.395    74.105    0.0000 
## Residual      2038.090        396          5.147                     
## Total         3182.275        399                                    
## ---------------------------------------------------------------------
## 
##                                    Parameter Estimates                                    
## -----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -----------------------------------------------------------------------------------------
## (Intercept)    16.003         0.719                  22.266    0.000    14.590    17.417 
##       Price    -0.058         0.005       -0.486    -12.022    0.000    -0.068    -0.049 
## Advertising     0.123         0.017        0.290      7.201    0.000     0.089     0.157 
##         Age    -0.049         0.007       -0.280     -6.931    0.000    -0.063    -0.035 
## -----------------------------------------------------------------------------------------
## 
## 
## 
## No more variables satisfy the condition of p value = 0.3
## 
## 
## Variables Removed: 
## 
## - Education 
## 
## 
## Final Model Output 
## ------------------
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.600       RMSE                2.269 
## R-Squared               0.360       Coef. Var          30.263 
## Adj. R-Squared          0.355       MSE                 5.147 
## Pred R-Squared          0.346       MAE                 1.815 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                 ANOVA                                 
## ---------------------------------------------------------------------
##                 Sum of                                               
##                Squares         DF    Mean Square      F         Sig. 
## ---------------------------------------------------------------------
## Regression    1144.185          3        381.395    74.105    0.0000 
## Residual      2038.090        396          5.147                     
## Total         3182.275        399                                    
## ---------------------------------------------------------------------
## 
##                                    Parameter Estimates                                    
## -----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -----------------------------------------------------------------------------------------
## (Intercept)    16.003         0.719                  22.266    0.000    14.590    17.417 
##       Price    -0.058         0.005       -0.486    -12.022    0.000    -0.068    -0.049 
## Advertising     0.123         0.017        0.290      7.201    0.000     0.089     0.157 
##         Age    -0.049         0.007       -0.280     -6.931    0.000    -0.063    -0.035 
## -----------------------------------------------------------------------------------------
## 
## 
##                            Elimination Summary                             
## --------------------------------------------------------------------------
##         Variable                   Adj.                                       
## Step     Removed     R-Square    R-Square     C(p)        AIC        RMSE     
## --------------------------------------------------------------------------
##    1    Education      0.3595      0.3547    3.7443    1796.4723    2.2686    
## --------------------------------------------------------------------------

Hyperparameter Tuning using Caret

# load required packages and data (Carseats data)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
library(AER)
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
library(dplyr)
data(HMDA)

# check out what's inside the CreditCard data
dim(HMDA)
## [1] 2380   14
names(HMDA)
##  [1] "deny"      "pirat"     "hirat"     "lvrat"     "chist"     "mhist"    
##  [7] "phist"     "unemp"     "selfemp"   "insurance" "condomin"  "afam"     
## [13] "single"    "hschool"
# check out variable class
str(HMDA)
## 'data.frame':    2380 obs. of  14 variables:
##  $ deny     : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ pirat    : num  0.221 0.265 0.372 0.32 0.36 ...
##  $ hirat    : num  0.221 0.265 0.248 0.25 0.35 ...
##  $ lvrat    : num  0.8 0.922 0.92 0.86 0.6 ...
##  $ chist    : Factor w/ 6 levels "1","2","3","4",..: 5 2 1 1 1 1 1 2 2 2 ...
##  $ mhist    : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 1 1 2 2 2 1 ...
##  $ phist    : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ unemp    : num  3.9 3.2 3.2 4.3 3.2 ...
##  $ selfemp  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ insurance: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ condomin : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 2 1 1 1 ...
##  $ afam     : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ single   : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 2 1 1 2 ...
##  $ hschool  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
# convert the outcome variable "card", and two factor class variables, "owner" and "selfemp," to binary numeric class

HMDA$phist <- ifelse(HMDA$phist == "yes", 1, 0)
HMDA$insurance <- ifelse(HMDA$insurance == "yes", 1, 0)
HMDA$condomin <- ifelse(HMDA$condomin == "yes", 1, 0)
HMDA$afam <- ifelse(HMDA$afam == "yes", 1, 0)
HMDA$single <- ifelse(HMDA$single == "yes", 1, 0)
HMDA$hschool <- ifelse(HMDA$hschool == "yes", 1, 0)


# do a 75-25 train-test split
set.seed(123)
index <- createDataPartition(HMDA$deny, p=0.75, list=FALSE)
training <- HMDA[index, ]
testing <- HMDA[-index, ]

# initialize train control function
fitControl1 <- trainControl(method = "repeatedcv", number = 5, repeats = 10, classProbs = TRUE)

# train a logistic regression model, let's time it

library(tictoc)

tic()
logit.fit1 = train(
  form = deny ~ .,
  data = training,
  trControl = fitControl1,
  method = "glm",
  family = "binomial"
)
toc()
## 1.705 sec elapsed
# using trained model to predict testing data
pred1 <- predict(logit.fit1, testing)

# show confusion table and calculate accuracy
confmat_logit1 <- table(Predicted = pred1, Actual = testing$deny)
confmat_logit1
##          Actual
## Predicted  no yes
##       no  510  51
##       yes  13  20
(confmat_logit1[1, 1] + confmat_logit1[2, 2]) / sum(confmat_logit1) * 100
## [1] 89.22559
# Does more cross-validation iterations improve prediction accuracy? Let's find out

fitControl2 <- trainControl(method = "repeatedcv", number = 10, repeats = 20, classProbs = TRUE)

# train a logistic regression model 

tic()
logit.fit2 = train(
  form = deny ~ .,
  data = training,
  trControl = fitControl2,
  method = "glm",
  family = "binomial"
)
toc()
## 5.079 sec elapsed
# more cv iterations certainly takes more time

# using trained model to predict testing data
pred2 <- predict(logit.fit2, testing)

# show confusion table and calculate accuracy
confmat_logit2 <- table(Predicted = pred2, Actual = testing$deny)
confmat_logit2
##          Actual
## Predicted  no yes
##       no  510  51
##       yes  13  20
(confmat_logit2[1, 1] + confmat_logit2[2, 2]) / sum(confmat_logit2) * 100
## [1] 89.22559
# accuracy does not improve, so more is not necessarily better (at least for this case)


## adding ROC curve as additional model evaluation metric, let's compare the performance of logistic regression against that of GBM
# note that since the ROC curve is based on the predicted class probabilities (which are not computed automatically), another option is required. The classProbs = TRUE option is used to include these calculations.


## logistic regression ##
# initialize train control function
fitControl <- trainControl(method = "repeatedcv", number = 5, repeats = 10, classProbs = TRUE, savePredictions = TRUE)

logit.fit = train(
  form = deny ~ .,
  data = training,
  trControl = fitControl,
  method = "glm",
  family = "binomial",
  metric = "Accuracy"   # logit model uses Accuracy (ROC is not used here)
)

# we compare the performance of logistic regression against against Gradient Boost Machine (GBM), which is the more sophisticated model
# manually define the hyperparameters of a GBM

hyperparams <- expand.grid(n.trees = 200,  # number of trees
                           interaction.depth = 1, # depth of variable interactions
                           shrinkage = 0.1,  # learning rate
                           n.minobsinnode = 10)  # minimum number of obs per node on a tree

gbmControl <- trainControl(method = "repeatedcv", number = 5, repeats = 10, classProbs = TRUE, savePredictions = TRUE)

# Apply hyperparameter grid to train(), we also need to load gbm package
library(gbm)
## Loaded gbm 2.1.8
gbm.fit <- train(deny ~ ., 
                   data = training, 
                   method = "gbm", 
                   trControl = gbmControl,
                   verbose = FALSE,
                   metric = "Accuracy",
                   tuneGrid = hyperparams)


# we will use MLeval package to automatically generate ROC curve
# MLeval stands for "Machine Learning Model Evaluation"

library(MLeval)  # for evalm() function

# create confusion matrix for the logistic regression model
logit.pred <- predict(logit.fit, testing)
confusionMatrix(data=logit.pred, testing$deny)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  510  51
##        yes  13  20
##                                          
##                Accuracy : 0.8923         
##                  95% CI : (0.8645, 0.916)
##     No Information Rate : 0.8805         
##     P-Value [Acc > NIR] : 0.207          
##                                          
##                   Kappa : 0.3341         
##                                          
##  Mcnemar's Test P-Value : 3.746e-06      
##                                          
##             Sensitivity : 0.9751         
##             Specificity : 0.2817         
##          Pos Pred Value : 0.9091         
##          Neg Pred Value : 0.6061         
##              Prevalence : 0.8805         
##          Detection Rate : 0.8586         
##    Detection Prevalence : 0.9444         
##       Balanced Accuracy : 0.6284         
##                                          
##        'Positive' Class : no             
## 
test = evalm(logit.fit)     # this gives the ROC curve for the test set
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Averaging probs.
## Group 1 type: repeatedcv
## Observations: 1786
## Number of groups: 1
## Observations per group: 1786
## Positive: yes
## Negative: no
## Group: Group 1
## Positive: 214
## Negative: 1572
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.545450762169746
## Group 1 AUC-ROC = 0.83

# we need to create another "predict" object in "prob" class in order to feed into evalm() that shows the ROC for each outcome (yes, no)
logit.predprob <- predict(logit.fit, testing, type = "prob")
m1 = data.frame(logit.predprob, testing$deny)

test1 <- evalm(m1)
## ***MLeval: Machine Learning Model Evaluation***
## Input: data frame of probabilities of observed labels
## Group does not exist, making column.
## Observations: 594
## Number of groups: 1
## Observations per group: 594
## Positive: yes
## Negative: no
## Group: Group1
## Positive: 71
## Negative: 523
## ***Performance Metrics***

## Group1 Optimal Informedness = 0.49403495543048
## Group1 AUC-ROC = 0.79

# create confusion matrix for the logistic regression model
gbm.pred <- predict(gbm.fit, testing)
confusionMatrix(data=gbm.pred, testing$deny)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  506  48
##        yes  17  23
##                                           
##                Accuracy : 0.8906          
##                  95% CI : (0.8627, 0.9145)
##     No Information Rate : 0.8805          
##     P-Value [Acc > NIR] : 0.2459454       
##                                           
##                   Kappa : 0.3592          
##                                           
##  Mcnemar's Test P-Value : 0.0001984       
##                                           
##             Sensitivity : 0.9675          
##             Specificity : 0.3239          
##          Pos Pred Value : 0.9134          
##          Neg Pred Value : 0.5750          
##              Prevalence : 0.8805          
##          Detection Rate : 0.8519          
##    Detection Prevalence : 0.9327          
##       Balanced Accuracy : 0.6457          
##                                           
##        'Positive' Class : no              
## 
test = evalm(gbm.fit)     # this gives the ROC curve for test set
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Averaging probs.
## Group 1 type: repeatedcv
## Observations: 1786
## Number of groups: 1
## Observations per group: 1786
## Positive: yes
## Negative: no
## Group: Group 1
## Positive: 214
## Negative: 1572
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.570135074076716
## Group 1 AUC-ROC = 0.83

# we need to create another "predict" object in "prob" class in order to feed into evalm() that shows the ROC for each outcome (yes, no)
gbm.predprob <- predict(gbm.fit, testing, type = "prob")
m2 = data.frame(gbm.predprob, testing$deny)

test2 <- evalm(m2)
## ***MLeval: Machine Learning Model Evaluation***
## Input: data frame of probabilities of observed labels
## Group does not exist, making column.
## Observations: 594
## Number of groups: 1
## Observations per group: 594
## Positive: yes
## Negative: no
## Group: Group1
## Positive: 71
## Negative: 523
## ***Performance Metrics***

## Group1 Optimal Informedness = 0.478415425632187
## Group1 AUC-ROC = 0.79

# GBM does fair a lot better than logistic regression!

# you can also evaluate two evaluation metrics sequentially in one swoop
compare <- evalm(list(m1, m1), gnames=c('logistic','gbm')) 
## ***MLeval: Machine Learning Model Evaluation***
## Input: data frame of probabilities of observed labels
## Group does not exist, making column.
## Observations: 594
## Number of groups: 1
## Observations per group: 594
## Positive: yes
## Negative: no
## Group: Group1
## Positive: 71
## Negative: 523
## ***Performance Metrics***

## Group1 Optimal Informedness = 0.49403495543048
## Group1 AUC-ROC = 0.79

## Grid vs. Random Search

grid <- expand.grid(interaction.depth=c(1, 2), # Depth of variable interactions
                    n.trees=c(5, 10, 20),           # Num trees to fit (10 or 20)
                    shrinkage=c(0.01, 0.05, 0.1),     # try three values of learning rate 
                    n.minobsinnode = 20)

# how many unique combinations of hyperparameters should we evaluate?
dim(grid)
## [1] 18  4
## train a grid search GBM model using last example
# Train control with grid search

fitControl <- trainControl(method = "repeatedcv", 
                           number = 5, 
                          repeats = 10, 
                           search = "grid")

# fit the model

set.seed(123)

tic()
gbm.grid <- train(deny ~ ., 
                   data = training, 
                   method = "gbm", 
                   trControl = fitControl,
                   #tuneLength = 4,   # the tuneLength parameter tells the algorithm to try different default values for the main parameter, in this case we use 4
                   verbose = FALSE,
                   tuneGrid = grid)
toc()
## 8.838 sec elapsed
# Plot the performance of the training models
plot(gbm.grid)

# display hyperparameter metrics
res.grid <- gbm.grid$results
res.grid
##    shrinkage interaction.depth n.minobsinnode n.trees  Accuracy      Kappa
## 1       0.01                 1             20       5 0.8801804 0.00000000
## 7       0.05                 1             20       5 0.8801804 0.00000000
## 13      0.10                 1             20       5 0.8801804 0.00000000
## 4       0.01                 2             20       5 0.8801804 0.00000000
## 10      0.05                 2             20       5 0.8801804 0.00000000
## 16      0.10                 2             20       5 0.8801804 0.00000000
## 2       0.01                 1             20      10 0.8801804 0.00000000
## 8       0.05                 1             20      10 0.8801804 0.00000000
## 14      0.10                 1             20      10 0.8815242 0.02125468
## 5       0.01                 2             20      10 0.8801804 0.00000000
## 11      0.05                 2             20      10 0.8801804 0.00000000
## 17      0.10                 2             20      10 0.8839878 0.05908329
## 3       0.01                 1             20      20 0.8801804 0.00000000
## 9       0.05                 1             20      20 0.8811322 0.01417699
## 15      0.10                 1             20      20 0.8888602 0.13609812
## 6       0.01                 2             20      20 0.8801804 0.00000000
## 12      0.05                 2             20      20 0.8832591 0.04822811
## 18      0.10                 2             20      20 0.8888026 0.18036163
##      AccuracySD    KappaSD
## 1  0.0009602064 0.00000000
## 7  0.0009602064 0.00000000
## 13 0.0009602064 0.00000000
## 4  0.0009602064 0.00000000
## 10 0.0009602064 0.00000000
## 16 0.0009602064 0.00000000
## 2  0.0009602064 0.00000000
## 8  0.0009602064 0.00000000
## 14 0.0019884765 0.02758901
## 5  0.0009602064 0.00000000
## 11 0.0009602064 0.00000000
## 17 0.0028912682 0.03917515
## 3  0.0009602064 0.00000000
## 9  0.0022007629 0.02937326
## 15 0.0049104765 0.05874387
## 6  0.0009602064 0.00000000
## 12 0.0028118936 0.03957363
## 18 0.0058082601 0.06247223
## train a random search GBM model from last example
# Train control with random search

fitControl <- trainControl(method = "repeatedcv",
                           number = 5,
                          repeats = 10,
                           search = "random")

set.seed(123)

tic()
gbm.random <- train(deny ~ ., 
                   data = training, 
                   method = "gbm", 
                   trControl = fitControl,
                   #tuneLength = 4,
                   verbose = FALSE,
                   tuneGrid = grid)
toc()
## 8.881 sec elapsed
# random search is slightly faster!

# Plot the performance of the training models
plot(gbm.random)

# display hyperparameter metrics
res.random <- gbm.random$results
res.random
##    shrinkage interaction.depth n.minobsinnode n.trees  Accuracy      Kappa
## 1       0.01                 1             20       5 0.8801804 0.00000000
## 7       0.05                 1             20       5 0.8801804 0.00000000
## 13      0.10                 1             20       5 0.8801804 0.00000000
## 4       0.01                 2             20       5 0.8801804 0.00000000
## 10      0.05                 2             20       5 0.8801804 0.00000000
## 16      0.10                 2             20       5 0.8801804 0.00000000
## 2       0.01                 1             20      10 0.8801804 0.00000000
## 8       0.05                 1             20      10 0.8801804 0.00000000
## 14      0.10                 1             20      10 0.8815242 0.02125468
## 5       0.01                 2             20      10 0.8801804 0.00000000
## 11      0.05                 2             20      10 0.8801804 0.00000000
## 17      0.10                 2             20      10 0.8839878 0.05908329
## 3       0.01                 1             20      20 0.8801804 0.00000000
## 9       0.05                 1             20      20 0.8811322 0.01417699
## 15      0.10                 1             20      20 0.8888602 0.13609812
## 6       0.01                 2             20      20 0.8801804 0.00000000
## 12      0.05                 2             20      20 0.8832591 0.04822811
## 18      0.10                 2             20      20 0.8888026 0.18036163
##      AccuracySD    KappaSD
## 1  0.0009602064 0.00000000
## 7  0.0009602064 0.00000000
## 13 0.0009602064 0.00000000
## 4  0.0009602064 0.00000000
## 10 0.0009602064 0.00000000
## 16 0.0009602064 0.00000000
## 2  0.0009602064 0.00000000
## 8  0.0009602064 0.00000000
## 14 0.0019884765 0.02758901
## 5  0.0009602064 0.00000000
## 11 0.0009602064 0.00000000
## 17 0.0028912682 0.03917515
## 3  0.0009602064 0.00000000
## 9  0.0022007629 0.02937326
## 15 0.0049104765 0.05874387
## 6  0.0009602064 0.00000000
## 12 0.0028118936 0.03957363
## 18 0.0058082601 0.06247223

Feature selection of voters’ party identification

Using American National Election Studies (ANES) data (2012) from Steve’s toy data to (1) selection import voter-level attributes, and (2) estimate their effects on voters’ party identification (pid).

For pedagogical purpose, we select only variables that can be measured on a continuous, ordinal, or binary scale:

pid7
1 = Strong Democrat. 2 = Weak Democrat. 3 = Independent, lean Democrat. 4 = Independent.5 = Independent, lean Republican. 6 = Weak Republican. 7 = Strong Republican
educat
the education-level of the respondent. 1 = 8 grades or less. 2 = high school, no diploma. 3= high school diploma. 4 = high school “plus non-academic training”. 5 = Some college, nodegree (includes AA holders). 6 = BA-level degree. 7 = advanced degree, including Bachelorof Laws degrees.
age
the age of the respondent
incomeperc
respondent’s household income percentile: 1 = 0-16 percentile. 2 = 17-33. 3 = 34-67.4 = 68-95. 5 = 96-100.
unemployed
a binary numeric vector for if the respondent is temporarily unemployed.
distrust_govt
the respondent’s self-reported (dis)trust in the federal government’s ability to dowhat’s right: 1 = Just about always (trust the government). 2 = Most of the time. 3 = Some of the time. 4 = None of the time/never.

We do not have any a priori information about how these variables are distributed or how they are related to the outcome variable (pid) of interests, so let’s find out using adaptive LASSO and improving model fit via bootstrap and cross validation techniques.

# load required packages and data
install.packages(c("stevedata", "polywog"))
## 
## The downloaded binary packages are in
##  /var/folders/4p/njfl6g_15nb18hpr_snt5g740000gn/T//Rtmp7eR2Nl/downloaded_packages
library(stevedata)
data(anes_partytherms) # major party thermometer index data
names(anes_partytherms) # check out what's inside the data
##  [1] "year"              "uid"               "stateabb"         
##  [4] "therm_dem"         "therm_gop"         "therm_bmp"        
##  [7] "mpti"              "age"               "educat"           
## [10] "urbanism"          "pid7"              "incomeperc"       
## [13] "race4"             "unemployed"        "polint"           
## [16] "distrust_govt"     "govt_crooked"      "govt_waste"       
## [19] "govt_biginterests"
# subset data to 2012 election only
anes_partytherms <- anes_partytherms[anes_partytherms$year == 2012, ]

# subset needed variables and rename the subsetted data as ANES to save on typing :-)
ANES <- anes_partytherms[, c("age", "educat", "pid7", "incomeperc", "unemployed", "distrust_govt")]

# check out df dimension
dim(ANES)
## [1] 5914    6
## What variables influence voters' party identification (pid)? (from 1 to 7 (more Democrat <--> more Republican)


# generate log age because this could be how age is related to pid  (be sure to filter out cases where age == 0)
ANES <- ANES[ANES$age != 0, ]
dim(ANES)
## [1] 5854    6
ANES$lage <- log(ANES$age)


# Using "pid7" as our dependent variable, we estimate an adaptive LASSO model w/o cross-validation (cv) of polynomial degree and penalization factor
library(polywog)
## Loading required package: miscTools
polywog.fit <- polywog(pid7 ~ ., data = ANES, degree = 2, thresh = 1e-4)
summary(polywog.fit)  # note that no s.e. generated as LASSO only concerns coef estimates, you need bootstrap to get simulated s.e.
## 
## Call:
## polywog(formula = pid7 ~ ., data = ANES, degree = 2, thresh = 1e-04)
## 
## Coefficients:
##                            Estimate Std. Error
## (Intercept)               5.5455041         NA
## age                      -0.0157075         NA
## educat                    0.0199406         NA
## incomeperc                0.0000000         NA
## unemployed                0.0000000         NA
## distrust_govt             0.0000000         NA
## lage                     -0.5582637         NA
## age^2                     0.0002374         NA
## age.educat               -0.0004647         NA
## age.incomeperc            0.0000000         NA
## age.unemployed            0.0000000         NA
## age.distrust_govt         0.0145500         NA
## age.lage                 -0.0020012         NA
## educat^2                  0.0000000         NA
## educat.incomeperc         0.0000000         NA
## educat.unemployed         0.0000000         NA
## educat.distrust_govt      0.0000000         NA
## educat.lage               0.0039425         NA
## incomeperc^2              0.0000000         NA
## incomeperc.unemployed     0.0000000         NA
## incomeperc.distrust_govt  0.0000000         NA
## incomeperc.lage           0.0692410         NA
## unemployed.distrust_govt  0.0000000         NA
## unemployed.lage          -0.0030015         NA
## distrust_govt^2           0.0000000         NA
## distrust_govt.lage        0.0000000         NA
## lage^2                   -0.1452616         NA
## 
## Regularization method: Adaptive LASSO
## Adaptive weights: inverse linear model coefficients
## Number of observations: 2849
## Polynomial expansion degree: 2
## Model family: gaussian
## Bootstrap iterations: 0
## Penalization parameter (lambda): 0.2468
# Bootstrap the fitted model
polywog.boot <- bootPolywog(polywog.fit, nboot = 5)
summary(polywog.boot)
## 
## Call:
## polywog(formula = pid7 ~ ., data = ANES, degree = 2, thresh = 1e-04)
## 
## Coefficients:
##                            Estimate Std. Error       2.5%  97.5%
## (Intercept)               5.546e+00  1.465e+00  3.322e+00  7.024
## age                      -1.571e-02  9.115e-03 -1.063e-02  0.010
## educat                    1.994e-02  1.307e-01  0.000e+00  0.285
## incomeperc                0.000e+00  1.045e-01  0.000e+00  0.221
## unemployed                0.000e+00  2.117e-01 -4.892e-01 -0.001
## distrust_govt             0.000e+00  2.144e-01 -2.251e-01  0.314
## lage                     -5.583e-01  5.381e-01 -1.565e+00 -0.287
## age^2                     2.374e-04  6.159e-05  1.334e-04  0.000
## age.educat               -4.647e-04  2.397e-03 -4.924e-03  0.000
## age.incomeperc            0.000e+00  5.669e-04 -1.027e-03  0.000
## age.unemployed            0.000e+00  5.861e-03  0.000e+00  0.013
## age.distrust_govt         1.455e-02  8.924e-03  7.602e-05  0.020
## age.lage                 -2.001e-03  1.179e-03  1.867e-04  0.003
## educat^2                  0.000e+00  1.027e-02 -1.998e-02  0.000
## educat.incomeperc         0.000e+00  1.070e-03 -2.153e-03  0.000
## educat.unemployed         0.000e+00  0.000e+00  0.000e+00  0.000
## educat.distrust_govt      0.000e+00  1.625e-02  0.000e+00  0.033
## educat.lage               3.942e-03  7.600e-03  3.080e-03  0.019
## incomeperc^2              0.000e+00  4.575e-04 -9.208e-04  0.000
## incomeperc.unemployed     0.000e+00  7.271e-02 -1.463e-01  0.000
## incomeperc.distrust_govt  0.000e+00  1.201e-02 -2.417e-02  0.000
## incomeperc.lage           6.924e-02  3.491e-02  5.781e-03  0.091
## unemployed.distrust_govt  0.000e+00  4.892e-02 -9.662e-02  0.006
## unemployed.lage          -3.001e-03  3.753e-03 -7.566e-03  0.000
## distrust_govt^2           0.000e+00  0.000e+00  0.000e+00  0.000
## distrust_govt.lage        0.000e+00  8.774e-02 -1.004e-02  0.194
## lage^2                   -1.453e-01  8.260e-02 -2.422e-01 -0.040
## 
## Regularization method: Adaptive LASSO
## Adaptive weights: inverse linear model coefficients
## Number of observations: 2849
## Polynomial expansion degree: 2
## Model family: gaussian
## Bootstrap iterations: 5
## Penalization parameter (lambda): 0.2468
# we then estimate an adaptive LASSO model w/ cross-validation (cv) of polynomial degree and penalization factor

set.seed(123)

cv.fit <- cv.polywog(pid7 ~ ., data = ANES,
                     degrees.cv = 1:2, # 2-degree CV
                     nfolds = 5,  # 5-fold
                     thresh = 1e-4)
summary(cv.fit)
##             Length Class   Mode   
## results      6     -none-  numeric
## degree.min   1     -none-  numeric
## polywog.fit 25     polywog list
# Extract best model and bootstrap

fit1 <- cv.fit$polywog.fit

fit1 <- bootPolywog(fit1, nboot = 5)
summary(fit1)
## 
## Call:
## polywog(formula = pid7 ~ ., data = ANES, degree = 2L, boot = 0, 
##     thresh = 1e-04, model = TRUE, X = FALSE, y = FALSE)
## 
## Coefficients:
##                            Estimate Std. Error       2.5%  97.5%
## (Intercept)               5.546e+00  9.976e-01  3.808e+00  5.843
## age                      -1.571e-02  1.013e-02 -1.034e-02  0.014
## educat                    1.994e-02  1.368e-01  1.314e-02  0.320
## incomeperc                0.000e+00  5.004e-02  2.187e-04  0.109
## unemployed                0.000e+00  1.112e-01 -2.098e-01  0.054
## distrust_govt             0.000e+00  3.846e-01 -6.165e-01  0.356
## lage                     -5.583e-01  2.167e-01 -6.734e-01 -0.125
## age^2                     2.374e-04  1.248e-04  4.527e-05  0.000
## age.educat               -4.647e-04  2.808e-03 -6.418e-03  0.000
## age.incomeperc            0.000e+00  6.075e-04 -2.253e-04  0.001
## age.unemployed            0.000e+00  6.801e-03  0.000e+00  0.015
## age.distrust_govt         1.455e-02  6.956e-03  6.897e-03  0.023
## age.lage                 -2.001e-03  1.896e-03 -3.191e-03  0.001
## educat^2                  0.000e+00  4.839e-03 -1.254e-02  0.000
## educat.incomeperc         0.000e+00  1.947e-02 -4.124e-02  0.000
## educat.unemployed         0.000e+00  4.790e-02 -5.438e-02  0.067
## educat.distrust_govt      0.000e+00  2.886e-02  0.000e+00  0.058
## educat.lage               3.942e-03  1.456e-02  1.179e-02  0.044
## incomeperc^2              0.000e+00  2.740e-02  0.000e+00  0.055
## incomeperc.unemployed     0.000e+00  1.317e-01 -2.952e-01  0.000
## incomeperc.distrust_govt  0.000e+00  8.129e-03 -1.636e-02  0.000
## incomeperc.lage           6.924e-02  3.706e-02  1.874e-02  0.111
## unemployed.distrust_govt  0.000e+00  8.247e-03  0.000e+00  0.017
## unemployed.lage          -3.001e-03  5.520e-02 -9.720e-03  0.108
## distrust_govt^2           0.000e+00  0.000e+00  0.000e+00  0.000
## distrust_govt.lage        0.000e+00  6.154e-02 -5.237e-03  0.124
## lage^2                   -1.453e-01  5.900e-02 -2.967e-01 -0.157
## 
## Regularization method: Adaptive LASSO
## Adaptive weights: inverse linear model coefficients
## Number of observations: 2849
## Polynomial expansion degree: 2
## Model family: gaussian
## Bootstrap iterations: 5
## Penalization parameter (lambda): 0.2468
# What would you say on the result? What variables and their combindations should be included (excluded)? Verdict?