Adult vtreat Demo

This is a follow-up to an example of vtreat usage from the Win-Vecto blog at http://www.win-vector.com/blog/2016/06/a-demonstration-of-vtreat-data-preparation/.

There are a few points which may not be clear to people without a lot of R experience.

Libraries

First I’ll copy the commands from the blog post to get the required libraries and set things up for parallel processing.

library(tictoc)
library(plyr)
library(tidyverse)

## ── Attaching packages ─────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.0.0       ✔ purrr   0.2.5  
## ✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.1       ✔ stringr 1.4.0  
## ✔ readr   1.1.1       ✔ forcats 0.3.0

## ── Conflicts ────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()

library('vtreat')
library('caret')

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library('gbm')

## Loaded gbm 2.1.5

library('doMC')

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: iterators

## Loading required package: parallel

library('WVPlots') # see https://github.com/WinVector/WVPlots

# parallel for vtreat
ncores <- parallel::detectCores()
parallelCluster <- parallel::makeCluster(ncores)
# parallel for caret
registerDoMC(cores=ncores)

To get started go to the UCI Machine Learnig resource to get the data. This is at http://archive.ics.uci.edu/ml/machine-learning-databases/adult/. Click on the files adult.data and adult.test to put them in your downloads folder. Then move them to your working directory and use RStudio to import them.

Note that the test file has an extra row at the top that needs to be eliminated.

dTrain <- read_csv("adult.data", 
                  col_names = FALSE,
                  na = c('NA', '?', ''))

## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   X2 = col_character(),
##   X3 = col_integer(),
##   X4 = col_character(),
##   X5 = col_integer(),
##   X6 = col_character(),
##   X7 = col_character(),
##   X8 = col_character(),
##   X9 = col_character(),
##   X10 = col_character(),
##   X11 = col_integer(),
##   X12 = col_integer(),
##   X13 = col_integer(),
##   X14 = col_character(),
##   X15 = col_character()
## )

dTest <- read_csv("adult.test", 
                  col_names = FALSE, 
                  na = c('NA', '?', ''),
                  skip = 1)

## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   X2 = col_character(),
##   X3 = col_integer(),
##   X4 = col_character(),
##   X5 = col_integer(),
##   X6 = col_character(),
##   X7 = col_character(),
##   X8 = col_character(),
##   X9 = col_character(),
##   X10 = col_character(),
##   X11 = col_integer(),
##   X12 = col_integer(),
##   X13 = col_integer(),
##   X14 = col_character(),
##   X15 = col_character()
## )

Names

Now add the names using the code from the blog post.

colnames <-
  c(
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'class'
  )
colnames(dTrain) = colnames
colnames(dTest) = colnames
table(dTrain$class)

## 
## <=50K  >50K 
## 24720  7841

table(dTest$class)

## 
## <=50K.  >50K. 
##  12435   3846

Check

Noted the extra . in dTest$class. Also resolve issue with values, which need to be legitimate R names. Revise and Check.

dTrain$class = ifelse(dTrain$class == "<=50K", "leq50K","gtr50K")
dTest$class = ifelse(dTest$class == "<=50K.", "leq50K","gtr50K")
dTrain$class = factor(dTrain$class)
dTest$class = factor(dTest$class)
table(dTrain$class)

## 
## gtr50K leq50K 
##   7841  24720

table(dTest$class)

## 
## gtr50K leq50K 
##   3846  12435

Define Problem

yName <- 'class'
yTarget <- 'gtr50K'
varNames <- setdiff(colnames,yName)

Treatment Plan

tic()

  cd <- vtreat::mkCrossFrameCExperiment(dTrain,varNames,yName,yTarget,
                                        parallelCluster=parallelCluster)

## [1] "vtreat 1.4.2 start initial treatment design Mon Jul  8 12:36:08 2019"
## [1] " start cross frame work Mon Jul  8 12:36:17 2019"
## [1] " vtreat::mkCrossFrameCExperiment done Mon Jul  8 12:36:22 2019"

  scoreFrame <- cd$treatments$scoreFrame
  dTrainTreated <- cd$crossFrame
  # pick our variables
  newVars <- scoreFrame$varName[scoreFrame$sig<1/nrow(scoreFrame)]
  dTestTreated <- vtreat::prepare(cd$treatments,dTest,
                                  pruneSig=NULL,varRestriction=newVars)
  
toc()

## 13.372 sec elapsed

Train Model

tic()

yForm <- as.formula(paste(yName,paste(newVars,collapse=' + '),sep=' ~ '))
  # from: http://topepo.github.io/caret/training.html
  fitControl <- trainControl(
    method = "cv",
    number = 3)
  model <- train(yForm,
                 data = dTrainTreated,
                 method = "gbm",
                 trControl = fitControl,
                 verbose = FALSE)
  print(model)

## Stochastic Gradient Boosting 
## 
## 32561 samples
##    59 predictor
##     2 classes: 'gtr50K', 'leq50K' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 21707, 21708, 21707 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.8473328  0.5062379
##   1                  100      0.8554714  0.5555624
##   1                  150      0.8574676  0.5701225
##   2                   50      0.8562699  0.5630326
##   2                  100      0.8605388  0.5848138
##   2                  150      0.8632414  0.5965881
##   3                   50      0.8595253  0.5774254
##   3                  100      0.8645006  0.5999199
##   3                  150      0.8669882  0.6089126
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
##  interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.

  dTest$pred <- predict(model,newdata=dTestTreated,type='prob')[,yTarget]
  
toc()

## 18.859 sec elapsed

Making yForm

There are a few steps to look at in the way yForm was created.

The first step was to build the right side of the final formula. It begins with a vector of variable names. The paste command with a collapse argument turns this vector into a single long string with the variable names separated by a “+” symbol.

RHS = paste(newVars,collapse=' + ')
RHS

## [1] "age + workclass_catP + workclass_catB + education_catP + education_catB + education_minus_num + marital_minus_status_catP + marital_minus_status_catB + occupation_catP + occupation_catB + relationship_catP + relationship_catB + race_catP + race_catB + capital_minus_gain + capital_minus_loss + hours_minus_per_minus_week + native_minus_country_catP + native_minus_country_catB + workclass_lev_NA + workclass_lev_x_Federal_minus_gov + workclass_lev_x_Local_minus_gov + workclass_lev_x_Private + workclass_lev_x_Self_minus_emp_minus_inc + workclass_lev_x_Self_minus_emp_minus_not_minus_inc + workclass_lev_x_State_minus_gov + education_lev_x_10th + education_lev_x_11th + education_lev_x_Bachelors + education_lev_x_HS_minus_grad + education_lev_x_Masters + education_lev_x_Some_minus_college + marital_minus_status_lev_x_Divorced + marital_minus_status_lev_x_Married_minus_civ_minus_spouse + marital_minus_status_lev_x_Never_minus_married + marital_minus_status_lev_x_Separated + marital_minus_status_lev_x_Widowed + occupation_lev_NA + occupation_lev_x_Adm_minus_clerical + occupation_lev_x_Exec_minus_managerial + occupation_lev_x_Farming_minus_fishing + occupation_lev_x_Handlers_minus_cleaners + occupation_lev_x_Machine_minus_op_minus_inspct + occupation_lev_x_Other_minus_service + occupation_lev_x_Prof_minus_specialty + occupation_lev_x_Sales + occupation_lev_x_Tech_minus_support + occupation_lev_x_Transport_minus_moving + relationship_lev_x_Husband + relationship_lev_x_Not_minus_in_minus_family + relationship_lev_x_Other_minus_relative + relationship_lev_x_Own_minus_child + relationship_lev_x_Unmarried + relationship_lev_x_Wife + race_lev_x_Black + race_lev_x_White + sex_lev_x_Female + sex_lev_x_Male + native_minus_country_lev_x_United_minus_States"

The next step adds the left side of the formula and the “~”.

aform = paste(yName,RHS, sep = "~")
aform

## [1] "class~age + workclass_catP + workclass_catB + education_catP + education_catB + education_minus_num + marital_minus_status_catP + marital_minus_status_catB + occupation_catP + occupation_catB + relationship_catP + relationship_catB + race_catP + race_catB + capital_minus_gain + capital_minus_loss + hours_minus_per_minus_week + native_minus_country_catP + native_minus_country_catB + workclass_lev_NA + workclass_lev_x_Federal_minus_gov + workclass_lev_x_Local_minus_gov + workclass_lev_x_Private + workclass_lev_x_Self_minus_emp_minus_inc + workclass_lev_x_Self_minus_emp_minus_not_minus_inc + workclass_lev_x_State_minus_gov + education_lev_x_10th + education_lev_x_11th + education_lev_x_Bachelors + education_lev_x_HS_minus_grad + education_lev_x_Masters + education_lev_x_Some_minus_college + marital_minus_status_lev_x_Divorced + marital_minus_status_lev_x_Married_minus_civ_minus_spouse + marital_minus_status_lev_x_Never_minus_married + marital_minus_status_lev_x_Separated + marital_minus_status_lev_x_Widowed + occupation_lev_NA + occupation_lev_x_Adm_minus_clerical + occupation_lev_x_Exec_minus_managerial + occupation_lev_x_Farming_minus_fishing + occupation_lev_x_Handlers_minus_cleaners + occupation_lev_x_Machine_minus_op_minus_inspct + occupation_lev_x_Other_minus_service + occupation_lev_x_Prof_minus_specialty + occupation_lev_x_Sales + occupation_lev_x_Tech_minus_support + occupation_lev_x_Transport_minus_moving + relationship_lev_x_Husband + relationship_lev_x_Not_minus_in_minus_family + relationship_lev_x_Other_minus_relative + relationship_lev_x_Own_minus_child + relationship_lev_x_Unmarried + relationship_lev_x_Wife + race_lev_x_Black + race_lev_x_White + sex_lev_x_Female + sex_lev_x_Male + native_minus_country_lev_x_United_minus_States"

What we have now is a string. Some procedures might convert this string into a formula automatically, but it’s best to convert the string into a formula object.

class(aform)

## [1] "character"

aform = as.formula(aform)
class(aform)

## [1] "formula"

dTest usage

Evaluation

Let’s draw the ROC curve and look at the auc. Note that the code in the blog post was missing two required arguments.

WVPlots::ROCPlot(dTest,'pred',yName,title = 'predictions on test',truthTarget = "gtr50K")

## Confusion Matrix

This is the version in the blog post.

confusionMatrix <- table(truth=dTest[[yName]],pred=dTest$pred>=0.5)
print(confusionMatrix)

##         pred
## truth    FALSE  TRUE
##   gtr50K  1440  2406
##   leq50K 11732   703

Note that this is different from the blog post largely because of the recoded class variable. There is a small difference because of a different random selection.

I prefer the output format of the confusionMatrix function from caret.

dTest$classPred = factor(ifelse(dTest$pred >=.5, "gtr50K","leq50K"))
confusionMatrix(dTest$classPred,dTest$class)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction gtr50K leq50K
##     gtr50K   2406    703
##     leq50K   1440  11732
##                                           
##                Accuracy : 0.8684          
##                  95% CI : (0.8631, 0.8735)
##     No Information Rate : 0.7638          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6094          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6256          
##             Specificity : 0.9435          
##          Pos Pred Value : 0.7739          
##          Neg Pred Value : 0.8907          
##              Prevalence : 0.2362          
##          Detection Rate : 0.1478          
##    Detection Prevalence : 0.1910          
##       Balanced Accuracy : 0.7845          
##                                           
##        'Positive' Class : gtr50K          
##

tic()

fitControl <- trainControl(
    method = "cv",
    number = 3)
  model1 <- train(yForm,
                 data = dTrainTreated,
                 method = "gbm",
                 trControl = fitControl,
                 verbose = FALSE,
                 tuneLength = 10)
  print(model1)

## Stochastic Gradient Boosting 
## 
## 32561 samples
##    59 predictor
##     2 classes: 'gtr50K', 'leq50K' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 21708, 21707, 21707 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##    1                  50      0.8479163  0.5091460
##    1                 100      0.8555329  0.5558312
##    1                 150      0.8580512  0.5706463
##    1                 200      0.8597097  0.5801216
##    1                 250      0.8612760  0.5871537
##    1                 300      0.8616752  0.5896155
##    1                 350      0.8618901  0.5914259
##    1                 400      0.8626272  0.5949158
##    1                 450      0.8619823  0.5932325
##    1                 500      0.8633336  0.5983401
##    2                  50      0.8562699  0.5615158
##    2                 100      0.8606617  0.5855945
##    2                 150      0.8626886  0.5947419
##    2                 200      0.8639171  0.6000594
##    2                 250      0.8646849  0.6027641
##    2                 300      0.8654219  0.6057825
##    2                 350      0.8667733  0.6095162
##    2                 400      0.8667425  0.6104620
##    2                 450      0.8671418  0.6117568
##    2                 500      0.8669575  0.6116073
##    3                  50      0.8593411  0.5776276
##    3                 100      0.8632415  0.5967408
##    3                 150      0.8652991  0.6049827
##    3                 200      0.8662204  0.6089815
##    3                 250      0.8672953  0.6127857
##    3                 300      0.8679403  0.6152513
##    3                 350      0.8686466  0.6175128
##    3                 400      0.8692302  0.6195859
##    3                 450      0.8695066  0.6204971
##    3                 500      0.8695066  0.6208689
##    4                  50      0.8621052  0.5891990
##    4                 100      0.8658519  0.6055269
##    4                 150      0.8671725  0.6122557
##    4                 200      0.8681553  0.6161575
##    4                 250      0.8683702  0.6173360
##    4                 300      0.8690766  0.6192936
##    4                 350      0.8698137  0.6218069
##    4                 400      0.8694758  0.6208449
##    4                 450      0.8702437  0.6232331
##    4                 500      0.8700287  0.6229679
##    5                  50      0.8629650  0.5936022
##    5                 100      0.8668347  0.6105603
##    5                 150      0.8692916  0.6195822
##    5                 200      0.8699058  0.6218010
##    5                 250      0.8702129  0.6227687
##    5                 300      0.8709193  0.6258818
##    5                 350      0.8712878  0.6274662
##    5                 400      0.8709500  0.6259851
##    5                 450      0.8709192  0.6269849
##    5                 500      0.8694144  0.6229953
##    6                  50      0.8649921  0.6003000
##    6                 100      0.8684010  0.6161311
##    6                 150      0.8699365  0.6213824
##    6                 200      0.8704279  0.6244469
##    6                 250      0.8702129  0.6249126
##    6                 300      0.8703357  0.6260539
##    6                 350      0.8688309  0.6215216
##    6                 400      0.8687694  0.6212273
##    6                 450      0.8684931  0.6203240
##    6                 500      0.8672953  0.6166515
##    7                  50      0.8651763  0.6011230
##    7                 100      0.8700901  0.6210773
##    7                 150      0.8712264  0.6257571
##    7                 200      0.8717178  0.6274508
##    7                 250      0.8707964  0.6261686
##    7                 300      0.8703051  0.6245890
##    7                 350      0.8696909  0.6233951
##    7                 400      0.8688309  0.6207646
##    7                 450      0.8686774  0.6202933
##    7                 500      0.8675103  0.6170713
##    8                  50      0.8673261  0.6085859
##    8                 100      0.8698751  0.6218713
##    8                 150      0.8711650  0.6267252
##    8                 200      0.8709807  0.6269851
##    8                 250      0.8696601  0.6239133
##    8                 300      0.8696294  0.6237753
##    8                 350      0.8684623  0.6207588
##    8                 400      0.8687694  0.6211153
##    8                 450      0.8677866  0.6185254
##    8                 500      0.8672953  0.6172502
##    9                  50      0.8673875  0.6091253
##    9                 100      0.8699672  0.6226623
##    9                 150      0.8706736  0.6257306
##    9                 200      0.8710114  0.6276635
##    9                 250      0.8703358  0.6254921
##    9                 300      0.8697215  0.6240693
##    9                 350      0.8689230  0.6221572
##    9                 400      0.8692608  0.6231535
##    9                 450      0.8672339  0.6174523
##    9                 500      0.8675410  0.6192085
##   10                  50      0.8677867  0.6118317
##   10                 100      0.8711036  0.6266889
##   10                 150      0.8718099  0.6298033
##   10                 200      0.8707657  0.6271908
##   10                 250      0.8701208  0.6262337
##   10                 300      0.8693529  0.6242952
##   10                 350      0.8692608  0.6242844
##   10                 400      0.8694451  0.6247471
##   10                 450      0.8687080  0.6223814
##   10                 500      0.8684623  0.6217944
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
##  interaction.depth = 10, shrinkage = 0.1 and n.minobsinnode = 10.

  dTest$pred <- predict(model1,newdata=dTestTreated,type='prob')[,yTarget]
  
toc()

## 176.13 sec elapsed

Note that increasing the tuneLength, changed the optimal values of the two hyperparameters which were actually tuned. Both of these values are interior values, not on the the edges of the allowed ranges. They can be taken seriously. However two other hyperparameters were fixed and not tuned. To explore the possibilities, I’ll set a grid fixing our two tuned hyperparameters and let the others vary.

aGrid = expand.grid(n.trees = 450,
                    interaction.depth = 5,
                    shrinkage = c(.08,.09,.1,.11),
                    n.minobsinnode = c(8,9,10,11,12))

tic()

fitControl <- trainControl(
    method = "cv",
    number = 3)
  model1 <- train(yForm,
                 data = dTrainTreated,
                 method = "gbm",
                 trControl = fitControl,
                 verbose = FALSE,
                 tuneGrid = aGrid)
  print(model1)

## Stochastic Gradient Boosting 
## 
## 32561 samples
##    59 predictor
##     2 classes: 'gtr50K', 'leq50K' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 21708, 21707, 21707 
## Resampling results across tuning parameters:
## 
##   shrinkage  n.minobsinnode  Accuracy   Kappa    
##   0.08        8              0.8696292  0.6218965
##   0.08        9              0.8698442  0.6220158
##   0.08       10              0.8692913  0.6205841
##   0.08       11              0.8715947  0.6279095
##   0.08       12              0.8700592  0.6244626
##   0.09        8              0.8698135  0.6223364
##   0.09        9              0.8691992  0.6211516
##   0.09       10              0.8699670  0.6237303
##   0.09       11              0.8698134  0.6221842
##   0.09       12              0.8709191  0.6260237
##   0.10        8              0.8709191  0.6262500
##   0.10        9              0.8702742  0.6247608
##   0.10       10              0.8692606  0.6209616
##   0.10       11              0.8698749  0.6228259
##   0.10       12              0.8692914  0.6211333
##   0.11        8              0.8696599  0.6229962
##   0.11        9              0.8687078  0.6197278
##   0.11       10              0.8685236  0.6202419
##   0.11       11              0.8699670  0.6240327
##   0.11       12              0.8695063  0.6224484
## 
## Tuning parameter 'n.trees' was held constant at a value of 450
## 
## Tuning parameter 'interaction.depth' was held constant at a value of 5
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 450,
##  interaction.depth = 5, shrinkage = 0.08 and n.minobsinnode = 11.

  dTest$pred <- predict(model1,newdata=dTestTreated,type='prob')[,yTarget]
  
toc()

## 255.296 sec elapsed

Adaptive Resampling

tic()
adaptControl <- trainControl(method = "adaptive_cv",
                             number = 3, repeats = 3,
                             adaptive = list(min = 2, alpha = 0.05, 
                                             method = "gls", complete = TRUE),
                             classProbs = TRUE,
                             summaryFunction = twoClassSummary,
                             search = "random")
modela <- train(yForm,
                 data = dTrainTreated,
                 method = "gbm",
                 trControl = adaptControl,
                 verbose = FALSE,
                 tuneLength = 5)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

  print(modela)

## Stochastic Gradient Boosting 
## 
## 32561 samples
##    59 predictor
##     2 classes: 'gtr50K', 'leq50K' 
## 
## No pre-processing
## Resampling: Adaptively Cross-Validated (3 fold, repeated 3 times) 
## Summary of sample sizes: 21708, 21707, 21707, 21707, 21708, 21707, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.minobsinnode  n.trees  ROC      
##   0.1110241   2                 16              2657     0.9219673
##   0.2132600   1                 21              1769     0.9198062
##   0.3279902  10                 24              1336     0.8950478
##   0.4825641   5                 25              3245     0.8877469
##   0.5979855   8                 11              2072     0.8712970
##   Sens       Spec       Resamples
##   0.6445180  0.9374326  9        
##   0.6272174  0.9400081  3        
##   0.6454961  0.9098908  2        
##   0.6449199  0.9000000  2        
##   0.6332511  0.8917476  2        
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 2657,
##  interaction.depth = 2, shrinkage = 0.1110241 and n.minobsinnode = 16.

  dTest$preda <- predict(modela,newdata=dTestTreated,type='prob')[,yTarget]
  
toc()

## 644.755 sec elapsed

Comparison

We can compare this set of hyperparameter values with the first set using the ROCPlot function.

WVPlots::ROCPlot(dTest,'preda',yName,title = 'predictions on test',truthTarget = "gtr50K")

The bottom line is that there is no improvement in out-of-sample prediction.