knitr::opts_chunk$set(echo = TRUE)

Goal is to predict attrition, employees who are likely to leave the company.

Import data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
spam <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-08-15/spam.csv')
## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore data

skimr::skim(spam)
Data summary
Name spam
Number of rows 4601
Number of columns 7
_______________________
Column type frequency:
character 1
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
yesno 0 1 1 1 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
crl.tot 0 1 283.29 606.35 1 35 95 266.00 15841.00 ▇▁▁▁▁
dollar 0 1 0.08 0.25 0 0 0 0.05 6.00 ▇▁▁▁▁
bang 0 1 0.27 0.82 0 0 0 0.32 32.48 ▇▁▁▁▁
money 0 1 0.09 0.44 0 0 0 0.00 12.50 ▇▁▁▁▁
n000 0 1 0.10 0.35 0 0 0 0.00 5.45 ▇▁▁▁▁
make 0 1 0.10 0.31 0 0 0 0.00 4.54 ▇▁▁▁▁
factors_vec <- spam %>% select(money, dollar, bang, n000, make, crl.tot) %>% names()

 data_clean <- spam %>% 
     # Convert character variables to factor 
     mutate(yesno = factor(yesno, levels = c("y", "n")))

Explore data

data_clean %>% count(yesno)
## # A tibble: 2 × 2
##   yesno     n
##   <fct> <int>
## 1 y      1813
## 2 n      2788
data_clean %>%
    ggplot(aes(yesno)) +
    geom_bar()

attrition vs. monthly income

data_clean %>%
    ggplot(aes(yesno, money)) +
    geom_boxplot()

Correlation Funnel Plot

library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
library(dplyr)

data_binarized <- data_clean %>%
    select(-crl.tot) %>%
    binarize()

data_binarized %>% glimpse()
## Rows: 4,601
## Columns: 13
## $ `dollar__-Inf_0.052` <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1…
## $ dollar__0.052_Inf    <dbl> 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0…
## $ `bang__-Inf_0.315`   <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0…
## $ bang__0.315_Inf      <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1…
## $ money__0             <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1…
## $ `money__-OTHER`      <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0…
## $ n000__0              <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1…
## $ `n000__-OTHER`       <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ make__0              <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1…
## $ make__0.1            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `make__-OTHER`       <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ yesno__y             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ yesno__n             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
#step 2: correlation
data_correlation <- data_binarized %>%
    correlate(yesno__y)

data_correlation
## # A tibble: 13 × 3
##    feature bin        correlation
##    <fct>   <chr>            <dbl>
##  1 yesno   y               1     
##  2 yesno   n              -1     
##  3 dollar  -Inf_0.052     -0.566 
##  4 dollar  0.052_Inf       0.566 
##  5 bang    -Inf_0.315     -0.490 
##  6 bang    0.315_Inf       0.490 
##  7 money   -OTHER          0.475 
##  8 money   0              -0.475 
##  9 n000    0              -0.419 
## 10 n000    -OTHER          0.419 
## 11 make    0              -0.239 
## 12 make    -OTHER          0.223 
## 13 make    0.1             0.0803
# step 3: plot
data_correlation %>%
    correlationfunnel:: plot_correlation_funnel()

Split data

library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.8     ✔ rsample      1.3.0
## ✔ dials        1.4.0     ✔ tune         1.3.0
## ✔ infer        1.0.7     ✔ workflows    1.2.0
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.3.1     ✔ yardstick    1.3.2
## ✔ recipes      1.2.1
## Warning: package 'broom' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
set.seed(1234)
data <- spam %>% sample_n(100)

data_split <- initial_split(spam, strata = yesno)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = yesno)
data_cv
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [3104/346]> Fold01
##  2 <split [3105/345]> Fold02
##  3 <split [3105/345]> Fold03
##  4 <split [3105/345]> Fold04
##  5 <split [3105/345]> Fold05
##  6 <split [3105/345]> Fold06
##  7 <split [3105/345]> Fold07
##  8 <split [3105/345]> Fold08
##  9 <split [3105/345]> Fold09
## 10 <split [3106/344]> Fold10

Preprocess data

library(themis)
## Warning: package 'themis' was built under R version 4.4.3
data_rec <- recipes::recipe(yesno ~ ., data = data_train) %>%
    step_YeoJohnson(all_numeric_predictors()) %>%
    step_normalize(all_numeric_predictors())

data_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 3,450
## Columns: 7
## $ crl.tot <dbl> -0.83644653, -0.24209462, 1.96201706, -0.64492123, 0.30637742,…
## $ dollar  <dbl> -0.3093153, -0.3093153, -0.3093153, -0.3093153, -0.3093153, -0…
## $ bang    <dbl> 0.3144685, 0.8044657, -0.8250407, -0.8250407, -0.8250407, -0.8…
## $ money   <dbl> -0.2018991, -0.2018991, -0.2018991, -0.2018991, -0.2018991, -0…
## $ n000    <dbl> -0.2835952, -0.2835952, -0.2835952, -0.2835952, -0.2835952, -0…
## $ make    <dbl> -0.3438677, -0.3438677, -0.3438677, -0.3438677, -0.3438677, -0…
## $ yesno   <fct> n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,…

Specify Model

library(usemodels)
## Warning: package 'usemodels' was built under R version 4.4.3
usemodels::use_xgboost(yesno ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = yesno ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(94429)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_rec <- recipe(yesno ~ ., data = data_train) %>%
    step_YeoJohnson(all_numeric_predictors()) %>%
    step_normalize(all_numeric_predictors())
xgboost_spec <-
    boost_tree(trees = tune()) %>%
    set_mode("classification") %>%
    set_engine("xgboost")

xgboost_workflow <-
    workflow() %>%
    add_recipe(xgboost_rec) %>%  # Use the newly defined recipe
    add_model(xgboost_spec)

Tune Hyperparameters

library(doParallel)
## Warning: package 'doParallel' was built under R version 4.4.3
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.4.3
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 4.4.3
## Loading required package: parallel
doParallel::registerDoParallel()

set.seed(65743)
xgboost_tune <-
    tune_grid(xgboost_workflow,
              resamples = data_cv,
              grid = 5, 
              control = control_grid(save_pred = TRUE))
## Warning: ! tune detected a parallel backend registered with foreach but no backend
##   registered with future.
## ℹ Support for parallel processing with foreach was soft-deprecated in tune
##   1.2.1.
## ℹ See ?parallelism (`?tune::parallelism()`) to learn more.

Model evaluation

Identify optimal values for hyperparameters

collect_metrics(xgboost_tune)
## # A tibble: 15 × 7
##    trees .metric     .estimator   mean     n std_err .config             
##    <int> <chr>       <chr>       <dbl> <int>   <dbl> <chr>               
##  1    12 accuracy    binary     0.881     10 0.00428 Preprocessor1_Model1
##  2    12 brier_class binary     0.0929    10 0.00306 Preprocessor1_Model1
##  3    12 roc_auc     binary     0.923     10 0.00546 Preprocessor1_Model1
##  4   507 accuracy    binary     0.871     10 0.00453 Preprocessor1_Model2
##  5   507 brier_class binary     0.104     10 0.00292 Preprocessor1_Model2
##  6   507 roc_auc     binary     0.908     10 0.00370 Preprocessor1_Model2
##  7  1037 accuracy    binary     0.869     10 0.00254 Preprocessor1_Model3
##  8  1037 brier_class binary     0.110     10 0.00258 Preprocessor1_Model3
##  9  1037 roc_auc     binary     0.903     10 0.00399 Preprocessor1_Model3
## 10  1524 accuracy    binary     0.867     10 0.00244 Preprocessor1_Model4
## 11  1524 brier_class binary     0.113     10 0.00249 Preprocessor1_Model4
## 12  1524 roc_auc     binary     0.900     10 0.00393 Preprocessor1_Model4
## 13  1999 accuracy    binary     0.866     10 0.00287 Preprocessor1_Model5
## 14  1999 brier_class binary     0.115     10 0.00250 Preprocessor1_Model5
## 15  1999 roc_auc     binary     0.898     10 0.00406 Preprocessor1_Model5
collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(yesno, .pred_y, event_level = "second") %>%
    autoplot()

##Fit the model for the last time

library(yardstick)
xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune)) %>%
    last_fit(data_split)
## Warning in select_best(xgboost_tune): No value of `metric` was given; "roc_auc"
## will be used.
## Warning: package 'xgboost' was built under R version 4.4.3
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary        0.884  Preprocessor1_Model1
## 2 roc_auc     binary        0.923  Preprocessor1_Model1
## 3 brier_class binary        0.0920 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(yesno, .pred_class) %>%
    autoplot()

Variable importance

library(vip)
## Warning: package 'vip' was built under R version 4.4.3
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion