Goal is to predict attrition, employees who are likely to leave the company.

Import data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>

library(recipes)

## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.12     ✔ tailor       0.1.0 
## ✔ dials        1.4.2      ✔ tune         2.0.1 
## ✔ infer        1.1.0      ✔ workflows    1.3.0 
## ✔ modeldata    1.5.1      ✔ workflowsets 1.1.1 
## ✔ parsnip      1.4.1      ✔ yardstick    1.3.2 
## ✔ rsample      1.3.2      
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()

library(themis)
library(doParallel)

## Loading required package: foreach
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loading required package: iterators
## Loading required package: parallel

data <- read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Issues with data

Missing values
Factors or numeric variables
Education, EnvironmentSatisfaction, JobInvolvement, JobSatisfaction, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance
Zero variance variables
Over18, EmployCount, StandardHours
Character variables
Unbalanced target variable
ID variable

factors_vec <- data %>% select(Education,EnvironmentSatisfaction, JobInvolvement,
JobSatisfaction, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance, JobLevel, StockOptionLevel) %>% names()

data_clean <- data %>%
  
  # Address factors imported as numeric
  mutate(across(all_of(factors_vec), as.factor)) %>%
  
  # Drop zero-variance variables
  select(-c(Over18,EmployeeCount,StandardHours)) %>%
  
  # Recode Attrition
  mutate(Attrition = if_else(Attrition == "Yes", "Left", Attrition))

Explore data

data_clean %>% count(Attrition)

## # A tibble: 2 × 2
##   Attrition     n
##   <chr>     <int>
## 1 Left        237
## 2 No         1233

data_clean %>%
  ggplot(aes(Attrition)) +
  geom_bar()

attrition vs. monthly income

data_clean %>%
  ggplot(aes(Attrition, MonthlyIncome)) +
  geom_boxplot()

correlation plot

# step 1: binarize
data_binarized <- data_clean %>%
  select(-EmployeeNumber) %>%
  binarize()

data_binarized %>% glimpse()

## Rows: 1,470
## Columns: 120
## $ `Age__-Inf_30`                       <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Age__30_36                           <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, …
## $ Age__36_43                           <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ Age__43_Inf                          <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ Attrition__Left                      <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Attrition__No                        <dbl> 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `BusinessTravel__Non-Travel`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BusinessTravel__Travel_Frequently    <dbl> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ BusinessTravel__Travel_Rarely        <dbl> 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ `DailyRate__-Inf_465`                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ DailyRate__465_802                   <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ DailyRate__802_1157                  <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ DailyRate__1157_Inf                  <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, …
## $ Department__Human_Resources          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Department__Research_&_Development` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Department__Sales                    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `DistanceFromHome__-Inf_2`           <dbl> 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__2_7                <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ DistanceFromHome__7_14               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__14_Inf             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ Education__1                         <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Education__2                         <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ Education__3                         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, …
## $ Education__4                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ Education__5                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Human_Resources      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Life_Sciences        <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ EducationField__Marketing            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Medical              <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, …
## $ EducationField__Other                <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Technical_Degree     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__1           <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ EnvironmentSatisfaction__2           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__3           <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, …
## $ EnvironmentSatisfaction__4           <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ Gender__Female                       <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ Gender__Male                         <dbl> 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ `HourlyRate__-Inf_48`                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ HourlyRate__48_66                    <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ HourlyRate__66_83.75                 <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ HourlyRate__83.75_Inf                <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ JobInvolvement__1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobInvolvement__2                    <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobInvolvement__3                    <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ JobInvolvement__4                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ JobLevel__1                          <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ JobLevel__2                          <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobLevel__3                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobLevel__4                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobLevel__5                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Healthcare_Representative   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobRole__Human_Resources             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Laboratory_Technician       <dbl> 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, …
## $ JobRole__Manager                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Manufacturing_Director      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobRole__Research_Director           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Research_Scientist          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Executive             <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Representative        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobSatisfaction__1                   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ JobSatisfaction__2                   <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ JobSatisfaction__3                   <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, …
## $ JobSatisfaction__4                   <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ MaritalStatus__Divorced              <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ MaritalStatus__Married               <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, …
## $ MaritalStatus__Single                <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
## $ `MonthlyIncome__-Inf_2911`           <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ MonthlyIncome__2911_4919             <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ MonthlyIncome__4919_8379             <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ MonthlyIncome__8379_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `MonthlyRate__-Inf_8047`             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ MonthlyRate__8047_14235.5            <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, …
## $ MonthlyRate__14235.5_20461.5         <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, …
## $ MonthlyRate__20461.5_Inf             <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ `NumCompaniesWorked__-Inf_1`         <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, …
## $ NumCompaniesWorked__1_2              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ NumCompaniesWorked__2_4              <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ NumCompaniesWorked__4_Inf            <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ OverTime__No                         <dbl> 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ OverTime__Yes                        <dbl> 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ `PercentSalaryHike__-Inf_12`         <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__12_14             <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, …
## $ PercentSalaryHike__14_18             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__18_Inf            <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ PerformanceRating__3                 <dbl> 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, …
## $ PerformanceRating__4                 <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ RelationshipSatisfaction__1          <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ RelationshipSatisfaction__2          <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, …
## $ RelationshipSatisfaction__3          <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, …
## $ RelationshipSatisfaction__4          <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ StockOptionLevel__0                  <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ StockOptionLevel__1                  <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ StockOptionLevel__2                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ StockOptionLevel__3                  <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `TotalWorkingYears__-Inf_6`          <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ TotalWorkingYears__6_10              <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ TotalWorkingYears__10_15             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ TotalWorkingYears__15_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `TrainingTimesLastYear__-Inf_2`      <dbl> 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, …
## $ TrainingTimesLastYear__2_3           <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, …
## $ TrainingTimesLastYear__3_Inf         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ WorkLifeBalance__1                   <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ WorkLifeBalance__2                   <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
## $ WorkLifeBalance__3                   <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, …
## $ WorkLifeBalance__4                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsAtCompany__-Inf_3`             <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsAtCompany__3_5                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsAtCompany__5_9                  <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, …
## $ YearsAtCompany__9_Inf                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsInCurrentRole__-Inf_2`         <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsInCurrentRole__2_3              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ YearsInCurrentRole__3_7              <dbl> 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, …
## $ YearsInCurrentRole__7_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsSinceLastPromotion__-Inf_1`    <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, …
## $ YearsSinceLastPromotion__1_3         <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ YearsSinceLastPromotion__3_Inf       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `YearsWithCurrManager__-Inf_2`       <dbl> 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsWithCurrManager__2_3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsWithCurrManager__3_7            <dbl> 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ YearsWithCurrManager__7_Inf          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …

# step 2: correlation
data_correlation <- data_binarized %>%
  correlate(Attrition__Left)

data_correlation

## # A tibble: 120 × 3
##    feature           bin       correlation
##    <fct>             <chr>           <dbl>
##  1 Attrition         Left            1    
##  2 Attrition         No             -1    
##  3 OverTime          No             -0.246
##  4 OverTime          Yes             0.246
##  5 JobLevel          1               0.213
##  6 MonthlyIncome     -Inf_2911       0.207
##  7 StockOptionLevel  0               0.195
##  8 YearsAtCompany    -Inf_3          0.183
##  9 MaritalStatus     Single          0.175
## 10 TotalWorkingYears -Inf_6          0.169
## # ℹ 110 more rows

# step 3: plot
data_correlation %>%
  correlationfunnel::plot_correlation_funnel()

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: ggrepel: 73 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model building

Split data

set.seed(1234)

data_split <- initial_split(data_clean, strata = Attrition)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- vfold_cv(data_train, strata = Attrition)

Preprocess data

library(themis)

xgboost_rec <- recipes::recipe(Attrition ~ ., data = data_train) %>%
  update_role(EmployeeNumber, new_role = "ID") %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_smote(all_outcomes())

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 1,848
## Columns: 64
## $ Age                               <dbl> 0.008807237, -0.101383303, -0.542145…
## $ DailyRate                         <dbl> 1.42650782, 1.04705665, 0.81938595, …
## $ DistanceFromHome                  <dbl> -0.90560797, -0.04468277, 0.81624244…
## $ EmployeeNumber                    <dbl> 4, 27, 33, 45, 47, 55, 58, 64, 90, 1…
## $ HourlyRate                        <dbl> 1.281526684, 0.789662751, 0.29779881…
## $ MonthlyIncome                     <dbl> -0.93899597, -0.66085114, -0.5527189…
## $ MonthlyRate                       <dbl> -1.6645786, -1.0194609, -1.3434252, …
## $ NumCompaniesWorked                <dbl> 1.3192201, 1.7195678, -0.6825182, -0…
## $ PercentSalaryHike                 <dbl> -0.06569789, 2.10974214, 1.83781213,…
## $ TotalWorkingYears                 <dbl> -0.5386623, -0.1543025, -0.1543025, …
## $ TrainingTimesLastYear             <dbl> 0.1451189, 0.9245137, 1.7039084, -0.…
## $ YearsAtCompany                    <dbl> -1.119612991, -0.323504813, 0.472603…
## $ YearsInCurrentRole                <dbl> -1.15408286, -0.33886992, -0.6106075…
## $ YearsSinceLastPromotion           <dbl> -0.67903619, -0.67903619, 1.21606861…
## $ YearsWithCurrManager              <dbl> -1.12646331, -0.29520587, 0.81313739…
## $ Attrition                         <fct> Left, Left, Left, Left, Left, Left, …
## $ BusinessTravel_Travel_Frequently  <dbl> -0.4781076, -0.4781076, 2.0896799, -…
## $ BusinessTravel_Travel_Rarely      <dbl> 0.6327712, 0.6327712, -1.5789147, 0.…
## $ Department_Research...Development <dbl> 0.7300286, -1.3685653, 0.7300286, 0.…
## $ Department_Sales                  <dbl> -0.6595963, 1.5147018, -0.6595963, -…
## $ Education_X2                      <dbl> 2.0834380, -0.4795399, -0.4795399, -…
## $ Education_X3                      <dbl> -0.8062777, -0.8062777, -0.8062777, …
## $ Education_X4                      <dbl> -0.5949133, 1.6793906, -0.5949133, -…
## $ Education_X5                      <dbl> -0.1990579, -0.1990579, -0.1990579, …
## $ EducationField_Life.Sciences      <dbl> -0.8420112, 1.1865540, 1.1865540, -0…
## $ EducationField_Marketing          <dbl> -0.3463115, -0.3463115, -0.3463115, …
## $ EducationField_Medical            <dbl> -0.6981366, -0.6981366, -0.6981366, …
## $ EducationField_Other              <dbl> 4.1634411, -0.2399678, -0.2399678, -…
## $ EducationField_Technical.Degree   <dbl> -0.2835453, -0.2835453, -0.2835453, …
## $ EnvironmentSatisfaction_X2        <dbl> -0.5079873, -0.5079873, 1.9667651, 1…
## $ EnvironmentSatisfaction_X3        <dbl> -0.6581793, 1.5179630, -0.6581793, -…
## $ EnvironmentSatisfaction_X4        <dbl> 1.5445140, -0.6468648, -0.6468648, -…
## $ Gender_Male                       <dbl> 0.8078118, 0.8078118, -1.2367877, 0.…
## $ JobInvolvement_X2                 <dbl> 1.6873239, 1.6873239, -0.5921161, -0…
## $ JobInvolvement_X3                 <dbl> -1.1755257, -1.1755257, -1.1755257, …
## $ JobInvolvement_X4                 <dbl> -0.3397008, -0.3397008, -0.3397008, …
## $ JobLevel_X2                       <dbl> -0.7535647, -0.7535647, -0.7535647, …
## $ JobLevel_X3                       <dbl> -0.4166727, -0.4166727, -0.4166727, …
## $ JobLevel_X4                       <dbl> -0.2872644, -0.2872644, -0.2872644, …
## $ JobLevel_X5                       <dbl> -0.218015, -0.218015, -0.218015, -0.…
## $ JobRole_Human.Resources           <dbl> -0.1990579, -0.1990579, -0.1990579, …
## $ JobRole_Laboratory.Technician     <dbl> 2.1411830, -0.4666074, -0.4666074, -…
## $ JobRole_Manager                   <dbl> -0.2663591, -0.2663591, -0.2663591, …
## $ JobRole_Manufacturing.Director    <dbl> -0.3380364, -0.3380364, -0.3380364, …
## $ JobRole_Research.Director         <dbl> -0.2524088, -0.2524088, -0.2524088, …
## $ JobRole_Research.Scientist        <dbl> -0.4766741, -0.4766741, 2.0959640, 2…
## $ JobRole_Sales.Executive           <dbl> -0.5319394, -0.5319394, -0.5319394, …
## $ JobRole_Sales.Representative      <dbl> -0.2483152, 4.0234821, -0.2483152, -…
## $ JobSatisfaction_X2                <dbl> -0.4866861, -0.4866861, -0.4866861, …
## $ JobSatisfaction_X3                <dbl> 1.5179630, -0.6581793, -0.6581793, -…
## $ JobSatisfaction_X4                <dbl> -0.6666917, -0.6666917, -0.6666917, …
## $ MaritalStatus_Married             <dbl> -0.9285259, -0.9285259, -0.9285259, …
## $ MaritalStatus_Single              <dbl> 1.4765158, 1.4765158, 1.4765158, -0.…
## $ OverTime_Yes                      <dbl> 1.6002434, -0.6243374, 1.6002434, 1.…
## $ PerformanceRating_X4              <dbl> -0.4360017, 2.2914860, 2.2914860, -0…
## $ RelationshipSatisfaction_X2       <dbl> 1.8831789, 1.8831789, 1.8831789, -0.…
## $ RelationshipSatisfaction_X3       <dbl> -0.6780813, -0.6780813, -0.6780813, …
## $ RelationshipSatisfaction_X4       <dbl> -0.6243374, -0.6243374, -0.6243374, …
## $ StockOptionLevel_X1               <dbl> -0.8294726, -0.8294726, -0.8294726, …
## $ StockOptionLevel_X2               <dbl> -0.3544752, -0.3544752, -0.3544752, …
## $ StockOptionLevel_X3               <dbl> -0.2420756, -0.2420756, -0.2420756, …
## $ WorkLifeBalance_X2                <dbl> -0.5571643, -0.5571643, -0.5571643, …
## $ WorkLifeBalance_X3                <dbl> 0.8062777, 0.8062777, 0.8062777, -1.…
## $ WorkLifeBalance_X4                <dbl> -0.3313295, -0.3313295, -0.3313295, …

Specify model

library(usemodels)
usemodels::use_xgboost(Attrition ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = Attrition ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(68552)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune()) %>%
  set_mode("classification") %>%
  set_engine("xgboost")

xgboost_workflow <- 
  workflow() %>%
  add_recipe(xgboost_rec) %>%
  add_model(xgboost_spec)

Tune hyperparameters

tree_grid <- grid_regular(
  trees(),
  tree_depth(),
  levels = 5
)

doParallel::registerDoParallel()

set.seed(65743)
xgboost_tune <- 
  tune_grid(
    xgboost_workflow,
    resamples = data_cv,
    grid = 5,
    control = control_grid(save_pred = TRUE)
  )

Model evaluation

Identify optimal values for hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 8
##    trees tree_depth .metric     .estimator   mean     n std_err .config        
##    <int>      <int> <chr>       <chr>       <dbl> <int>   <dbl> <chr>          
##  1     1          4 accuracy    binary     0.777     10 0.01000 pre0_mod1_post0
##  2     1          4 brier_class binary     0.201     10 0.00234 pre0_mod1_post0
##  3     1          4 roc_auc     binary     0.682     10 0.0197  pre0_mod1_post0
##  4   500         11 accuracy    binary     0.872     10 0.00922 pre0_mod2_post0
##  5   500         11 brier_class binary     0.108     10 0.00747 pre0_mod2_post0
##  6   500         11 roc_auc     binary     0.814     10 0.0161  pre0_mod2_post0
##  7  1000          1 accuracy    binary     0.886     10 0.00408 pre0_mod3_post0
##  8  1000          1 brier_class binary     0.0904    10 0.00307 pre0_mod3_post0
##  9  1000          1 roc_auc     binary     0.858     10 0.0112  pre0_mod3_post0
## 10  1500         15 accuracy    binary     0.872     10 0.00737 pre0_mod4_post0
## 11  1500         15 brier_class binary     0.107     10 0.00678 pre0_mod4_post0
## 12  1500         15 roc_auc     binary     0.814     10 0.0183  pre0_mod4_post0
## 13  2000          8 accuracy    binary     0.868     10 0.00996 pre0_mod5_post0
## 14  2000          8 brier_class binary     0.110     10 0.00783 pre0_mod5_post0
## 15  2000          8 roc_auc     binary     0.804     10 0.0190  pre0_mod5_post0

collect_predictions(xgboost_tune) %>%
  group_by(id) %>%
  roc_curve(Attrition, .pred_Left) %>%
  autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
  finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
  last_fit(data_split)

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config        
##   <chr>       <chr>          <dbl> <chr>          
## 1 accuracy    binary         0.843 pre0_mod0_post0
## 2 roc_auc     binary         0.792 pre0_mod0_post0
## 3 brier_class binary         0.121 pre0_mod0_post0

collect_predictions(xgboost_last) %>%
  yardstick::conf_mat(Attrition, .pred_class) %>%
  autoplot()

Variable importance

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
  workflows::extract_fit_engine() %>%
  vip()

Conclusion

The previous model had accuracy of 0.851 and AUC of 0.753.

Feature transformation: normalized numeric data. It resulted in a slight improvement with accuracy of 0.859 and AUC of 0.770.
Feature transformation: YeoJohnson transformation. No improvement.
Feature selection: PCA didn’t make an improvement.

Code Along 6

Javony Deleon

2026-03-04

Import data

Explore data

Model building

Split data

Preprocess data

Specify model

Tune hyperparameters

Model evaluation

Identify optimal values for hyperparameters

Fit the model for the last time

Variable importance

Conclusion