Goal is to predict attrition, the employees who are likely to leave the company.

#Import Data

library(tidyverse)
## Warning: package 'purrr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ correlationfunnel Tip #3 ════════════════════════════════════════════════════
## Using `binarize()` with data containing many columns or many rows can increase dimensionality substantially.
## Try subsetting your data column-wise or row-wise to avoid creating too many columns.
## You can always make a big problem smaller by sampling. :)
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.7     ✔ rsample      1.2.1
## ✔ dials        1.4.0     ✔ tune         1.3.0
## ✔ infer        1.0.7     ✔ workflows    1.2.0
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.3.0     ✔ yardstick    1.3.2
## ✔ recipes      1.1.1
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
library(rsample)
library(purrr)
library(recipes)
library(themis)
## Warning: package 'themis' was built under R version 4.4.3
library(usemodels)
## Warning: package 'usemodels' was built under R version 4.4.3
library(doParallel)
## Warning: package 'doParallel' was built under R version 4.4.3
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.4.3
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 4.4.3
## Loading required package: parallel
library(workflows)

data <- read_csv("00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Clean Data

skimr::skim(data)
Data summary
Name data
Number of rows 1470
Number of columns 35
_______________________
Column type frequency:
character 9
numeric 26
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Attrition 0 1 2 3 0 2 0
BusinessTravel 0 1 10 17 0 3 0
Department 0 1 5 22 0 3 0
EducationField 0 1 5 16 0 6 0
Gender 0 1 4 6 0 2 0
JobRole 0 1 7 25 0 9 0
MaritalStatus 0 1 6 8 0 3 0
Over18 0 1 1 1 0 1 0
OverTime 0 1 2 3 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1 36.92 9.14 18 30.00 36.0 43.00 60 ▂▇▇▃▂
DailyRate 0 1 802.49 403.51 102 465.00 802.0 1157.00 1499 ▇▇▇▇▇
DistanceFromHome 0 1 9.19 8.11 1 2.00 7.0 14.00 29 ▇▅▂▂▂
Education 0 1 2.91 1.02 1 2.00 3.0 4.00 5 ▂▃▇▆▁
EmployeeCount 0 1 1.00 0.00 1 1.00 1.0 1.00 1 ▁▁▇▁▁
EmployeeNumber 0 1 1024.87 602.02 1 491.25 1020.5 1555.75 2068 ▇▇▇▇▇
EnvironmentSatisfaction 0 1 2.72 1.09 1 2.00 3.0 4.00 4 ▅▅▁▇▇
HourlyRate 0 1 65.89 20.33 30 48.00 66.0 83.75 100 ▇▇▇▇▇
JobInvolvement 0 1 2.73 0.71 1 2.00 3.0 3.00 4 ▁▃▁▇▁
JobLevel 0 1 2.06 1.11 1 1.00 2.0 3.00 5 ▇▇▃▂▁
JobSatisfaction 0 1 2.73 1.10 1 2.00 3.0 4.00 4 ▅▅▁▇▇
MonthlyIncome 0 1 6502.93 4707.96 1009 2911.00 4919.0 8379.00 19999 ▇▅▂▁▂
MonthlyRate 0 1 14313.10 7117.79 2094 8047.00 14235.5 20461.50 26999 ▇▇▇▇▇
NumCompaniesWorked 0 1 2.69 2.50 0 1.00 2.0 4.00 9 ▇▃▂▂▁
PercentSalaryHike 0 1 15.21 3.66 11 12.00 14.0 18.00 25 ▇▅▃▂▁
PerformanceRating 0 1 3.15 0.36 3 3.00 3.0 3.00 4 ▇▁▁▁▂
RelationshipSatisfaction 0 1 2.71 1.08 1 2.00 3.0 4.00 4 ▅▅▁▇▇
StandardHours 0 1 80.00 0.00 80 80.00 80.0 80.00 80 ▁▁▇▁▁
StockOptionLevel 0 1 0.79 0.85 0 0.00 1.0 1.00 3 ▇▇▁▂▁
TotalWorkingYears 0 1 11.28 7.78 0 6.00 10.0 15.00 40 ▇▇▂▁▁
TrainingTimesLastYear 0 1 2.80 1.29 0 2.00 3.0 3.00 6 ▂▇▇▂▃
WorkLifeBalance 0 1 2.76 0.71 1 2.00 3.0 3.00 4 ▁▃▁▇▂
YearsAtCompany 0 1 7.01 6.13 0 3.00 5.0 9.00 40 ▇▂▁▁▁
YearsInCurrentRole 0 1 4.23 3.62 0 2.00 3.0 7.00 18 ▇▃▂▁▁
YearsSinceLastPromotion 0 1 2.19 3.22 0 0.00 1.0 3.00 15 ▇▁▁▁▁
YearsWithCurrManager 0 1 4.12 3.57 0 2.00 3.0 7.00 17 ▇▂▅▁▁
factors_vec <- data %>% select(Education,EnvironmentSatisfaction, JobInvolvement, JobSatisfaction, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance, JobLevel, StockOptionLevel) %>% names()

data_clean <- data %>%
  
  # Address factors imported as numeric
  mutate(across(all_of(factors_vec), as.factor)) %>%
  
  # Drop zero-variance variables
  select(-c(Over18,EmployeeCount, StandardHours)) %>%
  
  # Recode Attrition
  mutate(Attrition = if_else(Attrition == "Yes", "Left", Attrition))

#Explore Data

data_clean %>% count(Attrition)
## # A tibble: 2 × 2
##   Attrition     n
##   <chr>     <int>
## 1 Left        237
## 2 No         1233
data_clean %>%
  ggplot(aes(Attrition))+
  geom_bar()

Attrition vs. Monthly Income

data_clean %>%
  ggplot(aes(Attrition, MonthlyIncome)) +
  geom_boxplot()

Correlation Plot

# Step 1: Binarize the data
data_binarized <- data_clean %>%
  select(-EmployeeNumber) %>%
  binarize()

data_binarized %>% glimpse()
## Rows: 1,470
## Columns: 120
## $ `Age__-Inf_30`                       <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Age__30_36                           <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, …
## $ Age__36_43                           <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ Age__43_Inf                          <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ Attrition__Left                      <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Attrition__No                        <dbl> 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `BusinessTravel__Non-Travel`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BusinessTravel__Travel_Frequently    <dbl> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ BusinessTravel__Travel_Rarely        <dbl> 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ `DailyRate__-Inf_465`                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ DailyRate__465_802                   <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ DailyRate__802_1157                  <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ DailyRate__1157_Inf                  <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, …
## $ Department__Human_Resources          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Department__Research_&_Development` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Department__Sales                    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `DistanceFromHome__-Inf_2`           <dbl> 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__2_7                <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ DistanceFromHome__7_14               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__14_Inf             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ Education__1                         <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Education__2                         <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ Education__3                         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, …
## $ Education__4                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ Education__5                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Human_Resources      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Life_Sciences        <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ EducationField__Marketing            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Medical              <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, …
## $ EducationField__Other                <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Technical_Degree     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__1           <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ EnvironmentSatisfaction__2           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__3           <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, …
## $ EnvironmentSatisfaction__4           <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ Gender__Female                       <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ Gender__Male                         <dbl> 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ `HourlyRate__-Inf_48`                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ HourlyRate__48_66                    <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ HourlyRate__66_83.75                 <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ HourlyRate__83.75_Inf                <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ JobInvolvement__1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobInvolvement__2                    <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobInvolvement__3                    <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ JobInvolvement__4                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ JobLevel__1                          <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ JobLevel__2                          <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobLevel__3                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobLevel__4                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobLevel__5                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Healthcare_Representative   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobRole__Human_Resources             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Laboratory_Technician       <dbl> 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, …
## $ JobRole__Manager                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Manufacturing_Director      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobRole__Research_Director           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Research_Scientist          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Executive             <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Representative        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobSatisfaction__1                   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ JobSatisfaction__2                   <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ JobSatisfaction__3                   <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, …
## $ JobSatisfaction__4                   <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ MaritalStatus__Divorced              <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ MaritalStatus__Married               <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, …
## $ MaritalStatus__Single                <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
## $ `MonthlyIncome__-Inf_2911`           <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ MonthlyIncome__2911_4919             <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ MonthlyIncome__4919_8379             <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ MonthlyIncome__8379_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `MonthlyRate__-Inf_8047`             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ MonthlyRate__8047_14235.5            <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, …
## $ MonthlyRate__14235.5_20461.5         <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, …
## $ MonthlyRate__20461.5_Inf             <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ `NumCompaniesWorked__-Inf_1`         <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, …
## $ NumCompaniesWorked__1_2              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ NumCompaniesWorked__2_4              <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ NumCompaniesWorked__4_Inf            <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ OverTime__No                         <dbl> 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ OverTime__Yes                        <dbl> 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ `PercentSalaryHike__-Inf_12`         <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__12_14             <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, …
## $ PercentSalaryHike__14_18             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__18_Inf            <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ PerformanceRating__3                 <dbl> 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, …
## $ PerformanceRating__4                 <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ RelationshipSatisfaction__1          <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ RelationshipSatisfaction__2          <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, …
## $ RelationshipSatisfaction__3          <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, …
## $ RelationshipSatisfaction__4          <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ StockOptionLevel__0                  <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ StockOptionLevel__1                  <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ StockOptionLevel__2                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ StockOptionLevel__3                  <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `TotalWorkingYears__-Inf_6`          <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ TotalWorkingYears__6_10              <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ TotalWorkingYears__10_15             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ TotalWorkingYears__15_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `TrainingTimesLastYear__-Inf_2`      <dbl> 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, …
## $ TrainingTimesLastYear__2_3           <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, …
## $ TrainingTimesLastYear__3_Inf         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ WorkLifeBalance__1                   <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ WorkLifeBalance__2                   <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
## $ WorkLifeBalance__3                   <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, …
## $ WorkLifeBalance__4                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsAtCompany__-Inf_3`             <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsAtCompany__3_5                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsAtCompany__5_9                  <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, …
## $ YearsAtCompany__9_Inf                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsInCurrentRole__-Inf_2`         <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsInCurrentRole__2_3              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ YearsInCurrentRole__3_7              <dbl> 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, …
## $ YearsInCurrentRole__7_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsSinceLastPromotion__-Inf_1`    <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, …
## $ YearsSinceLastPromotion__1_3         <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ YearsSinceLastPromotion__3_Inf       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `YearsWithCurrManager__-Inf_2`       <dbl> 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsWithCurrManager__2_3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsWithCurrManager__3_7            <dbl> 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ YearsWithCurrManager__7_Inf          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
# Step 2: Correlation
data_correlation <- data_binarized %>%
  correlate(Attrition__Left)

#Step #3: Plot
data_correlation %>%
  correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 73 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#Model Building

##Split data

set.seed(1234)

#data_clean <- data_clean %>% sample_n(100)
data_clean <- data_clean %>% group_by(Attrition) %>% sample_n(237)


data_split <- initial_split(data_clean, strata = Attrition)
data_train<- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = Attrition)

##Preprocess Data

xgboost_recipe <- recipes::recipe(Attrition ~ ., data = data_train) %>%
  update_role(EmployeeNumber, new_role = "ID") %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_smote(Attrition)
  

xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 354
## Columns: 64
## $ Age                               <dbl> -1.729587358, -1.628351590, -1.72958…
## $ DailyRate                         <dbl> -1.16554775, 1.35846814, -0.68648525…
## $ DistanceFromHome                  <dbl> -0.949750119, 0.008800251, 0.0088002…
## $ EmployeeNumber                    <dbl> 243, 701, 1248, 918, 1106, 1389, 110…
## $ HourlyRate                        <dbl> -0.85811592, -1.59352514, 1.54422087…
## $ MonthlyIncome                     <dbl> -1.09295197, -1.11353367, -0.9254213…
## $ MonthlyRate                       <dbl> -0.67723954, 1.85302368, -1.11794818…
## $ NumCompaniesWorked                <dbl> -0.757156895, -0.757156895, -0.75715…
## $ PercentSalaryHike                 <dbl> 1.85344007, -1.10446798, 2.66014226,…
## $ TotalWorkingYears                 <dbl> -1.19295897, -1.19295897, -1.1929589…
## $ TrainingTimesLastYear             <dbl> 0.2535142, 1.8142799, -0.5268686, 0.…
## $ YearsAtCompany                    <dbl> -0.8538020, -0.8538020, -0.8538020, …
## $ YearsInCurrentRole                <dbl> -1.06714489, -1.06714489, -0.7805217…
## $ YearsSinceLastPromotion           <dbl> -0.3905640, -0.3905640, -0.7047905, …
## $ YearsWithCurrManager              <dbl> -1.0200048, -0.7341127, -1.0200048, …
## $ Attrition                         <fct> Left, Left, Left, Left, Left, Left, …
## $ BusinessTravel_Travel_Frequently  <dbl> -0.517743, -0.517743, -0.517743, 1.9…
## $ BusinessTravel_Travel_Rarely      <dbl> 0.6396869, 0.6396869, -1.5588487, -1…
## $ Department_Research...Development <dbl> 0.7981755, 0.7981755, 0.7981755, 0.7…
## $ Department_Sales                  <dbl> -0.7241385, -0.7241385, -0.7241385, …
## $ Education_X2                      <dbl> -0.4374666, -0.4374666, -0.4374666, …
## $ Education_X3                      <dbl> 1.1919031, -0.8366243, 1.1919031, -0…
## $ Education_X4                      <dbl> -0.6178439, -0.6178439, -0.6178439, …
## $ Education_X5                      <dbl> -0.1702576, -0.1702576, -0.1702576, …
## $ EducationField_Life.Sciences      <dbl> 1.2346066, -0.8076866, -0.8076866, 1…
## $ EducationField_Marketing          <dbl> -0.3713127, -0.3713127, -0.3713127, …
## $ EducationField_Medical            <dbl> -0.683780, 1.458327, 1.458327, -0.68…
## $ EducationField_Other              <dbl> -0.1949756, -0.1949756, -0.1949756, …
## $ EducationField_Technical.Degree   <dbl> -0.3411591, -0.3411591, -0.3411591, …
## $ EnvironmentSatisfaction_X2        <dbl> 1.9591940, -0.5089721, -0.5089721, -…
## $ EnvironmentSatisfaction_X3        <dbl> -0.6484563, -0.6484563, -0.6484563, …
## $ EnvironmentSatisfaction_X4        <dbl> -0.6309372, 1.5804665, -0.6309372, 1…
## $ Gender_Male                       <dbl> 0.8317572, 0.8317572, -1.1988776, 0.…
## $ JobInvolvement_X2                 <dbl> 1.5914851, -0.6265689, 1.5914851, 1.…
## $ JobInvolvement_X3                 <dbl> -1.0871234, 0.9172603, -1.0871234, -…
## $ JobInvolvement_X4                 <dbl> -0.320177, -0.320177, -0.320177, -0.…
## $ JobLevel_X2                       <dbl> -0.6616526, -0.6616526, -0.6616526, …
## $ JobLevel_X3                       <dbl> -0.4236644, -0.4236644, -0.4236644, …
## $ JobLevel_X4                       <dbl> -0.2752693, -0.2752693, -0.2752693, …
## $ JobLevel_X5                       <dbl> -0.1702576, -0.1702576, -0.1702576, …
## $ JobRole_Human.Resources           <dbl> -0.2172638, -0.2172638, -0.2172638, …
## $ JobRole_Laboratory.Technician     <dbl> 2.0479261, -0.4869195, -0.4869195, 2…
## $ JobRole_Manager                   <dbl> -0.2507687, -0.2507687, -0.2507687, …
## $ JobRole_Manufacturing.Director    <dbl> -0.298293, -0.298293, -0.298293, -0.…
## $ JobRole_Research.Director         <dbl> -0.2100543, -0.2100543, -0.2100543, …
## $ JobRole_Research.Scientist        <dbl> -0.4646324, 2.1461592, 2.1461592, -0…
## $ JobRole_Sales.Executive           <dbl> -0.5308581, -0.5308581, -0.5308581, …
## $ JobRole_Sales.Representative      <dbl> -0.3359877, -0.3359877, -0.3359877, …
## $ JobSatisfaction_X2                <dbl> -0.4601402, -0.4601402, 2.1671117, -…
## $ JobSatisfaction_X3                <dbl> -0.7106001, 1.4032859, -0.7106001, -…
## $ JobSatisfaction_X4                <dbl> 1.7102769, -0.5830489, -0.5830489, 1…
## $ MaritalStatus_Married             <dbl> -0.9017102, -0.9017102, -0.9017102, …
## $ MaritalStatus_Single              <dbl> 1.2795129, 1.2795129, 1.2795129, -0.…
## $ OverTime_Yes                      <dbl> -0.7840262, 1.2718646, 1.2718646, 1.…
## $ PerformanceRating_X4              <dbl> 2.2794314, -0.4374666, 2.2794314, -0…
## $ RelationshipSatisfaction_X2       <dbl> -0.517743, -0.517743, 1.926004, -0.5…
## $ RelationshipSatisfaction_X3       <dbl> 1.5070977, -0.6616526, -0.6616526, -…
## $ RelationshipSatisfaction_X4       <dbl> -0.5743627, 1.7361418, -0.5743627, 1…
## $ StockOptionLevel_X1               <dbl> -0.7423412, -0.7423412, -0.7423412, …
## $ StockOptionLevel_X2               <dbl> -0.3307687, -0.3307687, -0.3307687, …
## $ StockOptionLevel_X3               <dbl> -0.2172638, -0.2172638, -0.2172638, …
## $ WorkLifeBalance_X2                <dbl> 1.8784211, -0.5308581, -0.5308581, 1…
## $ WorkLifeBalance_X3                <dbl> -1.1988776, 0.8317572, -1.1988776, -…
## $ WorkLifeBalance_X4                <dbl> -0.3254994, -0.3254994, 3.0635238, -…

##Specify Model

usemodels::use_xgboost(Attrition ~., data = data_train)
## Adding missing grouping variables: `Attrition`
## xgboost_recipe <- 
##   recipe(formula = Attrition ~ ., data = data_train) %>% 
##   step_string2factor(one_of(NA_character_)) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(11091)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

##Tune Hyperparameters

tree_grid <- grid_regular(trees(),
                          tree_depth(),
                          levels = 5)
doParallel::registerDoParallel()

set.seed(47927)
xgboost_tune <-
  tune_grid(xgboost_workflow,
            resamples = data_cv, grid = 5,
              control = control_grid(save_pred = TRUE ))
## Warning: ! tune detected a parallel backend registered with foreach but no backend
##   registered with future.
## ℹ Support for parallel processing with foreach was soft-deprecated in tune
##   1.2.1.
## ℹ See ?parallelism (`?tune::parallelism()`) to learn more.

#Model Evaluation

Identify Optimal values for hyperparameters

collect_metrics(xgboost_tune)
## # A tibble: 15 × 8
##    trees tree_depth .metric     .estimator  mean     n std_err .config          
##    <int>      <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>            
##  1  1000          1 accuracy    binary     0.656    10 0.0286  Preprocessor1_Mo…
##  2  1000          1 brier_class binary     0.255    10 0.0249  Preprocessor1_Mo…
##  3  1000          1 roc_auc     binary     0.712    10 0.0401  Preprocessor1_Mo…
##  4     1          4 accuracy    binary     0.650    10 0.0270  Preprocessor1_Mo…
##  5     1          4 brier_class binary     0.227    10 0.00475 Preprocessor1_Mo…
##  6     1          4 roc_auc     binary     0.686    10 0.0292  Preprocessor1_Mo…
##  7  2000          8 accuracy    binary     0.673    10 0.0426  Preprocessor1_Mo…
##  8  2000          8 brier_class binary     0.252    10 0.0333  Preprocessor1_Mo…
##  9  2000          8 roc_auc     binary     0.744    10 0.0472  Preprocessor1_Mo…
## 10   500         11 accuracy    binary     0.641    10 0.0430  Preprocessor1_Mo…
## 11   500         11 brier_class binary     0.265    10 0.0324  Preprocessor1_Mo…
## 12   500         11 roc_auc     binary     0.733    10 0.0467  Preprocessor1_Mo…
## 13  1500         15 accuracy    binary     0.647    10 0.0425  Preprocessor1_Mo…
## 14  1500         15 brier_class binary     0.265    10 0.0331  Preprocessor1_Mo…
## 15  1500         15 roc_auc     binary     0.735    10 0.0470  Preprocessor1_Mo…
collect_predictions(xgboost_tune) %>%
  group_by(id)%>%
  roc_curve(Attrition, .pred_Left) %>%
  autoplot()

Fit the model for the lat time

xgboost_last <- xgboost_workflow %>%
  finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
  last_fit(data_split)
## Warning: package 'xgboost' was built under R version 4.4.3
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.708 Preprocessor1_Model1
## 2 roc_auc     binary         0.738 Preprocessor1_Model1
## 3 brier_class binary         0.249 Preprocessor1_Model1
collect_predictions(xgboost_last) %>% yardstick::conf_mat(Attrition, .pred_class) %>%
  autoplot()

Variable importance

library(vip)
## Warning: package 'vip' was built under R version 4.4.3
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgboost_last %>%
  workflows::extract_fit_engine() %>%
  vip()

#Conclusion

In the previous model, accuracy was .692 and AUC of .748

Feature transformation: normalize numeric data. It resulted in accuracy of .717 and AUC of .753. This is a slight increase in accuracy and a very slight increase in the AUC.

Feature Transformation: YeoJohnson Transformation. This resulted in an accuracy of .717 and an AUC of .753 which means it made no difference compared to the accuracy of the normalized numeric data.

Feature Selection: PCA. With a threshold of 99% the accuracy goes slightly down to 0.7 but the AUC improves to .795. When trying with the number of new features using numcomp of 3 the accuracy goes down to .608 and the AUC down to .669

Algorithm Tuning: