Goal is to predict attrition, employees who are likely to leave the company

Import data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## Warning: package 'correlationfunnel' was built under R version 4.4.2

## ══ correlationfunnel Tip #2 ════════════════════════════════════════════════════
## Clean your NA's prior to using `binarize()`.
## Missing values and cleaning data are critical to getting great correlations. :)

data <- read_csv("../00_DATA/WA_Fn-UseC_-HR-Employee-Attrition.csv")

## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore Data

skimr::skim(data)

Data summary
Name	data
Number of rows	1470
Number of columns	35
_______________________
Column type frequency:
character	9
numeric	26
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
Attrition	1	2	3	2
BusinessTravel	1	10	17	3
Department	1	5	22	3
EducationField	1	5	16	6
Gender	1	4	6	2
JobRole	1	7	25	9
MaritalStatus	1	6	8	3
Over18	1	1	1	1
OverTime	1	2	3	2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Age	1	36.92	9.14	18	30.00	36.0	43.00	60	▂▇▇▃▂
DailyRate	1	802.49	403.51	102	465.00	802.0	1157.00	1499	▇▇▇▇▇
DistanceFromHome	1	9.19	8.11	1	2.00	7.0	14.00	29	▇▅▂▂▂
Education	1	2.91	1.02	1	2.00	3.0	4.00	5	▂▃▇▆▁
EmployeeCount	1	1.00	0.00	1	1.00	1.0	1.00	1	▁▁▇▁▁
EmployeeNumber	1	1024.87	602.02	1	491.25	1020.5	1555.75	2068	▇▇▇▇▇
EnvironmentSatisfaction	1	2.72	1.09	1	2.00	3.0	4.00	4	▅▅▁▇▇
HourlyRate	1	65.89	20.33	30	48.00	66.0	83.75	100	▇▇▇▇▇
JobInvolvement	1	2.73	0.71	1	2.00	3.0	3.00	4	▁▃▁▇▁
JobLevel	1	2.06	1.11	1	1.00	2.0	3.00	5	▇▇▃▂▁
JobSatisfaction	1	2.73	1.10	1	2.00	3.0	4.00	4	▅▅▁▇▇
MonthlyIncome	1	6502.93	4707.96	1009	2911.00	4919.0	8379.00	19999	▇▅▂▁▂
MonthlyRate	1	14313.10	7117.79	2094	8047.00	14235.5	20461.50	26999	▇▇▇▇▇
NumCompaniesWorked	1	2.69	2.50	0	1.00	2.0	4.00	9	▇▃▂▂▁
PercentSalaryHike	1	15.21	3.66	11	12.00	14.0	18.00	25	▇▅▃▂▁
PerformanceRating	1	3.15	0.36	3	3.00	3.0	3.00	4	▇▁▁▁▂
RelationshipSatisfaction	1	2.71	1.08	1	2.00	3.0	4.00	4	▅▅▁▇▇
StandardHours	1	80.00	0.00	80	80.00	80.0	80.00	80	▁▁▇▁▁
StockOptionLevel	1	0.79	0.85	0	0.00	1.0	1.00	3	▇▇▁▂▁
TotalWorkingYears	1	11.28	7.78	0	6.00	10.0	15.00	40	▇▇▂▁▁
TrainingTimesLastYear	1	2.80	1.29	0	2.00	3.0	3.00	6	▂▇▇▂▃
WorkLifeBalance	1	2.76	0.71	1	2.00	3.0	3.00	4	▁▃▁▇▂
YearsAtCompany	1	7.01	6.13	0	3.00	5.0	9.00	40	▇▂▁▁▁
YearsInCurrentRole	1	4.23	3.62	0	2.00	3.0	7.00	18	▇▃▂▁▁
YearsSinceLastPromotion	1	2.19	3.22	0	0.00	1.0	3.00	15	▇▁▁▁▁
YearsWithCurrManager	1	4.12	3.57	0	2.00	3.0	7.00	17	▇▂▅▁▁

Issues with data

Missing values Factors or numeric variables Education, EnviornmentSatisfaction, JobInvolvement, PerformanceRating, RelationshipSatisfaction, Worklifebalance Zero variance variables Over 18, EmployeeCount, StandardHours Character variables: Convert them to numbers Unbalanced target variables: Attrition ID variable: EmployeeNumber

factors_vec <- data %>% select(Education, EnvironmentSatisfaction, JobInvolvement, JobSatisfaction, PerformanceRating, WorkLifeBalance, JobLevel, StockOptionLevel) %>% names()

data_clean <- data %>%
    #Address factors imported as numeric 
    mutate(across(all_of(factors_vec), as.factor)) %>%

    # Drop zero-variable variables 
    select(-c(Over18,EmployeeCount,StandardHours)) %>%
    
    # Recode Attrition
    mutate(Attrition = if_else(Attrition == "Yes", "Left", Attrition))

Explore Data

data_clean %>% count(Attrition)

## # A tibble: 2 × 2
##   Attrition     n
##   <chr>     <int>
## 1 Left        237
## 2 No         1233

data_clean %>%
    ggplot(aes(Attrition)) +
    geom_bar()

attrition vs. monthly income

data_clean %>%
    ggplot(aes(Attrition, MonthlyIncome)) +
    geom_boxplot()

correlation plot

# step 1: binarize 
data_binarized <- data_clean %>%
    select(-EmployeeNumber) %>%
    binarize()

data_binarized %>% glimpse()

## Rows: 1,470
## Columns: 120
## $ `Age__-Inf_30`                       <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Age__30_36                           <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, …
## $ Age__36_43                           <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ Age__43_Inf                          <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ Attrition__Left                      <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Attrition__No                        <dbl> 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `BusinessTravel__Non-Travel`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BusinessTravel__Travel_Frequently    <dbl> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ BusinessTravel__Travel_Rarely        <dbl> 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ `DailyRate__-Inf_465`                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ DailyRate__465_802                   <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ DailyRate__802_1157                  <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ DailyRate__1157_Inf                  <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, …
## $ Department__Human_Resources          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Department__Research_&_Development` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Department__Sales                    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `DistanceFromHome__-Inf_2`           <dbl> 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__2_7                <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ DistanceFromHome__7_14               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__14_Inf             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ Education__1                         <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Education__2                         <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ Education__3                         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, …
## $ Education__4                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ Education__5                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Human_Resources      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Life_Sciences        <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ EducationField__Marketing            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Medical              <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, …
## $ EducationField__Other                <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Technical_Degree     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__1           <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ EnvironmentSatisfaction__2           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__3           <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, …
## $ EnvironmentSatisfaction__4           <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ Gender__Female                       <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ Gender__Male                         <dbl> 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ `HourlyRate__-Inf_48`                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ HourlyRate__48_66                    <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ HourlyRate__66_83.75                 <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ HourlyRate__83.75_Inf                <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ JobInvolvement__1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobInvolvement__2                    <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobInvolvement__3                    <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ JobInvolvement__4                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ JobLevel__1                          <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ JobLevel__2                          <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobLevel__3                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobLevel__4                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobLevel__5                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Healthcare_Representative   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobRole__Human_Resources             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Laboratory_Technician       <dbl> 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, …
## $ JobRole__Manager                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Manufacturing_Director      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobRole__Research_Director           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Research_Scientist          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Executive             <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Representative        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobSatisfaction__1                   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ JobSatisfaction__2                   <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ JobSatisfaction__3                   <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, …
## $ JobSatisfaction__4                   <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ MaritalStatus__Divorced              <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ MaritalStatus__Married               <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, …
## $ MaritalStatus__Single                <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
## $ `MonthlyIncome__-Inf_2911`           <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ MonthlyIncome__2911_4919             <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ MonthlyIncome__4919_8379             <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ MonthlyIncome__8379_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `MonthlyRate__-Inf_8047`             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ MonthlyRate__8047_14235.5            <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, …
## $ MonthlyRate__14235.5_20461.5         <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, …
## $ MonthlyRate__20461.5_Inf             <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ `NumCompaniesWorked__-Inf_1`         <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, …
## $ NumCompaniesWorked__1_2              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ NumCompaniesWorked__2_4              <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ NumCompaniesWorked__4_Inf            <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ OverTime__No                         <dbl> 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ OverTime__Yes                        <dbl> 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ `PercentSalaryHike__-Inf_12`         <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__12_14             <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, …
## $ PercentSalaryHike__14_18             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__18_Inf            <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ PerformanceRating__3                 <dbl> 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, …
## $ PerformanceRating__4                 <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ RelationshipSatisfaction__1          <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ RelationshipSatisfaction__2          <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, …
## $ RelationshipSatisfaction__3          <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, …
## $ RelationshipSatisfaction__4          <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ StockOptionLevel__0                  <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ StockOptionLevel__1                  <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ StockOptionLevel__2                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ StockOptionLevel__3                  <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `TotalWorkingYears__-Inf_6`          <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ TotalWorkingYears__6_10              <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ TotalWorkingYears__10_15             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ TotalWorkingYears__15_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `TrainingTimesLastYear__-Inf_2`      <dbl> 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, …
## $ TrainingTimesLastYear__2_3           <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, …
## $ TrainingTimesLastYear__3_Inf         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ WorkLifeBalance__1                   <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ WorkLifeBalance__2                   <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
## $ WorkLifeBalance__3                   <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, …
## $ WorkLifeBalance__4                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsAtCompany__-Inf_3`             <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsAtCompany__3_5                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsAtCompany__5_9                  <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, …
## $ YearsAtCompany__9_Inf                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsInCurrentRole__-Inf_2`         <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsInCurrentRole__2_3              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ YearsInCurrentRole__3_7              <dbl> 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, …
## $ YearsInCurrentRole__7_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsSinceLastPromotion__-Inf_1`    <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, …
## $ YearsSinceLastPromotion__1_3         <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ YearsSinceLastPromotion__3_Inf       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `YearsWithCurrManager__-Inf_2`       <dbl> 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsWithCurrManager__2_3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsWithCurrManager__3_7            <dbl> 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ YearsWithCurrManager__7_Inf          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …

# step 2: correlation 
data_correlation <- data_binarized %>%
    correlate(Attrition__Left)

data_correlation

## # A tibble: 120 × 3
##    feature           bin       correlation
##    <fct>             <chr>           <dbl>
##  1 Attrition         Left            1    
##  2 Attrition         No             -1    
##  3 OverTime          No             -0.246
##  4 OverTime          Yes             0.246
##  5 JobLevel          1               0.213
##  6 MonthlyIncome     -Inf_2911       0.207
##  7 StockOptionLevel  0               0.195
##  8 YearsAtCompany    -Inf_3          0.183
##  9 MaritalStatus     Single          0.175
## 10 TotalWorkingYears -Inf_6          0.169
## # ℹ 110 more rows

# step 3: plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

## Warning: ggrepel: 73 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Split Data

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.4.2

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.2
## ✔ recipes      1.1.0

## Warning: package 'dials' was built under R version 4.4.2

## Warning: package 'infer' was built under R version 4.4.2

## Warning: package 'modeldata' was built under R version 4.4.2

## Warning: package 'parsnip' was built under R version 4.4.2

## Warning: package 'tune' was built under R version 4.4.2

## Warning: package 'workflows' was built under R version 4.4.2

## Warning: package 'workflowsets' was built under R version 4.4.3

## Warning: package 'yardstick' was built under R version 4.4.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

set.seed(1234)
data <- data_clean %>% sample_n(100)

data_split <- initial_split(data_clean, strata = Attrition)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = Attrition)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits            id    
##    <list>            <chr> 
##  1 <split [990/111]> Fold01
##  2 <split [990/111]> Fold02
##  3 <split [990/111]> Fold03
##  4 <split [990/111]> Fold04
##  5 <split [991/110]> Fold05
##  6 <split [991/110]> Fold06
##  7 <split [991/110]> Fold07
##  8 <split [992/109]> Fold08
##  9 <split [992/109]> Fold09
## 10 <split [992/109]> Fold10

Preprocess data

library(themis)

## Warning: package 'themis' was built under R version 4.4.3

data_rec <- recipes::recipe(Attrition ~., data = data_train) %>%
    update_role(EmployeeNumber, new_role = "ID") %>%
    step_dummy(all_nominal_predictors()) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_smote(Attrition)

    
    data_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 1,848
## Columns: 62
## $ Age                               <dbl> -0.9841381, -0.1045553, -0.3244510, …
## $ DailyRate                         <dbl> -1.7333405, 1.0374489, -0.2522728, 0…
## $ DistanceFromHome                  <dbl> 1.77013307, -0.04281252, -0.40540164…
## $ EmployeeNumber                    <dbl> 19, 27, 31, 33, 42, 47, 55, 58, 64, …
## $ HourlyRate                        <dbl> -0.7955162930, 0.7943430317, 0.84402…
## $ MonthlyIncome                     <dbl> -0.94942556, -0.65500781, -0.7504427…
## $ MonthlyRate                       <dbl> -0.17861854, -1.01597110, 0.40504190…
## $ NumCompaniesWorked                <dbl> 0.9389874, 1.7450889, -0.2701648, -0…
## $ PercentSalaryHike                 <dbl> -0.32630180, 2.13812237, -1.14777652…
## $ RelationshipSatisfaction          <dbl> -0.6544526, -0.6544526, 0.2789053, -…
## $ TotalWorkingYears                 <dbl> -0.67952021, -0.16488444, -0.4222023…
## $ TrainingTimesLastYear             <dbl> 0.9318008, 0.9318008, -0.6368730, 1.…
## $ YearsAtCompany                    <dbl> -0.4870091001, -0.3248690882, -0.487…
## $ YearsInCurrentRole                <dbl> -0.61452407, -0.33948708, -0.6145240…
## $ YearsSinceLastPromotion           <dbl> -0.67919910, -0.67919910, -0.3622011…
## $ YearsWithCurrManager              <dbl> -0.30817101, -0.30817101, -0.3081710…
## $ Attrition                         <fct> Left, Left, Left, Left, Left, Left, …
## $ BusinessTravel_Travel_Frequently  <dbl> -0.4781076, -0.4781076, -0.4781076, …
## $ BusinessTravel_Travel_Rarely      <dbl> 0.6285524, 0.6285524, 0.6285524, -1.…
## $ Department_Research...Development <dbl> 0.716921, -1.393587, 0.716921, 0.716…
## $ Department_Sales                  <dbl> -0.6525172, 1.5311346, -0.6525172, -…
## $ Education_X2                      <dbl> -0.4824015, -0.4824015, -0.4824015, …
## $ Education_X3                      <dbl> 1.2533961, -0.7971078, -0.7971078, -…
## $ Education_X4                      <dbl> -0.6159175, 1.6221194, -0.6159175, -…
## $ Education_X5                      <dbl> -0.1701342, -0.1701342, -0.1701342, …
## $ EducationField_Life.Sciences      <dbl> 1.2090320, 1.2090320, -0.8263567, 1.…
## $ EducationField_Marketing          <dbl> -0.3512225, -0.3512225, -0.3512225, …
## $ EducationField_Medical            <dbl> -0.6938248, -0.6938248, 1.4399771, -…
## $ EducationField_Other              <dbl> -0.2462489, -0.2462489, -0.2462489, …
## $ EducationField_Technical.Degree   <dbl> -0.3071606, -0.3071606, -0.3071606, …
## $ EnvironmentSatisfaction_X2        <dbl> -0.5037416, -0.5037416, 1.9833415, 1…
## $ EnvironmentSatisfaction_X3        <dbl> 1.5179630, 1.5179630, -0.6581793, -0…
## $ EnvironmentSatisfaction_X4        <dbl> -0.6638515, -0.6638515, -0.6638515, …
## $ Gender_Male                       <dbl> 0.8216966, 0.8216966, 0.8216966, -1.…
## $ JobInvolvement_X2                 <dbl> 1.7074832, 1.7074832, -0.5851254, -0…
## $ JobInvolvement_X3                 <dbl> -1.1977214, -1.1977214, 0.8341604, -…
## $ JobInvolvement_X4                 <dbl> -0.3245396, -0.3245396, -0.3245396, …
## $ JobLevel_X2                       <dbl> -0.7565295, -0.7565295, -0.7565295, …
## $ JobLevel_X3                       <dbl> -0.4136678, -0.4136678, -0.4136678, …
## $ JobLevel_X4                       <dbl> -0.2854091, -0.2854091, -0.2854091, …
## $ JobLevel_X5                       <dbl> -0.2110723, -0.2110723, -0.2110723, …
## $ JobRole_Human.Resources           <dbl> -0.1915458, -0.1915458, -0.1915458, …
## $ JobRole_Laboratory.Technician     <dbl> 2.1022910, -0.4752395, -0.4752395, -…
## $ JobRole_Manager                   <dbl> -0.2643999, -0.2643999, -0.2643999, …
## $ JobRole_Manufacturing.Director    <dbl> -0.3380364, -0.3380364, -0.3380364, …
## $ JobRole_Research.Director         <dbl> -0.2462489, -0.2462489, -0.2462489, …
## $ JobRole_Research.Scientist        <dbl> -0.4909619, -0.4909619, 2.0349681, 2…
## $ JobRole_Sales.Executive           <dbl> -0.5277239, -0.5277239, -0.5277239, …
## $ JobRole_Sales.Representative      <dbl> -0.2441691, 4.0918020, -0.2441691, -…
## $ JobSatisfaction_X2                <dbl> -0.4781076, -0.4781076, -0.4781076, …
## $ JobSatisfaction_X3                <dbl> 1.5017810, -0.6652713, -0.6652713, -…
## $ JobSatisfaction_X4                <dbl> -0.6581793, -0.6581793, -0.6581793, …
## $ MaritalStatus_Married             <dbl> -0.9116968, -0.9116968, -0.9116968, …
## $ MaritalStatus_Single              <dbl> 1.4459585, 1.4459585, 1.4459585, 1.4…
## $ OverTime_Yes                      <dbl> 1.618434, -0.617320, -0.617320, 1.61…
## $ PerformanceRating_X4              <dbl> -0.4345264, 2.2992657, -0.4345264, 2…
## $ StockOptionLevel_X1               <dbl> -0.8435871, -0.8435871, -0.8435871, …
## $ StockOptionLevel_X2               <dbl> -0.3463115, -0.3463115, -0.3463115, …
## $ StockOptionLevel_X3               <dbl> -0.2335554, -0.2335554, -0.2335554, …
## $ WorkLifeBalance_X2                <dbl> -0.5501667, -0.5501667, -0.5501667, …
## $ WorkLifeBalance_X3                <dbl> 0.7955852, 0.7955852, 0.7955852, 0.7…
## $ WorkLifeBalance_X4                <dbl> -0.3330138, -0.3330138, -0.3330138, …

Specify Model

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune()) %>%
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(data_rec) %>% 
  add_model(xgboost_spec)

Tune hyperpararmeters

tree_grid <- grid_regular(trees(),
                          tree_depth(),
                          levels = 5)

doParallel::registerDoParallel()

set.seed(27809)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv,
            grid = 5,
            control = control_grid(save_pred = TRUE))

Model evaluation

Identify optimal value for hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 8
##    trees tree_depth .metric     .estimator  mean     n std_err .config          
##    <int>      <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>            
##  1  1003         12 accuracy    binary     0.869    10 0.00934 Preprocessor1_Mo…
##  2  1003         12 brier_class binary     0.109    10 0.00773 Preprocessor1_Mo…
##  3  1003         12 roc_auc     binary     0.793    10 0.0237  Preprocessor1_Mo…
##  4  1889         12 accuracy    binary     0.872    10 0.00908 Preprocessor1_Mo…
##  5  1889         12 brier_class binary     0.111    10 0.00760 Preprocessor1_Mo…
##  6  1889         12 roc_auc     binary     0.791    10 0.0232  Preprocessor1_Mo…
##  7  1206          4 accuracy    binary     0.869    10 0.0106  Preprocessor1_Mo…
##  8  1206          4 brier_class binary     0.114    10 0.00797 Preprocessor1_Mo…
##  9  1206          4 roc_auc     binary     0.776    10 0.0249  Preprocessor1_Mo…
## 10   388          7 accuracy    binary     0.865    10 0.00920 Preprocessor1_Mo…
## 11   388          7 brier_class binary     0.111    10 0.00821 Preprocessor1_Mo…
## 12   388          7 roc_auc     binary     0.788    10 0.0223  Preprocessor1_Mo…
## 13   622          9 accuracy    binary     0.868    10 0.0111  Preprocessor1_Mo…
## 14   622          9 brier_class binary     0.112    10 0.00868 Preprocessor1_Mo…
## 15   622          9 roc_auc     binary     0.777    10 0.0239  Preprocessor1_Mo…

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(Attrition, .pred_Left) %>%
    autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

## Warning: package 'xgboost' was built under R version 4.4.2

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.859 Preprocessor1_Model1
## 2 roc_auc     binary         0.800 Preprocessor1_Model1
## 3 brier_class binary         0.121 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(Attrition, .pred_class) %>%
    autoplot()

Variable importance

library(vip)

## Warning: package 'vip' was built under R version 4.4.3

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model, accuracy of 0.851 and AUC of 0.753

Feature transformation normalized numeric data. It resulted in a slight improvement with accuracy of 0.859 and AUC of 0.770
Feature transformation: YeoJohnson transformation. No improvement
Feature selection: PCA didn’t make an improvement.

Code along 5

Cam Paquette

2025-03-04

Import data

Explore Data

Explore Data

Model Building

Split Data

Preprocess data

Specify Model

Tune hyperpararmeters

Model evaluation

Identify optimal value for hyperparameters

Fit the model for the last time

Variable importance

Conclusion