Goal is to predict attrition, employees who are likely to leave the company.

Import Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(correlationfunnel)

## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>

data <- read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean Data

skimr::skim(data)

Data summary
Name	data
Number of rows	1470
Number of columns	35
_______________________
Column type frequency:
character	9
numeric	26
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
Attrition	1	2	3	2
BusinessTravel	1	10	17	3
Department	1	5	22	3
EducationField	1	5	16	6
Gender	1	4	6	2
JobRole	1	7	25	9
MaritalStatus	1	6	8	3
Over18	1	1	1	1
OverTime	1	2	3	2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Age	1	36.92	9.14	18	30.00	36.0	43.00	60	▂▇▇▃▂
DailyRate	1	802.49	403.51	102	465.00	802.0	1157.00	1499	▇▇▇▇▇
DistanceFromHome	1	9.19	8.11	1	2.00	7.0	14.00	29	▇▅▂▂▂
Education	1	2.91	1.02	1	2.00	3.0	4.00	5	▂▃▇▆▁
EmployeeCount	1	1.00	0.00	1	1.00	1.0	1.00	1	▁▁▇▁▁
EmployeeNumber	1	1024.87	602.02	1	491.25	1020.5	1555.75	2068	▇▇▇▇▇
EnvironmentSatisfaction	1	2.72	1.09	1	2.00	3.0	4.00	4	▅▅▁▇▇
HourlyRate	1	65.89	20.33	30	48.00	66.0	83.75	100	▇▇▇▇▇
JobInvolvement	1	2.73	0.71	1	2.00	3.0	3.00	4	▁▃▁▇▁
JobLevel	1	2.06	1.11	1	1.00	2.0	3.00	5	▇▇▃▂▁
JobSatisfaction	1	2.73	1.10	1	2.00	3.0	4.00	4	▅▅▁▇▇
MonthlyIncome	1	6502.93	4707.96	1009	2911.00	4919.0	8379.00	19999	▇▅▂▁▂
MonthlyRate	1	14313.10	7117.79	2094	8047.00	14235.5	20461.50	26999	▇▇▇▇▇
NumCompaniesWorked	1	2.69	2.50	0	1.00	2.0	4.00	9	▇▃▂▂▁
PercentSalaryHike	1	15.21	3.66	11	12.00	14.0	18.00	25	▇▅▃▂▁
PerformanceRating	1	3.15	0.36	3	3.00	3.0	3.00	4	▇▁▁▁▂
RelationshipSatisfaction	1	2.71	1.08	1	2.00	3.0	4.00	4	▅▅▁▇▇
StandardHours	1	80.00	0.00	80	80.00	80.0	80.00	80	▁▁▇▁▁
StockOptionLevel	1	0.79	0.85	0	0.00	1.0	1.00	3	▇▇▁▂▁
TotalWorkingYears	1	11.28	7.78	0	6.00	10.0	15.00	40	▇▇▂▁▁
TrainingTimesLastYear	1	2.80	1.29	0	2.00	3.0	3.00	6	▂▇▇▂▃
WorkLifeBalance	1	2.76	0.71	1	2.00	3.0	3.00	4	▁▃▁▇▂
YearsAtCompany	1	7.01	6.13	0	3.00	5.0	9.00	40	▇▂▁▁▁
YearsInCurrentRole	1	4.23	3.62	0	2.00	3.0	7.00	18	▇▃▂▁▁
YearsSinceLastPromotion	1	2.19	3.22	0	0.00	1.0	3.00	15	▇▁▁▁▁
YearsWithCurrManager	1	4.12	3.57	0	2.00	3.0	7.00	17	▇▂▅▁▁

Issues with data: - Missing values - Factors or numeric variables - Education, EnvironmentSatisfaction, JobInvolvement, JobSatisfaction, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance - Zero variance variables - Over18, EmployeeCount, StandardHours - Character Variables: Convert them to numbers in the recipe steps - Unbalanced target variable: Attrition - ID Variable: EmployeeNumber

factors_vec <- data %>% select(Education, EnvironmentSatisfaction, JobInvolvement, JobSatisfaction, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance, JobLevel, StockOptionLevel) %>% names()


data_clean <- data %>%
    
    # Address factors imported as numeric
    mutate(across(all_of(factors_vec), as.factor)) %>%
    
    # Drop zero variance variables
    select(-c(Over18, EmployeeCount, StandardHours)) %>%

    # Recode Attrition
    mutate(Attrition = if_else(Attrition == "Yes", "Left", Attrition))

Explore data

data_clean %>% count(Attrition)

## # A tibble: 2 × 2
##   Attrition     n
##   <chr>     <int>
## 1 Left        237
## 2 No         1233

data_clean %>%
    ggplot(aes(Attrition)) +
    geom_bar()

attrition vs monthly income

data_clean %>%
    ggplot(aes(Attrition, MonthlyIncome)) +
    geom_boxplot()

correlation plot

# step 1: binarize
data_binarized <- data_clean %>%
    select(-EmployeeNumber) %>%
    binarize()

data_binarized %>% glimpse()

## Rows: 1,470
## Columns: 120
## $ `Age__-Inf_30`                       <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Age__30_36                           <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, …
## $ Age__36_43                           <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ Age__43_Inf                          <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ Attrition__Left                      <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Attrition__No                        <dbl> 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `BusinessTravel__Non-Travel`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BusinessTravel__Travel_Frequently    <dbl> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ BusinessTravel__Travel_Rarely        <dbl> 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ `DailyRate__-Inf_465`                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ DailyRate__465_802                   <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ DailyRate__802_1157                  <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ DailyRate__1157_Inf                  <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, …
## $ Department__Human_Resources          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Department__Research_&_Development` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Department__Sales                    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `DistanceFromHome__-Inf_2`           <dbl> 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__2_7                <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ DistanceFromHome__7_14               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__14_Inf             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ Education__1                         <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Education__2                         <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ Education__3                         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, …
## $ Education__4                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ Education__5                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Human_Resources      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Life_Sciences        <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ EducationField__Marketing            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Medical              <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, …
## $ EducationField__Other                <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Technical_Degree     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__1           <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ EnvironmentSatisfaction__2           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__3           <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, …
## $ EnvironmentSatisfaction__4           <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ Gender__Female                       <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ Gender__Male                         <dbl> 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ `HourlyRate__-Inf_48`                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ HourlyRate__48_66                    <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ HourlyRate__66_83.75                 <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ HourlyRate__83.75_Inf                <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ JobInvolvement__1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobInvolvement__2                    <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobInvolvement__3                    <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ JobInvolvement__4                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ JobLevel__1                          <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ JobLevel__2                          <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobLevel__3                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobLevel__4                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobLevel__5                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Healthcare_Representative   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobRole__Human_Resources             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Laboratory_Technician       <dbl> 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, …
## $ JobRole__Manager                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Manufacturing_Director      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobRole__Research_Director           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Research_Scientist          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Executive             <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Representative        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobSatisfaction__1                   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ JobSatisfaction__2                   <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ JobSatisfaction__3                   <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, …
## $ JobSatisfaction__4                   <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ MaritalStatus__Divorced              <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ MaritalStatus__Married               <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, …
## $ MaritalStatus__Single                <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
## $ `MonthlyIncome__-Inf_2911`           <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ MonthlyIncome__2911_4919             <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ MonthlyIncome__4919_8379             <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ MonthlyIncome__8379_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `MonthlyRate__-Inf_8047`             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ MonthlyRate__8047_14235.5            <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, …
## $ MonthlyRate__14235.5_20461.5         <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, …
## $ MonthlyRate__20461.5_Inf             <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ `NumCompaniesWorked__-Inf_1`         <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, …
## $ NumCompaniesWorked__1_2              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ NumCompaniesWorked__2_4              <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ NumCompaniesWorked__4_Inf            <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ OverTime__No                         <dbl> 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ OverTime__Yes                        <dbl> 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ `PercentSalaryHike__-Inf_12`         <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__12_14             <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, …
## $ PercentSalaryHike__14_18             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__18_Inf            <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ PerformanceRating__3                 <dbl> 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, …
## $ PerformanceRating__4                 <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ RelationshipSatisfaction__1          <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ RelationshipSatisfaction__2          <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, …
## $ RelationshipSatisfaction__3          <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, …
## $ RelationshipSatisfaction__4          <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ StockOptionLevel__0                  <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ StockOptionLevel__1                  <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ StockOptionLevel__2                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ StockOptionLevel__3                  <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `TotalWorkingYears__-Inf_6`          <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ TotalWorkingYears__6_10              <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ TotalWorkingYears__10_15             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ TotalWorkingYears__15_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `TrainingTimesLastYear__-Inf_2`      <dbl> 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, …
## $ TrainingTimesLastYear__2_3           <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, …
## $ TrainingTimesLastYear__3_Inf         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ WorkLifeBalance__1                   <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ WorkLifeBalance__2                   <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
## $ WorkLifeBalance__3                   <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, …
## $ WorkLifeBalance__4                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsAtCompany__-Inf_3`             <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsAtCompany__3_5                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsAtCompany__5_9                  <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, …
## $ YearsAtCompany__9_Inf                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsInCurrentRole__-Inf_2`         <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsInCurrentRole__2_3              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ YearsInCurrentRole__3_7              <dbl> 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, …
## $ YearsInCurrentRole__7_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsSinceLastPromotion__-Inf_1`    <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, …
## $ YearsSinceLastPromotion__1_3         <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ YearsSinceLastPromotion__3_Inf       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `YearsWithCurrManager__-Inf_2`       <dbl> 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsWithCurrManager__2_3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsWithCurrManager__3_7            <dbl> 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ YearsWithCurrManager__7_Inf          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …

# step 2: correlation
data_correlation <- data_binarized %>%
    correlate(Attrition__Left)

data_correlation

## # A tibble: 120 × 3
##    feature           bin       correlation
##    <fct>             <chr>           <dbl>
##  1 Attrition         Left            1    
##  2 Attrition         No             -1    
##  3 OverTime          No             -0.246
##  4 OverTime          Yes             0.246
##  5 JobLevel          1               0.213
##  6 MonthlyIncome     -Inf_2911       0.207
##  7 StockOptionLevel  0               0.195
##  8 YearsAtCompany    -Inf_3          0.183
##  9 MaritalStatus     Single          0.175
## 10 TotalWorkingYears -Inf_6          0.169
## # ℹ 110 more rows

# step 3: plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

## Warning: ggrepel: 72 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model building

Split Data

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.2
## ✔ recipes      1.1.0

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

set.seed(1234)
# data_clean <- data_clean %>% sample_n(100)

data_split <- initial_split(data_clean, strata = Attrition)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = Attrition)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits            id    
##    <list>            <chr> 
##  1 <split [990/111]> Fold01
##  2 <split [990/111]> Fold02
##  3 <split [990/111]> Fold03
##  4 <split [990/111]> Fold04
##  5 <split [991/110]> Fold05
##  6 <split [991/110]> Fold06
##  7 <split [991/110]> Fold07
##  8 <split [992/109]> Fold08
##  9 <split [992/109]> Fold09
## 10 <split [992/109]> Fold10

Preprocess Data

library(themis)

xgboost_rec <- recipes::recipe(Attrition ~ ., data = data_train) %>%
    update_role(EmployeeNumber, new_role = "ID") %>%
    step_dummy(all_nominal_predictors()) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_smote(Attrition)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 1,848
## Columns: 64
## $ Age                               <dbl> 0.008807237, -0.101383303, -0.542145…
## $ DailyRate                         <dbl> 1.42650782, 1.04705665, 0.81938595, …
## $ DistanceFromHome                  <dbl> -0.90560797, -0.04468277, 0.81624244…
## $ EmployeeNumber                    <dbl> 4, 27, 33, 45, 47, 55, 58, 64, 90, 1…
## $ HourlyRate                        <dbl> 1.281526684, 0.789662751, 0.29779881…
## $ MonthlyIncome                     <dbl> -0.93899597, -0.66085114, -0.5527189…
## $ MonthlyRate                       <dbl> -1.6645786, -1.0194609, -1.3434252, …
## $ NumCompaniesWorked                <dbl> 1.3192201, 1.7195678, -0.6825182, -0…
## $ PercentSalaryHike                 <dbl> -0.06569789, 2.10974214, 1.83781213,…
## $ TotalWorkingYears                 <dbl> -0.5386623, -0.1543025, -0.1543025, …
## $ TrainingTimesLastYear             <dbl> 0.1451189, 0.9245137, 1.7039084, -0.…
## $ YearsAtCompany                    <dbl> -1.119612991, -0.323504813, 0.472603…
## $ YearsInCurrentRole                <dbl> -1.15408286, -0.33886992, -0.6106075…
## $ YearsSinceLastPromotion           <dbl> -0.67903619, -0.67903619, 1.21606861…
## $ YearsWithCurrManager              <dbl> -1.12646331, -0.29520587, 0.81313739…
## $ Attrition                         <fct> Left, Left, Left, Left, Left, Left, …
## $ BusinessTravel_Travel_Frequently  <dbl> -0.4781076, -0.4781076, 2.0896799, -…
## $ BusinessTravel_Travel_Rarely      <dbl> 0.6327712, 0.6327712, -1.5789147, 0.…
## $ Department_Research...Development <dbl> 0.7300286, -1.3685653, 0.7300286, 0.…
## $ Department_Sales                  <dbl> -0.6595963, 1.5147018, -0.6595963, -…
## $ Education_X2                      <dbl> 2.0834380, -0.4795399, -0.4795399, -…
## $ Education_X3                      <dbl> -0.8062777, -0.8062777, -0.8062777, …
## $ Education_X4                      <dbl> -0.5949133, 1.6793906, -0.5949133, -…
## $ Education_X5                      <dbl> -0.1990579, -0.1990579, -0.1990579, …
## $ EducationField_Life.Sciences      <dbl> -0.8420112, 1.1865540, 1.1865540, -0…
## $ EducationField_Marketing          <dbl> -0.3463115, -0.3463115, -0.3463115, …
## $ EducationField_Medical            <dbl> -0.6981366, -0.6981366, -0.6981366, …
## $ EducationField_Other              <dbl> 4.1634411, -0.2399678, -0.2399678, -…
## $ EducationField_Technical.Degree   <dbl> -0.2835453, -0.2835453, -0.2835453, …
## $ EnvironmentSatisfaction_X2        <dbl> -0.5079873, -0.5079873, 1.9667651, 1…
## $ EnvironmentSatisfaction_X3        <dbl> -0.6581793, 1.5179630, -0.6581793, -…
## $ EnvironmentSatisfaction_X4        <dbl> 1.5445140, -0.6468648, -0.6468648, -…
## $ Gender_Male                       <dbl> 0.8078118, 0.8078118, -1.2367877, 0.…
## $ JobInvolvement_X2                 <dbl> 1.6873239, 1.6873239, -0.5921161, -0…
## $ JobInvolvement_X3                 <dbl> -1.1755257, -1.1755257, -1.1755257, …
## $ JobInvolvement_X4                 <dbl> -0.3397008, -0.3397008, -0.3397008, …
## $ JobLevel_X2                       <dbl> -0.7535647, -0.7535647, -0.7535647, …
## $ JobLevel_X3                       <dbl> -0.4166727, -0.4166727, -0.4166727, …
## $ JobLevel_X4                       <dbl> -0.2872644, -0.2872644, -0.2872644, …
## $ JobLevel_X5                       <dbl> -0.218015, -0.218015, -0.218015, -0.…
## $ JobRole_Human.Resources           <dbl> -0.1990579, -0.1990579, -0.1990579, …
## $ JobRole_Laboratory.Technician     <dbl> 2.1411830, -0.4666074, -0.4666074, -…
## $ JobRole_Manager                   <dbl> -0.2663591, -0.2663591, -0.2663591, …
## $ JobRole_Manufacturing.Director    <dbl> -0.3380364, -0.3380364, -0.3380364, …
## $ JobRole_Research.Director         <dbl> -0.2524088, -0.2524088, -0.2524088, …
## $ JobRole_Research.Scientist        <dbl> -0.4766741, -0.4766741, 2.0959640, 2…
## $ JobRole_Sales.Executive           <dbl> -0.5319394, -0.5319394, -0.5319394, …
## $ JobRole_Sales.Representative      <dbl> -0.2483152, 4.0234821, -0.2483152, -…
## $ JobSatisfaction_X2                <dbl> -0.4866861, -0.4866861, -0.4866861, …
## $ JobSatisfaction_X3                <dbl> 1.5179630, -0.6581793, -0.6581793, -…
## $ JobSatisfaction_X4                <dbl> -0.6666917, -0.6666917, -0.6666917, …
## $ MaritalStatus_Married             <dbl> -0.9285259, -0.9285259, -0.9285259, …
## $ MaritalStatus_Single              <dbl> 1.4765158, 1.4765158, 1.4765158, -0.…
## $ OverTime_Yes                      <dbl> 1.6002434, -0.6243374, 1.6002434, 1.…
## $ PerformanceRating_X4              <dbl> -0.4360017, 2.2914860, 2.2914860, -0…
## $ RelationshipSatisfaction_X2       <dbl> 1.8831789, 1.8831789, 1.8831789, -0.…
## $ RelationshipSatisfaction_X3       <dbl> -0.6780813, -0.6780813, -0.6780813, …
## $ RelationshipSatisfaction_X4       <dbl> -0.6243374, -0.6243374, -0.6243374, …
## $ StockOptionLevel_X1               <dbl> -0.8294726, -0.8294726, -0.8294726, …
## $ StockOptionLevel_X2               <dbl> -0.3544752, -0.3544752, -0.3544752, …
## $ StockOptionLevel_X3               <dbl> -0.2420756, -0.2420756, -0.2420756, …
## $ WorkLifeBalance_X2                <dbl> -0.5571643, -0.5571643, -0.5571643, …
## $ WorkLifeBalance_X3                <dbl> 0.8062777, 0.8062777, 0.8062777, -1.…
## $ WorkLifeBalance_X4                <dbl> -0.3313295, -0.3313295, -0.3313295, …

Specify Model

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune hyperparameters

tree_grid <- grid_regular(trees(), 
                          tree_depth(),
                          levels = 5)

doParallel::registerDoParallel()

set.seed(90812)
xgboost_tune <-
  tune_grid(xgboost_workflow,
            resamples = data_cv,
            grid = 5,
            control = control_grid(save_pred = TRUE))

Model evaluation

Identify optimal values for hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 8
##    trees tree_depth .metric     .estimator   mean     n std_err .config         
##    <int>      <int> <chr>       <chr>       <dbl> <int>   <dbl> <chr>           
##  1   926          1 accuracy    binary     0.887     10 0.00519 Preprocessor1_M…
##  2   926          1 brier_class binary     0.0904    10 0.00285 Preprocessor1_M…
##  3   926          1 roc_auc     binary     0.857     10 0.0118  Preprocessor1_M…
##  4   553          5 accuracy    binary     0.877     10 0.00984 Preprocessor1_M…
##  5   553          5 brier_class binary     0.104     10 0.00699 Preprocessor1_M…
##  6   553          5 roc_auc     binary     0.803     10 0.0173  Preprocessor1_M…
##  7  1924          8 accuracy    binary     0.875     10 0.00917 Preprocessor1_M…
##  8  1924          8 brier_class binary     0.108     10 0.00707 Preprocessor1_M…
##  9  1924          8 roc_auc     binary     0.812     10 0.0155  Preprocessor1_M…
## 10  1429         11 accuracy    binary     0.882     10 0.00839 Preprocessor1_M…
## 11  1429         11 brier_class binary     0.106     10 0.00724 Preprocessor1_M…
## 12  1429         11 roc_auc     binary     0.808     10 0.0187  Preprocessor1_M…
## 13   113         13 accuracy    binary     0.873     10 0.00624 Preprocessor1_M…
## 14   113         13 brier_class binary     0.106     10 0.00645 Preprocessor1_M…
## 15   113         13 roc_auc     binary     0.816     10 0.0192  Preprocessor1_M…

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(Attrition, .pred_Left) %>%
    autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.846 Preprocessor1_Model1
## 2 roc_auc     binary         0.799 Preprocessor1_Model1
## 3 brier_class binary         0.116 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(Attrition, .pred_class) %>%
    autoplot()

Variable importance

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had accuracy of 0.851 and AUC of 0.753.

Feature transformation: normalized numeric data. It resulted in a slight improvement with accuracy of 0.859 and AUC of 0.770.
Feature transformation: YeoJohnson transformation. No improvement
Feature selection: PCA didnt make an improvement

Code along 6

Alyssa D’Alessio

2025-03-05