Goal is to predict attrition, employees who are likely to leave the company.

Import data

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.1

## Warning: package 'dplyr' was built under R version 4.4.1

## Warning: package 'lubridate' was built under R version 4.4.1

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## Warning: package 'correlationfunnel' was built under R version 4.4.1

## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>

data <- read_csv("../00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")

## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean data

skimr::skim(data)

Data summary
Name	data
Number of rows	1470
Number of columns	35
_______________________
Column type frequency:
character	9
numeric	26
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
Attrition	1	2	3	2
BusinessTravel	1	10	17	3
Department	1	5	22	3
EducationField	1	5	16	6
Gender	1	4	6	2
JobRole	1	7	25	9
MaritalStatus	1	6	8	3
Over18	1	1	1	1
OverTime	1	2	3	2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Age	1	36.92	9.14	18	30.00	36.0	43.00	60	▂▇▇▃▂
DailyRate	1	802.49	403.51	102	465.00	802.0	1157.00	1499	▇▇▇▇▇
DistanceFromHome	1	9.19	8.11	1	2.00	7.0	14.00	29	▇▅▂▂▂
Education	1	2.91	1.02	1	2.00	3.0	4.00	5	▂▃▇▆▁
EmployeeCount	1	1.00	0.00	1	1.00	1.0	1.00	1	▁▁▇▁▁
EmployeeNumber	1	1024.87	602.02	1	491.25	1020.5	1555.75	2068	▇▇▇▇▇
EnvironmentSatisfaction	1	2.72	1.09	1	2.00	3.0	4.00	4	▅▅▁▇▇
HourlyRate	1	65.89	20.33	30	48.00	66.0	83.75	100	▇▇▇▇▇
JobInvolvement	1	2.73	0.71	1	2.00	3.0	3.00	4	▁▃▁▇▁
JobLevel	1	2.06	1.11	1	1.00	2.0	3.00	5	▇▇▃▂▁
JobSatisfaction	1	2.73	1.10	1	2.00	3.0	4.00	4	▅▅▁▇▇
MonthlyIncome	1	6502.93	4707.96	1009	2911.00	4919.0	8379.00	19999	▇▅▂▁▂
MonthlyRate	1	14313.10	7117.79	2094	8047.00	14235.5	20461.50	26999	▇▇▇▇▇
NumCompaniesWorked	1	2.69	2.50	0	1.00	2.0	4.00	9	▇▃▂▂▁
PercentSalaryHike	1	15.21	3.66	11	12.00	14.0	18.00	25	▇▅▃▂▁
PerformanceRating	1	3.15	0.36	3	3.00	3.0	3.00	4	▇▁▁▁▂
RelationshipSatisfaction	1	2.71	1.08	1	2.00	3.0	4.00	4	▅▅▁▇▇
StandardHours	1	80.00	0.00	80	80.00	80.0	80.00	80	▁▁▇▁▁
StockOptionLevel	1	0.79	0.85	0	0.00	1.0	1.00	3	▇▇▁▂▁
TotalWorkingYears	1	11.28	7.78	0	6.00	10.0	15.00	40	▇▇▂▁▁
TrainingTimesLastYear	1	2.80	1.29	0	2.00	3.0	3.00	6	▂▇▇▂▃
WorkLifeBalance	1	2.76	0.71	1	2.00	3.0	3.00	4	▁▃▁▇▂
YearsAtCompany	1	7.01	6.13	0	3.00	5.0	9.00	40	▇▂▁▁▁
YearsInCurrentRole	1	4.23	3.62	0	2.00	3.0	7.00	18	▇▃▂▁▁
YearsSinceLastPromotion	1	2.19	3.22	0	0.00	1.0	3.00	15	▇▁▁▁▁
YearsWithCurrManager	1	4.12	3.57	0	2.00	3.0	7.00	17	▇▂▅▁▁

Employee Number is the ID variable. (Number of employees match number of rows in data set.)

Issues with data

Missing values
Factors or numeric variables
- Education, Environment Satisfaction, Job Involvement, Job Satisfaction, Performance Rating, Relationship Satisfaction, Work Life Balance
Zero variance variables (drop them)
- Over 18, Employee Count, Standard Hours
Character variables
- Convert them to numbers in the recipes steps
Unbalanced target variables
- Attrition
ID variable
- Employee Number

# Change numeric variables to factors.

factors_vec <- data %>% select(Education, EnvironmentSatisfaction, JobInvolvement, JobSatisfaction, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance) %>% names()

data_clean <- data %>%
  
  # Address factors imported as numeric
  # mutate(Education = Education %>% as.factor()) %>%
  mutate(across(all_of(factors_vec), as.factor)) %>% 
  
  #Drop zero variance variables
  select(-c(Over18, EmployeeCount, StandardHours)) %>%

  #Recode Attrition
  mutate(Attrition = if_else(Attrition == "Yes", "Left", Attrition))

Explore data

data_clean %>% count(Attrition)

## # A tibble: 2 × 2
##   Attrition     n
##   <chr>     <int>
## 1 Left        237
## 2 No         1233

data_clean %>%
  ggplot(aes(Attrition)) +
  geom_bar()

Attrition vs. monthly income

data_clean %>%
  ggplot(aes(Attrition, MonthlyIncome)) +
  geom_boxplot()

Correlation plot

# Step 1: Binarize
data_binarized <- data_clean %>%
  select(-EmployeeNumber) %>%
  binarize()

data_binarized %>% glimpse()

## Rows: 1,470
## Columns: 120
## $ `Age__-Inf_30`                       <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Age__30_36                           <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, …
## $ Age__36_43                           <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ Age__43_Inf                          <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ Attrition__Left                      <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Attrition__No                        <dbl> 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `BusinessTravel__Non-Travel`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BusinessTravel__Travel_Frequently    <dbl> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ BusinessTravel__Travel_Rarely        <dbl> 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ `DailyRate__-Inf_465`                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ DailyRate__465_802                   <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ DailyRate__802_1157                  <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ DailyRate__1157_Inf                  <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, …
## $ Department__Human_Resources          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Department__Research_&_Development` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Department__Sales                    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `DistanceFromHome__-Inf_2`           <dbl> 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__2_7                <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ DistanceFromHome__7_14               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__14_Inf             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ Education__1                         <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Education__2                         <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ Education__3                         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, …
## $ Education__4                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ Education__5                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Human_Resources      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Life_Sciences        <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ EducationField__Marketing            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Medical              <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, …
## $ EducationField__Other                <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Technical_Degree     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__1           <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ EnvironmentSatisfaction__2           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__3           <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, …
## $ EnvironmentSatisfaction__4           <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ Gender__Female                       <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ Gender__Male                         <dbl> 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ `HourlyRate__-Inf_48`                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ HourlyRate__48_66                    <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ HourlyRate__66_83.75                 <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ HourlyRate__83.75_Inf                <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ JobInvolvement__1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobInvolvement__2                    <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobInvolvement__3                    <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ JobInvolvement__4                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ JobLevel__1                          <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ JobLevel__2                          <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobLevel__3                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobLevel__4                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobLevel__5                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Healthcare_Representative   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobRole__Human_Resources             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Laboratory_Technician       <dbl> 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, …
## $ JobRole__Manager                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Manufacturing_Director      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobRole__Research_Director           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Research_Scientist          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Executive             <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Representative        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobSatisfaction__1                   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ JobSatisfaction__2                   <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ JobSatisfaction__3                   <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, …
## $ JobSatisfaction__4                   <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ MaritalStatus__Divorced              <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ MaritalStatus__Married               <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, …
## $ MaritalStatus__Single                <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
## $ `MonthlyIncome__-Inf_2911`           <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ MonthlyIncome__2911_4919             <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ MonthlyIncome__4919_8379             <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ MonthlyIncome__8379_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `MonthlyRate__-Inf_8047`             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ MonthlyRate__8047_14235.5            <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, …
## $ MonthlyRate__14235.5_20461.5         <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, …
## $ MonthlyRate__20461.5_Inf             <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ `NumCompaniesWorked__-Inf_1`         <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, …
## $ NumCompaniesWorked__1_2              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ NumCompaniesWorked__2_4              <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ NumCompaniesWorked__4_Inf            <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ OverTime__No                         <dbl> 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ OverTime__Yes                        <dbl> 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ `PercentSalaryHike__-Inf_12`         <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__12_14             <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, …
## $ PercentSalaryHike__14_18             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__18_Inf            <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ PerformanceRating__3                 <dbl> 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, …
## $ PerformanceRating__4                 <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ RelationshipSatisfaction__1          <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ RelationshipSatisfaction__2          <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, …
## $ RelationshipSatisfaction__3          <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, …
## $ RelationshipSatisfaction__4          <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ StockOptionLevel__0                  <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ StockOptionLevel__1                  <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ StockOptionLevel__2                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ StockOptionLevel__3                  <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `TotalWorkingYears__-Inf_6`          <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ TotalWorkingYears__6_10              <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ TotalWorkingYears__10_15             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ TotalWorkingYears__15_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `TrainingTimesLastYear__-Inf_2`      <dbl> 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, …
## $ TrainingTimesLastYear__2_3           <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, …
## $ TrainingTimesLastYear__3_Inf         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ WorkLifeBalance__1                   <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ WorkLifeBalance__2                   <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
## $ WorkLifeBalance__3                   <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, …
## $ WorkLifeBalance__4                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsAtCompany__-Inf_3`             <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsAtCompany__3_5                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsAtCompany__5_9                  <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, …
## $ YearsAtCompany__9_Inf                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsInCurrentRole__-Inf_2`         <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsInCurrentRole__2_3              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ YearsInCurrentRole__3_7              <dbl> 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, …
## $ YearsInCurrentRole__7_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsSinceLastPromotion__-Inf_1`    <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, …
## $ YearsSinceLastPromotion__1_3         <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ YearsSinceLastPromotion__3_Inf       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `YearsWithCurrManager__-Inf_2`       <dbl> 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsWithCurrManager__2_3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsWithCurrManager__3_7            <dbl> 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ YearsWithCurrManager__7_Inf          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …

# Step 2: Correlation
data_correlation <- data_binarized %>% 
  correlate(Attrition__Left)

data_correlation

## # A tibble: 120 × 3
##    feature           bin       correlation
##    <fct>             <chr>           <dbl>
##  1 Attrition         Left            1    
##  2 Attrition         No             -1    
##  3 OverTime          No             -0.246
##  4 OverTime          Yes             0.246
##  5 JobLevel          1               0.213
##  6 MonthlyIncome     -Inf_2911       0.207
##  7 StockOptionLevel  0               0.195
##  8 YearsAtCompany    -Inf_3          0.183
##  9 MaritalStatus     Single          0.175
## 10 TotalWorkingYears -Inf_6          0.169
## # ℹ 110 more rows

# Step 3: Plot
data_correlation %>%
  correlationfunnel::plot_correlation_funnel()

## Warning: ggrepel: 73 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Split Data

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.4.1

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.6      ✔ rsample      1.2.1 
## ✔ dials        1.3.0      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10

## Warning: package 'dials' was built under R version 4.4.1

## Warning: package 'infer' was built under R version 4.4.1

## Warning: package 'modeldata' was built under R version 4.4.1

## Warning: package 'parsnip' was built under R version 4.4.1

## Warning: package 'tune' was built under R version 4.4.1

## Warning: package 'workflows' was built under R version 4.4.1

## Warning: package 'workflowsets' was built under R version 4.4.1

## Warning: package 'yardstick' was built under R version 4.4.1

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org

set.seed(1234)
data_clean <-  data_clean %>% sample_n(1000)

data_split <- initial_split(data_clean, strata = Attrition)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = Attrition)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [674/76]> Fold01
##  2 <split [674/76]> Fold02
##  3 <split [674/76]> Fold03
##  4 <split [675/75]> Fold04
##  5 <split [675/75]> Fold05
##  6 <split [675/75]> Fold06
##  7 <split [675/75]> Fold07
##  8 <split [676/74]> Fold08
##  9 <split [676/74]> Fold09
## 10 <split [676/74]> Fold10

Preprocess Data Using Recipes Package

library(themis)

## Warning: package 'themis' was built under R version 4.4.1

# Convert all nominal predictors (characters and factors) to numbers.
xgboost_rec <- recipes::recipe(Attrition ~ ., data = data_train) %>%
  update_role(EmployeeNumber, new_role = "ID") %>% 
  step_dummy(all_nominal_predictors()) %>%
  step_smote(Attrition)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 1,254
## Columns: 59
## $ Age                               <dbl> 35, 32, 37, 56, 31, 49, 55, 42, 55, …
## $ DailyRate                         <dbl> 622, 1259, 625, 441, 1079, 1184, 725…
## $ DistanceFromHome                  <dbl> 14, 2, 1, 14, 16, 11, 2, 19, 13, 22,…
## $ EmployeeNumber                    <dbl> 1010, 1692, 970, 161, 1761, 840, 787…
## $ HourlyRate                        <dbl> 39, 95, 46, 72, 70, 43, 78, 57, 85, …
## $ JobLevel                          <dbl> 1, 1, 3, 1, 3, 3, 5, 1, 4, 1, 1, 3, …
## $ MonthlyIncome                     <dbl> 3743, 1393, 10609, 4963, 8161, 7654,…
## $ MonthlyRate                       <dbl> 10074, 24852, 14922, 4510, 19002, 58…
## $ NumCompaniesWorked                <dbl> 1, 1, 5, 9, 2, 1, 5, 6, 6, 1, 6, 4, …
## $ PercentSalaryHike                 <dbl> 24, 12, 11, 18, 13, 18, 13, 12, 17, …
## $ StockOptionLevel                  <dbl> 1, 0, 0, 3, 3, 2, 1, 0, 0, 0, 3, 1, …
## $ TotalWorkingYears                 <dbl> 5, 1, 17, 7, 10, 9, 24, 7, 24, 1, 5,…
## $ TrainingTimesLastYear             <dbl> 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 3, …
## $ YearsAtCompany                    <dbl> 4, 1, 14, 5, 1, 9, 5, 2, 19, 1, 3, 3…
## $ YearsInCurrentRole                <dbl> 2, 0, 1, 4, 0, 8, 2, 2, 7, 0, 2, 2, …
## $ YearsSinceLastPromotion           <dbl> 0, 0, 11, 4, 0, 7, 1, 2, 3, 0, 0, 2,…
## $ YearsWithCurrManager              <dbl> 2, 0, 7, 3, 0, 7, 4, 2, 8, 0, 2, 0, …
## $ Attrition                         <fct> Left, Left, Left, Left, Left, Left, …
## $ BusinessTravel_Travel_Frequently  <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ BusinessTravel_Travel_Rarely      <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, …
## $ Department_Research...Development <dbl> 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, …
## $ Department_Sales                  <dbl> 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, …
## $ Education_X2                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Education_X3                      <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, …
## $ Education_X4                      <dbl> 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ Education_X5                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField_Life.Sciences      <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField_Marketing          <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, …
## $ EducationField_Medical            <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
## $ EducationField_Other              <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField_Technical.Degree   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction_X2        <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ EnvironmentSatisfaction_X3        <dbl> 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction_X4        <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ Gender_Male                       <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, …
## $ JobInvolvement_X2                 <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ JobInvolvement_X3                 <dbl> 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ JobInvolvement_X4                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, …
## $ JobRole_Human.Resources           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobRole_Laboratory.Technician     <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole_Manager                   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ JobRole_Manufacturing.Director    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole_Research.Director         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole_Research.Scientist        <dbl> 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ JobRole_Sales.Executive           <dbl> 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, …
## $ JobRole_Sales.Representative      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobSatisfaction_X2                <dbl> 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobSatisfaction_X3                <dbl> 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, …
## $ JobSatisfaction_X4                <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ MaritalStatus_Married             <dbl> 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, …
## $ MaritalStatus_Single              <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ OverTime_Yes                      <dbl> 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, …
## $ PerformanceRating_X4              <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ RelationshipSatisfaction_X2       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ RelationshipSatisfaction_X3       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, …
## $ RelationshipSatisfaction_X4       <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
## $ WorkLifeBalance_X2                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ WorkLifeBalance_X3                <dbl> 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, …
## $ WorkLifeBalance_X4                <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …

Specify Model

library(usemodels)

## Warning: package 'usemodels' was built under R version 4.4.1

#usemodels::use_xgboost(Attrition ~ ., data = data_train)

xgboost_spec <- 
  boost_tree(trees = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune Hyperparameters

doParallel::registerDoParallel()

set.seed(47927)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5, 
            control = control_grid(save_pred = TRUE))

Model Evaluation

Identify Optimal Values for Hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 7
##    trees .metric     .estimator  mean     n std_err .config             
##    <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>               
##  1   102 accuracy    binary     0.869    10 0.00793 Preprocessor1_Model1
##  2   102 brier_class binary     0.108    10 0.00437 Preprocessor1_Model1
##  3   102 roc_auc     binary     0.800    10 0.0160  Preprocessor1_Model1
##  4   463 accuracy    binary     0.867    10 0.00645 Preprocessor1_Model2
##  5   463 brier_class binary     0.111    10 0.00428 Preprocessor1_Model2
##  6   463 roc_auc     binary     0.801    10 0.0150  Preprocessor1_Model2
##  7  1088 accuracy    binary     0.863    10 0.00579 Preprocessor1_Model3
##  8  1088 brier_class binary     0.113    10 0.00426 Preprocessor1_Model3
##  9  1088 roc_auc     binary     0.801    10 0.0158  Preprocessor1_Model3
## 10  1236 accuracy    binary     0.863    10 0.00579 Preprocessor1_Model4
## 11  1236 brier_class binary     0.113    10 0.00427 Preprocessor1_Model4
## 12  1236 roc_auc     binary     0.801    10 0.0158  Preprocessor1_Model4
## 13  1945 accuracy    binary     0.864    10 0.00570 Preprocessor1_Model5
## 14  1945 brier_class binary     0.114    10 0.00417 Preprocessor1_Model5
## 15  1945 roc_auc     binary     0.799    10 0.0158  Preprocessor1_Model5

collect_predictions(xgboost_tune) %>%
  group_by(id) %>%
  roc_curve(Attrition, .pred_Left) %>%
  autoplot()

Fit the Model for the Last Time

xgboost_last <- xgboost_workflow %>%
  finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
  last_fit(data_split)

## Warning: package 'xgboost' was built under R version 4.4.1

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.836 Preprocessor1_Model1
## 2 roc_auc     binary         0.811 Preprocessor1_Model1
## 3 brier_class binary         0.133 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
  yardstick::conf_mat(Attrition, .pred_class) %>%
  autoplot()

Variable Importance

library(vip)

## Warning: package 'vip' was built under R version 4.4.1

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
  workflows::extract_fit_engine() %>%
  vip()

Code Along 7

Sara Donahue

2024-10-24

Import data

Clean data

Employee Number is the ID variable. (Number of employees match number of rows in data set.)

Explore data

Model Building

Split Data

Preprocess Data Using Recipes Package

Specify Model

Tune Hyperparameters

Model Evaluation

Identify Optimal Values for Hyperparameters

Fit the Model for the Last Time

Variable Importance