Goal is to predict attrition, employees who are likely to leave the company.

Import Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## Warning: package 'correlationfunnel' was built under R version 4.4.2

## ══ correlationfunnel Tip #3 ════════════════════════════════════════════════════
## Using `binarize()` with data containing many columns or many rows can increase dimensionality substantially.
## Try subsetting your data column-wise or row-wise to avoid creating too many columns.
## You can always make a big problem smaller by sampling. :)

data <- read_csv("../00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")

## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore Data

skimr::skim(data)

Data summary
Name	data
Number of rows	1470
Number of columns	35
_______________________
Column type frequency:
character	9
numeric	26
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
Attrition	1	2	3	2
BusinessTravel	1	10	17	3
Department	1	5	22	3
EducationField	1	5	16	6
Gender	1	4	6	2
JobRole	1	7	25	9
MaritalStatus	1	6	8	3
Over18	1	1	1	1
OverTime	1	2	3	2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Age	1	36.92	9.14	18	30.00	36.0	43.00	60	▂▇▇▃▂
DailyRate	1	802.49	403.51	102	465.00	802.0	1157.00	1499	▇▇▇▇▇
DistanceFromHome	1	9.19	8.11	1	2.00	7.0	14.00	29	▇▅▂▂▂
Education	1	2.91	1.02	1	2.00	3.0	4.00	5	▂▃▇▆▁
EmployeeCount	1	1.00	0.00	1	1.00	1.0	1.00	1	▁▁▇▁▁
EmployeeNumber	1	1024.87	602.02	1	491.25	1020.5	1555.75	2068	▇▇▇▇▇
EnvironmentSatisfaction	1	2.72	1.09	1	2.00	3.0	4.00	4	▅▅▁▇▇
HourlyRate	1	65.89	20.33	30	48.00	66.0	83.75	100	▇▇▇▇▇
JobInvolvement	1	2.73	0.71	1	2.00	3.0	3.00	4	▁▃▁▇▁
JobLevel	1	2.06	1.11	1	1.00	2.0	3.00	5	▇▇▃▂▁
JobSatisfaction	1	2.73	1.10	1	2.00	3.0	4.00	4	▅▅▁▇▇
MonthlyIncome	1	6502.93	4707.96	1009	2911.00	4919.0	8379.00	19999	▇▅▂▁▂
MonthlyRate	1	14313.10	7117.79	2094	8047.00	14235.5	20461.50	26999	▇▇▇▇▇
NumCompaniesWorked	1	2.69	2.50	0	1.00	2.0	4.00	9	▇▃▂▂▁
PercentSalaryHike	1	15.21	3.66	11	12.00	14.0	18.00	25	▇▅▃▂▁
PerformanceRating	1	3.15	0.36	3	3.00	3.0	3.00	4	▇▁▁▁▂
RelationshipSatisfaction	1	2.71	1.08	1	2.00	3.0	4.00	4	▅▅▁▇▇
StandardHours	1	80.00	0.00	80	80.00	80.0	80.00	80	▁▁▇▁▁
StockOptionLevel	1	0.79	0.85	0	0.00	1.0	1.00	3	▇▇▁▂▁
TotalWorkingYears	1	11.28	7.78	0	6.00	10.0	15.00	40	▇▇▂▁▁
TrainingTimesLastYear	1	2.80	1.29	0	2.00	3.0	3.00	6	▂▇▇▂▃
WorkLifeBalance	1	2.76	0.71	1	2.00	3.0	3.00	4	▁▃▁▇▂
YearsAtCompany	1	7.01	6.13	0	3.00	5.0	9.00	40	▇▂▁▁▁
YearsInCurrentRole	1	4.23	3.62	0	2.00	3.0	7.00	18	▇▃▂▁▁
YearsSinceLastPromotion	1	2.19	3.22	0	0.00	1.0	3.00	15	▇▁▁▁▁
YearsWithCurrManager	1	4.12	3.57	0	2.00	3.0	7.00	17	▇▂▅▁▁

Issues with Code

Missing Values
Factors or numeric variables
- Education, EnvironmentSatisfaction, JobInvolvment, JobSatisfaction, PerformancRating, RelationshipSatisfaction, WorkLifeBalance
Zero varience variables
- Over18, EmployCOunt, StandardHours
Character variables: convert them to numbers in the recipe steps
Unbalanced target variables: attrition
ID variable: EmployeeNumber

factors_vec <- data %>% select(Education, EnvironmentSatisfaction, JobInvolvement,JobSatisfaction, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance, JobLevel, StockOptionLevel) %>% names()

data_clean <- data %>%
    
    # Address factors imported as numeric
    
    mutate(across(factors_vec, as.factor)) %>%
    
    # Drop zero-variance variables
    select(-c(Over18, EmployeeCount, StandardHours)) %>%
    
    # Recode Attrition
    mutate(Attrition = if_else(Attrition == "Yes", "Left", Attrition))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(factors_vec, as.factor)`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(factors_vec)
## 
##   # Now:
##   data %>% select(all_of(factors_vec))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

Explore Data

skimr::skim(data_clean)

Data summary
Name	data_clean
Number of rows	1470
Number of columns	32
_______________________
Column type frequency:
character	8
factor	9
numeric	15
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
Attrition	1	2	4	2
BusinessTravel	1	10	17	3
Department	1	5	22	3
EducationField	1	5	16	6
Gender	1	4	6	2
JobRole	1	7	25	9
MaritalStatus	1	6	8	3
OverTime	1	2	3	2

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
Education	1	FALSE	5	3: 572, 4: 398, 2: 282, 1: 170
EnvironmentSatisfaction	1	FALSE	4	3: 453, 4: 446, 2: 287, 1: 284
JobInvolvement	1	FALSE	4	3: 868, 2: 375, 4: 144, 1: 83
JobLevel	1	FALSE	5	1: 543, 2: 534, 3: 218, 4: 106
JobSatisfaction	1	FALSE	4	4: 459, 3: 442, 1: 289, 2: 280
PerformanceRating	1	FALSE	2	3: 1244, 4: 226
RelationshipSatisfaction	1	FALSE	4	3: 459, 4: 432, 2: 303, 1: 276
StockOptionLevel	1	FALSE	4	0: 631, 1: 596, 2: 158, 3: 85
WorkLifeBalance	1	FALSE	4	3: 893, 2: 344, 4: 153, 1: 80

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Age	1	36.92	9.14	18	30.00	36.0	43.00	60	▂▇▇▃▂
DailyRate	1	802.49	403.51	102	465.00	802.0	1157.00	1499	▇▇▇▇▇
DistanceFromHome	1	9.19	8.11	1	2.00	7.0	14.00	29	▇▅▂▂▂
EmployeeNumber	1	1024.87	602.02	1	491.25	1020.5	1555.75	2068	▇▇▇▇▇
HourlyRate	1	65.89	20.33	30	48.00	66.0	83.75	100	▇▇▇▇▇
MonthlyIncome	1	6502.93	4707.96	1009	2911.00	4919.0	8379.00	19999	▇▅▂▁▂
MonthlyRate	1	14313.10	7117.79	2094	8047.00	14235.5	20461.50	26999	▇▇▇▇▇
NumCompaniesWorked	1	2.69	2.50	0	1.00	2.0	4.00	9	▇▃▂▂▁
PercentSalaryHike	1	15.21	3.66	11	12.00	14.0	18.00	25	▇▅▃▂▁
TotalWorkingYears	1	11.28	7.78	0	6.00	10.0	15.00	40	▇▇▂▁▁
TrainingTimesLastYear	1	2.80	1.29	0	2.00	3.0	3.00	6	▂▇▇▂▃
YearsAtCompany	1	7.01	6.13	0	3.00	5.0	9.00	40	▇▂▁▁▁
YearsInCurrentRole	1	4.23	3.62	0	2.00	3.0	7.00	18	▇▃▂▁▁
YearsSinceLastPromotion	1	2.19	3.22	0	0.00	1.0	3.00	15	▇▁▁▁▁
YearsWithCurrManager	1	4.12	3.57	0	2.00	3.0	7.00	17	▇▂▅▁▁

data_clean %>% count(Attrition)

## # A tibble: 2 × 2
##   Attrition     n
##   <chr>     <int>
## 1 Left        237
## 2 No         1233

data_clean %>% 
    ggplot(aes(Attrition)) +
    geom_bar()

attrition vs monthly income

data_clean %>%
    ggplot(aes(Attrition, MonthlyIncome)) +
    geom_boxplot()

correlation plot

# step 1: binarize
data_binarized <- data_clean %>% 
    select(-EmployeeNumber) %>%
    binarize()

data_binarized %>% glimpse()

## Rows: 1,470
## Columns: 120
## $ `Age__-Inf_30`                       <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Age__30_36                           <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, …
## $ Age__36_43                           <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ Age__43_Inf                          <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ Attrition__Left                      <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Attrition__No                        <dbl> 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `BusinessTravel__Non-Travel`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BusinessTravel__Travel_Frequently    <dbl> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ BusinessTravel__Travel_Rarely        <dbl> 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ `DailyRate__-Inf_465`                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ DailyRate__465_802                   <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ DailyRate__802_1157                  <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ DailyRate__1157_Inf                  <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, …
## $ Department__Human_Resources          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Department__Research_&_Development` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Department__Sales                    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `DistanceFromHome__-Inf_2`           <dbl> 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__2_7                <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ DistanceFromHome__7_14               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__14_Inf             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ Education__1                         <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Education__2                         <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ Education__3                         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, …
## $ Education__4                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ Education__5                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Human_Resources      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Life_Sciences        <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ EducationField__Marketing            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Medical              <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, …
## $ EducationField__Other                <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Technical_Degree     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__1           <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ EnvironmentSatisfaction__2           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__3           <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, …
## $ EnvironmentSatisfaction__4           <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ Gender__Female                       <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ Gender__Male                         <dbl> 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ `HourlyRate__-Inf_48`                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ HourlyRate__48_66                    <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ HourlyRate__66_83.75                 <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ HourlyRate__83.75_Inf                <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ JobInvolvement__1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobInvolvement__2                    <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobInvolvement__3                    <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ JobInvolvement__4                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ JobLevel__1                          <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ JobLevel__2                          <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobLevel__3                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobLevel__4                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobLevel__5                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Healthcare_Representative   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobRole__Human_Resources             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Laboratory_Technician       <dbl> 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, …
## $ JobRole__Manager                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Manufacturing_Director      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobRole__Research_Director           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Research_Scientist          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Executive             <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Representative        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobSatisfaction__1                   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ JobSatisfaction__2                   <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ JobSatisfaction__3                   <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, …
## $ JobSatisfaction__4                   <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ MaritalStatus__Divorced              <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ MaritalStatus__Married               <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, …
## $ MaritalStatus__Single                <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
## $ `MonthlyIncome__-Inf_2911`           <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ MonthlyIncome__2911_4919             <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ MonthlyIncome__4919_8379             <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ MonthlyIncome__8379_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `MonthlyRate__-Inf_8047`             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ MonthlyRate__8047_14235.5            <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, …
## $ MonthlyRate__14235.5_20461.5         <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, …
## $ MonthlyRate__20461.5_Inf             <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ `NumCompaniesWorked__-Inf_1`         <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, …
## $ NumCompaniesWorked__1_2              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ NumCompaniesWorked__2_4              <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ NumCompaniesWorked__4_Inf            <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ OverTime__No                         <dbl> 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ OverTime__Yes                        <dbl> 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ `PercentSalaryHike__-Inf_12`         <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__12_14             <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, …
## $ PercentSalaryHike__14_18             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__18_Inf            <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ PerformanceRating__3                 <dbl> 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, …
## $ PerformanceRating__4                 <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ RelationshipSatisfaction__1          <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ RelationshipSatisfaction__2          <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, …
## $ RelationshipSatisfaction__3          <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, …
## $ RelationshipSatisfaction__4          <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ StockOptionLevel__0                  <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ StockOptionLevel__1                  <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ StockOptionLevel__2                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ StockOptionLevel__3                  <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `TotalWorkingYears__-Inf_6`          <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ TotalWorkingYears__6_10              <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ TotalWorkingYears__10_15             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ TotalWorkingYears__15_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `TrainingTimesLastYear__-Inf_2`      <dbl> 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, …
## $ TrainingTimesLastYear__2_3           <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, …
## $ TrainingTimesLastYear__3_Inf         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ WorkLifeBalance__1                   <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ WorkLifeBalance__2                   <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
## $ WorkLifeBalance__3                   <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, …
## $ WorkLifeBalance__4                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsAtCompany__-Inf_3`             <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsAtCompany__3_5                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsAtCompany__5_9                  <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, …
## $ YearsAtCompany__9_Inf                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsInCurrentRole__-Inf_2`         <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsInCurrentRole__2_3              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ YearsInCurrentRole__3_7              <dbl> 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, …
## $ YearsInCurrentRole__7_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsSinceLastPromotion__-Inf_1`    <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, …
## $ YearsSinceLastPromotion__1_3         <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ YearsSinceLastPromotion__3_Inf       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `YearsWithCurrManager__-Inf_2`       <dbl> 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsWithCurrManager__2_3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsWithCurrManager__3_7            <dbl> 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ YearsWithCurrManager__7_Inf          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …

# step 2: data correlation
data_correlation <- data_binarized %>%
    correlate(Attrition__Left)

data_correlation

## # A tibble: 120 × 3
##    feature           bin       correlation
##    <fct>             <chr>           <dbl>
##  1 Attrition         Left            1    
##  2 Attrition         No             -1    
##  3 OverTime          No             -0.246
##  4 OverTime          Yes             0.246
##  5 JobLevel          1               0.213
##  6 MonthlyIncome     -Inf_2911       0.207
##  7 StockOptionLevel  0               0.195
##  8 YearsAtCompany    -Inf_3          0.183
##  9 MaritalStatus     Single          0.175
## 10 TotalWorkingYears -Inf_6          0.169
## # ℹ 110 more rows

# step 3: plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

## Warning: ggrepel: 73 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Split Data

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.4.2

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.2
## ✔ recipes      1.1.1

## Warning: package 'dials' was built under R version 4.4.2

## Warning: package 'infer' was built under R version 4.4.2

## Warning: package 'modeldata' was built under R version 4.4.2

## Warning: package 'parsnip' was built under R version 4.4.2

## Warning: package 'tune' was built under R version 4.4.2

## Warning: package 'workflows' was built under R version 4.4.2

## Warning: package 'workflowsets' was built under R version 4.4.2

## Warning: package 'yardstick' was built under R version 4.4.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

set.seed(3888)
data_clean <- data_clean %>% sample_n(100)

data_split <- initial_split(data_clean, strata = Attrition)
data_train <- training(data_split)
data_test <- testing (data_split)

data_cv <- rsample::vfold_cv(data_train, strata = Attrition)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [65/9]> Fold01
##  2 <split [66/8]> Fold02
##  3 <split [66/8]> Fold03
##  4 <split [67/7]> Fold04
##  5 <split [67/7]> Fold05
##  6 <split [67/7]> Fold06
##  7 <split [67/7]> Fold07
##  8 <split [67/7]> Fold08
##  9 <split [67/7]> Fold09
## 10 <split [67/7]> Fold10

Preprocess Data

library(themis)

## Warning: package 'themis' was built under R version 4.4.3

xgboost_rec <- recipes::recipe(Attrition ~ ., data = data_train) %>%
    update_role(EmployeeNumber, new_role = "ID") %>%
    step_dummy(all_nominal_predictors()) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_smote(Attrition)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 122
## Columns: 64
## $ Age                               <dbl> -0.2203084, -0.4565811, 0.2522371, -…
## $ DailyRate                         <dbl> -0.47393105, 0.78433135, 1.40924019,…
## $ DistanceFromHome                  <dbl> 0.744733895, -0.001681115, -0.250486…
## $ EmployeeNumber                    <dbl> 1052, 1467, 1649, 648, 1645, 785, 18…
## $ HourlyRate                        <dbl> 0.75212518, -0.69936820, 0.35171321,…
## $ MonthlyIncome                     <dbl> -0.27239745, -0.73703203, -0.8649620…
## $ MonthlyRate                       <dbl> -0.7306622, -1.3824702, -1.3461073, …
## $ NumCompaniesWorked                <dbl> 1.7054970, -0.6757630, 0.1179904, -0…
## $ PercentSalaryHike                 <dbl> -0.22677788, 0.07834145, -0.22677788…
## $ TotalWorkingYears                 <dbl> -0.32113232, -1.13655653, -0.2046431…
## $ TrainingTimesLastYear             <dbl> -0.08376166, -2.14988264, -0.0837616…
## $ YearsAtCompany                    <dbl> -0.9425978, -0.7664558, -0.4141717, …
## $ YearsInCurrentRole                <dbl> -1.11217415, -0.51362225, -0.5136222…
## $ YearsSinceLastPromotion           <dbl> -0.6317950, 0.1956887, -0.6317950, -…
## $ YearsWithCurrManager              <dbl> -1.1477034, -0.4867692, -0.1563020, …
## $ Attrition                         <fct> Left, Left, Left, Left, Left, Left, …
## $ BusinessTravel_Travel_Frequently  <dbl> -0.563103, -0.563103, -0.563103, 1.7…
## $ BusinessTravel_Travel_Rarely      <dbl> 0.709443, 0.709443, 0.709443, -1.390…
## $ Department_Research...Development <dbl> 0.8910675, -1.1070839, 0.8910675, -1…
## $ Department_Sales                  <dbl> -0.6669978, -0.6669978, -0.6669978, …
## $ Education_X2                      <dbl> -0.4585137, -0.4585137, -0.4585137, …
## $ Education_X3                      <dbl> 1.3495206, -0.7309903, 1.3495206, -0…
## $ Education_X4                      <dbl> -0.6460338, 1.5269890, -0.6460338, 1…
## $ Education_X5                      <dbl> -0.2673659, -0.2673659, -0.2673659, …
## $ EducationField_Life.Sciences      <dbl> -0.8433196, -0.8433196, 1.1697659, 1…
## $ EducationField_Marketing          <dbl> -0.4585137, -0.4585137, -0.4585137, …
## $ EducationField_Medical            <dbl> -0.6251976, -0.6251976, -0.6251976, …
## $ EducationField_Other              <dbl> 4.831857, -0.204163, -0.204163, -0.2…
## $ EducationField_Technical.Degree   <dbl> -0.2374251, 4.1549384, -0.2374251, -…
## $ EnvironmentSatisfaction_X2        <dbl> -0.583769, -0.583769, -0.583769, 1.6…
## $ EnvironmentSatisfaction_X3        <dbl> -0.5424161, -0.5424161, -0.5424161, …
## $ EnvironmentSatisfaction_X4        <dbl> -0.583769, -0.583769, -0.583769, -0.…
## $ Gender_Male                       <dbl> 0.8201246, -1.2028495, 0.8201246, 0.…
## $ JobInvolvement_X2                 <dbl> -0.6044546, -0.6044546, -0.6044546, …
## $ JobInvolvement_X3                 <dbl> 0.8433196, 0.8433196, 0.8433196, -1.…
## $ JobInvolvement_X4                 <dbl> -0.2673659, -0.2673659, -0.2673659, …
## $ JobLevel_X2                       <dbl> 1.390508, -0.709443, -0.709443, -0.7…
## $ JobLevel_X3                       <dbl> -0.479771, -0.479771, -0.479771, -0.…
## $ JobLevel_X4                       <dbl> -0.2673659, -0.2673659, -0.2673659, …
## $ JobLevel_X5                       <dbl> -0.1655367, -0.1655367, -0.1655367, …
## $ JobRole_Human.Resources           <dbl> -0.3457949, 2.8528080, -0.3457949, -…
## $ JobRole_Laboratory.Technician     <dbl> 2.3769470, -0.4150225, 2.3769470, -0…
## $ JobRole_Manager                   <dbl> -0.2673659, -0.2673659, -0.2673659, …
## $ JobRole_Manufacturing.Director    <dbl> -0.3457949, -0.3457949, -0.3457949, …
## $ JobRole_Research.Director         <dbl> -0.204163, -0.204163, -0.204163, -0.…
## $ JobRole_Research.Scientist        <dbl> -0.479771, -0.479771, -0.479771, -0.…
## $ JobRole_Sales.Executive           <dbl> -0.583769, -0.583769, -0.583769, -0.…
## $ JobRole_Sales.Representative      <dbl> -0.1655367, -0.1655367, -0.1655367, …
## $ JobSatisfaction_X2                <dbl> -0.4150225, -0.4150225, -0.4150225, …
## $ JobSatisfaction_X3                <dbl> 1.969817, 1.969817, -0.500801, -0.50…
## $ JobSatisfaction_X4                <dbl> -0.7973304, -0.7973304, -0.7973304, …
## $ MaritalStatus_Married             <dbl> -0.9409083, 1.0484407, -0.9409083, 1…
## $ MaritalStatus_Single              <dbl> -0.5424161, -0.5424161, 1.8186894, -…
## $ OverTime_Yes                      <dbl> -0.583769, -0.583769, 1.689858, -0.5…
## $ PerformanceRating_X4              <dbl> -0.3695814, -0.3695814, -0.3695814, …
## $ RelationshipSatisfaction_X2       <dbl> 2.3769470, -0.4150225, 2.3769470, -0…
## $ RelationshipSatisfaction_X3       <dbl> -0.7527981, -0.7527981, -0.7527981, …
## $ RelationshipSatisfaction_X4       <dbl> -0.7527981, 1.3104263, -0.7527981, -…
## $ StockOptionLevel_X1               <dbl> 0.9409083, -1.0484407, -1.0484407, 0…
## $ StockOptionLevel_X2               <dbl> -0.3210386, -0.3210386, -0.3210386, …
## $ StockOptionLevel_X3               <dbl> -0.2673659, -0.2673659, -0.2673659, …
## $ WorkLifeBalance_X2                <dbl> 1.4335900, -0.6881232, -0.6881232, -…
## $ WorkLifeBalance_X3                <dbl> -1.0484407, 0.9409083, -1.0484407, -…
## $ WorkLifeBalance_X4                <dbl> -0.3210386, -0.3210386, -0.3210386, …

Specify Model

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune Hyperparameters

tree_grid <- grid_regular(trees(),
                          tree_depth(),
                          levels = 5)
doParallel::registerDoParallel()

set.seed(65743)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5,
            control = control_grid(save_pred = TRUE))

Model Evaluation

Identify optimal values for hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 8
##    trees tree_depth .metric     .estimator  mean     n std_err .config          
##    <int>      <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>            
##  1  1741          3 accuracy    binary     0.707    10  0.0389 Preprocessor1_Mo…
##  2  1741          3 brier_class binary     0.215    10  0.0228 Preprocessor1_Mo…
##  3  1741          3 roc_auc     binary     0.567    10  0.0793 Preprocessor1_Mo…
##  4   885          5 accuracy    binary     0.709    10  0.0413 Preprocessor1_Mo…
##  5   885          5 brier_class binary     0.223    10  0.0242 Preprocessor1_Mo…
##  6   885          5 roc_auc     binary     0.533    10  0.0778 Preprocessor1_Mo…
##  7   325          7 accuracy    binary     0.709    10  0.0413 Preprocessor1_Mo…
##  8   325          7 brier_class binary     0.226    10  0.0245 Preprocessor1_Mo…
##  9   325          7 roc_auc     binary     0.542    10  0.0758 Preprocessor1_Mo…
## 10  1312         12 accuracy    binary     0.709    10  0.0413 Preprocessor1_Mo…
## 11  1312         12 brier_class binary     0.225    10  0.0245 Preprocessor1_Mo…
## 12  1312         12 roc_auc     binary     0.542    10  0.0758 Preprocessor1_Mo…
## 13   555         15 accuracy    binary     0.709    10  0.0413 Preprocessor1_Mo…
## 14   555         15 brier_class binary     0.225    10  0.0245 Preprocessor1_Mo…
## 15   555         15 roc_auc     binary     0.542    10  0.0758 Preprocessor1_Mo…

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(Attrition, .pred_Left) %>%
    autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

## Warning: package 'xgboost' was built under R version 4.4.2

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.731 Preprocessor1_Model1
## 2 roc_auc     binary         0.524 Preprocessor1_Model1
## 3 brier_class binary         0.236 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(Attrition, .pred_class) %>%
    autoplot()

Variable importance

library(vip)

## Warning: package 'vip' was built under R version 4.4.3

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had accuracy of 0.731 and an AUC of 0.419.

Feature transformation: normalized numeric data. It resulted in no change to the accuracy but an improvement to the AUC, which is now 0.524.
Feature transformation: YeoJohnson transformation. It resulted in an accuracy change of 0.654 and an AUC change of 0.514, both of which were not improvements.
Feature transformation: pca: no improvement

CodeAlong5

Paige Biester

2025-03-05

Import Data

Explore Data

Explore Data

Model Building

Split Data

Preprocess Data

Specify Model

Tune Hyperparameters

Model Evaluation

Identify optimal values for hyperparameters

Fit the model for the last time

Variable importance

Conclusion