library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>

Import data

data <- read_csv("../00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")

## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore data

skimr::skim(data)

Data summary
Name	data
Number of rows	1470
Number of columns	35
_______________________
Column type frequency:
character	9
numeric	26
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
Attrition	1	2	3	2
BusinessTravel	1	10	17	3
Department	1	5	22	3
EducationField	1	5	16	6
Gender	1	4	6	2
JobRole	1	7	25	9
MaritalStatus	1	6	8	3
Over18	1	1	1	1
OverTime	1	2	3	2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Age	1	36.92	9.14	18	30.00	36.0	43.00	60	▂▇▇▃▂
DailyRate	1	802.49	403.51	102	465.00	802.0	1157.00	1499	▇▇▇▇▇
DistanceFromHome	1	9.19	8.11	1	2.00	7.0	14.00	29	▇▅▂▂▂
Education	1	2.91	1.02	1	2.00	3.0	4.00	5	▂▃▇▆▁
EmployeeCount	1	1.00	0.00	1	1.00	1.0	1.00	1	▁▁▇▁▁
EmployeeNumber	1	1024.87	602.02	1	491.25	1020.5	1555.75	2068	▇▇▇▇▇
EnvironmentSatisfaction	1	2.72	1.09	1	2.00	3.0	4.00	4	▅▅▁▇▇
HourlyRate	1	65.89	20.33	30	48.00	66.0	83.75	100	▇▇▇▇▇
JobInvolvement	1	2.73	0.71	1	2.00	3.0	3.00	4	▁▃▁▇▁
JobLevel	1	2.06	1.11	1	1.00	2.0	3.00	5	▇▇▃▂▁
JobSatisfaction	1	2.73	1.10	1	2.00	3.0	4.00	4	▅▅▁▇▇
MonthlyIncome	1	6502.93	4707.96	1009	2911.00	4919.0	8379.00	19999	▇▅▂▁▂
MonthlyRate	1	14313.10	7117.79	2094	8047.00	14235.5	20461.50	26999	▇▇▇▇▇
NumCompaniesWorked	1	2.69	2.50	0	1.00	2.0	4.00	9	▇▃▂▂▁
PercentSalaryHike	1	15.21	3.66	11	12.00	14.0	18.00	25	▇▅▃▂▁
PerformanceRating	1	3.15	0.36	3	3.00	3.0	3.00	4	▇▁▁▁▂
RelationshipSatisfaction	1	2.71	1.08	1	2.00	3.0	4.00	4	▅▅▁▇▇
StandardHours	1	80.00	0.00	80	80.00	80.0	80.00	80	▁▁▇▁▁
StockOptionLevel	1	0.79	0.85	0	0.00	1.0	1.00	3	▇▇▁▂▁
TotalWorkingYears	1	11.28	7.78	0	6.00	10.0	15.00	40	▇▇▂▁▁
TrainingTimesLastYear	1	2.80	1.29	0	2.00	3.0	3.00	6	▂▇▇▂▃
WorkLifeBalance	1	2.76	0.71	1	2.00	3.0	3.00	4	▁▃▁▇▂
YearsAtCompany	1	7.01	6.13	0	3.00	5.0	9.00	40	▇▂▁▁▁
YearsInCurrentRole	1	4.23	3.62	0	2.00	3.0	7.00	18	▇▃▂▁▁
YearsSinceLastPromotion	1	2.19	3.22	0	0.00	1.0	3.00	15	▇▁▁▁▁
YearsWithCurrManager	1	4.12	3.57	0	2.00	3.0	7.00	17	▇▂▅▁▁

issues with data missing values factors or numeric variables Education, EnvironmentSatisfaction, JobInvolvement, JobSatisfaction, PerformanceRating, RelationshipSatisfaction, WorklifeBalance Zero Variance variables Over18, EmployCount, StandardHours Character variables Convert to numbers in recipe step Unbalanced target variables attrition id variable EmployeeNumber

factors_vec <- data %>% select( Education, EnvironmentSatisfaction, JobInvolvement, JobSatisfaction, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance) %>% names()

data_clean <- data %>%
    # mutate(Education = Education %>% as.factor(), EnvironmentSatisfaction = EnvironmentSatisfaction %>% as.factor, JobInvolvement = JobInvolvement %>% as.factor(), JobSatisfaction = JobSatisfaction %>% as.factor(), PerformanceRating = PerformanceRating %>% as.factor(), RelationshipSatisfaction = RelationshipSatisfaction %>% as.factor(), WorkLifeBalance = WorkLifeBalance %>% as.factor()) %>%
   # address factors imported as numeric
      mutate(across(all_of(factors_vec), as.factor)) %>%
    # drop the zero variance variables
    select(-c(Over18, EmployeeCount, StandardHours))

Explore data

data_clean %>% count(Attrition)

## # A tibble: 2 × 2
##   Attrition     n
##   <chr>     <int>
## 1 No         1233
## 2 Yes         237

data_clean %>%
    ggplot(aes(Attrition)) +
    geom_bar()

attrition vs monthly income

data_clean %>%
    ggplot(aes(Attrition, MonthlyIncome)) +
    geom_boxplot()

correlation plot

# step 1: binarize
data_binarized <- data_clean %>%
    select(-EmployeeNumber) %>%
    binarize()

data_binarized %>% glimpse()

## Rows: 1,470
## Columns: 120
## $ `Age__-Inf_30`                       <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Age__30_36                           <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, …
## $ Age__36_43                           <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ Age__43_Inf                          <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ Attrition__No                        <dbl> 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Attrition__Yes                       <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `BusinessTravel__Non-Travel`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BusinessTravel__Travel_Frequently    <dbl> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ BusinessTravel__Travel_Rarely        <dbl> 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ `DailyRate__-Inf_465`                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ DailyRate__465_802                   <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ DailyRate__802_1157                  <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ DailyRate__1157_Inf                  <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, …
## $ Department__Human_Resources          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Department__Research_&_Development` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Department__Sales                    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `DistanceFromHome__-Inf_2`           <dbl> 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__2_7                <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ DistanceFromHome__7_14               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ DistanceFromHome__14_Inf             <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ Education__1                         <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ Education__2                         <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ Education__3                         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, …
## $ Education__4                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ Education__5                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Human_Resources      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Life_Sciences        <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ EducationField__Marketing            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Medical              <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, …
## $ EducationField__Other                <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField__Technical_Degree     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__1           <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ EnvironmentSatisfaction__2           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction__3           <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, …
## $ EnvironmentSatisfaction__4           <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ Gender__Female                       <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ Gender__Male                         <dbl> 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ `HourlyRate__-Inf_48`                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ HourlyRate__48_66                    <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ HourlyRate__66_83.75                 <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ HourlyRate__83.75_Inf                <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ JobInvolvement__1                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobInvolvement__2                    <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobInvolvement__3                    <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ JobInvolvement__4                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ JobLevel__1                          <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ JobLevel__2                          <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobLevel__3                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobLevel__4                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobLevel__5                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Healthcare_Representative   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ JobRole__Human_Resources             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Laboratory_Technician       <dbl> 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, …
## $ JobRole__Manager                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Manufacturing_Director      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobRole__Research_Director           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Research_Scientist          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Executive             <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole__Sales_Representative        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobSatisfaction__1                   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ JobSatisfaction__2                   <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ JobSatisfaction__3                   <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, …
## $ JobSatisfaction__4                   <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ MaritalStatus__Divorced              <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ MaritalStatus__Married               <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, …
## $ MaritalStatus__Single                <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
## $ `MonthlyIncome__-Inf_2911`           <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ MonthlyIncome__2911_4919             <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ MonthlyIncome__4919_8379             <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ MonthlyIncome__8379_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `MonthlyRate__-Inf_8047`             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ MonthlyRate__8047_14235.5            <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, …
## $ MonthlyRate__14235.5_20461.5         <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, …
## $ MonthlyRate__20461.5_Inf             <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ `NumCompaniesWorked__-Inf_1`         <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, …
## $ NumCompaniesWorked__1_2              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ NumCompaniesWorked__2_4              <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ NumCompaniesWorked__4_Inf            <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ OverTime__No                         <dbl> 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ OverTime__Yes                        <dbl> 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ `PercentSalaryHike__-Inf_12`         <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__12_14             <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, …
## $ PercentSalaryHike__14_18             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ PercentSalaryHike__18_Inf            <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ PerformanceRating__3                 <dbl> 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, …
## $ PerformanceRating__4                 <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ RelationshipSatisfaction__1          <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ RelationshipSatisfaction__2          <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, …
## $ RelationshipSatisfaction__3          <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, …
## $ RelationshipSatisfaction__4          <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ StockOptionLevel__0                  <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ StockOptionLevel__1                  <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ StockOptionLevel__2                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ StockOptionLevel__3                  <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `TotalWorkingYears__-Inf_6`          <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ TotalWorkingYears__6_10              <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ TotalWorkingYears__10_15             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ TotalWorkingYears__15_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `TrainingTimesLastYear__-Inf_2`      <dbl> 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, …
## $ TrainingTimesLastYear__2_3           <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, …
## $ TrainingTimesLastYear__3_Inf         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ WorkLifeBalance__1                   <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ WorkLifeBalance__2                   <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
## $ WorkLifeBalance__3                   <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, …
## $ WorkLifeBalance__4                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsAtCompany__-Inf_3`             <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsAtCompany__3_5                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsAtCompany__5_9                  <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, …
## $ YearsAtCompany__9_Inf                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsInCurrentRole__-Inf_2`         <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsInCurrentRole__2_3              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ YearsInCurrentRole__3_7              <dbl> 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, …
## $ YearsInCurrentRole__7_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `YearsSinceLastPromotion__-Inf_1`    <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, …
## $ YearsSinceLastPromotion__1_3         <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ YearsSinceLastPromotion__3_Inf       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `YearsWithCurrManager__-Inf_2`       <dbl> 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, …
## $ YearsWithCurrManager__2_3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ YearsWithCurrManager__3_7            <dbl> 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ YearsWithCurrManager__7_Inf          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …

# step 2: correlation
data_correlation <- data_binarized %>%
    correlate(Attrition__Yes)

data_correlation

## # A tibble: 120 × 3
##    feature           bin       correlation
##    <fct>             <chr>           <dbl>
##  1 Attrition         No             -1    
##  2 Attrition         Yes             1    
##  3 OverTime          Yes             0.246
##  4 OverTime          No             -0.246
##  5 JobLevel          1               0.213
##  6 MonthlyIncome     -Inf_2911       0.207
##  7 StockOptionLevel  0               0.195
##  8 YearsAtCompany    -Inf_3          0.183
##  9 MaritalStatus     Single          0.175
## 10 TotalWorkingYears -Inf_6          0.169
## # ℹ 110 more rows

data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

## Warning: ggrepel: 72 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Split data

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.1
## ✔ dials        1.2.1     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0

## Warning: package 'modeldata' was built under R version 4.3.3

## Warning: package 'recipes' was built under R version 4.3.3

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org

set.seed(1234)
data_clean <- data_clean %>% sample_n(100)

data_split <- initial_split(data_clean, strata = Attrition)
data_train <- training(data_split)
data_test <- testing(data_split)


data_cv <- rsample::vfold_cv(data_train, strata = Attrition)

data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [66/8]> Fold01
##  2 <split [66/8]> Fold02
##  3 <split [66/8]> Fold03
##  4 <split [66/8]> Fold04
##  5 <split [67/7]> Fold05
##  6 <split [67/7]> Fold06
##  7 <split [67/7]> Fold07
##  8 <split [67/7]> Fold08
##  9 <split [67/7]> Fold09
## 10 <split [67/7]> Fold10

Preprocess data

library(themis)

xgboost_rec <- recipes::recipe(Attrition ~ ., data = data_train) %>%
    update_role(EmployeeNumber, new_role = "ID") %>%
    step_dummy(all_nominal_predictors()) %>%
    step_smote(Attrition)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 134
## Columns: 59
## $ Age                               <dbl> 27, 39, 25, 48, 31, 28, 31, 49, 28, …
## $ DailyRate                         <dbl> 1377, 1462, 949, 715, 1222, 640, 329…
## $ DistanceFromHome                  <dbl> 11, 6, 1, 1, 11, 1, 1, 4, 4, 2, 6, 1…
## $ EmployeeNumber                    <dbl> 1434, 1588, 1415, 1263, 895, 1301, 5…
## $ HourlyRate                        <dbl> 91, 38, 81, 76, 48, 84, 98, 85, 43, …
## $ JobLevel                          <dbl> 1, 3, 1, 5, 1, 1, 1, 5, 2, 1, 1, 3, …
## $ MonthlyIncome                     <dbl> 2099, 8237, 3229, 18265, 2356, 2080,…
## $ MonthlyRate                       <dbl> 7679, 4658, 4910, 8733, 14871, 4732,…
## $ NumCompaniesWorked                <dbl> 0, 2, 4, 6, 3, 2, 1, 2, 1, 3, 0, 4, …
## $ PercentSalaryHike                 <dbl> 14, 11, 11, 12, 19, 11, 12, 13, 15, …
## $ StockOptionLevel                  <dbl> 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, …
## $ TotalWorkingYears                 <dbl> 6, 11, 7, 25, 8, 5, 4, 23, 5, 7, 16,…
## $ TrainingTimesLastYear             <dbl> 3, 3, 2, 3, 2, 2, 3, 2, 3, 3, 4, 2, …
## $ YearsAtCompany                    <dbl> 5, 7, 3, 1, 6, 3, 4, 1, 5, 3, 15, 11…
## $ YearsInCurrentRole                <dbl> 0, 6, 2, 0, 4, 2, 2, 0, 4, 2, 13, 7,…
## $ YearsSinceLastPromotion           <dbl> 1, 7, 0, 0, 0, 1, 3, 0, 0, 1, 10, 4,…
## $ YearsWithCurrManager              <dbl> 4, 6, 2, 0, 2, 2, 2, 0, 4, 2, 11, 8,…
## $ Attrition                         <fct> No, No, No, No, No, No, No, No, No, …
## $ BusinessTravel_Travel_Frequently  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BusinessTravel_Travel_Rarely      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, …
## $ Department_Research...Development <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, …
## $ Department_Sales                  <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ Education_X2                      <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
## $ Education_X3                      <dbl> 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, …
## $ Education_X4                      <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, …
## $ Education_X5                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField_Life.Sciences      <dbl> 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, …
## $ EducationField_Marketing          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField_Medical            <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, …
## $ EducationField_Other              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ EducationField_Technical.Degree   <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ EnvironmentSatisfaction_X2        <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ EnvironmentSatisfaction_X3        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ EnvironmentSatisfaction_X4        <dbl> 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, …
## $ Gender_Male                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ JobInvolvement_X2                 <dbl> 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, …
## $ JobInvolvement_X3                 <dbl> 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, …
## $ JobInvolvement_X4                 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole_Human.Resources           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole_Laboratory.Technician     <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ JobRole_Manager                   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ JobRole_Manufacturing.Director    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobRole_Research.Director         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ JobRole_Research.Scientist        <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ JobRole_Sales.Executive           <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ JobRole_Sales.Representative      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ JobSatisfaction_X2                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JobSatisfaction_X3                <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, …
## $ JobSatisfaction_X4                <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, …
## $ MaritalStatus_Married             <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, …
## $ MaritalStatus_Single              <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ OverTime_Yes                      <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, …
## $ PerformanceRating_X4              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ RelationshipSatisfaction_X2       <dbl> 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, …
## $ RelationshipSatisfaction_X3       <dbl> 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, …
## $ RelationshipSatisfaction_X4       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ WorkLifeBalance_X2                <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ WorkLifeBalance_X3                <dbl> 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, …
## $ WorkLifeBalance_X4                <dbl> 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, …

Specify model

xgboost_spec <- 
  boost_tree(trees = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune hyperparameters

doParallel::registerDoParallel

## function (cl, cores = NULL, ...) 
## {
##     opts <- list(...)
##     optnames <- names(opts)
##     if (is.null(optnames)) 
##         optnames <- rep("", length(opts))
##     unnamed <- !nzchar(optnames)
##     if (any(unnamed)) {
##         warning("ignoring doParallel package option(s) specified with unnamed argument")
##         opts <- opts[!unnamed]
##         optnames <- optnames[!unnamed]
##     }
##     recog <- optnames %in% c("nocompile")
##     if (any(!recog)) {
##         warning(sprintf("ignoring unrecognized doParallel package option(s): %s", 
##             paste(optnames[!recog], collapse = ", ")), call. = FALSE)
##         opts <- opts[recog]
##         optnames <- optnames[recog]
##     }
##     old.optnames <- ls(.options, all.names = TRUE)
##     rm(list = old.optnames, pos = .options)
##     for (i in seq_along(opts)) {
##         assign(optnames[i], opts[[i]], pos = .options)
##     }
##     if (missing(cl) || is.numeric(cl)) {
##         if (.Platform$OS.type == "windows") {
##             if (!missing(cl) && is.numeric(cl)) {
##                 cl <- makeCluster(cl)
##             }
##             else {
##                 if (!missing(cores) && is.numeric(cores)) {
##                   cl <- makeCluster(cores)
##                 }
##                 else {
##                   cl <- makeCluster(3)
##                 }
##             }
##             assign(".revoDoParCluster", cl, pos = .options)
##             reg.finalizer(.options, function(e) {
##                 stopImplicitCluster()
##             }, onexit = TRUE)
##             setDoPar(doParallelSNOW, cl, snowinfo)
##         }
##         else {
##             if (!missing(cl) && is.numeric(cl)) {
##                 cores <- cl
##             }
##             setDoPar(doParallelMC, cores, mcinfo)
##         }
##     }
##     else {
##         setDoPar(doParallelSNOW, cl, snowinfo)
##     }
## }
## <bytecode: 0x13e5e7af8>
## <environment: namespace:doParallel>

set.seed(13500)
xgboost_tune <-
  tune_grid(xgboost_workflow,
            resamples = data_cv,
            grid = 5)

## Warning: package 'xgboost' was built under R version 4.3.3

## → A | warning: No control observations were detected in `truth` with control level 'Yes'.

## 
There were issues with some computations   A: x1

                                                 
→ B | error:   Error in `step_smote()`:
##                Caused by error in `bake()`:
##                ! Not enough observations of 'Yes' to perform SMOTE.
## There were issues with some computations   A: x1

There were issues with some computations   A: x2   B: x1

There were issues with some computations   A: x3   B: x2

There were issues with some computations   A: x4   B: x2

There were issues with some computations   A: x5   B: x2

There were issues with some computations   A: x5   B: x2

CodeAlong6

Jordan Lanowy

2024-10-15

Import data

Explore data

Explore data

Model Building

Split data

Preprocess data

Specify model

Tune hyperparameters