library(tidyverse)
library(tidyquant)

attrition <- readr::read_csv("../00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")

attrition %>% skimr::skim()
Data summary
Name Piped data
Number of rows 1470
Number of columns 35
_______________________
Column type frequency:
character 9
numeric 26
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Attrition 0 1 2 3 0 2 0
BusinessTravel 0 1 10 17 0 3 0
Department 0 1 5 22 0 3 0
EducationField 0 1 5 16 0 6 0
Gender 0 1 4 6 0 2 0
JobRole 0 1 7 25 0 9 0
MaritalStatus 0 1 6 8 0 3 0
Over18 0 1 1 1 0 1 0
OverTime 0 1 2 3 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1 36.92 9.14 18 30.00 36.0 43.00 60 ▂▇▇▃▂
DailyRate 0 1 802.49 403.51 102 465.00 802.0 1157.00 1499 ▇▇▇▇▇
DistanceFromHome 0 1 9.19 8.11 1 2.00 7.0 14.00 29 ▇▅▂▂▂
Education 0 1 2.91 1.02 1 2.00 3.0 4.00 5 ▂▃▇▆▁
EmployeeCount 0 1 1.00 0.00 1 1.00 1.0 1.00 1 ▁▁▇▁▁
EmployeeNumber 0 1 1024.87 602.02 1 491.25 1020.5 1555.75 2068 ▇▇▇▇▇
EnvironmentSatisfaction 0 1 2.72 1.09 1 2.00 3.0 4.00 4 ▅▅▁▇▇
HourlyRate 0 1 65.89 20.33 30 48.00 66.0 83.75 100 ▇▇▇▇▇
JobInvolvement 0 1 2.73 0.71 1 2.00 3.0 3.00 4 ▁▃▁▇▁
JobLevel 0 1 2.06 1.11 1 1.00 2.0 3.00 5 ▇▇▃▂▁
JobSatisfaction 0 1 2.73 1.10 1 2.00 3.0 4.00 4 ▅▅▁▇▇
MonthlyIncome 0 1 6502.93 4707.96 1009 2911.00 4919.0 8379.00 19999 ▇▅▂▁▂
MonthlyRate 0 1 14313.10 7117.79 2094 8047.00 14235.5 20461.50 26999 ▇▇▇▇▇
NumCompaniesWorked 0 1 2.69 2.50 0 1.00 2.0 4.00 9 ▇▃▂▂▁
PercentSalaryHike 0 1 15.21 3.66 11 12.00 14.0 18.00 25 ▇▅▃▂▁
PerformanceRating 0 1 3.15 0.36 3 3.00 3.0 3.00 4 ▇▁▁▁▂
RelationshipSatisfaction 0 1 2.71 1.08 1 2.00 3.0 4.00 4 ▅▅▁▇▇
StandardHours 0 1 80.00 0.00 80 80.00 80.0 80.00 80 ▁▁▇▁▁
StockOptionLevel 0 1 0.79 0.85 0 0.00 1.0 1.00 3 ▇▇▁▂▁
TotalWorkingYears 0 1 11.28 7.78 0 6.00 10.0 15.00 40 ▇▇▂▁▁
TrainingTimesLastYear 0 1 2.80 1.29 0 2.00 3.0 3.00 6 ▂▇▇▂▃
WorkLifeBalance 0 1 2.76 0.71 1 2.00 3.0 3.00 4 ▁▃▁▇▂
YearsAtCompany 0 1 7.01 6.13 0 3.00 5.0 9.00 40 ▇▂▁▁▁
YearsInCurrentRole 0 1 4.23 3.62 0 2.00 3.0 7.00 18 ▇▃▂▁▁
YearsSinceLastPromotion 0 1 2.19 3.22 0 0.00 1.0 3.00 15 ▇▁▁▁▁
YearsWithCurrManager 0 1 4.12 3.57 0 2.00 3.0 7.00 17 ▇▂▅▁▁

Clean Data

Notes about the data

  • Zero variance variables - Over18, EmployeeCount, StandardHours: Remove
  • Character variables: Convert to factor
  • Some factors imported as numeric - Education, JobLevel, StockOptionLevel : Convert to factor
  • Unbalanced target variable: step_smote(Attrition)
data <- attrition %>%
    
    # Remove zero variance variables
    select(-Over18, -EmployeeCount, -StandardHours) %>%
    
    # Convert character to factor
    mutate(across(where(is.character), factor)) %>%
    
    # Convert factors imported as numeric
    mutate(across(c(Education, JobLevel, StockOptionLevel), factor))

Explore Data

*** Identify variables with correlation with the target variable.***

* Attrition

data %>% count(Attrition)
## # A tibble: 2 × 2
##   Attrition     n
##   <fct>     <int>
## 1 No         1233
## 2 Yes         237

* Income

Monthly income looks like the best predictor among income variables.

data %>% 
    ggplot(aes(Attrition, MonthlyIncome)) +
    geom_boxplot()

data %>% 
    ggplot(aes(Attrition, MonthlyRate)) +
    geom_boxplot()

data %>% 
    ggplot(aes(Attrition, DailyRate)) +
    geom_boxplot()

data %>% 
    ggplot(aes(Attrition, HourlyRate)) +
    geom_boxplot()

* Distance from home

data %>%
    ggplot(aes(Attrition, DistanceFromHome)) +
    geom_boxplot()

* Work Life Balance

data %>%
    
    # Transform
    count(Attrition, WorkLifeBalance) %>%
    
    # Plot
    ggplot(aes(Attrition, WorkLifeBalance, fill = n)) +
    geom_tile()

data <- sample_n(data,100)

library(tidymodels)

set.seed(123)
data_split <- initial_split(data, strata = Attrition)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- bootstraps(data_train, strata = Attrition)
data_folds
## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits          id         
##    <list>          <chr>      
##  1 <split [74/28]> Bootstrap01
##  2 <split [74/30]> Bootstrap02
##  3 <split [74/26]> Bootstrap03
##  4 <split [74/26]> Bootstrap04
##  5 <split [74/27]> Bootstrap05
##  6 <split [74/26]> Bootstrap06
##  7 <split [74/29]> Bootstrap07
##  8 <split [74/34]> Bootstrap08
##  9 <split [74/27]> Bootstrap09
## 10 <split [74/29]> Bootstrap10
## # ℹ 15 more rows
library(themis)

data_rec <- 
    recipe(Attrition ~ ., data = data_train) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_smote(Attrition)

data_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 122
## Columns: 54
## $ Age                               <dbl> -1.5368207, 1.1735999, -1.0850840, 0…
## $ DailyRate                         <dbl> 0.65716498, 1.03008808, -0.12234789,…
## $ DistanceFromHome                  <dbl> 0.19091820, -0.95073408, -0.95073408…
## $ EmployeeNumber                    <dbl> -0.32114650, -0.55762847, 0.43623497…
## $ EnvironmentSatisfaction           <dbl> 0.2510663, 1.1800115, 1.1800115, 0.2…
## $ HourlyRate                        <dbl> -0.2217389, -1.1514090, -0.9654749, …
## $ JobInvolvement                    <dbl> -1.144000, -1.144000, 0.341193, 0.34…
## $ JobSatisfaction                   <dbl> -0.5096229, -1.4294302, 0.4101843, 0…
## $ MonthlyIncome                     <dbl> -0.788043354, 1.879046788, -0.901375…
## $ MonthlyRate                       <dbl> 1.638378424, 0.491854588, -1.0032907…
## $ NumCompaniesWorked                <dbl> -0.72227243, 1.60156060, -1.10957793…
## $ PercentSalaryHike                 <dbl> -1.1410380, -1.1410380, -1.1410380, …
## $ PerformanceRating                 <dbl> -0.4369587, -0.4369587, -0.4369587, …
## $ RelationshipSatisfaction          <dbl> 0.3767387, -1.3128774, 0.3767387, -0…
## $ TotalWorkingYears                 <dbl> -1.36965471, 1.26978406, -0.84176696…
## $ TrainingTimesLastYear             <dbl> 1.188705, 0.350951, -0.486803, 2.026…
## $ WorkLifeBalance                   <dbl> 0.2798988, -2.6790313, -1.1995663, 0…
## $ YearsAtCompany                    <dbl> -1.06903964, -0.71900011, -0.5439803…
## $ YearsInCurrentRole                <dbl> -1.2942719, -0.7102712, -0.7102712, …
## $ YearsSinceLastPromotion           <dbl> -0.6926947, -0.6926947, -0.6926947, …
## $ YearsWithCurrManager              <dbl> -1.2785445, -0.6998761, -0.6998761, …
## $ BusinessTravel_Travel_Frequently  <dbl> -0.4369587, -0.4369587, 2.2576197, -…
## $ BusinessTravel_Travel_Rarely      <dbl> -1.4335900, 0.6881232, -1.4335900, -…
## $ Department_Research...Development <dbl> 0.583769, 0.583769, 0.583769, 0.5837…
## $ Department_Sales                  <dbl> -0.5424161, -0.5424161, -0.5424161, …
## $ Education_X2                      <dbl> -0.4369587, -0.4369587, -0.4369587, …
## $ Education_X3                      <dbl> 1.2372368, -0.7973304, -0.7973304, -…
## $ Education_X4                      <dbl> -0.6669978, 1.4789951, 1.4789951, -0…
## $ Education_X5                      <dbl> -0.1162476, -0.1162476, -0.1162476, …
## $ EducationField_Life.Sciences      <dbl> 1.1378768, 1.1378768, -0.8669537, -0…
## $ EducationField_Marketing          <dbl> -0.2950304, -0.2950304, -0.2950304, …
## $ EducationField_Medical            <dbl> -0.6881232, -0.6881232, 1.4335900, 1…
## $ EducationField_Other              <dbl> -0.2374251, -0.2374251, -0.2374251, …
## $ EducationField_Technical.Degree   <dbl> -0.3210386, -0.3210386, -0.3210386, …
## $ Gender_Male                       <dbl> 0.9409083, -1.0484407, 0.9409083, -1…
## $ JobLevel_X2                       <dbl> -0.8201246, -0.8201246, -0.8201246, …
## $ JobLevel_X3                       <dbl> -0.4150225, -0.4150225, -0.4150225, …
## $ JobLevel_X4                       <dbl> -0.2673659, 3.6896495, -0.2673659, -…
## $ JobLevel_X5                       <dbl> -0.2374251, -0.2374251, -0.2374251, …
## $ JobRole_Human.Resources           <dbl> -0.1655367, -0.1655367, -0.1655367, …
## $ JobRole_Laboratory.Technician     <dbl> 2.1514874, -0.4585137, 2.1514874, -0…
## $ JobRole_Manager                   <dbl> -0.2673659, 3.6896495, -0.2673659, -…
## $ JobRole_Manufacturing.Director    <dbl> -0.3926048, -0.3926048, -0.3926048, …
## $ JobRole_Research.Director         <dbl> -0.2374251, -0.2374251, -0.2374251, …
## $ JobRole_Research.Scientist        <dbl> -0.5424161, -0.5424161, -0.5424161, …
## $ JobRole_Sales.Executive           <dbl> -0.500801, -0.500801, -0.500801, -0.…
## $ JobRole_Sales.Representative      <dbl> -0.1655367, -0.1655367, -0.1655367, …
## $ MaritalStatus_Married             <dbl> -0.9157038, 1.0772986, -0.9157038, -…
## $ MaritalStatus_Single              <dbl> -0.6669978, -0.6669978, 1.4789951, -…
## $ OverTime_Yes                      <dbl> -0.7309903, -0.7309903, 1.3495206, -…
## $ StockOptionLevel_X1               <dbl> -0.6881232, 1.4335900, -0.6881232, 1…
## $ StockOptionLevel_X2               <dbl> -0.3926048, -0.3926048, -0.3926048, …
## $ StockOptionLevel_X3               <dbl> 3.0727976, -0.3210386, -0.3210386, -…
## $ Attrition                         <fct> No, No, No, No, No, No, No, No, No, …

Build a Model