All models are wrong, but some are useful. - George Box

0.0.1 Setup

library(tidyverse)
library(odbc)
library(DBI)
library(RSQLite)
library(tvthemes)
library(ggthemes)
library(scales)


dat_new <- read_csv("./input/WA_Fn-UseC_-HR-Employee-Attrition.csv")
dat_new <- dat_new %>%
  mutate_if(is.character, as_factor) %>%
  mutate(
    EnvironmentSatisfaction = factor(EnvironmentSatisfaction, ordered = TRUE),
    StockOptionLevel = factor(StockOptionLevel, ordered = TRUE),
    JobLevel = factor(JobLevel, ordered = TRUE),
    JobInvolvement = factor(JobInvolvement, ordered = TRUE)
  ) %>%
  select(EmployeeNumber, Attrition, everything())
my_skim <- skimr::skim_with(numeric = skimr::sfl(p25 = NULL, p50 = NULL, p75 = NULL, hist = NULL))
my_skim(dat_new)
Data summary
Name dat_new
Number of rows 1470
Number of columns 35
_______________________
Column type frequency:
factor 13
numeric 22
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Attrition 0 1 FALSE 2 No: 1233, Yes: 237
BusinessTravel 0 1 FALSE 3 Tra: 1043, Tra: 277, Non: 150
Department 0 1 FALSE 3 Res: 961, Sal: 446, Hum: 63
EducationField 0 1 FALSE 6 Lif: 606, Med: 464, Mar: 159, Tec: 132
EnvironmentSatisfaction 0 1 TRUE 4 3: 453, 4: 446, 2: 287, 1: 284
Gender 0 1 FALSE 2 Mal: 882, Fem: 588
JobInvolvement 0 1 TRUE 4 3: 868, 2: 375, 4: 144, 1: 83
JobLevel 0 1 TRUE 5 1: 543, 2: 534, 3: 218, 4: 106
JobRole 0 1 FALSE 9 Sal: 326, Res: 292, Lab: 259, Man: 145
MaritalStatus 0 1 FALSE 3 Mar: 673, Sin: 470, Div: 327
Over18 0 1 FALSE 1 Y: 1470
OverTime 0 1 FALSE 2 No: 1054, Yes: 416
StockOptionLevel 0 1 TRUE 4 0: 631, 1: 596, 2: 158, 3: 85

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p100
EmployeeNumber 0 1 1024.87 602.02 1 2068
Age 0 1 36.92 9.14 18 60
DailyRate 0 1 802.49 403.51 102 1499
DistanceFromHome 0 1 9.19 8.11 1 29
Education 0 1 2.91 1.02 1 5
EmployeeCount 0 1 1.00 0.00 1 1
HourlyRate 0 1 65.89 20.33 30 100
JobSatisfaction 0 1 2.73 1.10 1 4
MonthlyIncome 0 1 6502.93 4707.96 1009 19999
MonthlyRate 0 1 14313.10 7117.79 2094 26999
NumCompaniesWorked 0 1 2.69 2.50 0 9
PercentSalaryHike 0 1 15.21 3.66 11 25
PerformanceRating 0 1 3.15 0.36 3 4
RelationshipSatisfaction 0 1 2.71 1.08 1 4
StandardHours 0 1 80.00 0.00 80 80
TotalWorkingYears 0 1 11.28 7.78 0 40
TrainingTimesLastYear 0 1 2.80 1.29 0 6
WorkLifeBalance 0 1 2.76 0.71 1 4
YearsAtCompany 0 1 7.01 6.13 0 40
YearsInCurrentRole 0 1 4.23 3.62 0 18
YearsSinceLastPromotion 0 1 2.19 3.22 0 15
YearsWithCurrManager 0 1 4.12 3.57 0 17

0.0.2 SETUP SQL Environment

con<-dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con,dat_new)

0.0.3 look at the first few rows of the data

  • to achieve this LIMIT n does the trick
dbGetQuery(con,'SELECT * 
                FROM dat_new 
                LIMIT 5')

0.0.4 How many employees were recorded

dbGetQuery(con,'SELECT COUNT(*) AS num_rows 
                FROM dat_new')

0.0.5 see if we have any missing employee numbers

dbGetQuery(con,'SELECT COUNT(*)-COUNT(EmployeeNumber) AS missing 
                FROM dat_new')

0.0.6 What was the highest ,lowest and average monthly salary at this company

dbGetQuery(con,"SELECT Attrition,
                    MAX(MonthlyIncome) AS max_income , 
                    MIN(MonthlyIncome) AS min_income , 
                    AVG(MonthlyIncome) AS avg_income
                FROM dat_new
                GROUP BY Attrition;")

0.0.7 visualise this distribution

dat_new |> 
  ggplot(aes(x=Attrition,y=MonthlyIncome ,fill=Attrition))+
  geom_boxplot(outlier.color = "blue")+
  tvthemes::scale_fill_avatar()

  • monthly salary for non-leavers is more spread out as compared to other category

0.0.8 test whether the difference in mean salary is statistically significant

t.test(MonthlyIncome~Attrition,data=dat_new,alternative="two.sided")
## 
##  Welch Two Sample t-test
## 
## data:  MonthlyIncome by Attrition
## t = -7.4826, df = 412.74, p-value = 4.434e-13
## alternative hypothesis: true difference in means between group Yes and group No is not equal to 0
## 95 percent confidence interval:
##  -2583.050 -1508.244
## sample estimates:
## mean in group Yes  mean in group No 
##          4787.093          6832.740

Comments

  • from the output of the test , t=-7.4826 represents the calculated difference in sample means in units of standard error and the negative sign suggest the first mean is smaller than the second
  • the p-value is less than 0.05 suggesting that the difference in means is statistically significant
  • the confidence interval also does not contain zero therefore providing further evidence that the difference in mean monthly income for each group is statistically significant

0.1

0.1.1 T.test for numeric variables by response

vars_numeric<-dat_new |> 
  select(-EmployeeCount,-EmployeeNumber,-StandardHours) |> 
  select_if(is.numeric) |> colnames()
km_aov <- vars_numeric %>% 
  map(~ t.test(rlang::eval_tidy(expr(!!sym(.x) ~Attrition)), data = dat_new))
km_aov %>% 
  map(~ data.frame(`test statistic` = .x$`statistic`[[1]], p.value = .x$`p.value`[[1]])) %>%
  bind_rows() %>%
  bind_cols(Attribute = vars_numeric) %>%
  select(Attribute, everything()) %>%
  mutate(`p value` = round(p.value,3)) %>%
  select(-p.value)%>%
  flextable::flextable() %>%
  flextable::autofit() %>%
  flextable::colformat_num(j = 2, digits = 2) %>%
  flextable::colformat_num(j = 3, digits = 4)

Attribute

test.statistic

p value

Age

-5.8280119

0.000

DailyRate

-2.1788817

0.030

DistanceFromHome

2.8881831

0.004

Education

-1.2177494

0.224

HourlyRate

-0.2647686

0.791

JobSatisfaction

-3.9261129

0.000

MonthlyIncome

-7.4826216

0.000

MonthlyRate

0.5755022

0.565

NumCompaniesWorked

1.5746511

0.116

PercentSalaryHike

-0.5042445

0.614

PerformanceRating

0.1099940

0.912

RelationshipSatisfaction

-1.7019371

0.090

TotalWorkingYears

-7.0191785

0.000

TrainingTimesLastYear

-2.3305223

0.020

WorkLifeBalance

-2.1741928

0.030

YearsAtCompany

-5.2825961

0.000

YearsInCurrentRole

-6.8470792

0.000

YearsSinceLastPromotion

-1.2879266

0.199

YearsWithCurrManager

-6.6333988

0.000

comments

  • the p-values that are less than 0.05 suggest that the difference in means is statistically significant
  • the above implies that the difference in means is not a result of mere chance
  • for instance the average job satisfaction for those who left the company and those who did not is expected to be significantly different

0.1.2 visualise the results above

vars_numeric<-dat_new |> 
  select(-EmployeeCount,-EmployeeNumber,-StandardHours) |> 
  select_if(is.numeric) |> 
  colnames()

subset<- dat_new |> 
  select(vars_numeric,Attrition) 

# Bring in external file for visualisations
source('functions/visualisations.R')

# Use plot function
plot <- histoplotter(subset, Attrition, 
                     chart_x_axis_lbl = "attrition status", 
                     chart_y_axis_lbl = 'Measures',
                     boxplot_color = 'navy', 
                     boxplot_fill = '#89CFF0', 
                     box_fill_transparency = 0.2) 

# Add extras to plot
plot + 
  ggthemes::theme_excel() +
  tvthemes::scale_color_attackOnTitan()+
  theme(legend.position = 'top') 

out of interest we might want to look at how Department affects each of the numeric variables through anova

0.1.3 Analysis Of Variance (ANOVA)

options(scipen=999)
vars_numeric<-dat_new |> 
   select(-EmployeeCount,-EmployeeNumber,-StandardHours) |> 
  select_if(is.numeric) |> colnames()
km_aov <- vars_numeric %>% 
  map(~ aov(rlang::eval_tidy(expr(!!sym(.x) ~Department)), data = dat_new))
km_aov %>% 
  map(anova) %>%
  map(~ data.frame(F = .x$`F value`[[1]], p = .x$`Pr(>F)`[[1]])) %>%
  bind_rows() %>%
  bind_cols(Attribute = vars_numeric) %>%
  select(Attribute, everything()) %>%
  mutate(`Pr(>F)` = round(p,2)) %>%
  select(-p) %>% 
  flextable::flextable() %>%
  flextable::autofit() %>%
  flextable::colformat_num(j = 2, digits = 2) %>%
  flextable::colformat_num(j = 3, digits = 4)

Attribute

F

Pr(>F)

Age

0.7655032

0.47

DailyRate

0.5647346

0.57

DistanceFromHome

0.2350266

0.79

Education

0.2830675

0.75

HourlyRate

0.3553375

0.70

JobSatisfaction

0.5021229

0.61

MonthlyIncome

3.2017829

0.04

MonthlyRate

0.5628353

0.57

NumCompaniesWorked

0.9516554

0.39

PercentSalaryHike

0.9243264

0.40

PerformanceRating

0.7940044

0.45

RelationshipSatisfaction

0.9023113

0.41

TotalWorkingYears

0.1824744

0.83

TrainingTimesLastYear

1.4506366

0.23

WorkLifeBalance

4.2131414

0.01

YearsAtCompany

0.7620289

0.47

YearsInCurrentRole

2.4721299

0.08

YearsSinceLastPromotion

1.2231552

0.29

YearsWithCurrManager

0.9569419

0.38

comments

  • values Pr(>F) < 0.05 indicate the difference in means per each department is not an outcome of chance alone
  • the above entails that worklifebalance and monthly income is significantly different in each department

0.2

0.2.1 freguency by status of attrition

res<-dbGetQuery(con,'SELECT Attrition ,
                     COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
                     FROM dat_new
                     GROUP BY Attrition;')

res

0.3 Visualise the results

res |> 
  ggplot(aes(x=fct_reorder(Attrition, PERC), PERC)) +
  geom_col() + 
  scale_fill_avatar()+
  theme_avatar()+
  labs(x="STATUS", 
       y="count", 
       title="") + 
  coord_flip() + 
  geom_text(aes(label=paste0(round(PERC), "% "), hjust=1), col="white")

  • above we see that about 16% of employees churned

0.3.1 freguency by status of attrition Department

res<-dbGetQuery(con,'SELECT Attrition, 
                            Department, 
                            COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
                     FROM dat_new
                     GROUP BY Attrition,Department;')

res

0.3.2 visualise the outcome

res %>% 
  ggplot(aes(Department, Attrition, fill = PERC)) +
  geom_tile(color = "white") +
  geom_text(aes(label = number(PERC, big.mark = ","))) +
  scale_fill_binned(low = "blue", high = "lightyellow",
                    label = number_format(big.mark = ",")) +
  theme(legend.position = "top",
        legend.key.width = unit(15, "mm")) +
  labs(x = "Department", 
       y = "Attrition", 
       fill = "Frequency")

0.3.3 Mean daily rate per each category

res<-dbGetQuery(con,'SELECT Attrition , 
                           AVG(DailyRate) as Daily_average,
                           AVG(MonthlyRate) as Monthly_average,
                           AVG(HourlyRate) as hourly_average,
                           AVG(YearsAtCompany) as average_number_of_years,
                           AVG(RelationshipSatisfaction) as average_satisfaction
                     FROM dat_new
                     GROUP BY Attrition;')

res
  • average daily rate of employees who churned is lower than that of those who did not

0.3.4 summary statistics by attrition status and Department

res<-dbGetQuery(con,'SELECT Attrition,Department , AVG(TotalWorkingYears) as average_workhours ,
                                                   AVG(DistanceFromHome) as average_distance ,
                                                   AVG(YearsAtCompany) as average_years
                     FROM dat_new
                     GROUP BY Attrition,Department ;')

res
  • generally , people who stay a bit far from the company are more likely to leave maybe because they would want to find employement near their place of residence
  • strangely , people with less hours of work on average tend to leave.
  • generally employees who are likely to leave are those that have less number of years at the company

0.3.5 Average monthly salary in each jobRole grouped by attrition status

res<-dbGetQuery(con,'SELECT Attrition,JobRole, AVG(MonthlyRate) as average_rate
                     FROM dat_new
                     GROUP BY Attrition,JobRole ;')

res

0.4

0.4.1 Daily rate and Attrition

dat_new %>% 
  ggplot(aes(x = DailyRate, fill =Attrition)) +
  geom_histogram(color = "white", alpha = 0.75,bins=30) +
  theme(legend.position = "top") +
  scale_x_continuous(labels = number_format(big.mark = ",")) +
  scale_y_continuous(labels = number_format(big.mark = ",")) +
  ggthemes::scale_fill_tableau()+
  labs(x = "Daily Rate", y = "Frequency",
       fill = "Attrition",
       title = "Daily Rate Distribution")

0.4.2 JobRole and Attrition

dat_new %>% 
  count(Attrition, JobRole) %>% 
  ggplot(aes(JobRole, Attrition, fill = n)) +
  geom_tile(color = "white") +
  geom_text(aes(label = number(n, big.mark = ","))) +
  scale_fill_binned(low = "blue", high = "lightyellow",
                    label = number_format(big.mark = ",")) +
  theme(legend.position = "top",
        legend.key.width = unit(15, "mm")) +
  labs(x = "JobRole", 
       y = "Attrition", 
       fill = "Frequency")+
  theme(axis.text.x = element_text(angle = 30, hjust = 1,color="blue"))

0.4.3 visualise relationships

Using binary correlation, I’ll include just the variables with a correlation coefficient of at least 0.10. For our employee attrition data set, OverTime (Y|N) has the largest correlation, JobLevel = 1, MonthlyIncome <= 2,695.80, etc.

library(plotly)
dat_new %>%
  select(-EmployeeNumber) %>%
  correlationfunnel::binarize(n_bins = 5, thresh_infreq = 0.01) %>%
  correlationfunnel::correlate(Attrition__Yes) %>%
  correlationfunnel::plot_correlation_funnel(interactive = FALSE) %>%
  ggplotly()  # Makes prettier, but drops the labels

0.5

library(rpart)
library(rattle)
library(tidymodels)

0.5.1 Modeling the data

  • Extract the training data set and store it in train.
  • Extract the testing data set and store it in test
attrition_df<-dat_new |> select(-EmployeeCount,-EmployeeNumber,-StandardHours)
# Initialize the split
split <- initial_split(attrition_df, prop = 0.8, strata = "Attrition")

# Extract training set
train <- split %>% training()

# Extract testing set
test <- split %>% testing()

0.5.2 Create a recipe-model workflow

  • Next up I Define a recipe using the train data with a :
  • step_filter_missing(),
  • step_scale(), and
  • step_nzv() to remove NAs, scale the numeric features, and remove low-variance features, respectively.
  • Use a threshold of 0.5 for step_filter_missing().
  • Define a logistic regression model using the “glm” engine.
  • Add feature_selection_recipe and lr_model to a workflow named attrition_wflow.
# Create recipe
feature_selection_recipe <- 
  recipe(Attrition ~ ., data = train) %>% 
  step_filter_missing(all_predictors(), threshold = 0.5) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  prep()

0.5.3 Create model

lr_model <- logistic_reg() %>% 
  set_engine("glm")

0.5.4 add a workflow

attrition_wflow <- workflow() %>% 
  add_recipe(feature_selection_recipe) %>% 
  add_model(lr_model)

0.5.5 Fit attrition_wflow using the training data.

  • Add the test predictions to the test data with the original Attrition values.
  • Use f_meas() to evaluate the model’s performance on the test data.
  • will use accuracy() to determine accuracy of the model
# Fit workflow to train data
attrition_fit <- 
  attrition_wflow %>% fit(data = train)

# Add the test predictions to the test data
attrition_pred_df <-predict(attrition_fit, test) %>% 
  bind_cols(test %>% select(Attrition))

# Evaluate F score
f_meas(attrition_pred_df,Attrition, .pred_class)
accuracy(attrition_pred_df, Attrition, .pred_class)

0.6

0.6.1 Decision Tree

# Create modeld
dt_model <- decision_tree() %>% 
  set_engine("rpart")%>% 
  set_mode("classification")

# Add recipe and model to a workflow
attrition_wflow <- workflow() %>% 
  add_recipe(feature_selection_recipe) %>% 
  add_model(dt_model)

0.6.2 Model evaluation

# Fit workflow to train data
attrition_fit <- 
  attrition_wflow %>% fit(data = train)

# Add the test predictions to the test data
attrition_pred_df <-predict(attrition_fit, test) %>% 
  bind_cols(test %>% select(Attrition))

# Evaluate F score
f_meas(attrition_pred_df,Attrition, .pred_class)
accuracy(attrition_pred_df, Attrition, .pred_class)