HR Analytics Using SQL and R

All models are wrong, but some are useful. - George Box

0.0.1 Setup

library(tidyverse)
library(odbc)
library(DBI)
library(RSQLite)
library(tvthemes)
library(ggthemes)
library(scales)


dat_new <- read_csv("./input/WA_Fn-UseC_-HR-Employee-Attrition.csv")
dat_new <- dat_new %>%
  mutate_if(is.character, as_factor) %>%
  mutate(
    EnvironmentSatisfaction = factor(EnvironmentSatisfaction, ordered = TRUE),
    StockOptionLevel = factor(StockOptionLevel, ordered = TRUE),
    JobLevel = factor(JobLevel, ordered = TRUE),
    JobInvolvement = factor(JobInvolvement, ordered = TRUE)
  ) %>%
  select(EmployeeNumber, Attrition, everything())
my_skim <- skimr::skim_with(numeric = skimr::sfl(p25 = NULL, p50 = NULL, p75 = NULL, hist = NULL))
my_skim(dat_new)

Data summary
Name	dat_new
Number of rows	1470
Number of columns	35
_______________________
Column type frequency:
factor	13
numeric	22
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
Attrition	1	FALSE	2	No: 1233, Yes: 237
BusinessTravel	1	FALSE	3	Tra: 1043, Tra: 277, Non: 150
Department	1	FALSE	3	Res: 961, Sal: 446, Hum: 63
EducationField	1	FALSE	6	Lif: 606, Med: 464, Mar: 159, Tec: 132
EnvironmentSatisfaction	1	TRUE	4	3: 453, 4: 446, 2: 287, 1: 284
Gender	1	FALSE	2	Mal: 882, Fem: 588
JobInvolvement	1	TRUE	4	3: 868, 2: 375, 4: 144, 1: 83
JobLevel	1	TRUE	5	1: 543, 2: 534, 3: 218, 4: 106
JobRole	1	FALSE	9	Sal: 326, Res: 292, Lab: 259, Man: 145
MaritalStatus	1	FALSE	3	Mar: 673, Sin: 470, Div: 327
Over18	1	FALSE	1	Y: 1470
OverTime	1	FALSE	2	No: 1054, Yes: 416
StockOptionLevel	1	TRUE	4	0: 631, 1: 596, 2: 158, 3: 85

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p100
EmployeeNumber	1	1024.87	602.02	1	2068
Age	1	36.92	9.14	18	60
DailyRate	1	802.49	403.51	102	1499
DistanceFromHome	1	9.19	8.11	1	29
Education	1	2.91	1.02	1	5
EmployeeCount	1	1.00	0.00	1	1
HourlyRate	1	65.89	20.33	30	100
JobSatisfaction	1	2.73	1.10	1	4
MonthlyIncome	1	6502.93	4707.96	1009	19999
MonthlyRate	1	14313.10	7117.79	2094	26999
NumCompaniesWorked	1	2.69	2.50	0	9
PercentSalaryHike	1	15.21	3.66	11	25
PerformanceRating	1	3.15	0.36	3	4
RelationshipSatisfaction	1	2.71	1.08	1	4
StandardHours	1	80.00	0.00	80	80
TotalWorkingYears	1	11.28	7.78	0	40
TrainingTimesLastYear	1	2.80	1.29	0	6
WorkLifeBalance	1	2.76	0.71	1	4
YearsAtCompany	1	7.01	6.13	0	40
YearsInCurrentRole	1	4.23	3.62	0	18
YearsSinceLastPromotion	1	2.19	3.22	0	15
YearsWithCurrManager	1	4.12	3.57	0	17

0.0.2 SETUP SQL Environment

con<-dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con,dat_new)

0.0.3 look at the first few rows of the data

to achieve this LIMIT n does the trick

dbGetQuery(con,'SELECT * 
                FROM dat_new 
                LIMIT 5')

0.0.4 How many employees were recorded

dbGetQuery(con,'SELECT COUNT(*) AS num_rows 
                FROM dat_new')

0.0.5 see if we have any missing employee numbers

dbGetQuery(con,'SELECT COUNT(*)-COUNT(EmployeeNumber) AS missing 
                FROM dat_new')

0.0.6 What was the highest ,lowest and average monthly salary at this company

dbGetQuery(con,"SELECT Attrition,
                    MAX(MonthlyIncome) AS max_income , 
                    MIN(MonthlyIncome) AS min_income , 
                    AVG(MonthlyIncome) AS avg_income
                FROM dat_new
                GROUP BY Attrition;")

0.0.7 visualise this distribution

dat_new |> 
  ggplot(aes(x=Attrition,y=MonthlyIncome ,fill=Attrition))+
  geom_boxplot(outlier.color = "blue")+
  tvthemes::scale_fill_avatar()

monthly salary for non-leavers is more spread out as compared to other category

0.0.8 test whether the difference in mean salary is statistically significant

t.test(MonthlyIncome~Attrition,data=dat_new,alternative="two.sided")

## 
##  Welch Two Sample t-test
## 
## data:  MonthlyIncome by Attrition
## t = -7.4826, df = 412.74, p-value = 4.434e-13
## alternative hypothesis: true difference in means between group Yes and group No is not equal to 0
## 95 percent confidence interval:
##  -2583.050 -1508.244
## sample estimates:
## mean in group Yes  mean in group No 
##          4787.093          6832.740

Comments

from the output of the test , t=-7.4826 represents the calculated difference in sample means in units of standard error and the negative sign suggest the first mean is smaller than the second
the p-value is less than 0.05 suggesting that the difference in means is statistically significant
the confidence interval also does not contain zero therefore providing further evidence that the difference in mean monthly income for each group is statistically significant

0.1

0.1.1 T.test for numeric variables by response

vars_numeric<-dat_new |> 
  select(-EmployeeCount,-EmployeeNumber,-StandardHours) |> 
  select_if(is.numeric) |> colnames()
km_aov <- vars_numeric %>% 
  map(~ t.test(rlang::eval_tidy(expr(!!sym(.x) ~Attrition)), data = dat_new))
km_aov %>% 
  map(~ data.frame(`test statistic` = .x$`statistic`[[1]], p.value = .x$`p.value`[[1]])) %>%
  bind_rows() %>%
  bind_cols(Attribute = vars_numeric) %>%
  select(Attribute, everything()) %>%
  mutate(`p value` = round(p.value,3)) %>%
  select(-p.value)%>%
  flextable::flextable() %>%
  flextable::autofit() %>%
  flextable::colformat_num(j = 2, digits = 2) %>%
  flextable::colformat_num(j = 3, digits = 4)

Attribute	test.statistic	p value
Age	-5.8280119	0.000
DailyRate	-2.1788817	0.030
DistanceFromHome	2.8881831	0.004
Education	-1.2177494	0.224
HourlyRate	-0.2647686	0.791
JobSatisfaction	-3.9261129	0.000
MonthlyIncome	-7.4826216	0.000
MonthlyRate	0.5755022	0.565
NumCompaniesWorked	1.5746511	0.116
PercentSalaryHike	-0.5042445	0.614
PerformanceRating	0.1099940	0.912
RelationshipSatisfaction	-1.7019371	0.090
TotalWorkingYears	-7.0191785	0.000
TrainingTimesLastYear	-2.3305223	0.020
WorkLifeBalance	-2.1741928	0.030
YearsAtCompany	-5.2825961	0.000
YearsInCurrentRole	-6.8470792	0.000
YearsSinceLastPromotion	-1.2879266	0.199
YearsWithCurrManager	-6.6333988	0.000

comments

the p-values that are less than 0.05 suggest that the difference in means is statistically significant
the above implies that the difference in means is not a result of mere chance
for instance the average job satisfaction for those who left the company and those who did not is expected to be significantly different

0.1.2 visualise the results above

vars_numeric<-dat_new |> 
  select(-EmployeeCount,-EmployeeNumber,-StandardHours) |> 
  select_if(is.numeric) |> 
  colnames()

subset<- dat_new |> 
  select(vars_numeric,Attrition) 

# Bring in external file for visualisations
source('functions/visualisations.R')

# Use plot function
plot <- histoplotter(subset, Attrition, 
                     chart_x_axis_lbl = "attrition status", 
                     chart_y_axis_lbl = 'Measures',
                     boxplot_color = 'navy', 
                     boxplot_fill = '#89CFF0', 
                     box_fill_transparency = 0.2) 

# Add extras to plot
plot + 
  ggthemes::theme_excel() +
  tvthemes::scale_color_attackOnTitan()+
  theme(legend.position = 'top')

out of interest we might want to look at how Department affects each of the numeric variables through anova

0.1.3 Analysis Of Variance (ANOVA)

options(scipen=999)
vars_numeric<-dat_new |> 
   select(-EmployeeCount,-EmployeeNumber,-StandardHours) |> 
  select_if(is.numeric) |> colnames()
km_aov <- vars_numeric %>% 
  map(~ aov(rlang::eval_tidy(expr(!!sym(.x) ~Department)), data = dat_new))
km_aov %>% 
  map(anova) %>%
  map(~ data.frame(F = .x$`F value`[[1]], p = .x$`Pr(>F)`[[1]])) %>%
  bind_rows() %>%
  bind_cols(Attribute = vars_numeric) %>%
  select(Attribute, everything()) %>%
  mutate(`Pr(>F)` = round(p,2)) %>%
  select(-p) %>% 
  flextable::flextable() %>%
  flextable::autofit() %>%
  flextable::colformat_num(j = 2, digits = 2) %>%
  flextable::colformat_num(j = 3, digits = 4)

Attribute	F	Pr(>F)
Age	0.7655032	0.47
DailyRate	0.5647346	0.57
DistanceFromHome	0.2350266	0.79
Education	0.2830675	0.75
HourlyRate	0.3553375	0.70
JobSatisfaction	0.5021229	0.61
MonthlyIncome	3.2017829	0.04
MonthlyRate	0.5628353	0.57
NumCompaniesWorked	0.9516554	0.39
PercentSalaryHike	0.9243264	0.40
PerformanceRating	0.7940044	0.45
RelationshipSatisfaction	0.9023113	0.41
TotalWorkingYears	0.1824744	0.83
TrainingTimesLastYear	1.4506366	0.23
WorkLifeBalance	4.2131414	0.01
YearsAtCompany	0.7620289	0.47
YearsInCurrentRole	2.4721299	0.08
YearsSinceLastPromotion	1.2231552	0.29
YearsWithCurrManager	0.9569419	0.38

comments

values Pr(>F) < 0.05 indicate the difference in means per each department is not an outcome of chance alone
the above entails that worklifebalance and monthly income is significantly different in each department

0.2

0.2.1 freguency by status of attrition

res<-dbGetQuery(con,'SELECT Attrition ,
                     COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
                     FROM dat_new
                     GROUP BY Attrition;')

res

0.3 Visualise the results

res |> 
  ggplot(aes(x=fct_reorder(Attrition, PERC), PERC)) +
  geom_col() + 
  scale_fill_avatar()+
  theme_avatar()+
  labs(x="STATUS", 
       y="count", 
       title="") + 
  coord_flip() + 
  geom_text(aes(label=paste0(round(PERC), "% "), hjust=1), col="white")

above we see that about 16% of employees churned

0.3.1 freguency by status of attrition Department

res<-dbGetQuery(con,'SELECT Attrition, 
                            Department, 
                            COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
                     FROM dat_new
                     GROUP BY Attrition,Department;')

res

0.3.2 visualise the outcome

res %>% 
  ggplot(aes(Department, Attrition, fill = PERC)) +
  geom_tile(color = "white") +
  geom_text(aes(label = number(PERC, big.mark = ","))) +
  scale_fill_binned(low = "blue", high = "lightyellow",
                    label = number_format(big.mark = ",")) +
  theme(legend.position = "top",
        legend.key.width = unit(15, "mm")) +
  labs(x = "Department", 
       y = "Attrition", 
       fill = "Frequency")

0.3.3 Mean daily rate per each category

res<-dbGetQuery(con,'SELECT Attrition , 
                           AVG(DailyRate) as Daily_average,
                           AVG(MonthlyRate) as Monthly_average,
                           AVG(HourlyRate) as hourly_average,
                           AVG(YearsAtCompany) as average_number_of_years,
                           AVG(RelationshipSatisfaction) as average_satisfaction
                     FROM dat_new
                     GROUP BY Attrition;')

res

average daily rate of employees who churned is lower than that of those who did not

0.3.4 summary statistics by attrition status and Department

res<-dbGetQuery(con,'SELECT Attrition,Department , AVG(TotalWorkingYears) as average_workhours ,
                                                   AVG(DistanceFromHome) as average_distance ,
                                                   AVG(YearsAtCompany) as average_years
                     FROM dat_new
                     GROUP BY Attrition,Department ;')

res

generally , people who stay a bit far from the company are more likely to leave maybe because they would want to find employement near their place of residence
strangely , people with less hours of work on average tend to leave.
generally employees who are likely to leave are those that have less number of years at the company

0.3.5 Average monthly salary in each jobRole grouped by attrition status

res<-dbGetQuery(con,'SELECT Attrition,JobRole, AVG(MonthlyRate) as average_rate
                     FROM dat_new
                     GROUP BY Attrition,JobRole ;')

res

0.4

0.4.1 Daily rate and Attrition

dat_new %>% 
  ggplot(aes(x = DailyRate, fill =Attrition)) +
  geom_histogram(color = "white", alpha = 0.75,bins=30) +
  theme(legend.position = "top") +
  scale_x_continuous(labels = number_format(big.mark = ",")) +
  scale_y_continuous(labels = number_format(big.mark = ",")) +
  ggthemes::scale_fill_tableau()+
  labs(x = "Daily Rate", y = "Frequency",
       fill = "Attrition",
       title = "Daily Rate Distribution")

0.4.2 JobRole and Attrition

dat_new %>% 
  count(Attrition, JobRole) %>% 
  ggplot(aes(JobRole, Attrition, fill = n)) +
  geom_tile(color = "white") +
  geom_text(aes(label = number(n, big.mark = ","))) +
  scale_fill_binned(low = "blue", high = "lightyellow",
                    label = number_format(big.mark = ",")) +
  theme(legend.position = "top",
        legend.key.width = unit(15, "mm")) +
  labs(x = "JobRole", 
       y = "Attrition", 
       fill = "Frequency")+
  theme(axis.text.x = element_text(angle = 30, hjust = 1,color="blue"))

0.4.3 visualise relationships

Using binary correlation, I’ll include just the variables with a correlation coefficient of at least 0.10. For our employee attrition data set, OverTime (Y|N) has the largest correlation, JobLevel = 1, MonthlyIncome <= 2,695.80, etc.

library(plotly)
dat_new %>%
  select(-EmployeeNumber) %>%
  correlationfunnel::binarize(n_bins = 5, thresh_infreq = 0.01) %>%
  correlationfunnel::correlate(Attrition__Yes) %>%
  correlationfunnel::plot_correlation_funnel(interactive = FALSE) %>%
  ggplotly()  # Makes prettier, but drops the labels

0.5

library(rpart)
library(rattle)
library(tidymodels)

0.5.1 Modeling the data

Extract the training data set and store it in train.
Extract the testing data set and store it in test

attrition_df<-dat_new |> select(-EmployeeCount,-EmployeeNumber,-StandardHours)
# Initialize the split
split <- initial_split(attrition_df, prop = 0.8, strata = "Attrition")

# Extract training set
train <- split %>% training()

# Extract testing set
test <- split %>% testing()

0.5.2 Create a recipe-model workflow

Next up I Define a recipe using the train data with a :
step_filter_missing(),
step_scale(), and
step_nzv() to remove NAs, scale the numeric features, and remove low-variance features, respectively.
Use a threshold of 0.5 for step_filter_missing().
Define a logistic regression model using the “glm” engine.
Add feature_selection_recipe and lr_model to a workflow named attrition_wflow.

# Create recipe
feature_selection_recipe <- 
  recipe(Attrition ~ ., data = train) %>% 
  step_filter_missing(all_predictors(), threshold = 0.5) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  prep()

0.5.3 Create model

lr_model <- logistic_reg() %>% 
  set_engine("glm")

0.5.4 add a workflow

attrition_wflow <- workflow() %>% 
  add_recipe(feature_selection_recipe) %>% 
  add_model(lr_model)

0.5.5 Fit attrition_wflow using the training data.

Add the test predictions to the test data with the original Attrition values.
Use f_meas() to evaluate the model’s performance on the test data.
will use accuracy() to determine accuracy of the model

# Fit workflow to train data
attrition_fit <- 
  attrition_wflow %>% fit(data = train)

# Add the test predictions to the test data
attrition_pred_df <-predict(attrition_fit, test) %>% 
  bind_cols(test %>% select(Attrition))

# Evaluate F score
f_meas(attrition_pred_df,Attrition, .pred_class)

accuracy(attrition_pred_df, Attrition, .pred_class)

0.6

0.6.1 Decision Tree

# Create modeld
dt_model <- decision_tree() %>% 
  set_engine("rpart")%>% 
  set_mode("classification")

# Add recipe and model to a workflow
attrition_wflow <- workflow() %>% 
  add_recipe(feature_selection_recipe) %>% 
  add_model(dt_model)

0.6.2 Model evaluation

# Fit workflow to train data
attrition_fit <- 
  attrition_wflow %>% fit(data = train)

# Add the test predictions to the test data
attrition_pred_df <-predict(attrition_fit, test) %>% 
  bind_cols(test %>% select(Attrition))

# Evaluate F score
f_meas(attrition_pred_df,Attrition, .pred_class)

accuracy(attrition_pred_df, Attrition, .pred_class)