HR Analytics Using SQL and R

library(tidyverse)
library(odbc)
library(DBI)
library(RSQLite)
library(tvthemes)
library(ggthemes)
library(scales)


dat_new <- read_csv("./input/WA_Fn-UseC_-HR-Employee-Attrition.csv")
dat_new <- dat_new %>%
  mutate_if(is.character, as_factor) %>%
  mutate(
    EnvironmentSatisfaction = factor(EnvironmentSatisfaction, ordered = TRUE),
    StockOptionLevel = factor(StockOptionLevel, ordered = TRUE),
    JobLevel = factor(JobLevel, ordered = TRUE),
    JobInvolvement = factor(JobInvolvement, ordered = TRUE)
  ) %>%
  select(EmployeeNumber, Attrition, everything())
my_skim <- skimr::skim_with(numeric = skimr::sfl(p25 = NULL, p50 = NULL, p75 = NULL, hist = NULL))
my_skim(dat_new)

Data summary
Name	dat_new
Number of rows	1470
Number of columns	35
_______________________
Column type frequency:
factor	13
numeric	22
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
Attrition	1	FALSE	2	No: 1233, Yes: 237
BusinessTravel	1	FALSE	3	Tra: 1043, Tra: 277, Non: 150
Department	1	FALSE	3	Res: 961, Sal: 446, Hum: 63
EducationField	1	FALSE	6	Lif: 606, Med: 464, Mar: 159, Tec: 132
EnvironmentSatisfaction	1	TRUE	4	3: 453, 4: 446, 2: 287, 1: 284
Gender	1	FALSE	2	Mal: 882, Fem: 588
JobInvolvement	1	TRUE	4	3: 868, 2: 375, 4: 144, 1: 83
JobLevel	1	TRUE	5	1: 543, 2: 534, 3: 218, 4: 106
JobRole	1	FALSE	9	Sal: 326, Res: 292, Lab: 259, Man: 145
MaritalStatus	1	FALSE	3	Mar: 673, Sin: 470, Div: 327
Over18	1	FALSE	1	Y: 1470
OverTime	1	FALSE	2	No: 1054, Yes: 416
StockOptionLevel	1	TRUE	4	0: 631, 1: 596, 2: 158, 3: 85

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p100
EmployeeNumber	1	1024.87	602.02	1	2068
Age	1	36.92	9.14	18	60
DailyRate	1	802.49	403.51	102	1499
DistanceFromHome	1	9.19	8.11	1	29
Education	1	2.91	1.02	1	5
EmployeeCount	1	1.00	0.00	1	1
HourlyRate	1	65.89	20.33	30	100
JobSatisfaction	1	2.73	1.10	1	4
MonthlyIncome	1	6502.93	4707.96	1009	19999
MonthlyRate	1	14313.10	7117.79	2094	26999
NumCompaniesWorked	1	2.69	2.50	0	9
PercentSalaryHike	1	15.21	3.66	11	25
PerformanceRating	1	3.15	0.36	3	4
RelationshipSatisfaction	1	2.71	1.08	1	4
StandardHours	1	80.00	0.00	80	80
TotalWorkingYears	1	11.28	7.78	0	40
TrainingTimesLastYear	1	2.80	1.29	0	6
WorkLifeBalance	1	2.76	0.71	1	4
YearsAtCompany	1	7.01	6.13	0	40
YearsInCurrentRole	1	4.23	3.62	0	18
YearsSinceLastPromotion	1	2.19	3.22	0	15
YearsWithCurrManager	1	4.12	3.57	0	17

0.0.1 SETUP SQL Environment

con<-dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con,dat_new)

0.0.2 look at the first few rows of the data

to achieve this LIMIT n does the trick

dbGetQuery(con,'SELECT * 
                FROM dat_new 
                LIMIT 5')

0.0.3 How many employees were recorded

dbGetQuery(con,'SELECT COUNT(*) AS num_rows 
                FROM dat_new')

0.0.4 see if we have any missing employee numbers

dbGetQuery(con,'SELECT COUNT(*)-COUNT(EmployeeNumber) AS missing 
                FROM dat_new')

0.0.5 What was the highest ,lowest and average monthly salary at this company

dbGetQuery(con,"SELECT Attrition,
                    MAX(MonthlyIncome) AS max_income , 
                    MIN(MonthlyIncome) AS min_income , 
                    AVG(MonthlyIncome) AS avg_income
                FROM dat_new
                GROUP BY Attrition;")

0.0.6 visualise this distribution

dat_new |> 
  ggplot(aes(x=Attrition,y=MonthlyIncome ,fill=Attrition))+
  geom_boxplot(outlier.color = "blue")

monthly salary for non-leavers is more spread out as compared to other category

0.0.7 test whether the difference in mean salary is statistically significant

t.test(MonthlyIncome~Attrition,data=dat_new,alternative="two.sided")

## 
##  Welch Two Sample t-test
## 
## data:  MonthlyIncome by Attrition
## t = -7.4826, df = 412.74, p-value = 4.434e-13
## alternative hypothesis: true difference in means between group Yes and group No is not equal to 0
## 95 percent confidence interval:
##  -2583.050 -1508.244
## sample estimates:
## mean in group Yes  mean in group No 
##          4787.093          6832.740

Comments

from the output of the test , t=-7.4826 represents the calculated difference in sample means in units of standard error and the negative sign suggest the first mean is smaller than the second
the p-value is less than 0.05 suggesting that the difference in means is statistically significant
the confidence interval also does not contain zero therefore providing further evidence that the difference in mean monthly income for each group is statistically significant

0.0.8 freguency by status of attrition

res<-dbGetQuery(con,'SELECT Attrition ,
                     COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
                     FROM dat_new
                     GROUP BY Attrition;')

res

0.1 Visualise the results

res |> 
  ggplot(aes(x=fct_reorder(Attrition, PERC), PERC)) +
  geom_col() + 
  scale_fill_avatar()+
  theme_avatar()+
  labs(x="STATUS", 
       y="count", 
       title="") + 
  coord_flip() + 
  geom_text(aes(label=paste0(round(PERC), "% "), hjust=1), col="white")

above we see that about 16% of employees churned

0.1.1 freguency by status of attrition Department

res<-dbGetQuery(con,'SELECT Attrition, 
                            Department, 
                            COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
                     FROM dat_new
                     GROUP BY Attrition,Department;')

res

0.1.2 visualise the outcome

res %>% 
  ggplot(aes(Department, Attrition, fill = PERC)) +
  geom_tile(color = "white") +
  geom_text(aes(label = number(PERC, big.mark = ","))) +
  scale_fill_binned(low = "blue", high = "lightyellow",
                    label = number_format(big.mark = ",")) +
  theme(legend.position = "top",
        legend.key.width = unit(15, "mm")) +
  labs(x = "Department", 
       y = "Attrition", 
       fill = "Frequency")

0.1.3 Mean daily rate per each category

res<-dbGetQuery(con,'SELECT Attrition , 
                           AVG(DailyRate) as Daily_average,
                           AVG(MonthlyRate) as Monthly_average,
                           AVG(HourlyRate) as hourly_average,
                           AVG(YearsAtCompany) as average_number_of_years,
                           AVG(RelationshipSatisfaction) as average_satisfaction
                     FROM dat_new
                     GROUP BY Attrition;')

res

average daily rate of employees who churned is lower than that of those who did not

0.1.4 summary statistics by attrition status and Department

res<-dbGetQuery(con,'SELECT Attrition,Department , AVG(TotalWorkingYears) as average_workhours ,
                                                   AVG(DistanceFromHome) as average_distance ,
                                                   AVG(YearsAtCompany) as average_years
                     FROM dat_new
                     GROUP BY Attrition,Department ;')

res

generally , people who stay a bit far from the company are more likely to leave maybe because they would want to find employement near their place of residence
strangely , people with less hours of work on average tend to leave.
generally employees who are likely to leave are those that have less number of years at the company

0.1.5 Average monthly salary in each jobRole grouped by attrition status

res<-dbGetQuery(con,'SELECT Attrition,JobRole, AVG(MonthlyRate) as average_rate
                     FROM dat_new
                     GROUP BY Attrition,JobRole ;')

res

0.2 data distribution of some variables

0.2.1 Daily rate and Attrition

dat_new %>% 
  ggplot(aes(x = DailyRate, fill =Attrition)) +
  geom_histogram(color = "white", alpha = 0.75,bins=30) +
  theme(legend.position = "top") +
  scale_x_continuous(labels = number_format(big.mark = ",")) +
  scale_y_continuous(labels = number_format(big.mark = ",")) +
  ggthemes::scale_fill_tableau()+
  labs(x = "Daily Rate", y = "Frequency",
       fill = "Attrition",
       title = "Daily Rate Distribution")

0.3 JobRole and Attrition

dat_new %>% 
  count(Attrition, JobRole) %>% 
  ggplot(aes(JobRole, Attrition, fill = n)) +
  geom_tile(color = "white") +
  geom_text(aes(label = number(n, big.mark = ","))) +
  scale_fill_binned(low = "blue", high = "lightyellow",
                    label = number_format(big.mark = ",")) +
  theme(legend.position = "top",
        legend.key.width = unit(15, "mm")) +
  labs(x = "JobRole", 
       y = "Attrition", 
       fill = "Frequency")+
  theme(axis.text.x = element_text(angle = 30, hjust = 1,color="blue"))

0.3.1 visualise relationships

Using binary correlation, I’ll include just the variables with a correlation coefficient of at least 0.10. For our employee attrition data set, OverTime (Y|N) has the largest correlation, JobLevel = 1, MonthlyIncome <= 2,695.80, etc.

library(plotly)
dat_new %>%
  select(-EmployeeNumber) %>%
  correlationfunnel::binarize(n_bins = 5, thresh_infreq = 0.01) %>%
  correlationfunnel::correlate(Attrition__Yes) %>%
  correlationfunnel::plot_correlation_funnel(interactive = FALSE) %>%
  ggplotly()  # Makes prettier, but drops the labels