library(tidyverse)
library(odbc)
library(DBI)
library(RSQLite)
library(tvthemes)
library(ggthemes)
library(scales)


dat_new <- read_csv("./input/WA_Fn-UseC_-HR-Employee-Attrition.csv")
dat_new <- dat_new %>%
  mutate_if(is.character, as_factor) %>%
  mutate(
    EnvironmentSatisfaction = factor(EnvironmentSatisfaction, ordered = TRUE),
    StockOptionLevel = factor(StockOptionLevel, ordered = TRUE),
    JobLevel = factor(JobLevel, ordered = TRUE),
    JobInvolvement = factor(JobInvolvement, ordered = TRUE)
  ) %>%
  select(EmployeeNumber, Attrition, everything())
my_skim <- skimr::skim_with(numeric = skimr::sfl(p25 = NULL, p50 = NULL, p75 = NULL, hist = NULL))
my_skim(dat_new)
Data summary
Name dat_new
Number of rows 1470
Number of columns 35
_______________________
Column type frequency:
factor 13
numeric 22
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Attrition 0 1 FALSE 2 No: 1233, Yes: 237
BusinessTravel 0 1 FALSE 3 Tra: 1043, Tra: 277, Non: 150
Department 0 1 FALSE 3 Res: 961, Sal: 446, Hum: 63
EducationField 0 1 FALSE 6 Lif: 606, Med: 464, Mar: 159, Tec: 132
EnvironmentSatisfaction 0 1 TRUE 4 3: 453, 4: 446, 2: 287, 1: 284
Gender 0 1 FALSE 2 Mal: 882, Fem: 588
JobInvolvement 0 1 TRUE 4 3: 868, 2: 375, 4: 144, 1: 83
JobLevel 0 1 TRUE 5 1: 543, 2: 534, 3: 218, 4: 106
JobRole 0 1 FALSE 9 Sal: 326, Res: 292, Lab: 259, Man: 145
MaritalStatus 0 1 FALSE 3 Mar: 673, Sin: 470, Div: 327
Over18 0 1 FALSE 1 Y: 1470
OverTime 0 1 FALSE 2 No: 1054, Yes: 416
StockOptionLevel 0 1 TRUE 4 0: 631, 1: 596, 2: 158, 3: 85

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p100
EmployeeNumber 0 1 1024.87 602.02 1 2068
Age 0 1 36.92 9.14 18 60
DailyRate 0 1 802.49 403.51 102 1499
DistanceFromHome 0 1 9.19 8.11 1 29
Education 0 1 2.91 1.02 1 5
EmployeeCount 0 1 1.00 0.00 1 1
HourlyRate 0 1 65.89 20.33 30 100
JobSatisfaction 0 1 2.73 1.10 1 4
MonthlyIncome 0 1 6502.93 4707.96 1009 19999
MonthlyRate 0 1 14313.10 7117.79 2094 26999
NumCompaniesWorked 0 1 2.69 2.50 0 9
PercentSalaryHike 0 1 15.21 3.66 11 25
PerformanceRating 0 1 3.15 0.36 3 4
RelationshipSatisfaction 0 1 2.71 1.08 1 4
StandardHours 0 1 80.00 0.00 80 80
TotalWorkingYears 0 1 11.28 7.78 0 40
TrainingTimesLastYear 0 1 2.80 1.29 0 6
WorkLifeBalance 0 1 2.76 0.71 1 4
YearsAtCompany 0 1 7.01 6.13 0 40
YearsInCurrentRole 0 1 4.23 3.62 0 18
YearsSinceLastPromotion 0 1 2.19 3.22 0 15
YearsWithCurrManager 0 1 4.12 3.57 0 17

0.0.1 SETUP SQL Environment

con<-dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con,dat_new)

0.0.2 look at the first few rows of the data

  • to achieve this LIMIT n does the trick
dbGetQuery(con,'SELECT * 
                FROM dat_new 
                LIMIT 5')

0.0.3 How many employees were recorded

dbGetQuery(con,'SELECT COUNT(*) AS num_rows 
                FROM dat_new')

0.0.4 see if we have any missing employee numbers

dbGetQuery(con,'SELECT COUNT(*)-COUNT(EmployeeNumber) AS missing 
                FROM dat_new')

0.0.5 What was the highest ,lowest and average monthly salary at this company

dbGetQuery(con,"SELECT Attrition,
                    MAX(MonthlyIncome) AS max_income , 
                    MIN(MonthlyIncome) AS min_income , 
                    AVG(MonthlyIncome) AS avg_income
                FROM dat_new
                GROUP BY Attrition;")

0.0.6 visualise this distribution

dat_new |> 
  ggplot(aes(x=Attrition,y=MonthlyIncome ,fill=Attrition))+
  geom_boxplot(outlier.color = "blue")

  • monthly salary for non-leavers is more spread out as compared to other category

0.0.7 test whether the difference in mean salary is statistically significant

t.test(MonthlyIncome~Attrition,data=dat_new,alternative="two.sided")
## 
##  Welch Two Sample t-test
## 
## data:  MonthlyIncome by Attrition
## t = -7.4826, df = 412.74, p-value = 4.434e-13
## alternative hypothesis: true difference in means between group Yes and group No is not equal to 0
## 95 percent confidence interval:
##  -2583.050 -1508.244
## sample estimates:
## mean in group Yes  mean in group No 
##          4787.093          6832.740

Comments

  • from the output of the test , t=-7.4826 represents the calculated difference in sample means in units of standard error and the negative sign suggest the first mean is smaller than the second
  • the p-value is less than 0.05 suggesting that the difference in means is statistically significant
  • the confidence interval also does not contain zero therefore providing further evidence that the difference in mean monthly income for each group is statistically significant

0.0.8 freguency by status of attrition

res<-dbGetQuery(con,'SELECT Attrition ,
                     COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
                     FROM dat_new
                     GROUP BY Attrition;')

res

0.1 Visualise the results

res |> 
  ggplot(aes(x=fct_reorder(Attrition, PERC), PERC)) +
  geom_col() + 
  scale_fill_avatar()+
  theme_avatar()+
  labs(x="STATUS", 
       y="count", 
       title="") + 
  coord_flip() + 
  geom_text(aes(label=paste0(round(PERC), "% "), hjust=1), col="white")

  • above we see that about 16% of employees churned

0.1.1 freguency by status of attrition Department

res<-dbGetQuery(con,'SELECT Attrition, 
                            Department, 
                            COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
                     FROM dat_new
                     GROUP BY Attrition,Department;')

res

0.1.2 visualise the outcome

res %>% 
  ggplot(aes(Department, Attrition, fill = PERC)) +
  geom_tile(color = "white") +
  geom_text(aes(label = number(PERC, big.mark = ","))) +
  scale_fill_binned(low = "blue", high = "lightyellow",
                    label = number_format(big.mark = ",")) +
  theme(legend.position = "top",
        legend.key.width = unit(15, "mm")) +
  labs(x = "Department", 
       y = "Attrition", 
       fill = "Frequency")

0.1.3 Mean daily rate per each category

res<-dbGetQuery(con,'SELECT Attrition , 
                           AVG(DailyRate) as Daily_average,
                           AVG(MonthlyRate) as Monthly_average,
                           AVG(HourlyRate) as hourly_average,
                           AVG(YearsAtCompany) as average_number_of_years,
                           AVG(RelationshipSatisfaction) as average_satisfaction
                     FROM dat_new
                     GROUP BY Attrition;')

res
  • average daily rate of employees who churned is lower than that of those who did not

0.1.4 summary statistics by attrition status and Department

res<-dbGetQuery(con,'SELECT Attrition,Department , AVG(TotalWorkingYears) as average_workhours ,
                                                   AVG(DistanceFromHome) as average_distance ,
                                                   AVG(YearsAtCompany) as average_years
                     FROM dat_new
                     GROUP BY Attrition,Department ;')

res
  • generally , people who stay a bit far from the company are more likely to leave maybe because they would want to find employement near their place of residence
  • strangely , people with less hours of work on average tend to leave.
  • generally employees who are likely to leave are those that have less number of years at the company

0.1.5 Average monthly salary in each jobRole grouped by attrition status

res<-dbGetQuery(con,'SELECT Attrition,JobRole, AVG(MonthlyRate) as average_rate
                     FROM dat_new
                     GROUP BY Attrition,JobRole ;')

res

0.2 data distribution of some variables

0.2.1 Daily rate and Attrition

dat_new %>% 
  ggplot(aes(x = DailyRate, fill =Attrition)) +
  geom_histogram(color = "white", alpha = 0.75,bins=30) +
  theme(legend.position = "top") +
  scale_x_continuous(labels = number_format(big.mark = ",")) +
  scale_y_continuous(labels = number_format(big.mark = ",")) +
  ggthemes::scale_fill_tableau()+
  labs(x = "Daily Rate", y = "Frequency",
       fill = "Attrition",
       title = "Daily Rate Distribution")

0.3 JobRole and Attrition

dat_new %>% 
  count(Attrition, JobRole) %>% 
  ggplot(aes(JobRole, Attrition, fill = n)) +
  geom_tile(color = "white") +
  geom_text(aes(label = number(n, big.mark = ","))) +
  scale_fill_binned(low = "blue", high = "lightyellow",
                    label = number_format(big.mark = ",")) +
  theme(legend.position = "top",
        legend.key.width = unit(15, "mm")) +
  labs(x = "JobRole", 
       y = "Attrition", 
       fill = "Frequency")+
  theme(axis.text.x = element_text(angle = 30, hjust = 1,color="blue"))

0.3.1 visualise relationships

Using binary correlation, I’ll include just the variables with a correlation coefficient of at least 0.10. For our employee attrition data set, OverTime (Y|N) has the largest correlation, JobLevel = 1, MonthlyIncome <= 2,695.80, etc.

library(plotly)
dat_new %>%
  select(-EmployeeNumber) %>%
  correlationfunnel::binarize(n_bins = 5, thresh_infreq = 0.01) %>%
  correlationfunnel::correlate(Attrition__Yes) %>%
  correlationfunnel::plot_correlation_funnel(interactive = FALSE) %>%
  ggplotly()  # Makes prettier, but drops the labels