Jacob Stoughton and Jakub Kepa

library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gmodels)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Chi-test 1: Promotion in the last 5 years

CrossTable(hr$promotion_last_5years , hr$left , chisq = T)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  14999 
## 
##  
##                          | hr$left 
## hr$promotion_last_5years |         0 |         1 | Row Total | 
## -------------------------|-----------|-----------|-----------|
##                        0 |     11128 |      3552 |     14680 | 
##                          |     0.290 |     0.928 |           | 
##                          |     0.758 |     0.242 |     0.979 | 
##                          |     0.974 |     0.995 |           | 
##                          |     0.742 |     0.237 |           | 
## -------------------------|-----------|-----------|-----------|
##                        1 |       300 |        19 |       319 | 
##                          |    13.343 |    42.702 |           | 
##                          |     0.940 |     0.060 |     0.021 | 
##                          |     0.026 |     0.005 |           | 
##                          |     0.020 |     0.001 |           | 
## -------------------------|-----------|-----------|-----------|
##             Column Total |     11428 |      3571 |     14999 | 
##                          |     0.762 |     0.238 |           | 
## -------------------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  57.26273     d.f. =  1     p =  3.813123e-14 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  56.26163     d.f. =  1     p =  6.344155e-14 
## 
## 

The p-value < alpha (0.01), therefore the association between promotions in the past 5 years and employees leaving is statistically significant

There is an association between employees being promoted and leaving the company.

Employees who have not been promoted in the last 5 years are more likely to leave.

#Visual:

df <- hr %>%
  mutate(promotion_last_5years = factor(promotion_last_5years, levels = c(0,1), labels = c("Not Promoted", "Promoted")))

Compute % left by Promotion

summary_df <- df %>%
  group_by(promotion_last_5years) %>%
  summarise(
    pct_left = mean(left == 1) * 100
  )

Bar plot

plot_ly(
  data = summary_df,
  x = ~promotion_last_5years,
  y = ~pct_left,
  type = "bar"
) %>%
  layout(
    title = "Employees Without a Recent Promotion are More Likely to Leave",
    xaxis = list(title = "Promotion/No Promotion"),
    yaxis = list(title = "Percentage Left (%)")
  )

Chi-Test 2: Departments

CrossTable(hr$Department , hr$left , chisq = T)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  14999 
## 
##  
##               | hr$left 
## hr$Department |         0 |         1 | Row Total | 
## --------------|-----------|-----------|-----------|
##    accounting |       563 |       204 |       767 | 
##               |     0.783 |     2.506 |           | 
##               |     0.734 |     0.266 |     0.051 | 
##               |     0.049 |     0.057 |           | 
##               |     0.038 |     0.014 |           | 
## --------------|-----------|-----------|-----------|
##            hr |       524 |       215 |       739 | 
##               |     2.709 |     8.670 |           | 
##               |     0.709 |     0.291 |     0.049 | 
##               |     0.046 |     0.060 |           | 
##               |     0.035 |     0.014 |           | 
## --------------|-----------|-----------|-----------|
##            IT |       954 |       273 |      1227 | 
##               |     0.391 |     1.252 |           | 
##               |     0.778 |     0.222 |     0.082 | 
##               |     0.083 |     0.076 |           | 
##               |     0.064 |     0.018 |           | 
## --------------|-----------|-----------|-----------|
##    management |       539 |        91 |       630 | 
##               |     7.250 |    23.202 |           | 
##               |     0.856 |     0.144 |     0.042 | 
##               |     0.047 |     0.025 |           | 
##               |     0.036 |     0.006 |           | 
## --------------|-----------|-----------|-----------|
##     marketing |       655 |       203 |       858 | 
##               |     0.002 |     0.008 |           | 
##               |     0.763 |     0.237 |     0.057 | 
##               |     0.057 |     0.057 |           | 
##               |     0.044 |     0.014 |           | 
## --------------|-----------|-----------|-----------|
##   product_mng |       704 |       198 |       902 | 
##               |     0.408 |     1.307 |           | 
##               |     0.780 |     0.220 |     0.060 | 
##               |     0.062 |     0.055 |           | 
##               |     0.047 |     0.013 |           | 
## --------------|-----------|-----------|-----------|
##         RandD |       666 |       121 |       787 | 
##               |     7.346 |    23.510 |           | 
##               |     0.846 |     0.154 |     0.052 | 
##               |     0.058 |     0.034 |           | 
##               |     0.044 |     0.008 |           | 
## --------------|-----------|-----------|-----------|
##         sales |      3126 |      1014 |      4140 | 
##               |     0.255 |     0.815 |           | 
##               |     0.755 |     0.245 |     0.276 | 
##               |     0.274 |     0.284 |           | 
##               |     0.208 |     0.068 |           | 
## --------------|-----------|-----------|-----------|
##       support |      1674 |       555 |      2229 | 
##               |     0.348 |     1.114 |           | 
##               |     0.751 |     0.249 |     0.149 | 
##               |     0.146 |     0.155 |           | 
##               |     0.112 |     0.037 |           | 
## --------------|-----------|-----------|-----------|
##     technical |      2023 |       697 |      2720 | 
##               |     1.178 |     3.771 |           | 
##               |     0.744 |     0.256 |     0.181 | 
##               |     0.177 |     0.195 |           | 
##               |     0.135 |     0.046 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |     11428 |      3571 |     14999 | 
##               |     0.762 |     0.238 |           | 
## --------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  86.82547     d.f. =  9     p =  7.04213e-15 
## 
## 
## 

The p-value < alpha (0.01), therefore the association between department and employees leaving is statistically significant

There is an association between department and employees left

Employees in HR are the most likely to leave

Visual:

df2 <- hr %>%
  mutate(Department = factor(Department, levels = c("accounting", "sales", "hr", "IT", "management", "marketing", "product_mng", "RandD", "support", "technical")))

Compute % left by Department

summary_df2 <- df2 %>%
  group_by(Department) %>%
  summarise(
    pct_left = mean(left == 1) * 100
  )

Bar plot

plot_ly(
  data = summary_df2,
  x = ~Department,
  y = ~pct_left,
  type = "bar"
) %>%
  layout(
    title = "HR Employees are Most Likely to Leave",
    xaxis = list(title = "Department"),
    yaxis = list(title = "Percentage Left (%)")
  )

Chi-Test 3: Work Accidents

CrossTable(hr$Work_accident , hr$left , chisq = T)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  14999 
## 
##  
##                  | hr$left 
## hr$Work_accident |         0 |         1 | Row Total | 
## -----------------|-----------|-----------|-----------|
##                0 |      9428 |      3402 |     12830 | 
##                  |    12.346 |    39.510 |           | 
##                  |     0.735 |     0.265 |     0.855 | 
##                  |     0.825 |     0.953 |           | 
##                  |     0.629 |     0.227 |           | 
## -----------------|-----------|-----------|-----------|
##                1 |      2000 |       169 |      2169 | 
##                  |    73.029 |   233.709 |           | 
##                  |     0.922 |     0.078 |     0.145 | 
##                  |     0.175 |     0.047 |           | 
##                  |     0.133 |     0.011 |           | 
## -----------------|-----------|-----------|-----------|
##     Column Total |     11428 |      3571 |     14999 | 
##                  |     0.762 |     0.238 |           | 
## -----------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  358.5938     d.f. =  1     p =  5.698673e-80 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  357.5624     d.f. =  1     p =  9.55824e-80 
## 
## 

The P-value < alpha (0.01), therefore the association between having a work accident and leaving is statistically significant

There is an association between having a work accident and leaving

Employees that have a work accident are less likely to leave

Visual:

df3 <- hr %>%
  mutate(Work_accident = factor(Work_accident, levels = c(0,1), labels = c("No" , "Yes")))

Compute % left by Department

summary_df3 <- df3 %>%
  group_by(Work_accident) %>%
  summarise(
    pct_left = mean(left == 1) * 100
  )

Bar plot

plot_ly(
  data = summary_df3,
  x = ~Work_accident,
  y = ~pct_left,
  type = "bar"
) %>%
  layout(
    title = "Employees that have NOT had a Work Accident are More Likely to Leave",
    xaxis = list(title = "Work Accident"),
    yaxis = list(title = "Percentage Left (%)")
  )

Chi-test 4: Salary

CrossTable(hr$salary , hr$left , chisq = T)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  14999 
## 
##  
##              | hr$left 
##    hr$salary |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##         high |      1155 |        82 |      1237 | 
##              |    47.915 |   153.339 |           | 
##              |     0.934 |     0.066 |     0.082 | 
##              |     0.101 |     0.023 |           | 
##              |     0.077 |     0.005 |           | 
## -------------|-----------|-----------|-----------|
##          low |      5144 |      2172 |      7316 | 
##              |    33.200 |   106.247 |           | 
##              |     0.703 |     0.297 |     0.488 | 
##              |     0.450 |     0.608 |           | 
##              |     0.343 |     0.145 |           | 
## -------------|-----------|-----------|-----------|
##       medium |      5129 |      1317 |      6446 | 
##              |     9.648 |    30.876 |           | 
##              |     0.796 |     0.204 |     0.430 | 
##              |     0.449 |     0.369 |           | 
##              |     0.342 |     0.088 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |     11428 |      3571 |     14999 | 
##              |     0.762 |     0.238 |           | 
## -------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  381.225     d.f. =  2     p =  1.652087e-83 
## 
## 
## 

The p-value < alpha (0.01), therefore the association between salary level and employees left is statistically significant

There is an association between salary level and employees leaving

Employees with a medium salary level are the least likely to leave

#Visual:

df4 <- hr %>%
  mutate(salary = factor(salary, levels = c("low" , "medium" , "high")))

Compute % left by Salary

summary_df4 <- df4 %>%
  group_by(salary) %>%
  summarise(
    pct_left = mean(left == 1) * 100
  )

Bar plot

plot_ly(
  data = summary_df4,
  x = ~salary,
  y = ~pct_left,
  type = "bar"
) %>%
  layout(
    title = "Employees that Have a Lower Salary Level are More Likely to Leave",
    xaxis = list(title = "Salary Level"),
    yaxis = list(title = "Percentage Left (%)")
  )