library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(gmodels)
hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hr$left <- factor(hr$left , labels = c('Stayed' , 'Left'))
chisq.test(hr$left , hr$Work_accident)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  hr$left and hr$Work_accident
## X-squared = 357.56, df = 1, p-value < 2.2e-16
prop_data <- hr %>%
  group_by(left) %>%
  summarise(
    No = sum(Work_accident == 0) / n(),
    Yes = sum(Work_accident == 1) / n()
  )

plot_ly(prop_data) %>%
  add_bars(x = ~left, y = ~Yes, name = "Yes", marker = list(color = "#1f77b4")) %>%
  add_bars(x = ~left, y = ~No, name = "No", marker = list(color = "#ff7f0e")) %>%
  layout(
    barmode = "stack",
    xaxis = list(title = "Employee Status"),
    yaxis = list(title = "Proportion of Work Accident", tickformat = ",.0%"),
    title = "Employees that had work accidents \n are more likely to stay"
  )
hr$Work_accident <- factor(hr$Work_accident , labels = c('No' , 'Yes'))
CrossTable(hr$left , hr$Work_accident , chisq = T)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  14999 
## 
##  
##              | hr$Work_accident 
##      hr$left |        No |       Yes | Row Total | 
## -------------|-----------|-----------|-----------|
##       Stayed |      9428 |      2000 |     11428 | 
##              |    12.346 |    73.029 |           | 
##              |     0.825 |     0.175 |     0.762 | 
##              |     0.735 |     0.922 |           | 
##              |     0.629 |     0.133 |           | 
## -------------|-----------|-----------|-----------|
##         Left |      3402 |       169 |      3571 | 
##              |    39.510 |   233.709 |           | 
##              |     0.953 |     0.047 |     0.238 | 
##              |     0.265 |     0.078 |           | 
##              |     0.227 |     0.011 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |     12830 |      2169 |     14999 | 
##              |     0.855 |     0.145 |           | 
## -------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  358.5938     d.f. =  1     p =  5.698673e-80 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  357.5624     d.f. =  1     p =  9.55824e-80 
## 
## 
summary_data <- hr %>%
  group_by(Work_accident) %>%
  summarize(proportion_left = mean(as.numeric(left)-1 ))

plot_ly(summary_data, 
        x = ~Work_accident, 
        y = ~proportion_left, 
        type = 'bar', 
        text = ~round(proportion_left, 2), 
        textposition = 'auto') %>%
  layout(title = "Employees that did not have a work accident \n are more than 3 times more likely to leave",
         xaxis = list(title = "Work Accident"),
         yaxis = list(title = "Proportion Left"))
hr$promotion_last_5years <- factor(hr$promotion_last_5years , labels = c('No' , 'Yes'))
chisq.test(hr$left , hr$promotion_last_5years)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  hr$left and hr$promotion_last_5years
## X-squared = 56.262, df = 1, p-value = 6.344e-14
prop_data2 <- hr %>%
  group_by(left) %>%
  summarise(
    No = sum(promotion_last_5years == 'No') / n(),
    Yes = sum(promotion_last_5years == 'Yes') / n()
  )

plot_ly(prop_data2) %>%
  add_bars(x = ~left, y = ~Yes, name = "Yes", marker = list(color = "#1f77b4")) %>%
  add_bars(x = ~left, y = ~No, name = "No", marker = list(color = "#ff7f0e")) %>%
  layout(
    barmode = "stack",
    xaxis = list(title = "Employee Status"),
    yaxis = list(title = "Proportion of Promotions", tickformat = ",.0%"),
    title = "Employees who were not promoted \n are more likely to leave"
  )
CrossTable(hr$left , hr$promotion_last_5years , chisq = T)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  14999 
## 
##  
##              | hr$promotion_last_5years 
##      hr$left |        No |       Yes | Row Total | 
## -------------|-----------|-----------|-----------|
##       Stayed |     11128 |       300 |     11428 | 
##              |     0.290 |    13.343 |           | 
##              |     0.974 |     0.026 |     0.762 | 
##              |     0.758 |     0.940 |           | 
##              |     0.742 |     0.020 |           | 
## -------------|-----------|-----------|-----------|
##         Left |      3552 |        19 |      3571 | 
##              |     0.928 |    42.702 |           | 
##              |     0.995 |     0.005 |     0.238 | 
##              |     0.242 |     0.060 |           | 
##              |     0.237 |     0.001 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |     14680 |       319 |     14999 | 
##              |     0.979 |     0.021 |           | 
## -------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  57.26273     d.f. =  1     p =  3.813123e-14 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  56.26163     d.f. =  1     p =  6.344155e-14 
## 
## 
summary_data2 <- hr %>%
  group_by(promotion_last_5years) %>%
  summarize(proportion_left = mean(as.numeric(left)-1 ))

plot_ly(summary_data2, 
        x = ~promotion_last_5years, 
        y = ~proportion_left, 
        type = 'bar', 
        text = ~round(proportion_left, 2), 
        textposition = 'auto') %>%
  layout(title = "Employees who were not promoted \n are more likely to leave",
         xaxis = list(title = "Promotion in Last 5 Years"),
         yaxis = list(title = "Proportion Left"))
hr$salary <- factor(hr$salary , levels = c('low', 'medium', 'high'))
chisq.test(hr$left , hr$salary)
## 
##  Pearson's Chi-squared test
## 
## data:  hr$left and hr$salary
## X-squared = 381.23, df = 2, p-value < 2.2e-16
prop_data3 <- hr %>%
  group_by(left) %>%
  summarise(
    Low = sum(salary == 'low') / n(),
    Medium = sum(salary == 'medium') / n(),
    High = sum(salary == 'high') / n()
  )

plot_ly(prop_data3) %>%
  add_bars(x = ~left, y = ~High, name = "High", marker = list(color = "#2ca02c")) %>%
  add_bars(x = ~left, y = ~Medium, name = "Medium", marker = list(color = "#ff7f0e")) %>%
  add_bars(x = ~left, y = ~Low, name = "Low", marker = list(color = "#d62728")) %>%
  layout(
    barmode = "stack",
    xaxis = list(title = "Employee Status"),
    yaxis = list(title = "Proportion of Salary Level", tickformat = ",.0%"),
    title = "Employees with low salaries \n are more likely to leave"
  )
CrossTable(hr$left , hr$salary , chisq = T)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  14999 
## 
##  
##              | hr$salary 
##      hr$left |       low |    medium |      high | Row Total | 
## -------------|-----------|-----------|-----------|-----------|
##       Stayed |      5144 |      5129 |      1155 |     11428 | 
##              |    33.200 |     9.648 |    47.915 |           | 
##              |     0.450 |     0.449 |     0.101 |     0.762 | 
##              |     0.703 |     0.796 |     0.934 |           | 
##              |     0.343 |     0.342 |     0.077 |           | 
## -------------|-----------|-----------|-----------|-----------|
##         Left |      2172 |      1317 |        82 |      3571 | 
##              |   106.247 |    30.876 |   153.339 |           | 
##              |     0.608 |     0.369 |     0.023 |     0.238 | 
##              |     0.297 |     0.204 |     0.066 |           | 
##              |     0.145 |     0.088 |     0.005 |           | 
## -------------|-----------|-----------|-----------|-----------|
## Column Total |      7316 |      6446 |      1237 |     14999 | 
##              |     0.488 |     0.430 |     0.082 |           | 
## -------------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  381.225     d.f. =  2     p =  1.652087e-83 
## 
## 
## 
summary_data3 <- hr %>%
  group_by(salary) %>%
  summarize(proportion_left = mean(as.numeric(left)-1 ))

plot_ly(summary_data3, 
        x = ~salary, 
        y = ~proportion_left, 
        type = 'bar', 
        text = ~round(proportion_left, 2), 
        textposition = 'auto') %>%
  layout(title = "Employees with low salaries \n are more likely to leave",
         xaxis = list(title = "Salary Level"),
         yaxis = list(title = "Proportion Left"))
hr$Department <- factor(hr$Department)
chisq.test(hr$left , hr$Department)
## 
##  Pearson's Chi-squared test
## 
## data:  hr$left and hr$Department
## X-squared = 86.825, df = 9, p-value = 7.042e-15
dept_summary <- hr %>%
  group_by(Department) %>%
  summarize(proportion_left = mean(as.numeric(left)-1 ))

plot_ly(dept_summary, 
        x = ~Department, 
        y = ~proportion_left, 
        type = 'bar', 
        text = ~round(proportion_left, 2), 
        textposition = 'auto') %>%
  layout(title = "Attrition rates vary by department",
         xaxis = list(title = "Department"),
         yaxis = list(title = "Proportion Left"))
CrossTable(hr$left , hr$Department , chisq = T)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  14999 
## 
##  
##              | hr$Department 
##      hr$left |  accounting |          hr |          IT |  management |   marketing | product_mng |       RandD |       sales |     support |   technical |   Row Total | 
## -------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
##       Stayed |         563 |         524 |         954 |         539 |         655 |         704 |         666 |        3126 |        1674 |        2023 |       11428 | 
##              |       0.783 |       2.709 |       0.391 |       7.250 |       0.002 |       0.408 |       7.346 |       0.255 |       0.348 |       1.178 |             | 
##              |       0.049 |       0.046 |       0.083 |       0.047 |       0.057 |       0.062 |       0.058 |       0.274 |       0.146 |       0.177 |       0.762 | 
##              |       0.734 |       0.709 |       0.778 |       0.856 |       0.763 |       0.780 |       0.846 |       0.755 |       0.751 |       0.744 |             | 
##              |       0.038 |       0.035 |       0.064 |       0.036 |       0.044 |       0.047 |       0.044 |       0.208 |       0.112 |       0.135 |             | 
## -------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
##         Left |         204 |         215 |         273 |          91 |         203 |         198 |         121 |        1014 |         555 |         697 |        3571 | 
##              |       2.506 |       8.670 |       1.252 |      23.202 |       0.008 |       1.307 |      23.510 |       0.815 |       1.114 |       3.771 |             | 
##              |       0.057 |       0.060 |       0.076 |       0.025 |       0.057 |       0.055 |       0.034 |       0.284 |       0.155 |       0.195 |       0.238 | 
##              |       0.266 |       0.291 |       0.222 |       0.144 |       0.237 |       0.220 |       0.154 |       0.245 |       0.249 |       0.256 |             | 
##              |       0.014 |       0.014 |       0.018 |       0.006 |       0.014 |       0.013 |       0.008 |       0.068 |       0.037 |       0.046 |             | 
## -------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## Column Total |         767 |         739 |        1227 |         630 |         858 |         902 |         787 |        4140 |        2229 |        2720 |       14999 | 
##              |       0.051 |       0.049 |       0.082 |       0.042 |       0.057 |       0.060 |       0.052 |       0.276 |       0.149 |       0.181 |             | 
## -------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  86.82547     d.f. =  9     p =  7.04213e-15 
## 
## 
##