library(readr)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Test 1

chisq.test(hr$promotion_last_5years , hr$left)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  hr$promotion_last_5years and hr$left
## X-squared = 56.262, df = 1, p-value = 6.344e-14
p-value interpretation: p-value is really small so the probability of the results being random is very low
chi-square interpretation: There is a dependency in a promotion in the last 5 years and leaving
non-technical interpretation: Employees that did not get a promotion are 4 times more likely to leave
prop_data <- hr %>%
  mutate(promotion_last_5years = as.factor(promotion_last_5years)) %>%
  group_by(promotion_last_5years) %>%
  summarise(
    stayed = sum(left == 0) / n(),
    left = sum(left == 1) / n()
  )

plot_ly(prop_data) %>%
  add_bars(x = ~promotion_last_5years, y = ~stayed, name = "stayed", 
           marker = list(color = "#1f77b4")) %>%
  add_bars(x = ~promotion_last_5years, y = ~left, name = "Left", 
           marker = list(color = "#ff7f0e")) %>%
  layout(
    barmode = "stack",
    xaxis = list(title = "Promotion in the last 5 years"),
    yaxis = list(title = "Proportion", tickformat = ",.0%"),
    title = "Employees that did not get a promotion are 4 times more likely to leave"
  )

Test 2

chisq.test(hr$Work_accident , hr$left)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  hr$Work_accident and hr$left
## X-squared = 357.56, df = 1, p-value < 2.2e-16
p-value interpretation: p-value is very small so this test is statistically significant, there is a correlation
chi-square interpretation: There is a dependency on a work accident and if they left the company
non-technical interpretation: Employees that had a work accident are more likely to stay
prop_data <- hr %>%
  mutate(Work_accident = as.factor(Work_accident)) %>%
  group_by(Work_accident) %>%
  summarise(
    stayed = sum(left == 0) / n(),
    left = sum(left == 1) / n()
  )

plot_ly(prop_data) %>%
  add_bars(x = ~Work_accident, y = ~stayed, name = "stayed", 
           marker = list(color = "#ebd409")) %>%
  add_bars(x = ~Work_accident, y = ~left, name = "Left", 
           marker = list(color = "#039605")) %>%
  layout(
    barmode = "stack",
    xaxis = list(title = "Work Accident"),
    yaxis = list(title = "Proportion", tickformat = ",.0%"),
    title = "Employees that did not have a work accident are 3 times more likely to stay"
  )

Test 3

chisq.test(hr$salary , hr$left)
## 
##  Pearson's Chi-squared test
## 
## data:  hr$salary and hr$left
## X-squared = 381.23, df = 2, p-value < 2.2e-16
p-value interpretation: The p-value is very small so the likelihood of the results being random is very low
chi-square interpretation: There is a dependency on salary and if employees left
non-technical interpretation: Employees that have a low salary are 4 times more likey to leave than high salary
prop_data <- hr %>%
  mutate(salary = as.factor(salary)) %>%
  group_by(salary) %>%
  summarise(
    stayed = sum(left == 0) / n(),
    left = sum(left == 1) / n()
  )

plot_ly(prop_data) %>%
  add_bars(x = ~salary, y = ~stayed, name = "stayed", 
           marker = list(color = "#940396")) %>%
  add_bars(x = ~salary, y = ~left, name = "Left", 
           marker = list(color = "#00d6f7")) %>%
  layout(
    barmode = "stack",
    xaxis = list(title = "Salary"),
    yaxis = list(title = "Proportion", tickformat = ",.0%"),
    title = "Employees that have a low salary are 4 times more 
    likey to leave than high salary"
  )

Test 4

chisq.test(hr$Department , hr$left)
## 
##  Pearson's Chi-squared test
## 
## data:  hr$Department and hr$left
## X-squared = 86.825, df = 9, p-value = 7.042e-15
p-value interpretation: p-value is really small so the probability of the results being random is very low
chi-square interpretation: There is a dependency in deartment and leaving
non-technical interpretation: Employees that are within the management and RandD departments are more likely to stay with the company over time, while those in the HR department are most likely to leave.
prop_data <- hr %>%
  mutate(Department = as.factor(Department)) %>%
  group_by(Department) %>%
  summarise(
    stayed = sum(left == 0) / n(),
    left = sum(left == 1) / n()
  )

plot_ly(prop_data) %>%
  add_bars(x = ~Department, y = ~stayed, name = "stayed", 
           marker = list(color = "#b50300")) %>%
  add_bars(x = ~Department, y = ~left, name = "Left", 
           marker = list(color = "#fa2fd8")) %>%
  layout(
    barmode = "stack",
    xaxis = list(title = "Department"),
    yaxis = list(title = "Proportion", tickformat = ",.0%"),
    title = "Employees in Management and RandD are more likely 
    to stay, while those in the HR are most likely to leave"
  )