library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(gmodels)
hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hr$left <- factor(hr$left , labels = c('Stayed' , 'Left'))
chisq.test(hr$left , hr$Work_accident)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: hr$left and hr$Work_accident
## X-squared = 357.56, df = 1, p-value < 2.2e-16
prop_data <- hr %>%
group_by(left) %>%
summarise(
No = sum(Work_accident == 0) / n(),
Yes = sum(Work_accident == 1) / n()
)
plot_ly(prop_data) %>%
add_bars(x = ~left, y = ~Yes, name = "Yes", marker = list(color = "#1f77b4")) %>%
add_bars(x = ~left, y = ~No, name = "No", marker = list(color = "#ff7f0e")) %>%
layout(
barmode = "stack",
xaxis = list(title = "Employee Status"),
yaxis = list(title = "Proportion of Work Accident", tickformat = ",.0%"),
title = "Employees that had work accidents \n are more likely to stay"
)
hr$Work_accident <- factor(hr$Work_accident , labels = c('No' , 'Yes'))
CrossTable(hr$left , hr$Work_accident , chisq = T)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 14999
##
##
## | hr$Work_accident
## hr$left | No | Yes | Row Total |
## -------------|-----------|-----------|-----------|
## Stayed | 9428 | 2000 | 11428 |
## | 12.346 | 73.029 | |
## | 0.825 | 0.175 | 0.762 |
## | 0.735 | 0.922 | |
## | 0.629 | 0.133 | |
## -------------|-----------|-----------|-----------|
## Left | 3402 | 169 | 3571 |
## | 39.510 | 233.709 | |
## | 0.953 | 0.047 | 0.238 |
## | 0.265 | 0.078 | |
## | 0.227 | 0.011 | |
## -------------|-----------|-----------|-----------|
## Column Total | 12830 | 2169 | 14999 |
## | 0.855 | 0.145 | |
## -------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 358.5938 d.f. = 1 p = 5.698673e-80
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 357.5624 d.f. = 1 p = 9.55824e-80
##
##
summary_data <- hr %>%
group_by(Work_accident) %>%
summarize(proportion_left = mean(as.numeric(left)-1 ))
plot_ly(summary_data,
x = ~Work_accident,
y = ~proportion_left,
type = 'bar',
text = ~round(proportion_left, 2),
textposition = 'auto') %>%
layout(title = "Employees that did not have a work accident \n are more than 3 times more likely to leave",
xaxis = list(title = "Work Accident"),
yaxis = list(title = "Proportion Left"))
hr$promotion_last_5years <- factor(hr$promotion_last_5years , labels = c('No' , 'Yes'))
chisq.test(hr$left , hr$promotion_last_5years)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: hr$left and hr$promotion_last_5years
## X-squared = 56.262, df = 1, p-value = 6.344e-14
prop_data2 <- hr %>%
group_by(left) %>%
summarise(
No = sum(promotion_last_5years == 'No') / n(),
Yes = sum(promotion_last_5years == 'Yes') / n()
)
plot_ly(prop_data2) %>%
add_bars(x = ~left, y = ~Yes, name = "Yes", marker = list(color = "#1f77b4")) %>%
add_bars(x = ~left, y = ~No, name = "No", marker = list(color = "#ff7f0e")) %>%
layout(
barmode = "stack",
xaxis = list(title = "Employee Status"),
yaxis = list(title = "Proportion of Promotions", tickformat = ",.0%"),
title = "Employees who were not promoted \n are more likely to leave"
)
CrossTable(hr$left , hr$promotion_last_5years , chisq = T)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 14999
##
##
## | hr$promotion_last_5years
## hr$left | No | Yes | Row Total |
## -------------|-----------|-----------|-----------|
## Stayed | 11128 | 300 | 11428 |
## | 0.290 | 13.343 | |
## | 0.974 | 0.026 | 0.762 |
## | 0.758 | 0.940 | |
## | 0.742 | 0.020 | |
## -------------|-----------|-----------|-----------|
## Left | 3552 | 19 | 3571 |
## | 0.928 | 42.702 | |
## | 0.995 | 0.005 | 0.238 |
## | 0.242 | 0.060 | |
## | 0.237 | 0.001 | |
## -------------|-----------|-----------|-----------|
## Column Total | 14680 | 319 | 14999 |
## | 0.979 | 0.021 | |
## -------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 57.26273 d.f. = 1 p = 3.813123e-14
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 56.26163 d.f. = 1 p = 6.344155e-14
##
##
summary_data2 <- hr %>%
group_by(promotion_last_5years) %>%
summarize(proportion_left = mean(as.numeric(left)-1 ))
plot_ly(summary_data2,
x = ~promotion_last_5years,
y = ~proportion_left,
type = 'bar',
text = ~round(proportion_left, 2),
textposition = 'auto') %>%
layout(title = "Employees who were not promoted \n are more likely to leave",
xaxis = list(title = "Promotion in Last 5 Years"),
yaxis = list(title = "Proportion Left"))
hr$salary <- factor(hr$salary , levels = c('low', 'medium', 'high'))
chisq.test(hr$left , hr$salary)
##
## Pearson's Chi-squared test
##
## data: hr$left and hr$salary
## X-squared = 381.23, df = 2, p-value < 2.2e-16
prop_data3 <- hr %>%
group_by(left) %>%
summarise(
Low = sum(salary == 'low') / n(),
Medium = sum(salary == 'medium') / n(),
High = sum(salary == 'high') / n()
)
plot_ly(prop_data3) %>%
add_bars(x = ~left, y = ~High, name = "High", marker = list(color = "#2ca02c")) %>%
add_bars(x = ~left, y = ~Medium, name = "Medium", marker = list(color = "#ff7f0e")) %>%
add_bars(x = ~left, y = ~Low, name = "Low", marker = list(color = "#d62728")) %>%
layout(
barmode = "stack",
xaxis = list(title = "Employee Status"),
yaxis = list(title = "Proportion of Salary Level", tickformat = ",.0%"),
title = "Employees with low salaries \n are more likely to leave"
)
CrossTable(hr$left , hr$salary , chisq = T)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 14999
##
##
## | hr$salary
## hr$left | low | medium | high | Row Total |
## -------------|-----------|-----------|-----------|-----------|
## Stayed | 5144 | 5129 | 1155 | 11428 |
## | 33.200 | 9.648 | 47.915 | |
## | 0.450 | 0.449 | 0.101 | 0.762 |
## | 0.703 | 0.796 | 0.934 | |
## | 0.343 | 0.342 | 0.077 | |
## -------------|-----------|-----------|-----------|-----------|
## Left | 2172 | 1317 | 82 | 3571 |
## | 106.247 | 30.876 | 153.339 | |
## | 0.608 | 0.369 | 0.023 | 0.238 |
## | 0.297 | 0.204 | 0.066 | |
## | 0.145 | 0.088 | 0.005 | |
## -------------|-----------|-----------|-----------|-----------|
## Column Total | 7316 | 6446 | 1237 | 14999 |
## | 0.488 | 0.430 | 0.082 | |
## -------------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 381.225 d.f. = 2 p = 1.652087e-83
##
##
##
summary_data3 <- hr %>%
group_by(salary) %>%
summarize(proportion_left = mean(as.numeric(left)-1 ))
plot_ly(summary_data3,
x = ~salary,
y = ~proportion_left,
type = 'bar',
text = ~round(proportion_left, 2),
textposition = 'auto') %>%
layout(title = "Employees with low salaries \n are more likely to leave",
xaxis = list(title = "Salary Level"),
yaxis = list(title = "Proportion Left"))
hr$Department <- factor(hr$Department)
chisq.test(hr$left , hr$Department)
##
## Pearson's Chi-squared test
##
## data: hr$left and hr$Department
## X-squared = 86.825, df = 9, p-value = 7.042e-15
dept_summary <- hr %>%
group_by(Department) %>%
summarize(proportion_left = mean(as.numeric(left)-1 ))
plot_ly(dept_summary,
x = ~Department,
y = ~proportion_left,
type = 'bar',
text = ~round(proportion_left, 2),
textposition = 'auto') %>%
layout(title = "Attrition rates vary by department",
xaxis = list(title = "Department"),
yaxis = list(title = "Proportion Left"))
CrossTable(hr$left , hr$Department , chisq = T)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 14999
##
##
## | hr$Department
## hr$left | accounting | hr | IT | management | marketing | product_mng | RandD | sales | support | technical | Row Total |
## -------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## Stayed | 563 | 524 | 954 | 539 | 655 | 704 | 666 | 3126 | 1674 | 2023 | 11428 |
## | 0.783 | 2.709 | 0.391 | 7.250 | 0.002 | 0.408 | 7.346 | 0.255 | 0.348 | 1.178 | |
## | 0.049 | 0.046 | 0.083 | 0.047 | 0.057 | 0.062 | 0.058 | 0.274 | 0.146 | 0.177 | 0.762 |
## | 0.734 | 0.709 | 0.778 | 0.856 | 0.763 | 0.780 | 0.846 | 0.755 | 0.751 | 0.744 | |
## | 0.038 | 0.035 | 0.064 | 0.036 | 0.044 | 0.047 | 0.044 | 0.208 | 0.112 | 0.135 | |
## -------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## Left | 204 | 215 | 273 | 91 | 203 | 198 | 121 | 1014 | 555 | 697 | 3571 |
## | 2.506 | 8.670 | 1.252 | 23.202 | 0.008 | 1.307 | 23.510 | 0.815 | 1.114 | 3.771 | |
## | 0.057 | 0.060 | 0.076 | 0.025 | 0.057 | 0.055 | 0.034 | 0.284 | 0.155 | 0.195 | 0.238 |
## | 0.266 | 0.291 | 0.222 | 0.144 | 0.237 | 0.220 | 0.154 | 0.245 | 0.249 | 0.256 | |
## | 0.014 | 0.014 | 0.018 | 0.006 | 0.014 | 0.013 | 0.008 | 0.068 | 0.037 | 0.046 | |
## -------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## Column Total | 767 | 739 | 1227 | 630 | 858 | 902 | 787 | 4140 | 2229 | 2720 | 14999 |
## | 0.051 | 0.049 | 0.082 | 0.042 | 0.057 | 0.060 | 0.052 | 0.276 | 0.149 | 0.181 | |
## -------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 86.82547 d.f. = 9 p = 7.04213e-15
##
##
##