library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gmodels)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CrossTable(hr$promotion_last_5years , hr$left , chisq = T)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 14999
##
##
## | hr$left
## hr$promotion_last_5years | 0 | 1 | Row Total |
## -------------------------|-----------|-----------|-----------|
## 0 | 11128 | 3552 | 14680 |
## | 0.290 | 0.928 | |
## | 0.758 | 0.242 | 0.979 |
## | 0.974 | 0.995 | |
## | 0.742 | 0.237 | |
## -------------------------|-----------|-----------|-----------|
## 1 | 300 | 19 | 319 |
## | 13.343 | 42.702 | |
## | 0.940 | 0.060 | 0.021 |
## | 0.026 | 0.005 | |
## | 0.020 | 0.001 | |
## -------------------------|-----------|-----------|-----------|
## Column Total | 11428 | 3571 | 14999 |
## | 0.762 | 0.238 | |
## -------------------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 57.26273 d.f. = 1 p = 3.813123e-14
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 56.26163 d.f. = 1 p = 6.344155e-14
##
##
#Visual:
df <- hr %>%
mutate(promotion_last_5years = factor(promotion_last_5years, levels = c(0,1), labels = c("Not Promoted", "Promoted")))
summary_df <- df %>%
group_by(promotion_last_5years) %>%
summarise(
pct_left = mean(left == 1) * 100
)
plot_ly(
data = summary_df,
x = ~promotion_last_5years,
y = ~pct_left,
type = "bar"
) %>%
layout(
title = "Employees Without a Recent Promotion are More Likely to Leave",
xaxis = list(title = "Promotion/No Promotion"),
yaxis = list(title = "Percentage Left (%)")
)
CrossTable(hr$Department , hr$left , chisq = T)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 14999
##
##
## | hr$left
## hr$Department | 0 | 1 | Row Total |
## --------------|-----------|-----------|-----------|
## accounting | 563 | 204 | 767 |
## | 0.783 | 2.506 | |
## | 0.734 | 0.266 | 0.051 |
## | 0.049 | 0.057 | |
## | 0.038 | 0.014 | |
## --------------|-----------|-----------|-----------|
## hr | 524 | 215 | 739 |
## | 2.709 | 8.670 | |
## | 0.709 | 0.291 | 0.049 |
## | 0.046 | 0.060 | |
## | 0.035 | 0.014 | |
## --------------|-----------|-----------|-----------|
## IT | 954 | 273 | 1227 |
## | 0.391 | 1.252 | |
## | 0.778 | 0.222 | 0.082 |
## | 0.083 | 0.076 | |
## | 0.064 | 0.018 | |
## --------------|-----------|-----------|-----------|
## management | 539 | 91 | 630 |
## | 7.250 | 23.202 | |
## | 0.856 | 0.144 | 0.042 |
## | 0.047 | 0.025 | |
## | 0.036 | 0.006 | |
## --------------|-----------|-----------|-----------|
## marketing | 655 | 203 | 858 |
## | 0.002 | 0.008 | |
## | 0.763 | 0.237 | 0.057 |
## | 0.057 | 0.057 | |
## | 0.044 | 0.014 | |
## --------------|-----------|-----------|-----------|
## product_mng | 704 | 198 | 902 |
## | 0.408 | 1.307 | |
## | 0.780 | 0.220 | 0.060 |
## | 0.062 | 0.055 | |
## | 0.047 | 0.013 | |
## --------------|-----------|-----------|-----------|
## RandD | 666 | 121 | 787 |
## | 7.346 | 23.510 | |
## | 0.846 | 0.154 | 0.052 |
## | 0.058 | 0.034 | |
## | 0.044 | 0.008 | |
## --------------|-----------|-----------|-----------|
## sales | 3126 | 1014 | 4140 |
## | 0.255 | 0.815 | |
## | 0.755 | 0.245 | 0.276 |
## | 0.274 | 0.284 | |
## | 0.208 | 0.068 | |
## --------------|-----------|-----------|-----------|
## support | 1674 | 555 | 2229 |
## | 0.348 | 1.114 | |
## | 0.751 | 0.249 | 0.149 |
## | 0.146 | 0.155 | |
## | 0.112 | 0.037 | |
## --------------|-----------|-----------|-----------|
## technical | 2023 | 697 | 2720 |
## | 1.178 | 3.771 | |
## | 0.744 | 0.256 | 0.181 |
## | 0.177 | 0.195 | |
## | 0.135 | 0.046 | |
## --------------|-----------|-----------|-----------|
## Column Total | 11428 | 3571 | 14999 |
## | 0.762 | 0.238 | |
## --------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 86.82547 d.f. = 9 p = 7.04213e-15
##
##
##
df2 <- hr %>%
mutate(Department = factor(Department, levels = c("accounting", "sales", "hr", "IT", "management", "marketing", "product_mng", "RandD", "support", "technical")))
summary_df2 <- df2 %>%
group_by(Department) %>%
summarise(
pct_left = mean(left == 1) * 100
)
plot_ly(
data = summary_df2,
x = ~Department,
y = ~pct_left,
type = "bar"
) %>%
layout(
title = "HR Employees are Most Likely to Leave",
xaxis = list(title = "Department"),
yaxis = list(title = "Percentage Left (%)")
)
CrossTable(hr$Work_accident , hr$left , chisq = T)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 14999
##
##
## | hr$left
## hr$Work_accident | 0 | 1 | Row Total |
## -----------------|-----------|-----------|-----------|
## 0 | 9428 | 3402 | 12830 |
## | 12.346 | 39.510 | |
## | 0.735 | 0.265 | 0.855 |
## | 0.825 | 0.953 | |
## | 0.629 | 0.227 | |
## -----------------|-----------|-----------|-----------|
## 1 | 2000 | 169 | 2169 |
## | 73.029 | 233.709 | |
## | 0.922 | 0.078 | 0.145 |
## | 0.175 | 0.047 | |
## | 0.133 | 0.011 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 11428 | 3571 | 14999 |
## | 0.762 | 0.238 | |
## -----------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 358.5938 d.f. = 1 p = 5.698673e-80
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 357.5624 d.f. = 1 p = 9.55824e-80
##
##
df3 <- hr %>%
mutate(Work_accident = factor(Work_accident, levels = c(0,1), labels = c("No" , "Yes")))
summary_df3 <- df3 %>%
group_by(Work_accident) %>%
summarise(
pct_left = mean(left == 1) * 100
)
plot_ly(
data = summary_df3,
x = ~Work_accident,
y = ~pct_left,
type = "bar"
) %>%
layout(
title = "Employees that have NOT had a Work Accident are More Likely to Leave",
xaxis = list(title = "Work Accident"),
yaxis = list(title = "Percentage Left (%)")
)
CrossTable(hr$salary , hr$left , chisq = T)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 14999
##
##
## | hr$left
## hr$salary | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## high | 1155 | 82 | 1237 |
## | 47.915 | 153.339 | |
## | 0.934 | 0.066 | 0.082 |
## | 0.101 | 0.023 | |
## | 0.077 | 0.005 | |
## -------------|-----------|-----------|-----------|
## low | 5144 | 2172 | 7316 |
## | 33.200 | 106.247 | |
## | 0.703 | 0.297 | 0.488 |
## | 0.450 | 0.608 | |
## | 0.343 | 0.145 | |
## -------------|-----------|-----------|-----------|
## medium | 5129 | 1317 | 6446 |
## | 9.648 | 30.876 | |
## | 0.796 | 0.204 | 0.430 |
## | 0.449 | 0.369 | |
## | 0.342 | 0.088 | |
## -------------|-----------|-----------|-----------|
## Column Total | 11428 | 3571 | 14999 |
## | 0.762 | 0.238 | |
## -------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 381.225 d.f. = 2 p = 1.652087e-83
##
##
##
#Visual:
df4 <- hr %>%
mutate(salary = factor(salary, levels = c("low" , "medium" , "high")))
summary_df4 <- df4 %>%
group_by(salary) %>%
summarise(
pct_left = mean(left == 1) * 100
)
plot_ly(
data = summary_df4,
x = ~salary,
y = ~pct_left,
type = "bar"
) %>%
layout(
title = "Employees that Have a Lower Salary Level are More Likely to Leave",
xaxis = list(title = "Salary Level"),
yaxis = list(title = "Percentage Left (%)")
)