library(readr)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Test 1
chisq.test(hr$promotion_last_5years , hr$left)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: hr$promotion_last_5years and hr$left
## X-squared = 56.262, df = 1, p-value = 6.344e-14
p-value interpretation: p-value is really small so the probability
of the results being random is very low
chi-square interpretation: There is a dependency in a promotion in
the last 5 years and leaving
Test 2
chisq.test(hr$Work_accident , hr$left)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: hr$Work_accident and hr$left
## X-squared = 357.56, df = 1, p-value < 2.2e-16
p-value interpretation: p-value is very small so this test is
statistically significant, there is a correlation
chi-square interpretation: There is a dependency on a work accident
and if they left the company
non-technical interpretation: Employees that had a work accident are
more likely to stay
prop_data <- hr %>%
mutate(Work_accident = as.factor(Work_accident)) %>%
group_by(Work_accident) %>%
summarise(
stayed = sum(left == 0) / n(),
left = sum(left == 1) / n()
)
plot_ly(prop_data) %>%
add_bars(x = ~Work_accident, y = ~stayed, name = "stayed",
marker = list(color = "#ebd409")) %>%
add_bars(x = ~Work_accident, y = ~left, name = "Left",
marker = list(color = "#039605")) %>%
layout(
barmode = "stack",
xaxis = list(title = "Work Accident"),
yaxis = list(title = "Proportion", tickformat = ",.0%"),
title = "Employees that did not have a work accident are 3 times more likely to stay"
)
Test 3
chisq.test(hr$salary , hr$left)
##
## Pearson's Chi-squared test
##
## data: hr$salary and hr$left
## X-squared = 381.23, df = 2, p-value < 2.2e-16
p-value interpretation: The p-value is very small so the likelihood
of the results being random is very low
chi-square interpretation: There is a dependency on salary and if
employees left
non-technical interpretation: Employees that have a low salary are 4
times more likey to leave than high salary
prop_data <- hr %>%
mutate(salary = as.factor(salary)) %>%
group_by(salary) %>%
summarise(
stayed = sum(left == 0) / n(),
left = sum(left == 1) / n()
)
plot_ly(prop_data) %>%
add_bars(x = ~salary, y = ~stayed, name = "stayed",
marker = list(color = "#940396")) %>%
add_bars(x = ~salary, y = ~left, name = "Left",
marker = list(color = "#00d6f7")) %>%
layout(
barmode = "stack",
xaxis = list(title = "Salary"),
yaxis = list(title = "Proportion", tickformat = ",.0%"),
title = "Employees that have a low salary are 4 times more
likey to leave than high salary"
)
Test 4
chisq.test(hr$Department , hr$left)
##
## Pearson's Chi-squared test
##
## data: hr$Department and hr$left
## X-squared = 86.825, df = 9, p-value = 7.042e-15
p-value interpretation: p-value is really small so the probability
of the results being random is very low
chi-square interpretation: There is a dependency in deartment and
leaving
non-technical interpretation: Employees that are within the
management and RandD departments are more likely to stay with the
company over time, while those in the HR department are most likely to
leave.
prop_data <- hr %>%
mutate(Department = as.factor(Department)) %>%
group_by(Department) %>%
summarise(
stayed = sum(left == 0) / n(),
left = sum(left == 1) / n()
)
plot_ly(prop_data) %>%
add_bars(x = ~Department, y = ~stayed, name = "stayed",
marker = list(color = "#b50300")) %>%
add_bars(x = ~Department, y = ~left, name = "Left",
marker = list(color = "#fa2fd8")) %>%
layout(
barmode = "stack",
xaxis = list(title = "Department"),
yaxis = list(title = "Proportion", tickformat = ",.0%"),
title = "Employees in Management and RandD are more likely
to stay, while those in the HR are most likely to leave"
)