library(readr)
hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
options(scipen=999)
chisq.test(hr$Work_accident , hr$left)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: hr$Work_accident and hr$left
## X-squared = 357.56, df = 1, p-value < 0.00000000000000022
The p-value is extremely tiny, therefore the probability of these results being random is very small.
There is a dependence between work accidents and employment status.
Not as many employees with accidents leave as you’d think.
q1 <- hr %>%
group_by(left) %>%
summarise(
had_accident = sum(Work_accident == 1) / n(),
no_accident = sum(Work_accident == 0) / n()
)
plot_ly(q1) %>%
add_bars(x = ~left, y = ~had_accident, name = "Had Work Accident",
marker = list(color = "purple")) %>%
add_bars(x = ~left, y = ~no_accident, name = "No Work Accident",
marker = list(color = "grey")) %>%
layout(
barmode = "stack",
xaxis = list(
title = "Left the Company (1 = Yes, 0 = No)",
tickvals = c(0, 1),
ticktext = c("Stayed", "Left")
),
yaxis = list(
title = "Proportion",
tickformat = ",.0%"
),
title = "Not as many employees with accidents leave as you'd think"
)
chisq.test(hr$promotion_last_5years , hr$left)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: hr$promotion_last_5years and hr$left
## X-squared = 56.262, df = 1, p-value = 0.00000000000006344
The p-value is extremely small, therefore the probability of these results being random is very small.
There is a dependence between the promotions within 5 years and employment status.
Employees who’ve been promoted are less likely to leave.
q2 <- hr %>%
group_by(left) %>%
summarise(
had_promotion = sum(promotion_last_5years == 1) / n(),
no_promotion = sum(promotion_last_5years == 0) / n()
)
plot_ly(q2) %>%
add_bars(x = ~left, y = ~had_promotion, name = "Had Promotion",
marker = list(color = "blue")) %>%
add_bars(x = ~left, y = ~no_promotion, name = "No Promotion",
marker = list(color = "red")) %>%
layout(
barmode = "stack",
xaxis = list(
title = "Left the Company (1 = Yes, 0 = No)",
tickvals = c(0, 1),
ticktext = c("Stayed", "Left")
),
yaxis = list(
title = "Proportion",
tickformat = ",.0%"
),
title = "Employees who've been promoted are less likely to leave."
)
chisq.test(hr$salary , hr$left)
##
## Pearson's Chi-squared test
##
## data: hr$salary and hr$left
## X-squared = 381.23, df = 2, p-value < 0.00000000000000022
The p-value is extremely small, therefore the probability of these results being random is very small.
There is a dependence between the range of salary and employment status.
Employees who made more money were more inclined to stay.
q3 <- hr %>%
group_by(left) %>%
summarise(
low_salary = sum(salary == "low") / n(),
medium_salary = sum(salary == "medium") / n(),
high_salary = sum(salary == "high") / n()
)
# Create stacked bar chart
plot_ly(q3) %>%
add_bars(x = ~left, y = ~low_salary, name = "Low Salary",
marker = list(color = "green")) %>%
add_bars(x = ~left, y = ~medium_salary, name = "Medium Salary",
marker = list(color = "blue")) %>%
add_bars(x = ~left, y = ~high_salary, name = "High Salary",
marker = list(color = "yellow")) %>%
layout(
barmode = "stack",
xaxis = list(
title = "Left the Company (1 = Yes, 0 = No)",
tickvals = c(0, 1),
ticktext = c("Stayed", "Left")
),
yaxis = list(
title = "Proportion",
tickformat = ",.0%"
),
title = "Employees who made more money were more inclined to stay."
)
chisq.test(hr$Department , hr$left)
##
## Pearson's Chi-squared test
##
## data: hr$Department and hr$left
## X-squared = 86.825, df = 9, p-value = 0.000000000000007042
unique(hr$Department)
## [1] "sales" "accounting" "hr" "technical" "support"
## [6] "management" "IT" "product_mng" "marketing" "RandD"
The p-value is teeny tiny, which means there is a significant difference between the departments and leaving or staying.
There is a dependence between the departments and employment status.
No difference between department and leaving.
q4 <- hr %>%
group_by(left) %>%
summarise(
sales = sum(Department == "sales") / n(),
accounting = sum(Department == "accounting") / n(),
hr = sum(Department == "hr") / n(),
technical = sum(Department == "technical") / n(),
support = sum(Department == "support") / n(),
management = sum(Department == "management") / n(),
IT = sum(Department == "IT") / n(),
product_mng = sum(Department == "product_mng") / n(),
marketing = sum(Department == "marketing") / n(),
RandD = sum(Department == "RandD") / n()
)
plot_ly(q4) %>%
add_bars(x = ~left, y = ~sales, name = "Sales",
marker = list(color = "blue")) %>%
add_bars(x = ~left, y = ~accounting, name = "Accounting",
marker = list(color = "orange")) %>%
add_bars(x = ~left, y = ~hr, name = "HR",
marker = list(color = "green")) %>%
add_bars(x = ~left, y = ~technical, name = "Technical",
marker = list(color = "red")) %>%
add_bars(x = ~left, y = ~support, name = "Support",
marker = list(color = "magenta")) %>%
add_bars(x = ~left, y = ~management, name = "Management",
marker = list(color = "brown")) %>%
add_bars(x = ~left, y = ~IT, name = "IT",
marker = list(color = "pink")) %>%
add_bars(x = ~left, y = ~product_mng, name = "Product Management",
marker = list(color = "gray")) %>%
add_bars(x = ~left, y = ~marketing, name = "Marketing",
marker = list(color = "yellow")) %>%
add_bars(x = ~left, y = ~RandD, name = "R&D",
marker = list(color = "cyan")) %>%
layout(
barmode = "stack",
xaxis = list(
title = "Left the Company (1 = Yes, 0 = No)",
tickvals = c(0, 1),
ticktext = c("Stayed", "Left")
),
yaxis = list(
title = "Proportion",
tickformat = ",.0%"
),
title = "No difference between department and leaving."
)