library(readr)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Ensure ‘left’ and categorical variables are factors
hr <- hr %>%
mutate(
left = as.factor(left),
salary = as.factor(salary),
promotion_last_5years = as.factor(promotion_last_5years),
Work_accident = as.factor(Work_accident)
)
Perform chi-square tests
# 1. Test between 'left' and 'promotion_last_5years'
chi1 <- chisq.test(table(hr$left, hr$promotion_last_5years))
print("Chi-Square Test between 'left' and 'promotion_last_5years':")
## [1] "Chi-Square Test between 'left' and 'promotion_last_5years':"
print(chi1)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(hr$left, hr$promotion_last_5years)
## X-squared = 56.262, df = 1, p-value = 6.344e-14
##Technical Interpretation: Reject the null hypothesis. There is a statistically significant association between whether an employee left the company and whether they were promoted in the last 5 years.
##Non-Technical Interpretation: Employees who were not promoted in the last 5 years are much more likely to leave the company. This suggests that promotions play an important role in retaining employees.
prop_promotion <- hr %>%
group_by(promotion_last_5years) %>%
summarise(
stayed = sum(left == 0) / n(),
left = sum(left == 1) / n()
)
plot_ly(prop_promotion) %>%
add_bars(x = ~promotion_last_5years, y = ~stayed, name = "Stayed",
marker = list(color = "purple")) %>%
add_bars(x = ~promotion_last_5years, y = ~left, name = "Left",
marker = list(color = "yellow")) %>%
layout(
barmode = "stack",
xaxis = list(title = "Promotion in Last 5 Years"),
yaxis = list(title = "Proportion", tickformat = ",.0%"),
title = "Employees not promoted in the last 5 years are more likely to leave"
)
# 2. Test between 'left' and 'salary'
chi2 <- chisq.test(table(hr$left, hr$salary))
print("Chi-Square Test between 'left' and 'salary':")
## [1] "Chi-Square Test between 'left' and 'salary':"
print(chi2)
##
## Pearson's Chi-squared test
##
## data: table(hr$left, hr$salary)
## X-squared = 381.23, df = 2, p-value < 2.2e-16
##Technical Interpretation: Reject the null hypothesis. There is a statistically significant association between employee turnover and salary level.
##Non-Technical Interpretation: Salary level significantly affects whether employees leave or stay. Lower salaries are likely associated with higher turnover, indicating that pay is a key factor in employee retention.
# Calculate proportions
prop_salary <- hr %>%
group_by(salary) %>%
summarise(
stayed = sum(left == 0) / n(),
left = sum(left == 1) / n()
)
# Create stacked bar chart
plot_ly(prop_salary) %>%
add_bars(x = ~salary, y = ~stayed, name = "Stayed",
marker = list(color = "purple")) %>%
add_bars(x = ~salary, y = ~left, name = "Left",
marker = list(color = "yellow")) %>%
layout(
barmode = "stack",
xaxis = list(title = "Salary Level"),
yaxis = list(title = "Proportion", tickformat = ",.0%"),
title = "Employees with lower salaries are more likely to leave"
)
# 3. Create a new categorical variable: 'average_monthly_hours_cat'
hr <- hr %>%
mutate(average_monthly_hours_cat = cut(average_montly_hours,
breaks = c(0, 150, 200, 250),
labels = c("Low", "Medium", "High")))
chi3 <- chisq.test(table(hr$left, hr$average_monthly_hours_cat))
print("Chi-Square Test between 'left' and 'average_monthly_hours_cat':")
## [1] "Chi-Square Test between 'left' and 'average_monthly_hours_cat':"
print(chi3)
##
## Pearson's Chi-squared test
##
## data: table(hr$left, hr$average_monthly_hours_cat)
## X-squared = 663.79, df = 2, p-value < 2.2e-16
##Technical Interpretation: Reject the null hypothesis. There is a statistically significant association between categorized average monthly hours and employee turnover.
##Non-Technical Interpretation: Workload intensity (average monthly hours) plays a major role in whether employees leave. Employees working too few or too many hours are more likely to leave, suggesting that achieving a balanced workload may help improve retention.
# Calculate proportions
prop_hours <- hr %>%
group_by(average_monthly_hours_cat) %>%
summarise(
stayed = sum(left == 0) / n(),
left = sum(left == 1) / n()
)
# Create stacked bar chart
plot_ly(prop_hours) %>%
add_bars(x = ~average_monthly_hours_cat, y = ~stayed, name = "Stayed",
marker = list(color = "purple")) %>%
add_bars(x = ~average_monthly_hours_cat, y = ~left, name = "Left",
marker = list(color = "yellow")) %>%
layout(
barmode = "stack",
xaxis = list(title = "Average Monthly Hours (Categorized)"),
yaxis = list(title = "Proportion", tickformat = ",.0%"),
title = "Employees working too few or too many hours are more likely to leave"
)
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
# 4. Test between 'left' and 'Work_accident'
chi4 <- chisq.test(table(hr$left, hr$Work_accident))
print("Chi-Square Test between 'left' and 'Work_accident':")
## [1] "Chi-Square Test between 'left' and 'Work_accident':"
print(chi4)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(hr$left, hr$Work_accident)
## X-squared = 357.56, df = 1, p-value < 2.2e-16
##Technical Interpretation: Reject the null hypothesis. There is a statistically significant association between experiencing a workplace accident and employee turnover.
##Non-Technical Interpretation: Experiencing a workplace accident significantly increases the likelihood of leaving the company. This suggests that accidents may negatively affect employees’ perceptions of safety or satisfaction, leading to higher turnover.
# Calculate proportions
prop_accident <- hr %>%
group_by(Work_accident) %>%
summarise(
stayed = sum(left == 0) / n(),
left = sum(left == 1) / n()
)
# Create stacked bar chart
plot_ly(prop_accident) %>%
add_bars(x = ~Work_accident, y = ~stayed, name = "Stayed",
marker = list(color = "purple")) %>%
add_bars(x = ~Work_accident, y = ~left, name = "Left",
marker = list(color = "yellow")) %>%
layout(
barmode = "stack",
xaxis = list(title = "Work Accident"),
yaxis = list(title = "Proportion", tickformat = ",.0%"),
title = "Employees who experience a work accident are more likely to leave"
)