library(readr)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Ensure ‘left’ and categorical variables are factors

hr <- hr %>%
  mutate(
    left = as.factor(left),
    salary = as.factor(salary),
    promotion_last_5years = as.factor(promotion_last_5years),
    Work_accident = as.factor(Work_accident)
  )

Perform chi-square tests

# 1. Test between 'left' and 'promotion_last_5years'
chi1 <- chisq.test(table(hr$left, hr$promotion_last_5years))
print("Chi-Square Test between 'left' and 'promotion_last_5years':")
## [1] "Chi-Square Test between 'left' and 'promotion_last_5years':"
print(chi1)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(hr$left, hr$promotion_last_5years)
## X-squared = 56.262, df = 1, p-value = 6.344e-14
##Technical Interpretation: Reject the null hypothesis. There is a statistically significant association between whether an employee left the company and whether they were promoted in the last 5 years.
##Non-Technical Interpretation: Employees who were not promoted in the last 5 years are much more likely to leave the company. This suggests that promotions play an important role in retaining employees.


prop_promotion <- hr %>%
  group_by(promotion_last_5years) %>%
  summarise(
    stayed = sum(left == 0) / n(),
    left = sum(left == 1) / n()
  )


plot_ly(prop_promotion) %>%
  add_bars(x = ~promotion_last_5years, y = ~stayed, name = "Stayed", 
           marker = list(color = "purple")) %>%
  add_bars(x = ~promotion_last_5years, y = ~left, name = "Left", 
           marker = list(color = "yellow")) %>%
  layout(
    barmode = "stack",
    xaxis = list(title = "Promotion in Last 5 Years"),
    yaxis = list(title = "Proportion", tickformat = ",.0%"),
    title = "Employees not promoted in the last 5 years are more likely to leave"
  )
# 2. Test between 'left' and 'salary'
chi2 <- chisq.test(table(hr$left, hr$salary))
print("Chi-Square Test between 'left' and 'salary':")
## [1] "Chi-Square Test between 'left' and 'salary':"
print(chi2)
## 
##  Pearson's Chi-squared test
## 
## data:  table(hr$left, hr$salary)
## X-squared = 381.23, df = 2, p-value < 2.2e-16
##Technical Interpretation: Reject the null hypothesis. There is a statistically significant association between employee turnover and salary level.
##Non-Technical Interpretation: Salary level significantly affects whether employees leave or stay. Lower salaries are likely associated with higher turnover, indicating that pay is a key factor in employee retention.

# Calculate proportions
prop_salary <- hr %>%
  group_by(salary) %>%
  summarise(
    stayed = sum(left == 0) / n(),
    left = sum(left == 1) / n()
  )

# Create stacked bar chart
plot_ly(prop_salary) %>%
  add_bars(x = ~salary, y = ~stayed, name = "Stayed", 
           marker = list(color = "purple")) %>%
  add_bars(x = ~salary, y = ~left, name = "Left", 
           marker = list(color = "yellow")) %>%
  layout(
    barmode = "stack",
    xaxis = list(title = "Salary Level"),
    yaxis = list(title = "Proportion", tickformat = ",.0%"),
    title = "Employees with lower salaries are more likely to leave"
  )
# 3. Create a new categorical variable: 'average_monthly_hours_cat'
hr <- hr %>%
  mutate(average_monthly_hours_cat = cut(average_montly_hours,
                                         breaks = c(0, 150, 200, 250),
                                         labels = c("Low", "Medium", "High")))

chi3 <- chisq.test(table(hr$left, hr$average_monthly_hours_cat))
print("Chi-Square Test between 'left' and 'average_monthly_hours_cat':")
## [1] "Chi-Square Test between 'left' and 'average_monthly_hours_cat':"
print(chi3)
## 
##  Pearson's Chi-squared test
## 
## data:  table(hr$left, hr$average_monthly_hours_cat)
## X-squared = 663.79, df = 2, p-value < 2.2e-16
##Technical Interpretation: Reject the null hypothesis. There is a statistically significant association between categorized average monthly hours and employee turnover.
##Non-Technical Interpretation: Workload intensity (average monthly hours) plays a major role in whether employees leave. Employees working too few or too many hours are more likely to leave, suggesting that achieving a balanced workload may help improve retention.

# Calculate proportions
prop_hours <- hr %>%
  group_by(average_monthly_hours_cat) %>%
  summarise(
    stayed = sum(left == 0) / n(),
    left = sum(left == 1) / n()
  )

# Create stacked bar chart
plot_ly(prop_hours) %>%
  add_bars(x = ~average_monthly_hours_cat, y = ~stayed, name = "Stayed", 
           marker = list(color = "purple")) %>%
  add_bars(x = ~average_monthly_hours_cat, y = ~left, name = "Left", 
           marker = list(color = "yellow")) %>%
  layout(
    barmode = "stack",
    xaxis = list(title = "Average Monthly Hours (Categorized)"),
    yaxis = list(title = "Proportion", tickformat = ",.0%"),
    title = "Employees working too few or too many hours are more likely to leave"
  )
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
# 4. Test between 'left' and 'Work_accident'
chi4 <- chisq.test(table(hr$left, hr$Work_accident))
print("Chi-Square Test between 'left' and 'Work_accident':")
## [1] "Chi-Square Test between 'left' and 'Work_accident':"
print(chi4)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(hr$left, hr$Work_accident)
## X-squared = 357.56, df = 1, p-value < 2.2e-16
##Technical Interpretation: Reject the null hypothesis. There is a statistically significant association between experiencing a workplace accident and employee turnover.
##Non-Technical Interpretation: Experiencing a workplace accident significantly increases the likelihood of leaving the company. This suggests that accidents may negatively affect employees’ perceptions of safety or satisfaction, leading to higher turnover.

# Calculate proportions
prop_accident <- hr %>%
  group_by(Work_accident) %>%
  summarise(
    stayed = sum(left == 0) / n(),
    left = sum(left == 1) / n()
  )

# Create stacked bar chart
plot_ly(prop_accident) %>%
  add_bars(x = ~Work_accident, y = ~stayed, name = "Stayed", 
           marker = list(color = "purple")) %>%
  add_bars(x = ~Work_accident, y = ~left, name = "Left", 
           marker = list(color = "yellow")) %>%
  layout(
    barmode = "stack",
    xaxis = list(title = "Work Accident"),
    yaxis = list(title = "Proportion", tickformat = ",.0%"),
    title = "Employees who experience a work accident are more likely to leave"
  )