library(readr)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Turn ‘left’ into a factor
hr$left <- factor(hr$left, levels = c(0, 1), labels = c("Stayed", "Left"))
str(hr$left)
## Factor w/ 2 levels "Stayed","Left": 2 2 2 2 2 2 2 2 2 2 ...
levels(hr$left)
## [1] "Stayed" "Left"
1. Left vs Work Accident
hr$Work_accident <- factor(hr$Work_accident, levels = c(0, 1), labels = c("No Accident", "Accident"))
str(hr$Work_accident)
## Factor w/ 2 levels "No Accident",..: 1 1 1 1 1 1 1 1 1 1 ...
levels(hr$Work_accident)
## [1] "No Accident" "Accident"
table_left_accident <- table(hr$left, hr$Work_accident)
table_left_accident
##
## No Accident Accident
## Stayed 9428 2000
## Left 3402 169
chisq_test_result1 <- chisq.test(table_left_accident)
print(chisq_test_result1)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table_left_accident
## X-squared = 357.56, df = 1, p-value < 2.2e-16
The p-value is very small, therefore the probability of these results being random is very small.
There is not a connection with having a work accident and leaving the company
ggplot(hr, aes(x = Work_accident, fill = left)) +
geom_bar(position = "fill") +
labs(title = "Employee Leave Based on Accidents",
y = "Proportion",
x = "Accidents") +
theme_minimal()
2. Left vs Department
table_left_department <- table(hr$left , hr$Department)
table_left_department
##
## accounting hr IT management marketing product_mng RandD sales
## Stayed 563 524 954 539 655 704 666 3126
## Left 204 215 273 91 203 198 121 1014
##
## support technical
## Stayed 1674 2023
## Left 555 697
chisq_test_result2 <- chisq.test(table_left_department)
print(chisq_test_result2)
##
## Pearson's Chi-squared test
##
## data: table_left_department
## X-squared = 86.825, df = 9, p-value = 7.042e-15
The p-value is very small, therefore the probability of these results being random is very small.
There is not a connection bwtween department and leaving the company
ggplot(hr, aes(x = Department, fill = left)) +
geom_bar(position = "fill") +
labs(title = "Employee Leave Based on Department",
y = "Proportion",
x = "Department") +
theme_minimal()
3. Left vs Salary
table_left_salary <- table(hr$left , hr$salary)
table_left_salary
##
## high low medium
## Stayed 1155 5144 5129
## Left 82 2172 1317
chisq_test_result3 <- chisq.test(table_left_salary)
chisq_test_result3
##
## Pearson's Chi-squared test
##
## data: table_left_salary
## X-squared = 381.23, df = 2, p-value < 2.2e-16
The p-value is very small, therefore the probability of these results being random is very small.
There is not a connection between salary and leaving the company.
ggplot(hr, aes(x = salary, fill = left)) +
geom_bar(position = "fill") +
labs(title = "Employee Leave Based on Salary",
y = "Proportion",
x = "Salary") +
theme_minimal()
4. Left vs Number of Projects
table_left_numproject <- table(hr$left , hr$number_project)
table_left_numproject
##
## 2 3 4 5 6 7
## Stayed 821 3983 3956 2149 519 0
## Left 1567 72 409 612 655 256
chisq_test_result4 <- chisq.test(table_left_numproject)
chisq_test_result4
##
## Pearson's Chi-squared test
##
## data: table_left_numproject
## X-squared = 5373.6, df = 5, p-value < 2.2e-16
The p-value is very small, therefore the probability of these results being random is very small.
There is not a connection between number of projects and leaving the company.
ggplot(hr, aes(x = number_project, fill = left)) +
geom_bar(position = "fill") +
labs(title = "Employee Leave Based on Number of Projects",
y = "Proportion",
x = "Number of Projects") +
theme_minimal()