Find out leading factors contributing to employee attrition
Use of R programming to analyse the data set
head(df)
It look like the dataset have categorical value which is encoded into numeric . Here are labels: Education 1 ‘Below College’ 2 ‘College’ 3 ‘Bachelor’ 4 ‘Master’ 5 ‘Doctor’
EnvironmentSatisfaction 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’
JobInvolvement 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’
JobSatisfaction 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’
PerformanceRating 1 ‘Low’ 2 ‘Good’ 3 ‘Excellent’ 4 ‘Outstanding’
RelationshipSatisfaction 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’
WorkLifeBalance 1 ‘Bad’ 2 ‘Good’ 3 ‘Better’ 4 ‘Best’
library(ggplot2)
# Create a histogram
ggplot(df, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Employees Across Age Groups",
x = "Age",
y = "Number of Employees") +
theme_minimal()
# Using sum() function
attrition_count <- sum(df$Attrition == "Yes")
print(paste("Number of employees who have experienced attrition: ", attrition_count))
[1] "Number of employees who have experienced attrition: 237"
ggplot(df, aes(x = MaritalStatus, fill = Gender)) +
geom_bar(position = "dodge", color = "white") +
labs(title = "Distribution of Gender and Marital Status Among Employees",
x = "Marital Status",
y = "Number of Employees",
fill = "Gender") +
theme_minimal()
ggplot(df, aes(x = BusinessTravel, fill = BusinessTravel)) +
geom_bar() +
labs(title = "Distribution of Business Travel Frequency",
x = "Business Travel Frequency",
y = "Number of Employees") +
theme_minimal()
ggplot(df, aes(x = Department, fill = Department)) +
geom_bar() +
labs(title = "Departmental Distribution",
x = "Department",
y = "Number of Employees") +
theme_minimal()
average_daily_rate <- mean(df$DailyRate)
print(paste("Average Daily Rate for Employees: $", round(average_daily_rate, 2)))
[1] "Average Daily Rate for Employees: $ 802.49"
ggplot(df, aes(x = factor(Attrition), fill = factor(Attrition))) +
geom_bar(aes(y = EnvironmentSatisfaction), position = "dodge", stat = "summary", fun = "mean") +
geom_bar(aes(y = RelationshipSatisfaction), position = "dodge", stat = "summary", fun = "mean") +
geom_bar(aes(y = WorkLifeBalance), position = "dodge", stat = "summary", fun = "mean") +
labs(title = "Satisfaction with Work Environment, Relationships, and Work-Life Balance",
x = "Attrition",
y = "Mean Satisfaction Score",
fill = "Attrition") +
scale_fill_manual(values = c("No" = "lightblue", "Yes" = "lightcoral")) +
theme_minimal()
ggplot(df, aes(x = Attrition, y = EnvironmentSatisfaction, fill = Attrition)) +
geom_boxplot() +
labs(title = "Environment Satisfaction by Attrition",
x = "Attrition",
y = "Environment Satisfaction",
fill = "Attrition") +
theme_minimal()
ggplot(df, aes(x = PerformanceRating, fill = factor(PerformanceRating))) +
geom_bar() +
labs(title = "Distribution of Performance Ratings",
x = "Performance Rating",
y = "Number of Employees",
fill = "Performance Rating") +
theme_minimal()
t_test_result <- t.test(PerformanceRating ~ Attrition, data = df)
# Display the result
print(t_test_result)
Welch Two Sample t-test
data: PerformanceRating by Attrition
t = -0.10999, df = 331.22, p-value = 0.9125
alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
95 percent confidence interval:
-0.05350780 0.04784086
sample estimates:
mean in group No mean in group Yes
3.153285 3.156118
total_training_sessions <- sum(df$TrainingTimesLastYear)
# Display the result
print(paste("Total Training Sessions Last Year: ", total_training_sessions))
[1] "Total Training Sessions Last Year: 4115"
t_test_result_2 <- t.test(TrainingTimesLastYear ~ Attrition, data = df)
# Display the result
print(t_test_result_2)
Welch Two Sample t-test
data: TrainingTimesLastYear by Attrition
t = 2.3305, df = 339.56, p-value = 0.02036
alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
95 percent confidence interval:
0.03251776 0.38439273
sample estimates:
mean in group No mean in group Yes
2.832928 2.624473
ggplot(df, aes(x = YearsAtCompany, fill = Attrition)) +
geom_bar(position = "stack") +
labs(title = "Trends in Attrition Over the Years",
x = "Years at Company",
y = "Number of Employees",
fill = "Attrition") +
theme_minimal()
ggplot(df, aes(x = YearsInCurrentRole, y = MonthlyIncome, color = Attrition)) +
geom_point() +
labs(title = "Relationship Between Monthly Income and Years in Current Role",
x = "Years in Current Role",
y = "Monthly Income",
color = "Attrition") +
theme_minimal()+ geom_jitter()
employees_overtime <- sum(df$OverTime == "Yes")
# Display the result
print(paste("Number of Employees Working Overtime: ", employees_overtime))
[1] "Number of Employees Working Overtime: 416"
ggplot(df, aes(x = OverTime, fill = Attrition)) +
geom_bar(position = "fill") +
labs(title = "Relationship Between Overtime and Attrition",
x = "Overtime",
y = "Proportion",
fill = "Attrition") +
scale_y_continuous(labels = scales::percent_format(scale = 1)) + # Display y-axis in percentage
theme_minimal()
It look like the leading factors of attrition are salary(income), job satisfaction , lack of work life balance. Increase in training hour could potentially decrease atttrition .