Development of retention strategies is important to the health of a company. Employee turnover has many consequences such as low morale, increased recruitment costs, and decreased productivity. The goal of this analysis is to discover why employees leave, and create a model that predicts employee attrition to help alleviate attrition.
Variable types were changed to allow proper analysis and missing values were checked.
#Loading packages
# Package names
packages <- c("tidyverse", "ggcorrplot", "randomForest", "here", "knitr", "gbm",
"caret", "car", "readr", "DescTools", "MASS", "rstatix", "viridis",
"pdp","rmarkdown")
# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
# Packages loading
invisible(lapply(packages, library, character.only = TRUE))
here()
hr_data = read.csv(here("Input", "hr_ibm_data_2.csv"))
#Checking the structure of the data
str(hr_data)
#Age column name has weird formatting in the title so the column name will be renmaed.
hr_data =
hr_data %>%
dplyr::rename(Age = ï..Age)
#categorical variables are currently character and need to be converted to factor variables
#some variables are integer type but should be factor
# Education
# 1 'Below College'
# 2 'College'
# 3 'Bachelor'
# 4 'Master'
# 5 'Doctor'
#
# EnvironmentSatisfaction
# 1 'Low'
# 2 'Medium'
# 3 'High'
# 4 'Very High'
#
# JobInvolvement
# 1 'Low'
# 2 'Medium'
# 3 'High'
# 4 'Very High'
#
# JobSatisfaction
# 1 'Low'
# 2 'Medium'
# 3 'High'
# 4 'Very High'
#
# PerformanceRating
# 1 'Low'
# 2 'Good'
# 3 'Excellent'
# 4 'Outstanding'
#
# RelationshipSatisfaction
# 1 'Low'
# 2 'Medium'
# 3 'High'
# 4 'Very High'
#
# WorkLifeBalance
# 1 'Bad'
# 2 'Good'
# 3 'Better'
# 4 'Best'
hr_data$Attrition = as.factor(hr_data$Attrition)
hr_data$BusinessTravel = as.factor(hr_data$BusinessTravel)
hr_data$Department = as.factor(hr_data$Department)
hr_data$Gender = as.factor(hr_data$Gender)
hr_data$OverTime = as.factor(hr_data$OverTime)
hr_data$JobRole = as.factor(hr_data$JobRole)
hr_data$Over18 = as.factor(hr_data$Over18)
hr_data$MaritalStatus = as.factor(hr_data$MaritalStatus)
hr_data$EducationField = as.factor(hr_data$EducationField)
#Education
hr_data$Education[hr_data$Education == 1] = "Below College"
hr_data$Education[hr_data$Education == 2] = "College"
hr_data$Education[hr_data$Education == 3] = "Bachelor"
hr_data$Education[hr_data$Education == 4] = "Master"
hr_data$Education[hr_data$Education == 5] = "Doctor"
hr_data$Education = as.factor(hr_data$Education)
#EnvironmentStatisfaction
hr_data$EnvironmentSatisfaction[hr_data$EnvironmentSatisfaction == 1] = "Low"
hr_data$EnvironmentSatisfaction[hr_data$EnvironmentSatisfaction == 2] = "Medium"
hr_data$EnvironmentSatisfaction[hr_data$EnvironmentSatisfaction == 3] = "High"
hr_data$EnvironmentSatisfaction[hr_data$EnvironmentSatisfaction == 4] = "Very High"
hr_data$EnvironmentSatisfaction = as.factor(hr_data$EnvironmentSatisfaction)
#JobInvolvement
hr_data$JobInvolvement[hr_data$JobInvolvement == 1] = "Low"
hr_data$JobInvolvement[hr_data$JobInvolvement == 2] = "Medium"
hr_data$JobInvolvement[hr_data$JobInvolvement == 3] = "High"
hr_data$JobInvolvement[hr_data$JobInvolvement == 4] = "Very High"
hr_data$JobInvolvement = as.factor(hr_data$JobInvolvement)
#JobSatisfaction
hr_data$JobSatisfaction[hr_data$JobSatisfaction == 1] = "Low"
hr_data$JobSatisfaction[hr_data$JobSatisfaction == 2] = "Medium"
hr_data$JobSatisfaction[hr_data$JobSatisfaction == 3] = "High"
hr_data$JobSatisfaction[hr_data$JobSatisfaction == 4] = "Very High"
hr_data$JobSatisfaction = as.factor(hr_data$JobSatisfaction)
#PerformanceRating
hr_data$PerformanceRating[hr_data$PerformanceRating == 1] = "Low"
hr_data$PerformanceRating[hr_data$PerformanceRating == 2] = "Good"
hr_data$PerformanceRating[hr_data$PerformanceRating == 3] = "Excellent"
hr_data$PerformanceRating[hr_data$PerformanceRating == 4] = "Outstanding"
hr_data$PerformanceRating = as.factor(hr_data$PerformanceRating)
#RelationshipSatisfaction
hr_data$RelationshipSatisfaction[hr_data$RelationshipSatisfaction == 1] = "Low"
hr_data$RelationshipSatisfaction[hr_data$RelationshipSatisfaction == 2] = "Medium"
hr_data$RelationshipSatisfaction[hr_data$RelationshipSatisfaction == 3] = "High"
hr_data$RelationshipSatisfaction[hr_data$RelationshipSatisfaction == 4] = "Very High"
hr_data$RelationshipSatisfaction = as.factor(hr_data$RelationshipSatisfaction)
#WorkLifeBalance
hr_data$WorkLifeBalance[hr_data$WorkLifeBalance == 1] = "Bad"
hr_data$WorkLifeBalance[hr_data$WorkLifeBalance == 2] = "Good"
hr_data$WorkLifeBalance[hr_data$WorkLifeBalance == 3] = "Better"
hr_data$WorkLifeBalance[hr_data$WorkLifeBalance == 4] = "Best"
hr_data$WorkLifeBalance = as.factor(hr_data$WorkLifeBalance)
#Checking to see if the changes were implemented correctly
str(hr_data)
#Checking for missing values
sum(is.na(hr_data))
#There are no missing data points
The company has a 16% attrition rate.
hr_data %>%
dplyr::group_by(Attrition) %>%
dplyr::summarise(cnt = n()) %>%
dplyr::mutate(freq = (cnt / sum(cnt))*100) %>%
ggplot(aes(x = Attrition, y = freq, fill = Attrition)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(round(freq,0), "%")),
position = position_stack(vjust = 0.5), size = 3) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
labs(title = "Attrition", x = "Attrition", y = "Percentage") +
scale_fill_manual(values = c("#fde725", "#21918c"))
Employees who travel frequently have the highest employee turnover.
hr_data$Attrition = factor(hr_data$Attrition, levels = c("Yes", "No"))
hr_data %>%
dplyr::group_by(BusinessTravel, Attrition) %>%
dplyr::summarise(cnt = n()) %>%
dplyr::mutate(freq = (cnt / sum(cnt))*100) %>%
ggplot(aes(x = BusinessTravel, y = freq, fill = Attrition)) +
geom_bar(position = position_stack(), stat = "identity", width = .7) +
geom_text(aes(label = paste0(round(freq,0), "%")),
position = position_stack(vjust = 0.5), size = 3) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
labs(title = "Business Travel and Attrition", x = "Business Travel", y = "Percentage") +
scale_x_discrete(breaks = c("Travel_Rarely", "Travel_Frequently", "Non-Travel"),
labels = c("Travel Rarely","Travel Frequently", "Non Travel")) +
scale_fill_manual(values = c("#fde725", "#21918c"))
Employees in the Sales Department have the highest turnover.
hr_data$Attrition = factor(hr_data$Attrition, levels = c("Yes", "No"))
hr_data %>%
dplyr::group_by(Department, Attrition) %>%
dplyr::summarise(cnt = n()) %>%
dplyr::mutate(freq = (cnt / sum(cnt))*100) %>%
ggplot(aes(x = Department, y = freq, fill = Attrition)) +
geom_bar(position = position_stack(), stat = "identity", width = .7) +
geom_text(aes(label = paste0(round(freq,0), "%")),
position = position_stack(vjust = 0.5), size = 3) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
labs(title = "Department and Attrition", x = "Department", y = "Percentage") +
scale_fill_manual(values = c("#fde725", "#21918c"))
The attrition rates between men and women is very similar. Men have a slighly higher turnover rate.
hr_data$Attrition = factor(hr_data$Attrition, levels = c("Yes", "No"))
hr_data %>%
dplyr::group_by(Gender, Attrition) %>%
dplyr::summarise(cnt = n()) %>%
dplyr::mutate(freq = (cnt / sum(cnt))*100) %>%
ggplot(aes(x = Gender, y = freq, fill = Attrition)) +
geom_bar(position = position_stack(), stat = "identity", width = .7) +
geom_text(aes(label = paste0(round(freq,0), "%")),
position = position_stack(vjust = 0.5), size = 3) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
labs(title = "Gender and Attrition", x = "Gender", y = "Percentage") +
scale_fill_manual(values = c("#fde725", "#21918c"))
Employees who work overtime have the highest turnover rate.
hr_data$Attrition = factor(hr_data$Attrition, levels = c("Yes", "No"))
hr_data %>%
dplyr::group_by(OverTime, Attrition) %>%
dplyr::summarise(cnt = n()) %>%
dplyr::mutate(freq = (cnt / sum(cnt))*100) %>%
ggplot(aes(x = OverTime, y = freq, fill = Attrition)) +
geom_bar(position = position_stack(), stat = "identity", width = .7) +
geom_text(aes(label = paste0(round(freq,0), "%")),
position = position_stack(vjust = 0.5), size = 3) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
labs(title = "Over Time and Attrition", x = "Over Time", y = "Percentage") +
scale_fill_manual(values = c("#fde725", "#21918c"))
Sales representatives have the highest attrition rate.
hr_data$Attrition = factor(hr_data$Attrition, levels = c("Yes", "No"))
hr_data %>%
dplyr::group_by(JobRole, Attrition) %>%
dplyr::summarise(cnt = n()) %>%
dplyr::mutate(freq = (cnt / sum(cnt))*100) %>%
ggplot(aes(x = JobRole, y = freq, fill = Attrition)) +
geom_bar(position = position_stack(), stat = "identity", width = .7) +
geom_text(aes(label = paste0(round(freq,0), "%")),
position = position_stack(vjust = 0.5), size = 3) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
labs(title = "Job Role and Attrition", x = "Job Role", y = "Percentage") +
scale_fill_manual(values = c("#fde725", "#21918c")) +
theme(axis.text.x = element_text(angle = 20, hjust = 0.5))
Employees with low environment satisfaction leave more than employees with medium, high, and very high enviroment satisfaction.
hr_data$Attrition = factor(hr_data$Attrition, levels = c("Yes", "No"))
hr_data$EnvironmentSatisfaction = factor(hr_data$EnvironmentSatisfaction, levels = c("Low",
"Medium",
"High",
"Very High"))
hr_data %>%
dplyr::group_by(EnvironmentSatisfaction, Attrition) %>%
dplyr::summarise(cnt = n()) %>%
dplyr::mutate(freq = (cnt / sum(cnt))*100) %>%
ggplot(aes(x = EnvironmentSatisfaction, y = freq, fill = Attrition)) +
geom_bar(position = position_stack(), stat = "identity", width = .7) +
geom_text(aes(label = paste0(round(freq,0), "%")),
position = position_stack(vjust = 0.5), size = 3) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
labs(title = "Environment Satisfaction and Attrition", x = "Environment Satisfaction", y = "Percentage") +
scale_fill_manual(values = c("#fde725", "#21918c"))
Employees with low job satisfaction leave more than employees with medium, high, and very high job satisfaction.
hr_data$Attrition = factor(hr_data$Attrition, levels = c("Yes", "No"))
hr_data$JobSatisfaction = factor(hr_data$JobSatisfaction, levels = c("Low",
"Medium",
"High",
"Very High"))
hr_data %>%
dplyr::group_by(JobSatisfaction, Attrition) %>%
dplyr::summarise(cnt = n()) %>%
dplyr::mutate(freq = (cnt / sum(cnt))*100) %>%
ggplot(aes(x = JobSatisfaction, y = freq, fill = Attrition)) +
geom_bar(position = position_stack(), stat = "identity", width = .7) +
geom_text(aes(label = paste0(round(freq,0), "%")),
position = position_stack(vjust = 0.5), size = 3) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
labs(title = "Job Satisfaction and Attrition", x = "Job Satisfaction", y = "Percentage") +
scale_fill_manual(values = c("#fde725", "#21918c"))
Employees with bad work life balance have the highest amount of turnover.
hr_data$Attrition = factor(hr_data$Attrition, levels = c("Yes", "No"))
hr_data$WorkLifeBalance = factor(hr_data$WorkLifeBalance, levels = c("Bad",
"Good",
"Better",
"Best"))
hr_data %>%
dplyr::group_by(WorkLifeBalance, Attrition) %>%
dplyr::summarise(cnt = n()) %>%
dplyr::mutate(freq = (cnt / sum(cnt))*100) %>%
ggplot(aes(x = WorkLifeBalance, y = freq, fill = Attrition)) +
geom_bar(position = position_stack(), stat = "identity", width = .7) +
geom_text(aes(label = paste0(round(freq,0), "%")),
position = position_stack(vjust = 0.5), size = 3) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
labs(title = "Work Life Balance and Attrition", x = "Work Life Balance", y = "Percentage") +
scale_fill_manual(values = c("#fde725", "#21918c"))
Employees who left had a lower monthly income.
ggplot(hr_data, aes(x=MonthlyIncome, fill=Attrition, color=Attrition)) +
geom_histogram(position="identity", alpha=0.5) +
labs(title = "Distribution of Monthly Income") +
scale_color_manual(values=c("#fde725", "#21918c")) +
scale_fill_manual(values=c("#fde725", "#21918c"))
Employees who left were not at the company for a long time.
ggplot(hr_data, aes(x=YearsAtCompany, fill=Attrition, color=Attrition)) +
geom_histogram(position="identity", alpha=0.5) +
labs(title = "Distribution of Years at Company") +
scale_color_manual(values=c("#fde725", "#21918c")) +
scale_fill_manual(values=c("#fde725", "#21918c"))
Total working years and years at company have a high correlation (r = .63), total working year and age have a high correlation (r = .68), total working year and monthly income have a high correlation (r = .77), years in current role and year working with current manager have a high correlation (r = .71), years with current manager and years at company have a high correlation (r = .77), and years at company and years in current role have a high correlation (r = .76).
cordata <- hr_data %>%
dplyr::select(c("DistanceFromHome", "MonthlyIncome",
"NumCompaniesWorked", "TotalWorkingYears",
"YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager", "Age"))
cormatrix <- cor(cordata)
round(cormatrix, 2)
ggcorrplot(cormatrix, hc.order = TRUE,outline.color = "white", lab = TRUE, colors = c("#52c569", "white", "#fde725"), lab_size = 2.5) +
labs(title="Correlation of Numeric Variables")
To decide which categorical variables should be kept in the attrition model, Chi-Square will be used to test whether there is a relationship between the categorical variables and attrition. The null hypothesis for this test is the two variables are independent, and the alternative hypothesis is the variables are not independent. In order to reject the null hypothesis and keep variables in the model, the p-value of this test must have a p-value below .05.
The variables we will leave out of the model are education, gender, performance rating, and relationship satisfaction. These variables all have a p-value above .05 so they are independent from attrition.
chisq.test(hr_data$BusinessTravel, hr_data$Attrition)
chisq.test(hr_data$Department, hr_data$Attrition)
chisq.test(hr_data$Education, hr_data$Attrition)
chisq.test(hr_data$EducationField, hr_data$Attrition)
chisq.test(hr_data$EnvironmentSatisfaction, hr_data$Attrition)
chisq.test(hr_data$Gender, hr_data$Attrition)
chisq.test(hr_data$JobInvolvement, hr_data$Attrition)
chisq.test(hr_data$JobRole, hr_data$Attrition)
chisq.test(hr_data$JobSatisfaction, hr_data$Attrition)
chisq.test(hr_data$MaritalStatus, hr_data$Attrition)
chisq.test(hr_data$PerformanceRating, hr_data$Attrition)
chisq.test(hr_data$OverTime, hr_data$Attrition)
chisq.test(hr_data$RelationshipSatisfaction, hr_data$Attrition)
chisq.test(hr_data$StockOptionLevel, hr_data$Attrition)
chisq.test(hr_data$WorkLifeBalance, hr_data$Attrition)
chisq_results =
data.frame(Variable = c("Business Travel",
"Department",
"Education",
"Education Field",
"Environment Satisfaction",
"Gender",
"Job Involvement",
"Job Role",
"Job Satisfaction",
"Marital Status",
"Over Time",
"Performance Rating",
"Relationship Satisfaction",
"Stock Option Level",
"Work Life Balance"),
Chi_Sq_Stat = c(24.18,
10.80,
3.07,
16.03,
22.50,
1.13,
28.49,
86.19,
17.51,
46.16,
87.54,
0.00,
5.241,
60.51,
16.33),
P_value = c(0.00,
0.00,
0.55,
0.01,
0.00,
0.29,
0.00,
0.00,
0.00,
0.00,
0.00,
0.99,
0.16,
0.00,
0.00),
Stat_Sig = c("Yes",
"Yes",
"No",
"Yes",
"Yes",
"No",
"Yes",
"Yes",
"Yes",
"Yes",
"Yes",
"No",
"No",
"Yes",
"Yes"))
kable(chisq_results,
col.names = c("Variable","Chi-Square Statistic", "p-value", "Statistically Significant"))
| Variable | Chi-Square Statistic | p-value | Statistically Significant |
|---|---|---|---|
| Business Travel | 24.180 | 0.00 | Yes |
| Department | 10.800 | 0.00 | Yes |
| Education | 3.070 | 0.55 | No |
| Education Field | 16.030 | 0.01 | Yes |
| Environment Satisfaction | 22.500 | 0.00 | Yes |
| Gender | 1.130 | 0.29 | No |
| Job Involvement | 28.490 | 0.00 | Yes |
| Job Role | 86.190 | 0.00 | Yes |
| Job Satisfaction | 17.510 | 0.00 | Yes |
| Marital Status | 46.160 | 0.00 | Yes |
| Over Time | 87.540 | 0.00 | Yes |
| Performance Rating | 0.000 | 0.99 | No |
| Relationship Satisfaction | 5.241 | 0.16 | No |
| Stock Option Level | 60.510 | 0.00 | Yes |
| Work Life Balance | 16.330 | 0.00 | Yes |
To decide if any numerical variables should be kept in the attrition model, I am using ANOVA to test if there is a significant difference between attrition and the numerical variables. The null hypothesis of ANOVA is there is no difference between means. The alternative hypothesis states there is a difference. If the p-value of the ANOVA is greater than .05, we can reject the null hypothesis and keep the variables in the model. Distance from home and monthly income will be tested. The other numeric variables are counts, so they cannot be evaluated with ANOVA.
One of the assumptions in ANOVA is equal variances, so I’m using the Levene’s Test to determine whether or not that assumption has been violated. The null hypothesis for the Levene’s Test is the variances are equal across groups. The alternative hypothesis for the Levene’s Test is the variances are not equal. If the p-value of the Levene’s Test is greater than .05, we can accept the null hypothesis and the equal variance assumption of ANOVA will be met.
Distance from home and monthly income are both statistically significant, so they will be kept in the attrition model.
# ANOVA: Distance From Home and Attrition
aov.res = aov(DistanceFromHome~Attrition, data = hr_data)
summary(aov.res)
leveneTest(aov.res)
oneway.test(DistanceFromHome~Attrition, data = hr_data, var.equal = FALSE)
# ANOVA: Monthly Income and Attrition
aov.res = aov(MonthlyIncome~Attrition, data = hr_data)
summary(aov.res)
leveneTest(aov.res)
oneway.test(MonthlyIncome~Attrition, data = hr_data, var.equal = FALSE)
anova_results =
data.frame(Variable = c("Distance From Home",
"Monthly Income"),
F_Stat = c(8.34,
55.99),
P_value = c(0.00,
0.00),
Levene_Test = c(0.048,
0.00),
Stat_Sig = c("Yes",
"Yes"))
kable(anova_results ,
col.names = c("Variable","F Statistic", "p-value", "Levene's Test p-value","Statistically Significant"))
| Variable | F Statistic | p-value | Levene’s Test p-value | Statistically Significant |
|---|---|---|---|---|
| Distance From Home | 8.34 | 0 | 0.048 | Yes |
| Monthly Income | 55.99 | 0 | 0.000 | Yes |
Creating a test and a train dataset to evaluate and test the model.
set.seed(123)
index = createDataPartition(y=hr_data$Attrition,
p = .8,list = FALSE )
train = hr_data[index,]
test = hr_data[-index,]
Since there is class imbalance in the target variable attrition, upsampling and downsampling will be used. Upsampling randomly replicate instances in the minority class, and downsampling randomly remove instances in the majority class. Models will be trained with cross-validation only, upsampling, and downsampling to see how these methods will affect model performance.
up.model.control =
trainControl(method = "repeatedcv",
repeats = 3,
number = 10,
classProbs = TRUE,
sampling = "up",
summaryFunction = prSummary)
down.model.control =
trainControl(method = "repeatedcv",
repeats = 3,
number = 10,
classProbs = TRUE,
sampling = "down",
summaryFunction = prSummary)
control = trainControl(method = "repeatedcv",
repeats = 3,
number = 10,
classProbs = TRUE,
summaryFunction = prSummary)
set.seed(123)
rf.model =
train(Attrition ~
BusinessTravel +
EnvironmentSatisfaction +
JobInvolvement +
MonthlyIncome +
JobSatisfaction +
NumCompaniesWorked +
OverTime +
TotalWorkingYears +
WorkLifeBalance +
YearsAtCompany +
YearsInCurrentRole +
YearsSinceLastPromotion +
Department +
DistanceFromHome +
JobRole +
YearsWithCurrManager+
PercentSalaryHike +
TrainingTimesLastYear,
data = train,
method = "rf",
metric = "Recall",
trControl = control)
rf.model
p1 = predict(rf.model, newdata=test, type="raw")
caret::confusionMatrix(as.factor(p1), test$Attrition, positive = 'Yes', mode = 'everything')
Train results for random forest with only cross-validation.
Test results for random forest with only cross-validation.
set.seed(123)
rf.model.up =
train(Attrition ~
BusinessTravel +
EnvironmentSatisfaction +
JobInvolvement +
MonthlyIncome +
JobSatisfaction +
NumCompaniesWorked +
OverTime +
TotalWorkingYears +
WorkLifeBalance +
YearsAtCompany +
YearsInCurrentRole +
YearsSinceLastPromotion +
Department +
DistanceFromHome +
JobRole +
YearsWithCurrManager+
PercentSalaryHike +
TrainingTimesLastYear,
data = train,
method = "rf",
metric = "Recall",
trControl = up.model.control)
rf.model.up
p2 = predict(rf.model.up, newdata=test, type="raw")
caret::confusionMatrix(as.factor(p2), test$Attrition, positive = 'Yes', mode = 'everything')
Train results for random forest with upsampling.
Test results for random forest with upsampling.
set.seed(123)
rf.model.down =
train(Attrition ~
BusinessTravel +
EnvironmentSatisfaction +
JobInvolvement +
MonthlyIncome +
JobSatisfaction +
NumCompaniesWorked +
OverTime +
TotalWorkingYears +
WorkLifeBalance +
YearsAtCompany +
YearsInCurrentRole +
YearsSinceLastPromotion +
Department +
DistanceFromHome +
JobRole +
YearsWithCurrManager+
PercentSalaryHike +
TrainingTimesLastYear,
data = train,
method = "rf",
metric = "Recall",
trControl = down.model.control)
rf.model.down
p3 = predict(rf.model.down, newdata=test, type="raw")
caret::confusionMatrix(as.factor(p3), test$Attrition, positive = 'Yes', mode = 'everything')
Train results for random forest with downsampling.
Test results for random forest with downsampling.
set.seed(123)
gbm_fit <- train(Attrition ~
BusinessTravel +
EnvironmentSatisfaction +
JobInvolvement +
MonthlyIncome +
JobSatisfaction +
NumCompaniesWorked +
OverTime +
TotalWorkingYears +
WorkLifeBalance +
YearsAtCompany +
YearsInCurrentRole +
YearsSinceLastPromotion +
Department +
DistanceFromHome +
JobRole +
YearsWithCurrManager+
PercentSalaryHike +
TrainingTimesLastYear,
data = train,
method = "gbm",
verbose = FALSE,
metric = "Recall",
trControl = control)
print(gbm_fit)
p4 = predict(gbm_fit, newdata=test, type="raw")
caret::confusionMatrix(as.factor(p4), test$Attrition, positive = 'Yes', mode = 'everything')
Train results for GBM with just cross-validation.
Test results for GBM with just cross-validation.
set.seed(123)
up_fit <- train(Attrition ~
BusinessTravel +
EnvironmentSatisfaction +
JobInvolvement +
MonthlyIncome +
JobSatisfaction +
NumCompaniesWorked +
OverTime +
TotalWorkingYears +
WorkLifeBalance +
YearsAtCompany +
YearsInCurrentRole +
YearsSinceLastPromotion +
Department +
DistanceFromHome +
JobRole +
YearsWithCurrManager+
PercentSalaryHike +
TrainingTimesLastYear,
data = train,
method = "gbm",
verbose = FALSE,
metric = "Recall",
trControl = up.model.control)
print(up_fit)
p5 = predict(up_fit, newdata=test, type="raw")
caret::confusionMatrix(as.factor(p5), test$Attrition, positive = 'Yes', mode = 'everything')
Train results for GBM with upsampling.
Test results for GBM with upsampling.
set.seed(123)
down_fit <- train(Attrition ~
BusinessTravel +
EnvironmentSatisfaction +
JobInvolvement +
MonthlyIncome +
JobSatisfaction +
NumCompaniesWorked +
OverTime +
TotalWorkingYears +
WorkLifeBalance +
YearsAtCompany +
YearsInCurrentRole +
YearsSinceLastPromotion +
Department +
DistanceFromHome +
JobRole +
YearsWithCurrManager+
PercentSalaryHike +
TrainingTimesLastYear,
data = train,
method = "gbm",
verbose = FALSE,
metric = "Recall",
trControl = down.model.control)
print(down_fit)
p6 = predict(down_fit, newdata=test, type="raw")
caret::confusionMatrix(as.factor(p6), test$Attrition, positive = 'Yes', mode = 'everything')
Train results for GBM with downsampling.
Test results for GBM with downsampling.
Recall is used to assess model performance instead of accuracy because in this model it is important to minimize false negatives. If we falsely label too many employee as “no attrition”, but they actually do leave (“yes attrition”), we run the risk of omitting employees who will actually leave. This will be very costly, and reducing false negatives will negate that risk. The gbm with downsampling model is the best performing model. It has a train recall of 0.7295 and a test recall of 0.7872. The other models fail to improve on the training recall score.
model_summary =
data.frame(Model = c("Random Forest",
"Random Forest Upsampling",
"Random Forest Downsampling",
"GBM",
"GBM Upsampling",
"GBM Downsampling"),
recall_training_score = c(.9986,
.9443,
.7423,
.9960,
.8267,
.7295),
recall_test_score = c(.06383,
.34043,
.7021,
.1064,
.6809,
.7872))
kable(model_summary,
col.names = c("Model","Train Recall", "Test Recall"))
| Model | Train Recall | Test Recall |
|---|---|---|
| Random Forest | 0.9986 | 0.06383 |
| Random Forest Upsampling | 0.9443 | 0.34043 |
| Random Forest Downsampling | 0.7423 | 0.70210 |
| GBM | 0.9960 | 0.10640 |
| GBM Upsampling | 0.8267 | 0.68090 |
| GBM Downsampling | 0.7295 | 0.78720 |
Monthly income and over time are the most important variables in the model. Other important variables are related to work history and distance from the office.
gbm.varimp = varImp(down_fit, scale = FALSE)
imp <- data.frame(importance = gbm.varimp$importance) %>%
tibble::rownames_to_column(var = "variable")
imp %>% ggplot(aes(x = reorder(variable,Overall), y = Overall)) +
geom_bar(stat = "identity", fill = "#21918c", color = "black")+
coord_flip() +
labs(x = "Variables", y = "Variable importance")
Sixteen percent of employees left the company.
In the stacked bar charts, we saw employees who left were:
Chi-square results revealed gender, education, and performance rating did not have a significant role in employee attrition.
From Chi-square tests and ANOVA, statistically significant variables
that affected an employee’s decision to leave include:
GBM with downsampling performed the best in minimizing false negatives, which will prevent us from overlooking employees that will actually leave. According to the variable importance plot, monthly income and over time are critical in attrition. Other important variables are related to work history, and distance from the office.
To prevent attrition, the company could consider raising wages, foster a company culture that promotes work life balance, and allow remote work so employees don’t have long commutes. Remote work will also permit flexible schedules that will aid in work life balance issues.