library(readr)
library(tidyverse)
library(plotly)
df <- read_csv("~/Library/Mobile Documents/com~apple~CloudDocs/Fairfield University/Fall 2024/DATA 6520/Assignments/Group/HR_comma_sep.csv") %>%
mutate(employee_status = ifelse(left == 1 , 'Left' , 'Stayed') ,
work_accident = ifelse(Work_accident == 1 , 'Yes' , 'No') ,
promotion_last_5years = ifelse(promotion_last_5years == 1 , 'Yes' , 'No')) %>%
select(-left , -Work_accident) %>%
rename(average_monthly_hours = average_montly_hours)
# Load required libraries
library(tidyverse)
library(caret)
library(car)
library(ROCR)
library(pROC)
# Prepare data for logistic regression
df_model <- df %>%
mutate(target = ifelse(employee_status == "Left", 1, 0),
department = as.factor(department),
salary = as.factor(salary),
work_accident = as.factor(work_accident),
promotion_last_5years = as.factor(promotion_last_5years))
# Split data into training and testing sets
set.seed(123)
train_index <- createDataPartition(df_model$target, p = 0.7, list = FALSE)
train_data <- df_model[train_index, ]
test_data <- df_model[-train_index, ]
# Create logistic regression model
model <- glm(target ~ satisfaction_level + last_evaluation + number_project +
average_monthly_hours + time_spend_company + work_accident +
promotion_last_5years + department + salary,
data = train_data, family = "binomial")
# Display model summary
summary(model)
##
## Call:
## glm(formula = target ~ satisfaction_level + last_evaluation +
## number_project + average_monthly_hours + time_spend_company +
## work_accident + promotion_last_5years + department + salary,
## family = "binomial", data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.200751 0.360745 -6.101 1.06e-09 ***
## satisfaction_level -4.639888 0.192578 -24.094 < 2e-16 ***
## last_evaluation 0.651139 0.300690 2.165 0.0304 *
## number_project -0.455438 0.043370 -10.501 < 2e-16 ***
## average_monthly_hours 0.002181 0.001057 2.063 0.0391 *
## time_spend_company 1.010193 0.048974 20.627 < 2e-16 ***
## work_accidentYes -1.477157 0.161526 -9.145 < 2e-16 ***
## promotion_last_5yearsYes -1.022645 0.482370 -2.120 0.0340 *
## departmenthr 0.179725 0.255711 0.703 0.4822
## departmentIT 0.169669 0.233481 0.727 0.4674
## departmentmanagement 0.116421 0.320859 0.363 0.7167
## departmentmarketing 0.122366 0.255587 0.479 0.6321
## departmentproduct_mng 0.167479 0.252334 0.664 0.5069
## departmentRandD -0.412785 0.253574 -1.628 0.1036
## departmentsales 0.192218 0.198308 0.969 0.3324
## departmentsupport 0.235785 0.210242 1.121 0.2621
## departmenttechnical 0.131901 0.204717 0.644 0.5194
## salarylow 1.718768 0.245719 6.995 2.66e-12 ***
## salarymedium 1.348420 0.247863 5.440 5.32e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5153.6 on 3946 degrees of freedom
## Residual deviance: 3548.4 on 3928 degrees of freedom
## AIC: 3586.4
##
## Number of Fisher Scoring iterations: 5
# Calculate VIF to check multicollinearity
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## satisfaction_level 1.258732 1 1.121932
## last_evaluation 1.786043 1 1.336429
## number_project 2.283550 1 1.511142
## average_monthly_hours 1.866632 1 1.366247
## time_spend_company 1.425222 1 1.193827
## work_accident 1.010641 1 1.005306
## promotion_last_5years 1.008243 1 1.004113
## department 1.020682 9 1.001138
## salary 1.014655 2 1.003644
# Make predictions on test set
predictions_prob <- predict(model, test_data, type = "response")
predictions_class <- ifelse(predictions_prob > 0.5, 1, 0)
# Calculate model performance metrics
conf_matrix <- confusionMatrix(factor(predictions_class),
factor(test_data$target),
positive = "1")
# Print model performance metrics
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 933 156
## 1 175 427
##
## Accuracy : 0.8043
## 95% CI : (0.7845, 0.8229)
## No Information Rate : 0.6552
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5701
##
## Mcnemar's Test P-Value : 0.3225
##
## Sensitivity : 0.7324
## Specificity : 0.8421
## Pos Pred Value : 0.7093
## Neg Pred Value : 0.8567
## Prevalence : 0.3448
## Detection Rate : 0.2525
## Detection Prevalence : 0.3560
## Balanced Accuracy : 0.7872
##
## 'Positive' Class : 1
##
# Create ROC curve
roc_obj <- roc(test_data$target, predictions_prob)
auc_value <- auc(roc_obj)
# Plot ROC curve
plot(roc_obj, main = "ROC Curve for Employee Attrition Model")
abline(a = 0, b = 1, lty = 2, col = "gray")
text(0.8, 0.2, paste("AUC =", round(auc_value, 3)))

# Calculate odds ratios and confidence intervals
odds_ratios <- exp(coef(model))
conf_int <- exp(confint(model))
# Combine odds ratios and confidence intervals
odds_table <- data.frame(
OR = odds_ratios,
CI_lower = conf_int[,1],
CI_upper = conf_int[,2]
) %>%
round(3)
# Print odds ratios table
print(odds_table)
## OR CI_lower CI_upper
## (Intercept) 0.111 0.054 0.223
## satisfaction_level 0.010 0.007 0.014
## last_evaluation 1.918 1.065 3.463
## number_project 0.634 0.582 0.690
## average_monthly_hours 1.002 1.000 1.004
## time_spend_company 2.746 2.498 3.027
## work_accidentYes 0.228 0.165 0.311
## promotion_last_5yearsYes 0.360 0.129 0.872
## departmenthr 1.197 0.725 1.978
## departmentIT 1.185 0.750 1.875
## departmentmanagement 1.123 0.597 2.101
## departmentmarketing 1.130 0.685 1.866
## departmentproduct_mng 1.182 0.721 1.941
## departmentRandD 0.662 0.402 1.087
## departmentsales 1.212 0.823 1.793
## departmentsupport 1.266 0.840 1.916
## departmenttechnical 1.141 0.765 1.708
## salarylow 5.578 3.497 9.182
## salarymedium 3.851 2.404 6.364
# Feature importance based on absolute z-values
importance <- data.frame(
Variable = names(coef(model)[-1]), # Exclude intercept
Importance = abs(summary(model)$coefficients[-1, "z value"])
) %>%
arrange(desc(Importance))
# Plot feature importance
ggplot(importance, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal() +
labs(title = "Feature Importance in Predicting Employee Attrition",
x = "Variables",
y = "Absolute Z-value")

# Investigate interaction effects
# Example interactions between satisfaction and other key variables
model_interactions <- glm(target ~ satisfaction_level * last_evaluation +
satisfaction_level * number_project +
satisfaction_level * average_monthly_hours +
time_spend_company + work_accident +
promotion_last_5years + department + salary,
data = train_data, family = "binomial")
# Compare models using AIC
aic_comparison <- AIC(model, model_interactions)
print(aic_comparison)
## df AIC
## model 19 3586.362
## model_interactions 22 3393.768
# Plot predicted probabilities vs satisfaction level for different number of projects
ggplot(train_data, aes(x = satisfaction_level, y = target)) +
geom_smooth(method = "glm", method.args = list(family = "binomial"), se = TRUE) +
facet_wrap(~number_project) +
theme_minimal() +
labs(title = "Predicted Attrition Probability vs Satisfaction by Number of Projects",
x = "Satisfaction Level",
y = "Predicted Probability of Leaving")
