A Comparative Analysis of Random Forest and XGBoost for Employee Attrition Prediction

1. Loading Libraries:

2. Sample HR Data Creation:

3. Data Preprocessing:

4. Splitting Data into Training and Testing Sets:

5. Feature Selection:

6. Random Forest Model with Hyperparameter Tuning:

7. XGBoost Model with Hyperparameter Tuning:

8. Comparing Models:

9. Model Evaluation:

10. Feature Importance Analysis:

11. Prediction on New Data:

12. Predicting Attrition for New Employees:

13. Visualizing Predictions:

In summary, this R code demonstrates how to build, tune, compare, and evaluate two machine learning models (Random Forest and XGBoost) for predicting employee attrition. It also explores feature importance and visualizes predictions for unseen data. By analyzing the results, you can gain insights into which model performs better and identify the most important factors influencing employee attrition in your HR data.

R Programming

# Load necessary libraries
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice
library(ggplot2)
library(e1071)
library(xgboost)

# Sample HR data for demonstration
set.seed(123)
hr_data <- data.frame(
  EmployeeID = 1:1000,
  Age = sample(20:60, 1000, replace = TRUE),
  Gender = sample(c("Male", "Female"), 1000, replace = TRUE),
  Department = sample(c("Sales", "R&D", "HR"), 1000, replace = TRUE),
  YearsAtCompany = sample(0:30, 1000, replace = TRUE),
  MonthlyIncome = sample(2000:15000, 1000, replace = TRUE),
  PerformanceRating = sample(1:5, 1000, replace = TRUE),
  JobSatisfaction = sample(1:4, 1000, replace = TRUE),
  DistanceFromHome = sample(1:50, 1000, replace = TRUE),
  Attrition = sample(c("Yes", "No"), 1000, replace = TRUE)
)

# Converting categorical variables to factors
hr_data$Gender <- as.factor(hr_data$Gender)
hr_data$Department <- as.factor(hr_data$Department)
hr_data$Attrition <- as.factor(hr_data$Attrition)

# Split the data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(hr_data$Attrition, p = 0.7, list = FALSE)
trainData <- hr_data[trainIndex, ]
testData <- hr_data[-trainIndex, ]

# Feature selection: Selecting relevant features, including new ones
features <- c("Age", "Gender", "Department", "YearsAtCompany", "MonthlyIncome", "PerformanceRating", "JobSatisfaction", "DistanceFromHome")

# Random Forest with advanced hyperparameter tuning
control <- trainControl(method = "cv", number = 10)
tunegrid_rf <- expand.grid(.mtry = c(2:8))
set.seed(123)
rf_model <- train(Attrition ~ ., data = trainData[, c(features, "Attrition")],
                  method = "rf", trControl = control, tuneGrid = tunegrid_rf, ntree = 1000)

# XGBoost with hyperparameter tuning
tunegrid_xgb <- expand.grid(
  nrounds = 100, 
  max_depth = c(2, 4, 6), 
  eta = c(0.01, 0.1), 
  gamma = c(0, 1, 5), 
  colsample_bytree = c(0.5, 0.8, 1),
  min_child_weight = 1,
  subsample = c(0.6, 0.8, 1)
)
set.seed(123)
xgb_model <- train(Attrition ~ ., data = trainData[, c(features, "Attrition")],
                   method = "xgbTree", trControl = control, tuneGrid = tunegrid_xgb)

# Compare models
results <- resamples(list(rf = rf_model, xgb = xgb_model))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: rf, xgb 
## Number of resamples: 10 
## 
## Accuracy 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## rf  0.3478261 0.4767598 0.5071429 0.5002149 0.5409457 0.5915493    0
## xgb 0.5211268 0.5378882 0.5674044 0.5605657 0.5759557 0.6142857    0
## 
## Kappa 
##            Min.     1st Qu.      Median         Mean    3rd Qu.      Max. NA's
## rf  -0.31847134 -0.05052315 0.008662965 -0.004642655 0.08339932 0.1747495    0
## xgb  0.01043025  0.05290358 0.109011169  0.095950477 0.12706018 0.2038753    0
# Model evaluation: Evaluate model performance
# Predictions on the test data
test_pred_rf <- predict(rf_model, testData[, features])
test_pred_xgb <- predict(xgb_model, testData[, features])

# Confusion matrix
conf_matrix_rf <- confusionMatrix(test_pred_rf, testData$Attrition)
conf_matrix_xgb <- confusionMatrix(test_pred_xgb, testData$Attrition)

# Print model evaluation metrics
print(conf_matrix_rf)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  63  73
##        Yes 79  84
##                                           
##                Accuracy : 0.4916          
##                  95% CI : (0.4336, 0.5498)
##     No Information Rate : 0.5251          
##     P-Value [Acc > NIR] : 0.8879          
##                                           
##                   Kappa : -0.0213         
##                                           
##  Mcnemar's Test P-Value : 0.6851          
##                                           
##             Sensitivity : 0.4437          
##             Specificity : 0.5350          
##          Pos Pred Value : 0.4632          
##          Neg Pred Value : 0.5153          
##              Prevalence : 0.4749          
##          Detection Rate : 0.2107          
##    Detection Prevalence : 0.4548          
##       Balanced Accuracy : 0.4893          
##                                           
##        'Positive' Class : No              
## 
print(conf_matrix_xgb)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No   38  41
##        Yes 104 116
##                                          
##                Accuracy : 0.5151         
##                  95% CI : (0.4568, 0.573)
##     No Information Rate : 0.5251         
##     P-Value [Acc > NIR] : 0.6576         
##                                          
##                   Kappa : 0.0066         
##                                          
##  Mcnemar's Test P-Value : 2.621e-07      
##                                          
##             Sensitivity : 0.2676         
##             Specificity : 0.7389         
##          Pos Pred Value : 0.4810         
##          Neg Pred Value : 0.5273         
##              Prevalence : 0.4749         
##          Detection Rate : 0.1271         
##    Detection Prevalence : 0.2642         
##       Balanced Accuracy : 0.5032         
##                                          
##        'Positive' Class : No             
## 
# Feature importance from Random Forest
importance_rf <- varImp(rf_model, scale = FALSE)
importance_df_rf <- as.data.frame(importance_rf$importance)

# Feature importance from XGBoost
importance_xgb <- varImp(xgb_model, scale = FALSE)
importance_df_xgb <- as.data.frame(importance_xgb$importance)

# Plot feature importance for Random Forest
ggplot(importance_df_rf, aes(x = reorder(rownames(importance_df_rf), Overall), y = Overall)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ggtitle("Feature Importance (Random Forest)") +
  xlab("Feature") +
  ylab("Mean Decrease Accuracy")

# Plot feature importance for XGBoost
ggplot(importance_df_xgb, aes(x = reorder(rownames(importance_df_xgb), Overall), y = Overall)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ggtitle("Feature Importance (XGBoost)") +
  xlab("Feature") +
  ylab("Mean Decrease Accuracy")

# Visualize the predictions on new/unseen data
new_data <- data.frame(
  Age = c(30, 45),
  Gender = factor(c("Female", "Male"), levels = levels(trainData$Gender)),
  Department = factor(c("R&D", "Sales"), levels = levels(trainData$Department)),
  YearsAtCompany = c(3, 10),
  MonthlyIncome = c(5000, 12000),
  PerformanceRating = c(4, 2),
  JobSatisfaction = c(3, 2),
  DistanceFromHome = c(10, 20)
)

# Aligning factor levels in new data with training data
new_data$Gender <- factor(new_data$Gender, levels = levels(trainData$Gender))
new_data$Department <- factor(new_data$Department, levels = levels(trainData$Department))

# Predictions on the new data
new_pred_rf <- predict(rf_model, new_data)
new_pred_xgb <- predict(xgb_model, new_data)

# Print predictions
print(new_pred_rf)
## [1] No  Yes
## Levels: No Yes
print(new_pred_xgb)
## [1] No  Yes
## Levels: No Yes
# Visualize predictions for new data
new_data$AttritionPredictionRF <- new_pred_rf
new_data$AttritionPredictionXGB <- new_pred_xgb

ggplot(new_data, aes(x = Age, y = MonthlyIncome, color = AttritionPredictionRF)) +
  geom_point(size = 4) +
  ggtitle("Attrition Predictions (Random Forest) for New Employees") +
  xlab("Age") +
  ylab("Monthly Income") +
  theme_minimal()

ggplot(new_data, aes(x = Age, y = MonthlyIncome, color = AttritionPredictionXGB)) +
  geom_point(size = 4) +
  ggtitle("Attrition Predictions (XGBoost) for New Employees") +
  xlab("Age") +
  ylab("Monthly Income") +
  theme_minimal()