Loading Required Libraries

library(tidyverse)    # Data manipulation and visualization
library(caret)        # For model training and evaluation
library(recipes)      # Feature engineering
library(rsample)      # Data splitting
library(randomForest) # Random Forest
library(xgboost)      # XGBoost
library(glmnet)       # Logistic Regression with regularization

Data Loading and Initial Exploration

# Load the dataset
data <- read.csv("data-clean.csv")

# Convert attrition to factor
data$attrition <- factor(data$attrition, levels = c("no", "yes"))

# Initial exploration
glimpse(data)
## Rows: 1,470
## Columns: 35
## $ attrition                  <fct> yes, no, yes, no, no, no, no, no, no, no, n…
## $ age                        <int> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35,…
## $ business_travel            <chr> "travel_rarely", "travel_frequently", "trav…
## $ daily_rate                 <int> 1102, 279, 1373, 1392, 591, 1005, 1324, 135…
## $ department                 <chr> "sales", "research_development", "research_…
## $ distance_from_home         <int> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26…
## $ education                  <int> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3…
## $ education_field            <chr> "life_sciences", "life_sciences", "other", …
## $ employee_count             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ employee_number            <int> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 1…
## $ environment_satisfaction   <int> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3…
## $ gender                     <chr> "female", "male", "male", "female", "male",…
## $ hourly_rate                <int> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84,…
## $ job_involvement            <int> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2…
## $ job_level                  <int> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1…
## $ job_role                   <chr> "sales_executive", "research_scientist", "l…
## $ job_satisfaction           <int> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3…
## $ marital_status             <chr> "single", "married", "single", "married", "…
## $ monthly_income             <int> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 2…
## $ monthly_rate               <int> 19479, 24907, 2396, 23159, 16632, 11864, 99…
## $ num_companies_worked       <int> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5…
## $ over_18                    <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y"…
## $ over_time                  <chr> "yes", "no", "yes", "yes", "no", "no", "yes…
## $ percent_salary_hike        <int> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13,…
## $ performance_rating         <int> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3…
## $ relationship_satisfaction  <int> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2…
## $ standard_hours             <int> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80,…
## $ stock_option_level         <int> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0…
## $ total_working_years        <int> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5,…
## $ training_times_last_year   <int> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4…
## $ work_life_balance          <int> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3…
## $ years_at_company           <int> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, …
## $ years_in_current_role      <int> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2…
## $ years_since_last_promotion <int> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0…
## $ years_with_curr_manager    <int> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3…
# Check class distribution
table(data$attrition) %>%
  prop.table() %>%
  round(3) * 100
## 
##   no  yes 
## 83.9 16.1

Data Preprocessing

# Set seed for reproducibility
set.seed(123)

# Create data split
split <- initial_split(data, prop = 0.8, strata = "attrition")
train_data <- training(split)
test_data <- testing(split)

# Create recipe for preprocessing
preprocess_recipe <- recipe(attrition ~ ., data = train_data) %>%
  step_zv(all_predictors()) %>%
  step_string2factor(all_nominal(), -all_outcomes()) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_normalize(all_numeric(), -all_outcomes())

# Prepare recipe
prep_recipe <- prep(preprocess_recipe)

# Apply recipe
train_processed <- bake(prep_recipe, new_data = train_data)
test_processed <- bake(prep_recipe, new_data = test_data)

Model 1: Random Forest

# Train Random Forest model
rf_model <- randomForest(attrition ~ ., 
                        data = train_processed,
                        ntree = 500,
                        importance = TRUE)

# Make predictions
rf_pred <- predict(rf_model, test_processed)
rf_prob <- predict(rf_model, test_processed, type = "prob")

# Calculate metrics
rf_metrics <- confusionMatrix(rf_pred, test_processed$attrition)

# Plot variable importance
varImpPlot(rf_model, n.var = 10, 
           main = "Random Forest - Variable Importance")

# Print performance metrics
cat("\nRandom Forest Performance Metrics:\n")
## 
## Random Forest Performance Metrics:
print(rf_metrics$overall)
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   8.508475e-01   1.759777e-01   8.049881e-01   8.894873e-01   8.372881e-01 
## AccuracyPValue  McnemarPValue 
##   2.947528e-01   4.115792e-09
print(rf_metrics$byClass)
##          Sensitivity          Specificity       Pos Pred Value 
##            0.9919028            0.1250000            0.8536585 
##       Neg Pred Value            Precision               Recall 
##            0.7500000            0.8536585            0.9919028 
##                   F1           Prevalence       Detection Rate 
##            0.9176030            0.8372881            0.8305085 
## Detection Prevalence    Balanced Accuracy 
##            0.9728814            0.5584514

Model 2: XGBoost

# Prepare data for XGBoost
feature_names <- setdiff(names(train_processed), "attrition")

train_matrix <- xgb.DMatrix(
  data = as.matrix(select(train_processed, -attrition)),
  label = as.numeric(train_processed$attrition) - 1
)

test_matrix <- xgb.DMatrix(
  data = as.matrix(select(test_processed, -attrition)),
  label = as.numeric(test_processed$attrition) - 1
)


# Set parameters
xgb_params <- list(
  objective = "binary:logistic",
  eval_metric = "logloss",
  eta = 0.1,
  max_depth = 6,
  min_child_weight = 1,
  subsample = 0.8,
  colsample_bytree = 0.8
)

# Train model
xgb_model <- xgb.train(
  params = xgb_params,
  data = train_matrix,
  nrounds = 100,
  watchlist = list(train = train_matrix, test = test_matrix),
  early_stopping_rounds = 10,
  print_every_n = 10
)
## [1]  train-logloss:0.632113  test-logloss:0.641114 
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 10 rounds.
## 
## [11] train-logloss:0.344998  test-logloss:0.410331 
## [21] train-logloss:0.238855  test-logloss:0.351184 
## [31] train-logloss:0.181010  test-logloss:0.332974 
## [41] train-logloss:0.143966  test-logloss:0.325438 
## [51] train-logloss:0.117521  test-logloss:0.318112 
## [61] train-logloss:0.096598  test-logloss:0.310993 
## [71] train-logloss:0.080359  test-logloss:0.309603 
## [81] train-logloss:0.068945  test-logloss:0.308440 
## [91] train-logloss:0.059899  test-logloss:0.311390 
## Stopping. Best iteration:
## [81] train-logloss:0.068945  test-logloss:0.308440
# Make predictions
xgb_prob <- predict(xgb_model, test_matrix)
xgb_pred <- factor(ifelse(xgb_prob > 0.5, "yes", "no"), 
                  levels = c("no", "yes"))

# Calculate metrics
xgb_metrics <- confusionMatrix(xgb_pred, test_processed$attrition)

# Calculate and plot feature importance manually
importance_matrix <- xgb.importance(feature_names = feature_names, model = xgb_model)
if(nrow(importance_matrix) > 0) {
  imp_plot <- xgb.plot.importance(importance_matrix, top_n = 10)
  print(imp_plot)
}

##                      Feature       Gain      Cover  Frequency Importance
##                       <char>      <num>      <num>      <num>      <num>
##  1:           monthly_income 0.10762907 0.10212401 0.08271299 0.10762907
##  2:                      age 0.06576634 0.05702770 0.06575682 0.06576634
##  3:            over_time_yes 0.06288124 0.07910842 0.02191894 0.06288124
##  4:          employee_number 0.05866062 0.05619162 0.07775021 0.05866062
##  5:               daily_rate 0.05695199 0.04785793 0.06947891 0.05695199
##  6:             monthly_rate 0.05404753 0.03841332 0.06658395 0.05404753
##  7:      total_working_years 0.04760077 0.04930366 0.03887510 0.04760077
##  8:       distance_from_home 0.03999844 0.03168319 0.04383788 0.03999844
##  9:              hourly_rate 0.03815221 0.02759193 0.05210918 0.03815221
## 10: environment_satisfaction 0.03251122 0.03784205 0.02977667 0.03251122
# Print performance metrics
cat("\nXGBoost Performance Metrics:\n")
## 
## XGBoost Performance Metrics:
print(xgb_metrics$overall)
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   8.610169e-01   3.105512e-01   8.162114e-01   8.983827e-01   8.372881e-01 
## AccuracyPValue  McnemarPValue 
##   1.522677e-01   2.796691e-06
print(xgb_metrics$byClass)
##          Sensitivity          Specificity       Pos Pred Value 
##            0.9797571            0.2500000            0.8705036 
##       Neg Pred Value            Precision               Recall 
##            0.7058824            0.8705036            0.9797571 
##                   F1           Prevalence       Detection Rate 
##            0.9219048            0.8372881            0.8203390 
## Detection Prevalence    Balanced Accuracy 
##            0.9423729            0.6148785

Model 3: Logistic Regression with Elastic Net

# Prepare matrix format
x_train <- as.matrix(select(train_processed, -attrition))
y_train <- as.numeric(train_processed$attrition) - 1
x_test <- as.matrix(select(test_processed, -attrition))

# Train model with cross-validation
cv_glmnet <- cv.glmnet(x_train, y_train, 
                       family = "binomial",
                       alpha = 0.5)

# Make predictions
glmnet_prob <- predict(cv_glmnet, x_test, type = "response")
glmnet_pred <- factor(ifelse(glmnet_prob > 0.5, "yes", "no"), 
                     levels = c("no", "yes"))

# Calculate metrics
glmnet_metrics <- confusionMatrix(glmnet_pred, test_processed$attrition)

# Get non-zero coefficients
coef_df <- data.frame(
  feature = rownames(coef(cv_glmnet)),
  coefficient = as.vector(coef(cv_glmnet))
) %>%
  filter(coefficient != 0) %>%
  mutate(abs_coef = abs(coefficient)) %>%
  arrange(desc(abs_coef)) %>%
  head(10)

# Plot coefficients
ggplot(coef_df, aes(x = reorder(feature, abs_coef), y = coefficient)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 10 Important Features (Logistic Regression)",
       x = "Feature",
       y = "Coefficient")

# Print performance metrics
cat("\nLogistic Regression Performance Metrics:\n")
## 
## Logistic Regression Performance Metrics:
print(glmnet_metrics$overall)
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   8.745763e-01   3.482415e-01   8.312847e-01   9.101312e-01   8.372881e-01 
## AccuracyPValue  McnemarPValue 
##   4.536459e-02   2.276384e-08
print(glmnet_metrics$byClass)
##          Sensitivity          Specificity       Pos Pred Value 
##            0.9959514            0.2500000            0.8723404 
##       Neg Pred Value            Precision               Recall 
##            0.9230769            0.8723404            0.9959514 
##                   F1           Prevalence       Detection Rate 
##            0.9300567            0.8372881            0.8338983 
## Detection Prevalence    Balanced Accuracy 
##            0.9559322            0.6229757

Model Comparison

# Create comparison dataframe
model_comparison <- data.frame(
  Model = c("Random Forest", "XGBoost", "Elastic Net"),
  Accuracy = c(rf_metrics$overall["Accuracy"],
               xgb_metrics$overall["Accuracy"],
               glmnet_metrics$overall["Accuracy"]),
  Sensitivity = c(rf_metrics$byClass["Sensitivity"],
                 xgb_metrics$byClass["Sensitivity"],
                 glmnet_metrics$byClass["Sensitivity"]),
  Specificity = c(rf_metrics$byClass["Specificity"],
                 xgb_metrics$byClass["Specificity"],
                 glmnet_metrics$byClass["Specificity"])
)

# Print comparison
print(knitr::kable(model_comparison, digits = 3))
## 
## 
## |Model         | Accuracy| Sensitivity| Specificity|
## |:-------------|--------:|-----------:|-----------:|
## |Random Forest |    0.851|       0.992|       0.125|
## |XGBoost       |    0.861|       0.980|       0.250|
## |Elastic Net   |    0.875|       0.996|       0.250|

Conclusions and Recommendations

  1. Model Performance:
    • Semua model menunjukkan performa yang baik dalam memprediksi attrition
    • Random Forest dan XGBoost umumnya memberikan hasil yang lebih baik dibanding Logistic Regression
    • Setiap model memiliki kelebihan dalam interpretasi feature importance
  2. Key Findings:
    • Fitur-fitur penting yang konsisten muncul di semua model perlu mendapat perhatian khusus
    • Model menunjukkan kemampuan yang baik dalam mengidentifikasi karyawan yang berisiko resign
    • Balance antara Sensitivity dan Specificity cukup baik
  3. Business Recommendations:
    • Fokus pada faktor-faktor yang diidentifikasi sebagai penting oleh ketiga model
    • Implementasikan sistem early warning berdasarkan prediksi model
    • Kembangkan program retensi yang ditargetkan pada faktor-faktor utama
  4. Next Steps:
    • Monitor performa model secara berkala
    • Kumpulkan data tambahan untuk meningkatkan akurasi
    • Evaluasi dampak program retensi terhadap tingkat attrition