library(tidyverse) # Data manipulation and visualization
library(caret) # For model training and evaluation
library(recipes) # Feature engineering
library(rsample) # Data splitting
library(randomForest) # Random Forest
library(xgboost) # XGBoost
library(glmnet) # Logistic Regression with regularization# Load the dataset
data <- read.csv("data-clean.csv")
# Convert attrition to factor
data$attrition <- factor(data$attrition, levels = c("no", "yes"))
# Initial exploration
glimpse(data)## Rows: 1,470
## Columns: 35
## $ attrition <fct> yes, no, yes, no, no, no, no, no, no, no, n…
## $ age <int> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35,…
## $ business_travel <chr> "travel_rarely", "travel_frequently", "trav…
## $ daily_rate <int> 1102, 279, 1373, 1392, 591, 1005, 1324, 135…
## $ department <chr> "sales", "research_development", "research_…
## $ distance_from_home <int> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26…
## $ education <int> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3…
## $ education_field <chr> "life_sciences", "life_sciences", "other", …
## $ employee_count <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ employee_number <int> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 1…
## $ environment_satisfaction <int> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3…
## $ gender <chr> "female", "male", "male", "female", "male",…
## $ hourly_rate <int> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84,…
## $ job_involvement <int> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2…
## $ job_level <int> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1…
## $ job_role <chr> "sales_executive", "research_scientist", "l…
## $ job_satisfaction <int> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3…
## $ marital_status <chr> "single", "married", "single", "married", "…
## $ monthly_income <int> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 2…
## $ monthly_rate <int> 19479, 24907, 2396, 23159, 16632, 11864, 99…
## $ num_companies_worked <int> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5…
## $ over_18 <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y"…
## $ over_time <chr> "yes", "no", "yes", "yes", "no", "no", "yes…
## $ percent_salary_hike <int> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13,…
## $ performance_rating <int> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3…
## $ relationship_satisfaction <int> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2…
## $ standard_hours <int> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80,…
## $ stock_option_level <int> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0…
## $ total_working_years <int> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5,…
## $ training_times_last_year <int> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4…
## $ work_life_balance <int> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3…
## $ years_at_company <int> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, …
## $ years_in_current_role <int> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2…
## $ years_since_last_promotion <int> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0…
## $ years_with_curr_manager <int> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3…
##
## no yes
## 83.9 16.1
# Set seed for reproducibility
set.seed(123)
# Create data split
split <- initial_split(data, prop = 0.8, strata = "attrition")
train_data <- training(split)
test_data <- testing(split)
# Create recipe for preprocessing
preprocess_recipe <- recipe(attrition ~ ., data = train_data) %>%
step_zv(all_predictors()) %>%
step_string2factor(all_nominal(), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_normalize(all_numeric(), -all_outcomes())
# Prepare recipe
prep_recipe <- prep(preprocess_recipe)
# Apply recipe
train_processed <- bake(prep_recipe, new_data = train_data)
test_processed <- bake(prep_recipe, new_data = test_data)# Train Random Forest model
rf_model <- randomForest(attrition ~ .,
data = train_processed,
ntree = 500,
importance = TRUE)
# Make predictions
rf_pred <- predict(rf_model, test_processed)
rf_prob <- predict(rf_model, test_processed, type = "prob")
# Calculate metrics
rf_metrics <- confusionMatrix(rf_pred, test_processed$attrition)
# Plot variable importance
varImpPlot(rf_model, n.var = 10,
main = "Random Forest - Variable Importance")##
## Random Forest Performance Metrics:
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.508475e-01 1.759777e-01 8.049881e-01 8.894873e-01 8.372881e-01
## AccuracyPValue McnemarPValue
## 2.947528e-01 4.115792e-09
## Sensitivity Specificity Pos Pred Value
## 0.9919028 0.1250000 0.8536585
## Neg Pred Value Precision Recall
## 0.7500000 0.8536585 0.9919028
## F1 Prevalence Detection Rate
## 0.9176030 0.8372881 0.8305085
## Detection Prevalence Balanced Accuracy
## 0.9728814 0.5584514
# Prepare data for XGBoost
feature_names <- setdiff(names(train_processed), "attrition")
train_matrix <- xgb.DMatrix(
data = as.matrix(select(train_processed, -attrition)),
label = as.numeric(train_processed$attrition) - 1
)
test_matrix <- xgb.DMatrix(
data = as.matrix(select(test_processed, -attrition)),
label = as.numeric(test_processed$attrition) - 1
)
# Set parameters
xgb_params <- list(
objective = "binary:logistic",
eval_metric = "logloss",
eta = 0.1,
max_depth = 6,
min_child_weight = 1,
subsample = 0.8,
colsample_bytree = 0.8
)
# Train model
xgb_model <- xgb.train(
params = xgb_params,
data = train_matrix,
nrounds = 100,
watchlist = list(train = train_matrix, test = test_matrix),
early_stopping_rounds = 10,
print_every_n = 10
)## [1] train-logloss:0.632113 test-logloss:0.641114
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 10 rounds.
##
## [11] train-logloss:0.344998 test-logloss:0.410331
## [21] train-logloss:0.238855 test-logloss:0.351184
## [31] train-logloss:0.181010 test-logloss:0.332974
## [41] train-logloss:0.143966 test-logloss:0.325438
## [51] train-logloss:0.117521 test-logloss:0.318112
## [61] train-logloss:0.096598 test-logloss:0.310993
## [71] train-logloss:0.080359 test-logloss:0.309603
## [81] train-logloss:0.068945 test-logloss:0.308440
## [91] train-logloss:0.059899 test-logloss:0.311390
## Stopping. Best iteration:
## [81] train-logloss:0.068945 test-logloss:0.308440
# Make predictions
xgb_prob <- predict(xgb_model, test_matrix)
xgb_pred <- factor(ifelse(xgb_prob > 0.5, "yes", "no"),
levels = c("no", "yes"))
# Calculate metrics
xgb_metrics <- confusionMatrix(xgb_pred, test_processed$attrition)
# Calculate and plot feature importance manually
importance_matrix <- xgb.importance(feature_names = feature_names, model = xgb_model)
if(nrow(importance_matrix) > 0) {
imp_plot <- xgb.plot.importance(importance_matrix, top_n = 10)
print(imp_plot)
}## Feature Gain Cover Frequency Importance
## <char> <num> <num> <num> <num>
## 1: monthly_income 0.10762907 0.10212401 0.08271299 0.10762907
## 2: age 0.06576634 0.05702770 0.06575682 0.06576634
## 3: over_time_yes 0.06288124 0.07910842 0.02191894 0.06288124
## 4: employee_number 0.05866062 0.05619162 0.07775021 0.05866062
## 5: daily_rate 0.05695199 0.04785793 0.06947891 0.05695199
## 6: monthly_rate 0.05404753 0.03841332 0.06658395 0.05404753
## 7: total_working_years 0.04760077 0.04930366 0.03887510 0.04760077
## 8: distance_from_home 0.03999844 0.03168319 0.04383788 0.03999844
## 9: hourly_rate 0.03815221 0.02759193 0.05210918 0.03815221
## 10: environment_satisfaction 0.03251122 0.03784205 0.02977667 0.03251122
##
## XGBoost Performance Metrics:
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.610169e-01 3.105512e-01 8.162114e-01 8.983827e-01 8.372881e-01
## AccuracyPValue McnemarPValue
## 1.522677e-01 2.796691e-06
## Sensitivity Specificity Pos Pred Value
## 0.9797571 0.2500000 0.8705036
## Neg Pred Value Precision Recall
## 0.7058824 0.8705036 0.9797571
## F1 Prevalence Detection Rate
## 0.9219048 0.8372881 0.8203390
## Detection Prevalence Balanced Accuracy
## 0.9423729 0.6148785
# Prepare matrix format
x_train <- as.matrix(select(train_processed, -attrition))
y_train <- as.numeric(train_processed$attrition) - 1
x_test <- as.matrix(select(test_processed, -attrition))
# Train model with cross-validation
cv_glmnet <- cv.glmnet(x_train, y_train,
family = "binomial",
alpha = 0.5)
# Make predictions
glmnet_prob <- predict(cv_glmnet, x_test, type = "response")
glmnet_pred <- factor(ifelse(glmnet_prob > 0.5, "yes", "no"),
levels = c("no", "yes"))
# Calculate metrics
glmnet_metrics <- confusionMatrix(glmnet_pred, test_processed$attrition)
# Get non-zero coefficients
coef_df <- data.frame(
feature = rownames(coef(cv_glmnet)),
coefficient = as.vector(coef(cv_glmnet))
) %>%
filter(coefficient != 0) %>%
mutate(abs_coef = abs(coefficient)) %>%
arrange(desc(abs_coef)) %>%
head(10)
# Plot coefficients
ggplot(coef_df, aes(x = reorder(feature, abs_coef), y = coefficient)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Important Features (Logistic Regression)",
x = "Feature",
y = "Coefficient")##
## Logistic Regression Performance Metrics:
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.745763e-01 3.482415e-01 8.312847e-01 9.101312e-01 8.372881e-01
## AccuracyPValue McnemarPValue
## 4.536459e-02 2.276384e-08
## Sensitivity Specificity Pos Pred Value
## 0.9959514 0.2500000 0.8723404
## Neg Pred Value Precision Recall
## 0.9230769 0.8723404 0.9959514
## F1 Prevalence Detection Rate
## 0.9300567 0.8372881 0.8338983
## Detection Prevalence Balanced Accuracy
## 0.9559322 0.6229757
# Create comparison dataframe
model_comparison <- data.frame(
Model = c("Random Forest", "XGBoost", "Elastic Net"),
Accuracy = c(rf_metrics$overall["Accuracy"],
xgb_metrics$overall["Accuracy"],
glmnet_metrics$overall["Accuracy"]),
Sensitivity = c(rf_metrics$byClass["Sensitivity"],
xgb_metrics$byClass["Sensitivity"],
glmnet_metrics$byClass["Sensitivity"]),
Specificity = c(rf_metrics$byClass["Specificity"],
xgb_metrics$byClass["Specificity"],
glmnet_metrics$byClass["Specificity"])
)
# Print comparison
print(knitr::kable(model_comparison, digits = 3))##
##
## |Model | Accuracy| Sensitivity| Specificity|
## |:-------------|--------:|-----------:|-----------:|
## |Random Forest | 0.851| 0.992| 0.125|
## |XGBoost | 0.861| 0.980| 0.250|
## |Elastic Net | 0.875| 0.996| 0.250|