attrition_raw_tbl <- read_csv("../00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Glimpse data (use slice(0) if sensitive)
attrition_raw_tbl %>% glimpse()
## Rows: 1,470
## Columns: 35
## $ Age <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2…
## $ Attrition <chr> "Yes", "No", "Yes", "No", "No", "No", "No", "…
## $ BusinessTravel <chr> "Travel_Rarely", "Travel_Frequently", "Travel…
## $ DailyRate <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358,…
## $ Department <chr> "Sales", "Research & Development", "Research …
## $ DistanceFromHome <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, …
## $ Education <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, …
## $ EducationField <chr> "Life Sciences", "Life Sciences", "Other", "L…
## $ EmployeeCount <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EmployeeNumber <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16,…
## $ EnvironmentSatisfaction <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, …
## $ Gender <chr> "Female", "Male", "Male", "Female", "Male", "…
## $ HourlyRate <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 4…
## $ JobInvolvement <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, …
## $ JobLevel <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, …
## $ JobRole <chr> "Sales Executive", "Research Scientist", "Lab…
## $ JobSatisfaction <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, …
## $ MaritalStatus <chr> "Single", "Married", "Single", "Married", "Ma…
## $ MonthlyIncome <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269…
## $ MonthlyRate <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9964…
## $ NumCompaniesWorked <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, …
## $ Over18 <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ OverTime <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes",…
## $ PercentSalaryHike <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1…
## $ PerformanceRating <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, …
## $ StandardHours <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ StockOptionLevel <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, …
## $ TotalWorkingYears <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3…
## $ TrainingTimesLastYear <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, …
## $ WorkLifeBalance <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, …
## $ YearsAtCompany <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4,…
## $ YearsInCurrentRole <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, …
## $ YearsSinceLastPromotion <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, …
## $ YearsWithCurrManager <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, …
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\rad1081\AppData\Local\Temp\RtmpQj6NQO\file54281a914d90/h2o_rad1081_started_from_r.out
## C:\Users\rad1081\AppData\Local\Temp\RtmpQj6NQO\file54282a2f6f2e/h2o_rad1081_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 seconds 815 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 14 days
## H2O cluster name: H2O_started_from_R_rad1081_rzo575
## H2O cluster total nodes: 1
## H2O cluster total memory: 7.91 GB
## H2O cluster total cores: 20
## H2O cluster allowed cores: 20
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.1 (2024-06-14 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Drop irrelevant columns and convert characters to factors
attrition_data <- attrition_raw_tbl %>%
select(-EmployeeCount, -StandardHours, -Over18, -EmployeeNumber) %>%
mutate(Attrition = as.factor(Attrition)) %>%
mutate_if(is.character, as.factor)
#Split the data into training and testing sets
set.seed(123) # Set seed for reproducibility
attrition_split <- initial_split(attrition_data, prop = 0.8, strata = Attrition)
train_data <- training(attrition_split)
test_data <- testing(attrition_split)
train_h2o <- as.h2o(train_data)
## | | | 0% | |======================================================================| 100%
test_h2o <- as.h2o(test_data)
## | | | 0% | |======================================================================| 100%
h2o_model <- h2o.automl(
x = setdiff(names(train_h2o), "Attrition"),
y = "Attrition",
training_frame = train_h2o,
max_models = 10,
seed = 123
)
## | | | 0% | |== | 3%
## 18:19:59.208: AutoML: XGBoost is not available; skipping it. | |======== | 12% | |================================= | 47% | |======================================================================| 100%
h2o_leaderboard <- h2o_model@leaderboard
print(h2o_leaderboard)
## model_id auc logloss
## 1 StackedEnsemble_BestOfFamily_1_AutoML_1_20250504_181959 0.8275245 0.3250953
## 2 GLM_1_AutoML_1_20250504_181959 0.8259549 0.3306738
## 3 StackedEnsemble_AllModels_1_AutoML_1_20250504_181959 0.8215386 0.3276695
## 4 GBM_1_AutoML_1_20250504_181959 0.8135055 0.3408358
## 5 GBM_grid_1_AutoML_1_20250504_181959_model_1 0.7948743 0.3513284
## 6 GBM_4_AutoML_1_20250504_181959 0.7910375 0.3558672
## aucpr mean_per_class_error rmse mse
## 1 0.6215654 0.2461900 0.3066131 0.09401159
## 2 0.6028195 0.2651486 0.3096123 0.09585977
## 3 0.6100406 0.2655457 0.3084137 0.09511902
## 4 0.5822298 0.2726451 0.3180395 0.10114912
## 5 0.5233784 0.2707079 0.3231920 0.10445309
## 6 0.5324434 0.2908765 0.3236651 0.10475911
##
## [12 rows x 7 columns]
best_model <- h2o_model@leader
perf <- h2o.performance(model = best_model, newdata = test_h2o)
# Print evaluation metrics
print(perf)
## H2OBinomialMetrics: stackedensemble
##
## MSE: 0.08215506
## RMSE: 0.286627
## LogLoss: 0.2740359
## Mean Per-Class Error: 0.1866144
## AUC: 0.905027
## AUCPR: 0.7086014
## Gini: 0.810054
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## No Yes Error Rate
## No 232 15 0.060729 =15/247
## Yes 15 33 0.312500 =15/48
## Totals 247 48 0.101695 =30/295
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.375222 0.687500 47
## 2 max f2 0.184726 0.731707 94
## 3 max f0point5 0.425336 0.703125 35
## 4 max accuracy 0.425336 0.898305 35
## 5 max precision 0.961614 1.000000 0
## 6 max recall 0.034062 1.000000 220
## 7 max specificity 0.961614 1.000000 0
## 8 max absolute_mcc 0.375222 0.626771 47
## 9 max min_per_class_accuracy 0.204262 0.813765 85
## 10 max mean_per_class_accuracy 0.184726 0.830213 94
## 11 max tns 0.961614 247.000000 0
## 12 max fns 0.961614 47.000000 0
## 13 max fps 0.001052 247.000000 294
## 14 max tps 0.034062 48.000000 220
## 15 max tnr 0.961614 1.000000 0
## 16 max fnr 0.961614 0.979167 0
## 17 max fpr 0.001052 1.000000 294
## 18 max tpr 0.034062 1.000000 220
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
auc <- h2o.auc(perf)
accuracy <- h2o.accuracy(perf)[1]
conf_matrix <- h2o.confusionMatrix(perf)
accuracy <- h2o.accuracy(perf)[[1]]
# Display metrics
cat("AUC:", auc, "\n")
## AUC: 0.905027
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9616138 0.9245366 0.871523 0.8141842 0.7990371 0.7878657 0.7471655 0.7334996 0.7283238 0.7222935 0.6975181 0.6892695 0.6814436 0.6337859 0.6293791 0.6244319 0.6196883 0.6123259 0.6011972 0.6001928 0.5935172 0.5919922 0.5804403 0.5729872 0.5611742 0.5582258 0.5401999 0.5308874 0.5242936 0.5137066 0.5043273 0.4701503 0.4686619 0.4491324 0.4451531 0.4253359 0.4245342 0.4216486 0.4192301 0.4014752 0.4012712 0.399803 0.3956578 0.3888274 0.387297 0.3872279 0.3821448 0.375222 0.3742163 0.36898 0.3643417 0.3582214 0.3580651 0.3413015 0.3399194 0.3388385 0.3309473 0.3234375 0.3123765 0.3076053 0.3072284 0.2980713 0.2958621 0.2936102 0.2934118 0.2926437 0.2882713 0.2878296 0.2801179 0.2790682 0.2681138 0.2674 0.2624364 0.2584551 0.2558147 0.25533 0.255323 0.245706 0.2405125 0.2402285 0.2362573 0.2233441 0.2233085 0.2180679 0.2175483 0.2042621 0.2017594 0.201441 0.1996297 0.1989534 0.1925842 0.1909736 0.190823 0.187506 0.1847262 0.1836804 0.1817529 0.1736781 0.1733125 0.1727316 0.1712169 0.1708236 0.1647519 0.1614388 0.1556935 0.1551438 0.151268 0.1501994 0.149951 0.1460053 0.1453622 0.1434869 0.1416151 0.1406779 0.1383114 0.1382431 0.1369872 0.1276415 0.1274511 0.1268229 0.1267543 0.1218869 0.1218286 0.1207453 0.1185603 0.1164203 0.1157211 0.1141598 0.1115827 0.1108084 0.1104543 0.1100951 0.109258 0.1076554 0.1075118 0.1074757 0.107348 0.1070284 0.1043248 0.1041866 0.1008977 0.1008845 0.1000398 0.09724831 0.09566379 0.09477471 0.09408874 0.09063769 0.09036678 0.08993766 0.08832665 0.08706225 0.08668365 0.08511396 0.08454992 0.08408159 0.08290835 0.08286601 0.08251185 0.08236766 0.0816125 0.08017065 0.07805355 0.07722302 0.07465679 0.07434656 0.07360644 0.07296194 0.07131207 0.07031572 0.06922436 0.06754732 0.06643694 0.06498756 0.06422828 0.0633509 0.06295473 0.06078366 0.06004533 0.05901197 0.05879273 0.0574824 0.05668989 0.05469786 0.0542011 0.05394839 0.05380196 0.05212003 0.05187689 0.05155685 0.05052985 0.05020602 0.04864732 0.04829877 0.04795702 0.04736679 0.04724163 0.04693756 0.04677987 0.04670168 0.04648914 0.04593013 0.04468501 0.04305879 0.04292334 0.04257423 0.04247684 0.04016737 0.03997258 0.0397826 0.03977164 0.03933347 0.03875731 0.03873986 0.03868259 0.03739564 0.03543872 0.03483161 0.03458347 0.03408447 0.03406212 0.03378359 0.03298907 0.03266904 0.03196496 0.03112403 0.03096412 0.03036982 0.03011866 0.02978748 0.02867109 0.02861123 0.02821824 0.0279262 0.02783898 0.02783629 0.02725006 0.02655248 0.02388788 0.02387725 0.02340773 0.02340754 0.022748 0.02257323 0.02221507 0.0220958 0.0219104 0.02101952 0.02094813 0.0198366 0.01943726 0.01872218 0.01860443 0.0184851 0.01796534 0.01771414 0.01763149 0.01715886 0.01633221 0.01566798 0.01463872 0.0145995 0.0140274 0.01374856 0.01373672 0.01319103 0.01286035 0.01271953 0.01256817 0.01206631 0.01158738 0.01155561 0.01137544 0.01116262 0.01092193 0.01070148 0.01043513 0.01040102 0.01035808 0.01003766 0.009960736 0.009534344 0.009324566 0.007245363 0.007169944 0.006812267 0.005494378 0.005381452 0.004411495 0.004391901 0.00407018 0.002239763 0.00211525 0.001414153 0.001051786
print(conf_matrix)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.375222042567265:
## No Yes Error Rate
## No 232 15 0.060729 =15/247
## Yes 15 33 0.312500 =15/48
## Totals 247 48 0.101695 =30/295
predictions <- h2o.predict(best_model, test_h2o)
## | | | 0% | |======================================================================| 100%
test_data <- test_data %>%
mutate(
Predicted_Prob = as.vector(predictions[, "Yes"]),
Predicted_Class = as.vector(predictions[, "predict"])
)
# Example output of predictions
print(head(test_data %>% select(Attrition, Predicted_Prob, Predicted_Class)))
## # A tibble: 6 × 3
## Attrition Predicted_Prob Predicted_Class
## <fct> <dbl> <chr>
## 1 No 0.0194 No
## 2 No 0.0883 No
## 3 No 0.0588 No
## 4 No 0.0398 No
## 5 No 0.256 No
## 6 No 0.00212 No
h2o.varimp_plot(best_model)
## Warning: This model doesn't have variable importances
h2o.shutdown(prompt = FALSE)