Load dataset

attrition_raw_tbl <- read_csv("../00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Glimpse data (use slice(0) if sensitive)
attrition_raw_tbl %>% glimpse()
## Rows: 1,470
## Columns: 35
## $ Age                      <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2…
## $ Attrition                <chr> "Yes", "No", "Yes", "No", "No", "No", "No", "…
## $ BusinessTravel           <chr> "Travel_Rarely", "Travel_Frequently", "Travel…
## $ DailyRate                <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358,…
## $ Department               <chr> "Sales", "Research & Development", "Research …
## $ DistanceFromHome         <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, …
## $ Education                <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, …
## $ EducationField           <chr> "Life Sciences", "Life Sciences", "Other", "L…
## $ EmployeeCount            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EmployeeNumber           <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16,…
## $ EnvironmentSatisfaction  <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, …
## $ Gender                   <chr> "Female", "Male", "Male", "Female", "Male", "…
## $ HourlyRate               <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 4…
## $ JobInvolvement           <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, …
## $ JobLevel                 <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, …
## $ JobRole                  <chr> "Sales Executive", "Research Scientist", "Lab…
## $ JobSatisfaction          <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, …
## $ MaritalStatus            <chr> "Single", "Married", "Single", "Married", "Ma…
## $ MonthlyIncome            <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269…
## $ MonthlyRate              <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9964…
## $ NumCompaniesWorked       <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, …
## $ Over18                   <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ OverTime                 <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes",…
## $ PercentSalaryHike        <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1…
## $ PerformanceRating        <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, …
## $ StandardHours            <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ StockOptionLevel         <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, …
## $ TotalWorkingYears        <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3…
## $ TrainingTimesLastYear    <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, …
## $ WorkLifeBalance          <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, …
## $ YearsAtCompany           <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4,…
## $ YearsInCurrentRole       <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, …
## $ YearsSinceLastPromotion  <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, …
## $ YearsWithCurrManager     <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, …

Initialize H2O

h2o.init()
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\rad1081\AppData\Local\Temp\RtmpQj6NQO\file54281a914d90/h2o_rad1081_started_from_r.out
##     C:\Users\rad1081\AppData\Local\Temp\RtmpQj6NQO\file54282a2f6f2e/h2o_rad1081_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 815 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 14 days 
##     H2O cluster name:           H2O_started_from_R_rad1081_rzo575 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   7.91 GB 
##     H2O cluster total cores:    20 
##     H2O cluster allowed cores:  20 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.1 (2024-06-14 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

Clean and Preprocess Dataset

# Drop irrelevant columns and convert characters to factors
attrition_data <- attrition_raw_tbl %>%
  select(-EmployeeCount, -StandardHours, -Over18, -EmployeeNumber) %>%
  mutate(Attrition = as.factor(Attrition)) %>%
  mutate_if(is.character, as.factor)

#Split the data into training and testing sets

set.seed(123) # Set seed for reproducibility
attrition_split <- initial_split(attrition_data, prop = 0.8, strata = Attrition)
train_data <- training(attrition_split)
test_data <- testing(attrition_split)

Convert data to H2O format

train_h2o <- as.h2o(train_data)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
test_h2o <- as.h2o(test_data)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

Train AutoML Model

h2o_model <- h2o.automl(
  x = setdiff(names(train_h2o), "Attrition"),
  y = "Attrition",
  training_frame = train_h2o,
  max_models = 10,
  seed = 123
)
##   |                                                                              |                                                                      |   0%  |                                                                              |==                                                                    |   3%
## 18:19:59.208: AutoML: XGBoost is not available; skipping it.  |                                                                              |========                                                              |  12%  |                                                                              |=================================                                     |  47%  |                                                                              |======================================================================| 100%

View leaderboard of models

h2o_leaderboard <- h2o_model@leaderboard
print(h2o_leaderboard)
##                                                  model_id       auc   logloss
## 1 StackedEnsemble_BestOfFamily_1_AutoML_1_20250504_181959 0.8275245 0.3250953
## 2                          GLM_1_AutoML_1_20250504_181959 0.8259549 0.3306738
## 3    StackedEnsemble_AllModels_1_AutoML_1_20250504_181959 0.8215386 0.3276695
## 4                          GBM_1_AutoML_1_20250504_181959 0.8135055 0.3408358
## 5             GBM_grid_1_AutoML_1_20250504_181959_model_1 0.7948743 0.3513284
## 6                          GBM_4_AutoML_1_20250504_181959 0.7910375 0.3558672
##       aucpr mean_per_class_error      rmse        mse
## 1 0.6215654            0.2461900 0.3066131 0.09401159
## 2 0.6028195            0.2651486 0.3096123 0.09585977
## 3 0.6100406            0.2655457 0.3084137 0.09511902
## 4 0.5822298            0.2726451 0.3180395 0.10114912
## 5 0.5233784            0.2707079 0.3231920 0.10445309
## 6 0.5324434            0.2908765 0.3236651 0.10475911
## 
## [12 rows x 7 columns]

Evaluate Best Model

best_model <- h2o_model@leader
perf <- h2o.performance(model = best_model, newdata = test_h2o)

# Print evaluation metrics
print(perf)
## H2OBinomialMetrics: stackedensemble
## 
## MSE:  0.08215506
## RMSE:  0.286627
## LogLoss:  0.2740359
## Mean Per-Class Error:  0.1866144
## AUC:  0.905027
## AUCPR:  0.7086014
## Gini:  0.810054
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         No Yes    Error     Rate
## No     232  15 0.060729  =15/247
## Yes     15  33 0.312500   =15/48
## Totals 247  48 0.101695  =30/295
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.375222   0.687500  47
## 2                       max f2  0.184726   0.731707  94
## 3                 max f0point5  0.425336   0.703125  35
## 4                 max accuracy  0.425336   0.898305  35
## 5                max precision  0.961614   1.000000   0
## 6                   max recall  0.034062   1.000000 220
## 7              max specificity  0.961614   1.000000   0
## 8             max absolute_mcc  0.375222   0.626771  47
## 9   max min_per_class_accuracy  0.204262   0.813765  85
## 10 max mean_per_class_accuracy  0.184726   0.830213  94
## 11                     max tns  0.961614 247.000000   0
## 12                     max fns  0.961614  47.000000   0
## 13                     max fps  0.001052 247.000000 294
## 14                     max tps  0.034062  48.000000 220
## 15                     max tnr  0.961614   1.000000   0
## 16                     max fnr  0.961614   0.979167   0
## 17                     max fpr  0.001052   1.000000 294
## 18                     max tpr  0.034062   1.000000 220
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
auc <- h2o.auc(perf)
accuracy <- h2o.accuracy(perf)[1]
conf_matrix <- h2o.confusionMatrix(perf)

accuracy <- h2o.accuracy(perf)[[1]]  

# Display metrics
cat("AUC:", auc, "\n")
## AUC: 0.905027
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9616138 0.9245366 0.871523 0.8141842 0.7990371 0.7878657 0.7471655 0.7334996 0.7283238 0.7222935 0.6975181 0.6892695 0.6814436 0.6337859 0.6293791 0.6244319 0.6196883 0.6123259 0.6011972 0.6001928 0.5935172 0.5919922 0.5804403 0.5729872 0.5611742 0.5582258 0.5401999 0.5308874 0.5242936 0.5137066 0.5043273 0.4701503 0.4686619 0.4491324 0.4451531 0.4253359 0.4245342 0.4216486 0.4192301 0.4014752 0.4012712 0.399803 0.3956578 0.3888274 0.387297 0.3872279 0.3821448 0.375222 0.3742163 0.36898 0.3643417 0.3582214 0.3580651 0.3413015 0.3399194 0.3388385 0.3309473 0.3234375 0.3123765 0.3076053 0.3072284 0.2980713 0.2958621 0.2936102 0.2934118 0.2926437 0.2882713 0.2878296 0.2801179 0.2790682 0.2681138 0.2674 0.2624364 0.2584551 0.2558147 0.25533 0.255323 0.245706 0.2405125 0.2402285 0.2362573 0.2233441 0.2233085 0.2180679 0.2175483 0.2042621 0.2017594 0.201441 0.1996297 0.1989534 0.1925842 0.1909736 0.190823 0.187506 0.1847262 0.1836804 0.1817529 0.1736781 0.1733125 0.1727316 0.1712169 0.1708236 0.1647519 0.1614388 0.1556935 0.1551438 0.151268 0.1501994 0.149951 0.1460053 0.1453622 0.1434869 0.1416151 0.1406779 0.1383114 0.1382431 0.1369872 0.1276415 0.1274511 0.1268229 0.1267543 0.1218869 0.1218286 0.1207453 0.1185603 0.1164203 0.1157211 0.1141598 0.1115827 0.1108084 0.1104543 0.1100951 0.109258 0.1076554 0.1075118 0.1074757 0.107348 0.1070284 0.1043248 0.1041866 0.1008977 0.1008845 0.1000398 0.09724831 0.09566379 0.09477471 0.09408874 0.09063769 0.09036678 0.08993766 0.08832665 0.08706225 0.08668365 0.08511396 0.08454992 0.08408159 0.08290835 0.08286601 0.08251185 0.08236766 0.0816125 0.08017065 0.07805355 0.07722302 0.07465679 0.07434656 0.07360644 0.07296194 0.07131207 0.07031572 0.06922436 0.06754732 0.06643694 0.06498756 0.06422828 0.0633509 0.06295473 0.06078366 0.06004533 0.05901197 0.05879273 0.0574824 0.05668989 0.05469786 0.0542011 0.05394839 0.05380196 0.05212003 0.05187689 0.05155685 0.05052985 0.05020602 0.04864732 0.04829877 0.04795702 0.04736679 0.04724163 0.04693756 0.04677987 0.04670168 0.04648914 0.04593013 0.04468501 0.04305879 0.04292334 0.04257423 0.04247684 0.04016737 0.03997258 0.0397826 0.03977164 0.03933347 0.03875731 0.03873986 0.03868259 0.03739564 0.03543872 0.03483161 0.03458347 0.03408447 0.03406212 0.03378359 0.03298907 0.03266904 0.03196496 0.03112403 0.03096412 0.03036982 0.03011866 0.02978748 0.02867109 0.02861123 0.02821824 0.0279262 0.02783898 0.02783629 0.02725006 0.02655248 0.02388788 0.02387725 0.02340773 0.02340754 0.022748 0.02257323 0.02221507 0.0220958 0.0219104 0.02101952 0.02094813 0.0198366 0.01943726 0.01872218 0.01860443 0.0184851 0.01796534 0.01771414 0.01763149 0.01715886 0.01633221 0.01566798 0.01463872 0.0145995 0.0140274 0.01374856 0.01373672 0.01319103 0.01286035 0.01271953 0.01256817 0.01206631 0.01158738 0.01155561 0.01137544 0.01116262 0.01092193 0.01070148 0.01043513 0.01040102 0.01035808 0.01003766 0.009960736 0.009534344 0.009324566 0.007245363 0.007169944 0.006812267 0.005494378 0.005381452 0.004411495 0.004391901 0.00407018 0.002239763 0.00211525 0.001414153 0.001051786
print(conf_matrix)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.375222042567265:
##         No Yes    Error     Rate
## No     232  15 0.060729  =15/247
## Yes     15  33 0.312500   =15/48
## Totals 247  48 0.101695  =30/295

Predict on the Test Data

predictions <- h2o.predict(best_model, test_h2o)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
test_data <- test_data %>%
  mutate(
    Predicted_Prob = as.vector(predictions[, "Yes"]), 
    Predicted_Class = as.vector(predictions[, "predict"]) 
  )

# Example output of predictions
print(head(test_data %>% select(Attrition, Predicted_Prob, Predicted_Class)))
## # A tibble: 6 × 3
##   Attrition Predicted_Prob Predicted_Class
##   <fct>              <dbl> <chr>          
## 1 No               0.0194  No             
## 2 No               0.0883  No             
## 3 No               0.0588  No             
## 4 No               0.0398  No             
## 5 No               0.256   No             
## 6 No               0.00212 No

Plot variable importance

h2o.varimp_plot(best_model)
## Warning: This model doesn't have variable importances

Shutdown H2O when done

h2o.shutdown(prompt = FALSE)