library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.7      ✔ recipes      1.0.10
## ✔ dials        1.2.1      ✔ rsample      1.2.1 
## ✔ dplyr        1.1.4      ✔ tibble       3.2.1 
## ✔ ggplot2      3.5.1      ✔ tidyr        1.3.1 
## ✔ infer        1.0.7      ✔ tune         1.2.1 
## ✔ modeldata    1.4.0      ✔ workflows    1.1.4 
## ✔ parsnip      1.2.1      ✔ workflowsets 1.1.0 
## ✔ purrr        1.0.2      ✔ yardstick    1.3.1
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:yardstick':
## 
##     spec
## The following object is masked from 'package:scales':
## 
##     col_factor
library(dplyr)
attrition_raw_tbl <-read_csv("~/Desktop/DAT3100_DataAnalytics/PSU_DAT3100/00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv") 
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
attrition_raw_tbl %>% glimpse()
## Rows: 1,470
## Columns: 35
## $ Age                      <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2…
## $ Attrition                <chr> "Yes", "No", "Yes", "No", "No", "No", "No", "…
## $ BusinessTravel           <chr> "Travel_Rarely", "Travel_Frequently", "Travel…
## $ DailyRate                <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358,…
## $ Department               <chr> "Sales", "Research & Development", "Research …
## $ DistanceFromHome         <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, …
## $ Education                <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, …
## $ EducationField           <chr> "Life Sciences", "Life Sciences", "Other", "L…
## $ EmployeeCount            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EmployeeNumber           <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16,…
## $ EnvironmentSatisfaction  <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, …
## $ Gender                   <chr> "Female", "Male", "Male", "Female", "Male", "…
## $ HourlyRate               <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 4…
## $ JobInvolvement           <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, …
## $ JobLevel                 <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, …
## $ JobRole                  <chr> "Sales Executive", "Research Scientist", "Lab…
## $ JobSatisfaction          <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, …
## $ MaritalStatus            <chr> "Single", "Married", "Single", "Married", "Ma…
## $ MonthlyIncome            <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269…
## $ MonthlyRate              <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9964…
## $ NumCompaniesWorked       <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, …
## $ Over18                   <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ OverTime                 <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes",…
## $ PercentSalaryHike        <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1…
## $ PerformanceRating        <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, …
## $ StandardHours            <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ StockOptionLevel         <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, …
## $ TotalWorkingYears        <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3…
## $ TrainingTimesLastYear    <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, …
## $ WorkLifeBalance          <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, …
## $ YearsAtCompany           <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4,…
## $ YearsInCurrentRole       <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, …
## $ YearsSinceLastPromotion  <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, …
## $ YearsWithCurrManager     <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, …

Initialize H2O

h2o.init()
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /var/folders/hx/57xnmk_52272gw1nc2jn7nbr0000gn/T//Rtmpwnoain/file41b19adbd09/h2o_max_started_from_r.out
##     /var/folders/hx/57xnmk_52272gw1nc2jn7nbr0000gn/T//Rtmpwnoain/file41b5d55a70a/h2o_max_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: ... Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 897 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 13 days 
##     H2O cluster name:           H2O_started_from_R_max_dua568 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   4.00 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 13 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

Data preprocessing

attrition_data <- attrition_raw_tbl %>%
  mutate(
    Attrition = as.factor(Attrition), # Convert target variable to factor
    OverTime = as.factor(OverTime),  # Convert categorical columns to factor
    BusinessTravel = as.factor(BusinessTravel),
    Department = as.factor(Department),
    EducationField = as.factor(EducationField),
    Gender = as.factor(Gender),
    JobRole = as.factor(JobRole),
    MaritalStatus = as.factor(MaritalStatus)
  )

Split the data into training and testing sets

set.seed(123) # Set seed for reproducibility
attrition_split <- initial_split(attrition_data, prop = 0.7, strata = Attrition)
train_data <- training(attrition_split)
test_data <- testing(attrition_split)

Convert data to H2O format

train_h2o <- as.h2o(train_data)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
test_h2o <- as.h2o(test_data)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

Train an H2O model (example with AutoML)

h2o_model <- h2o.automl(
  x = setdiff(names(train_h2o), "Attrition"),
  y = "Attrition",
  training_frame = train_h2o,
  max_models = 10,
  seed = 123
)
##   |                                                                              |                                                                      |   0%  |                                                                              |==                                                                    |   3%
## 20:54:18.153: AutoML: XGBoost is not available; skipping it.
## 20:54:18.171: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]
## 20:54:19.523: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]  |                                                                              |======                                                                |   9%
## 20:54:21.664: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]
## 20:54:23.77: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]
## 20:54:24.119: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]  |                                                                              |==========                                                            |  15%
## 20:54:25.4: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]
## 20:54:25.850: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]  |                                                                              |==============                                                        |  21%
## 20:54:27.344: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]  |                                                                              |================                                                      |  24%
## 20:54:29.99: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]  |                                                                              |===================                                                   |  26%
## 20:54:32.358: _train param, Dropping unused columns: [StandardHours, EmployeeCount, Over18]  |                                                                              |===================================                                   |  50%
## 20:54:33.593: _train param, Dropping unused columns: [StandardHours, EmployeeCount, Over18]  |                                                                              |======================================================================| 100%

View leaderboard of models

h2o_leaderboard <- h2o_model@leaderboard
print(h2o_leaderboard)
##                                                  model_id       auc   logloss
## 1 StackedEnsemble_BestOfFamily_1_AutoML_1_20241203_205418 0.8386004 0.3146375
## 2    StackedEnsemble_AllModels_1_AutoML_1_20241203_205418 0.8365041 0.3131929
## 3                          GBM_1_AutoML_1_20241203_205418 0.8333720 0.3250817
## 4                          GLM_1_AutoML_1_20241203_205418 0.8295340 0.3270977
## 5                          GBM_2_AutoML_1_20241203_205418 0.8111345 0.3396223
## 6                          GBM_4_AutoML_1_20241203_205418 0.8001966 0.3460101
##       aucpr mean_per_class_error      rmse        mse
## 1 0.6705876            0.2414902 0.2961479 0.08770360
## 2 0.6574466            0.2425155 0.2976782 0.08861232
## 3 0.6368047            0.2271885 0.3092049 0.09560770
## 4 0.6272810            0.2248710 0.3047531 0.09287443
## 5 0.5639653            0.2687208 0.3171214 0.10056596
## 6 0.5510131            0.2869026 0.3204873 0.10271212
## 
## [12 rows x 7 columns]

Get the best model and evaluate

best_model <- h2o_model@leader
perf <- h2o.performance(model = best_model, newdata = test_h2o)

# Print evaluation metrics
print(perf)
## H2OBinomialMetrics: stackedensemble
## 
## MSE:  0.09361442
## RMSE:  0.3059647
## LogLoss:  0.3184334
## Mean Per-Class Error:  0.2506006
## AUC:  0.8432995
## AUCPR:  0.6115262
## Gini:  0.6865991
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         No Yes    Error     Rate
## No     349  21 0.056757  =21/370
## Yes     32  40 0.444444   =32/72
## Totals 381  61 0.119910  =53/442
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.375500   0.601504  59
## 2                       max f2  0.165741   0.688073 143
## 3                 max f0point5  0.420864   0.642857  51
## 4                 max accuracy  0.420864   0.882353  51
## 5                max precision  0.975182   1.000000   0
## 6                   max recall  0.006324   1.000000 378
## 7              max specificity  0.975182   1.000000   0
## 8             max absolute_mcc  0.375500   0.534029  59
## 9   max min_per_class_accuracy  0.178213   0.772973 135
## 10 max mean_per_class_accuracy  0.165741   0.797748 143
## 11                     max tns  0.975182 370.000000   0
## 12                     max fns  0.975182  71.000000   0
## 13                     max fps  0.000706 370.000000 399
## 14                     max tps  0.006324  72.000000 378
## 15                     max tnr  0.975182   1.000000   0
## 16                     max fnr  0.975182   0.986111   0
## 17                     max fpr  0.000706   1.000000 399
## 18                     max tpr  0.006324   1.000000 378
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
auc <- h2o.auc(perf)
accuracy <- h2o.accuracy(perf)[1]
conf_matrix <- h2o.confusionMatrix(perf)

# Extract accuracy as a numeric value
accuracy <- h2o.accuracy(perf)[[1]]  # Extract the first accuracy value

# Display metrics
cat("AUC:", auc, "\n")
## AUC: 0.8432995
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9751821 0.9127579 0.8364277 0.8305046 0.8252833 0.8204808 0.8022532 0.7922581 0.7878397 0.7700209 0.7521063 0.7472068 0.7285618 0.7115241 0.7041796 0.7031954 0.6987496 0.6977415 0.6963477 0.6646098 0.6191475 0.6111708 0.6103151 0.6001047 0.5852447 0.5822152 0.5798637 0.5780693 0.5702154 0.5564044 0.5504891 0.5494489 0.5476027 0.5385535 0.5313054 0.5090374 0.5082401 0.5018406 0.4907486 0.4890557 0.4857536 0.4835393 0.4794958 0.4782505 0.4637786 0.453354 0.4510735 0.4451846 0.4442211 0.4423107 0.4409355 0.420864 0.4197037 0.3946198 0.393463 0.3896012 0.3886001 0.3828846 0.3756101 0.3754998 0.3733798 0.3718236 0.3679423 0.3655493 0.3545863 0.3532399 0.3527459 0.3502867 0.3482461 0.3469723 0.3465882 0.3412785 0.3411645 0.3364138 0.3357797 0.3262411 0.3253198 0.3163659 0.3159569 0.310756 0.3096678 0.2998334 0.2995189 0.2949534 0.29246 0.2874393 0.2868173 0.2805434 0.2797111 0.2706151 0.262169 0.2592847 0.2560095 0.2507721 0.2457153 0.2447202 0.2444049 0.2424518 0.2412075 0.2408285 0.2368427 0.235851 0.234397 0.2326649 0.2324006 0.2278836 0.22179 0.2205304 0.2179724 0.2178182 0.2159844 0.2141343 0.2124365 0.2110188 0.209227 0.2091431 0.20872 0.2022828 0.2009582 0.1996987 0.1983936 0.1973469 0.1971624 0.195179 0.1946379 0.1933371 0.1920126 0.1912149 0.1903492 0.1886608 0.1882069 0.1861656 0.1836359 0.180398 0.1792191 0.1782135 0.1752166 0.1746371 0.1734804 0.1713467 0.1699604 0.1690749 0.1687606 0.1657413 0.1646203 0.1610602 0.1597338 0.1511644 0.15056 0.1473622 0.1444785 0.143947 0.1437974 0.1432241 0.1414828 0.1411625 0.1404474 0.1375945 0.1324595 0.1315856 0.1310654 0.1304249 0.128956 0.1281764 0.1280602 0.1276466 0.1258116 0.122982 0.1227685 0.1223787 0.1201136 0.1197336 0.1181846 0.1177143 0.1152692 0.1132738 0.1119584 0.1114451 0.1100498 0.109749 0.1094454 0.1076471 0.1072955 0.1071356 0.106731 0.1034931 0.1024142 0.100346 0.09936596 0.09757875 0.09685688 0.09586072 0.09400401 0.09379561 0.0930745 0.09291373 0.09252175 0.09225645 0.08899592 0.08878818 0.08850551 0.08833004 0.08778171 0.08621731 0.08580381 0.08467172 0.08383244 0.08276729 0.08186398 0.08167124 0.07985916 0.0790153 0.07871586 0.07777367 0.07729462 0.07288555 0.07249151 0.07241679 0.07214874 0.07136649 0.06989245 0.06954602 0.06913734 0.06831221 0.06818105 0.06797305 0.06779871 0.0664984 0.06632332 0.06624631 0.06591795 0.06554294 0.06513438 0.06497301 0.06422148 0.06412656 0.06399425 0.06346237 0.06286811 0.0617945 0.06075186 0.05971446 0.05730556 0.05655469 0.05626133 0.05615499 0.05606494 0.05509548 0.05476857 0.05450379 0.05433967 0.05388811 0.05321859 0.05125254 0.05093453 0.04954648 0.04898312 0.04873956 0.04691465 0.04650408 0.04623554 0.04612644 0.04601762 0.04517541 0.04494492 0.04485987 0.0447677 0.04423884 0.04308296 0.04176342 0.04075913 0.0397927 0.03952467 0.03927689 0.03913503 0.03889412 0.03873789 0.03864033 0.0382402 0.03796107 0.03751324 0.03737073 0.03723409 0.03700652 0.03692701 0.03658827 0.03631633 0.03616765 0.0356325 0.03503823 0.03488044 0.0345777 0.03408034 0.03361682 0.03350987 0.03246549 0.03178769 0.0315757 0.03145857 0.03136963 0.03068672 0.03003895 0.02983549 0.02909884 0.02854543 0.02847438 0.02759036 0.02738625 0.02654958 0.02636053 0.02548802 0.02541742 0.02525668 0.02497911 0.02476661 0.02444811 0.02416965 0.02367635 0.0235959 0.02343435 0.02326889 0.02316608 0.0221855 0.02209393 0.02145869 0.02101483 0.02094733 0.02065577 0.02049589 0.02027344 0.0201028 0.01967286 0.01945477 0.01910638 0.0188873 0.01855383 0.01820602 0.01812313 0.01801081 0.01790677 0.01775927 0.01762202 0.01711566 0.0167128 0.01626138 0.01606905 0.01600759 0.01584168 0.01545468 0.01526716 0.01508964 0.01466811 0.01422872 0.01376712 0.01353289 0.01328356 0.01314828 0.01275929 0.01198857 0.01181997 0.01163522 0.01157925 0.01119376 0.01098859 0.01061649 0.01021495 0.01009242 0.009321322 0.008631795 0.008397463 0.008046229 0.00788253 0.007796094 0.007502182 0.007402095 0.007000496 0.006468257 0.006378755 0.006324082 0.006203439 0.005951512 0.005676898 0.005496305 0.00532797 0.005170605 0.004705413 0.004480717 0.004139721 0.00398821 0.003614348 0.003223788 0.002975878 0.0026042 0.002165767 0.001993258 0.001917975 0.001463492 0.001245163 0.001101394 0.0007059707
print(conf_matrix)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.375499807925299:
##         No Yes    Error     Rate
## No     349  21 0.056757  =21/370
## Yes     32  40 0.444444   =32/72
## Totals 381  61 0.119910  =53/442

Predict on the test data

predictions <- h2o.predict(best_model, test_h2o)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
test_data <- test_data %>%
  mutate(
    Predicted_Prob = as.vector(predictions[, "Yes"]), # Probability of "Yes"
    Predicted_Class = as.vector(predictions[, "predict"]) # Predicted class
  )

# Example output of predictions
print(head(test_data %>% select(Attrition, Predicted_Prob, Predicted_Class)))
## # A tibble: 6 × 3
##   Attrition Predicted_Prob Predicted_Class
##   <fct>              <dbl> <chr>          
## 1 Yes               0.341  No             
## 2 No                0.0106 No             
## 3 No                0.237  No             
## 4 No                0.129  No             
## 5 No                0.0729 No             
## 6 No                0.0695 No

Plot variable importance

h2o.varimp_plot(best_model)
## Warning: This model doesn't have variable importances

Shutdown H2O when done

h2o.shutdown(prompt = FALSE)