library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.7 ✔ recipes 1.0.10
## ✔ dials 1.2.1 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:yardstick':
##
## spec
## The following object is masked from 'package:scales':
##
## col_factor
library(dplyr)
attrition_raw_tbl <-read_csv("~/Desktop/DAT3100_DataAnalytics/PSU_DAT3100/00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
attrition_raw_tbl %>% glimpse()
## Rows: 1,470
## Columns: 35
## $ Age <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2…
## $ Attrition <chr> "Yes", "No", "Yes", "No", "No", "No", "No", "…
## $ BusinessTravel <chr> "Travel_Rarely", "Travel_Frequently", "Travel…
## $ DailyRate <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358,…
## $ Department <chr> "Sales", "Research & Development", "Research …
## $ DistanceFromHome <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, …
## $ Education <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, …
## $ EducationField <chr> "Life Sciences", "Life Sciences", "Other", "L…
## $ EmployeeCount <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EmployeeNumber <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16,…
## $ EnvironmentSatisfaction <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, …
## $ Gender <chr> "Female", "Male", "Male", "Female", "Male", "…
## $ HourlyRate <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 4…
## $ JobInvolvement <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, …
## $ JobLevel <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, …
## $ JobRole <chr> "Sales Executive", "Research Scientist", "Lab…
## $ JobSatisfaction <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, …
## $ MaritalStatus <chr> "Single", "Married", "Single", "Married", "Ma…
## $ MonthlyIncome <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269…
## $ MonthlyRate <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9964…
## $ NumCompaniesWorked <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, …
## $ Over18 <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ OverTime <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes",…
## $ PercentSalaryHike <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1…
## $ PerformanceRating <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, …
## $ StandardHours <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ StockOptionLevel <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, …
## $ TotalWorkingYears <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3…
## $ TrainingTimesLastYear <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, …
## $ WorkLifeBalance <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, …
## $ YearsAtCompany <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4,…
## $ YearsInCurrentRole <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, …
## $ YearsSinceLastPromotion <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, …
## $ YearsWithCurrManager <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, …
Initialize H2O
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /var/folders/hx/57xnmk_52272gw1nc2jn7nbr0000gn/T//Rtmpwnoain/file41b19adbd09/h2o_max_started_from_r.out
## /var/folders/hx/57xnmk_52272gw1nc2jn7nbr0000gn/T//Rtmpwnoain/file41b5d55a70a/h2o_max_started_from_r.err
##
##
## Starting H2O JVM and connecting: ... Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 seconds 897 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 11 months and 13 days
## H2O cluster name: H2O_started_from_R_max_dua568
## H2O cluster total nodes: 1
## H2O cluster total memory: 4.00 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.2 (2023-10-31)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (11 months and 13 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
Data preprocessing
attrition_data <- attrition_raw_tbl %>%
mutate(
Attrition = as.factor(Attrition), # Convert target variable to factor
OverTime = as.factor(OverTime), # Convert categorical columns to factor
BusinessTravel = as.factor(BusinessTravel),
Department = as.factor(Department),
EducationField = as.factor(EducationField),
Gender = as.factor(Gender),
JobRole = as.factor(JobRole),
MaritalStatus = as.factor(MaritalStatus)
)
Split the data into training and testing sets
set.seed(123) # Set seed for reproducibility
attrition_split <- initial_split(attrition_data, prop = 0.7, strata = Attrition)
train_data <- training(attrition_split)
test_data <- testing(attrition_split)
Train an H2O model (example with AutoML)
h2o_model <- h2o.automl(
x = setdiff(names(train_h2o), "Attrition"),
y = "Attrition",
training_frame = train_h2o,
max_models = 10,
seed = 123
)
## | | | 0% | |== | 3%
## 20:54:18.153: AutoML: XGBoost is not available; skipping it.
## 20:54:18.171: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]
## 20:54:19.523: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18] | |====== | 9%
## 20:54:21.664: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]
## 20:54:23.77: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]
## 20:54:24.119: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18] | |========== | 15%
## 20:54:25.4: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18]
## 20:54:25.850: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18] | |============== | 21%
## 20:54:27.344: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18] | |================ | 24%
## 20:54:29.99: _train param, Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18] | |=================== | 26%
## 20:54:32.358: _train param, Dropping unused columns: [StandardHours, EmployeeCount, Over18] | |=================================== | 50%
## 20:54:33.593: _train param, Dropping unused columns: [StandardHours, EmployeeCount, Over18] | |======================================================================| 100%
View leaderboard of models
h2o_leaderboard <- h2o_model@leaderboard
print(h2o_leaderboard)
## model_id auc logloss
## 1 StackedEnsemble_BestOfFamily_1_AutoML_1_20241203_205418 0.8386004 0.3146375
## 2 StackedEnsemble_AllModels_1_AutoML_1_20241203_205418 0.8365041 0.3131929
## 3 GBM_1_AutoML_1_20241203_205418 0.8333720 0.3250817
## 4 GLM_1_AutoML_1_20241203_205418 0.8295340 0.3270977
## 5 GBM_2_AutoML_1_20241203_205418 0.8111345 0.3396223
## 6 GBM_4_AutoML_1_20241203_205418 0.8001966 0.3460101
## aucpr mean_per_class_error rmse mse
## 1 0.6705876 0.2414902 0.2961479 0.08770360
## 2 0.6574466 0.2425155 0.2976782 0.08861232
## 3 0.6368047 0.2271885 0.3092049 0.09560770
## 4 0.6272810 0.2248710 0.3047531 0.09287443
## 5 0.5639653 0.2687208 0.3171214 0.10056596
## 6 0.5510131 0.2869026 0.3204873 0.10271212
##
## [12 rows x 7 columns]
Get the best model and evaluate
best_model <- h2o_model@leader
perf <- h2o.performance(model = best_model, newdata = test_h2o)
# Print evaluation metrics
print(perf)
## H2OBinomialMetrics: stackedensemble
##
## MSE: 0.09361442
## RMSE: 0.3059647
## LogLoss: 0.3184334
## Mean Per-Class Error: 0.2506006
## AUC: 0.8432995
## AUCPR: 0.6115262
## Gini: 0.6865991
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## No Yes Error Rate
## No 349 21 0.056757 =21/370
## Yes 32 40 0.444444 =32/72
## Totals 381 61 0.119910 =53/442
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.375500 0.601504 59
## 2 max f2 0.165741 0.688073 143
## 3 max f0point5 0.420864 0.642857 51
## 4 max accuracy 0.420864 0.882353 51
## 5 max precision 0.975182 1.000000 0
## 6 max recall 0.006324 1.000000 378
## 7 max specificity 0.975182 1.000000 0
## 8 max absolute_mcc 0.375500 0.534029 59
## 9 max min_per_class_accuracy 0.178213 0.772973 135
## 10 max mean_per_class_accuracy 0.165741 0.797748 143
## 11 max tns 0.975182 370.000000 0
## 12 max fns 0.975182 71.000000 0
## 13 max fps 0.000706 370.000000 399
## 14 max tps 0.006324 72.000000 378
## 15 max tnr 0.975182 1.000000 0
## 16 max fnr 0.975182 0.986111 0
## 17 max fpr 0.000706 1.000000 399
## 18 max tpr 0.006324 1.000000 378
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
auc <- h2o.auc(perf)
accuracy <- h2o.accuracy(perf)[1]
conf_matrix <- h2o.confusionMatrix(perf)
# Extract accuracy as a numeric value
accuracy <- h2o.accuracy(perf)[[1]] # Extract the first accuracy value
# Display metrics
cat("AUC:", auc, "\n")
## AUC: 0.8432995
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9751821 0.9127579 0.8364277 0.8305046 0.8252833 0.8204808 0.8022532 0.7922581 0.7878397 0.7700209 0.7521063 0.7472068 0.7285618 0.7115241 0.7041796 0.7031954 0.6987496 0.6977415 0.6963477 0.6646098 0.6191475 0.6111708 0.6103151 0.6001047 0.5852447 0.5822152 0.5798637 0.5780693 0.5702154 0.5564044 0.5504891 0.5494489 0.5476027 0.5385535 0.5313054 0.5090374 0.5082401 0.5018406 0.4907486 0.4890557 0.4857536 0.4835393 0.4794958 0.4782505 0.4637786 0.453354 0.4510735 0.4451846 0.4442211 0.4423107 0.4409355 0.420864 0.4197037 0.3946198 0.393463 0.3896012 0.3886001 0.3828846 0.3756101 0.3754998 0.3733798 0.3718236 0.3679423 0.3655493 0.3545863 0.3532399 0.3527459 0.3502867 0.3482461 0.3469723 0.3465882 0.3412785 0.3411645 0.3364138 0.3357797 0.3262411 0.3253198 0.3163659 0.3159569 0.310756 0.3096678 0.2998334 0.2995189 0.2949534 0.29246 0.2874393 0.2868173 0.2805434 0.2797111 0.2706151 0.262169 0.2592847 0.2560095 0.2507721 0.2457153 0.2447202 0.2444049 0.2424518 0.2412075 0.2408285 0.2368427 0.235851 0.234397 0.2326649 0.2324006 0.2278836 0.22179 0.2205304 0.2179724 0.2178182 0.2159844 0.2141343 0.2124365 0.2110188 0.209227 0.2091431 0.20872 0.2022828 0.2009582 0.1996987 0.1983936 0.1973469 0.1971624 0.195179 0.1946379 0.1933371 0.1920126 0.1912149 0.1903492 0.1886608 0.1882069 0.1861656 0.1836359 0.180398 0.1792191 0.1782135 0.1752166 0.1746371 0.1734804 0.1713467 0.1699604 0.1690749 0.1687606 0.1657413 0.1646203 0.1610602 0.1597338 0.1511644 0.15056 0.1473622 0.1444785 0.143947 0.1437974 0.1432241 0.1414828 0.1411625 0.1404474 0.1375945 0.1324595 0.1315856 0.1310654 0.1304249 0.128956 0.1281764 0.1280602 0.1276466 0.1258116 0.122982 0.1227685 0.1223787 0.1201136 0.1197336 0.1181846 0.1177143 0.1152692 0.1132738 0.1119584 0.1114451 0.1100498 0.109749 0.1094454 0.1076471 0.1072955 0.1071356 0.106731 0.1034931 0.1024142 0.100346 0.09936596 0.09757875 0.09685688 0.09586072 0.09400401 0.09379561 0.0930745 0.09291373 0.09252175 0.09225645 0.08899592 0.08878818 0.08850551 0.08833004 0.08778171 0.08621731 0.08580381 0.08467172 0.08383244 0.08276729 0.08186398 0.08167124 0.07985916 0.0790153 0.07871586 0.07777367 0.07729462 0.07288555 0.07249151 0.07241679 0.07214874 0.07136649 0.06989245 0.06954602 0.06913734 0.06831221 0.06818105 0.06797305 0.06779871 0.0664984 0.06632332 0.06624631 0.06591795 0.06554294 0.06513438 0.06497301 0.06422148 0.06412656 0.06399425 0.06346237 0.06286811 0.0617945 0.06075186 0.05971446 0.05730556 0.05655469 0.05626133 0.05615499 0.05606494 0.05509548 0.05476857 0.05450379 0.05433967 0.05388811 0.05321859 0.05125254 0.05093453 0.04954648 0.04898312 0.04873956 0.04691465 0.04650408 0.04623554 0.04612644 0.04601762 0.04517541 0.04494492 0.04485987 0.0447677 0.04423884 0.04308296 0.04176342 0.04075913 0.0397927 0.03952467 0.03927689 0.03913503 0.03889412 0.03873789 0.03864033 0.0382402 0.03796107 0.03751324 0.03737073 0.03723409 0.03700652 0.03692701 0.03658827 0.03631633 0.03616765 0.0356325 0.03503823 0.03488044 0.0345777 0.03408034 0.03361682 0.03350987 0.03246549 0.03178769 0.0315757 0.03145857 0.03136963 0.03068672 0.03003895 0.02983549 0.02909884 0.02854543 0.02847438 0.02759036 0.02738625 0.02654958 0.02636053 0.02548802 0.02541742 0.02525668 0.02497911 0.02476661 0.02444811 0.02416965 0.02367635 0.0235959 0.02343435 0.02326889 0.02316608 0.0221855 0.02209393 0.02145869 0.02101483 0.02094733 0.02065577 0.02049589 0.02027344 0.0201028 0.01967286 0.01945477 0.01910638 0.0188873 0.01855383 0.01820602 0.01812313 0.01801081 0.01790677 0.01775927 0.01762202 0.01711566 0.0167128 0.01626138 0.01606905 0.01600759 0.01584168 0.01545468 0.01526716 0.01508964 0.01466811 0.01422872 0.01376712 0.01353289 0.01328356 0.01314828 0.01275929 0.01198857 0.01181997 0.01163522 0.01157925 0.01119376 0.01098859 0.01061649 0.01021495 0.01009242 0.009321322 0.008631795 0.008397463 0.008046229 0.00788253 0.007796094 0.007502182 0.007402095 0.007000496 0.006468257 0.006378755 0.006324082 0.006203439 0.005951512 0.005676898 0.005496305 0.00532797 0.005170605 0.004705413 0.004480717 0.004139721 0.00398821 0.003614348 0.003223788 0.002975878 0.0026042 0.002165767 0.001993258 0.001917975 0.001463492 0.001245163 0.001101394 0.0007059707
print(conf_matrix)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.375499807925299:
## No Yes Error Rate
## No 349 21 0.056757 =21/370
## Yes 32 40 0.444444 =32/72
## Totals 381 61 0.119910 =53/442
Predict on the test data
predictions <- h2o.predict(best_model, test_h2o)
## | | | 0% | |======================================================================| 100%
test_data <- test_data %>%
mutate(
Predicted_Prob = as.vector(predictions[, "Yes"]), # Probability of "Yes"
Predicted_Class = as.vector(predictions[, "predict"]) # Predicted class
)
# Example output of predictions
print(head(test_data %>% select(Attrition, Predicted_Prob, Predicted_Class)))
## # A tibble: 6 × 3
## Attrition Predicted_Prob Predicted_Class
## <fct> <dbl> <chr>
## 1 Yes 0.341 No
## 2 No 0.0106 No
## 3 No 0.237 No
## 4 No 0.129 No
## 5 No 0.0729 No
## 6 No 0.0695 No
Plot variable importance
h2o.varimp_plot(best_model)
## Warning: This model doesn't have variable importances
Shutdown H2O when done
h2o.shutdown(prompt = FALSE)