Load Data
attrition_raw_tbl <- read_csv("../00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Split the Data
set.seed(123)
split <- initial_split(attrition_raw_tbl, prop = 0.8, strata = "Attrition")
train_data <- training(split)
test_data <- testing(split)
# Optional: Reduce training size for faster prototyping
train_data <- train_data %>% sample_frac(0.5)
Preprocess with Tidymodels
attrition_recipe <- recipe(Attrition ~ ., data = train_data) %>%
step_rm(Over18, EmployeeCount, StandardHours) %>%
step_dummy(all_nominal(), one_hot = TRUE) %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes())
# Fit the recipe
attrition_recipe_prep <- prep(attrition_recipe, training = train_data)
# Apply the recipe
train_data_processed <- bake(attrition_recipe_prep, new_data = train_data)
test_data_processed <- bake(attrition_recipe_prep, new_data = test_data)
Train Classification Model with H2O AutoML
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\adamc\AppData\Local\Temp\RtmpqChng8\file606c1563524/h2o_adamc_started_from_r.out
## C:\Users\adamc\AppData\Local\Temp\RtmpqChng8\file606c2fb739f5/h2o_adamc_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 4 seconds 20 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 9 days
## H2O cluster name: H2O_started_from_R_adamc_pcs264
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.47 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.1 (2024-06-14 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 9 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert data to H2OFrame
train_h2o <- as.h2o(train_data_processed)
## | | | 0% | |======================================================================| 100%
test_h2o <- as.h2o(test_data_processed)
## | | | 0% | |======================================================================| 100%
# Define predictors and response
predictors <- setdiff(names(train_data_processed), "Attrition_Yes")
response <- "Attrition_Yes"
# Train AutoML classification model
automl <- h2o.automl(
x = predictors,
y = response,
training_frame = train_h2o,
max_runtime_secs = 30,
max_models = 5,
exclude_algos = c("StackedEnsemble"),
seed = 123
)
## | | | 0% | |=== | 4%
## 16:05:56.962: AutoML: XGBoost is not available; skipping it. | |============= | 18% | |=================== | 27% | |======================================================================| 100%
Evaluate the Best Model
predictions <- h2o.predict(automl@leader, newdata = test_h2o)$predict
## | | | 0% | |======================================================================| 100%
Optional: Inspect the Best Model
summary(automl@leader)
## Model Details:
## ==============
##
## H2ORegressionModel: glm
## Model Key: GLM_1_AutoML_1_20250430_160556
## GLM Model: summary
## family link regularization
## 1 gaussian identity Ridge ( lambda = 0.009983 )
## lambda_search
## 1 nlambda = 30, lambda.max = 99.83, lambda.min = 0.009983, lambda.1se = 0.009983
## number_of_predictors_total number_of_active_predictors number_of_iterations
## 1 53 53 30
## training_frame
## 1 AutoML_1_20250430_160556_training_train_data_processed_sid_b358_1
##
## H2ORegressionMetrics: glm
## ** Reported on training data. **
##
## MSE: 0.0001286205
## RMSE: 0.0113411
## MAE: 0.008406775
## RMSLE: 0.01152495
## Mean Residual Deviance : 0.0001286205
## R^2 : 0.9998712
## Null Deviance :587
## Null D.o.F. :587
## Residual Deviance :0.07562884
## Residual D.o.F. :534
## AIC :-3489.011
##
##
##
## H2ORegressionMetrics: glm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.0002617287
## RMSE: 0.01617803
## MAE: 0.011901
## RMSLE: 0.01652345
## Mean Residual Deviance : 0.0002617287
## R^2 : 0.9997378
## Null Deviance :590.8731
## Null D.o.F. :587
## Residual Deviance :0.1538964
## Residual D.o.F. :534
## AIC :-3071.271
##
##
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## mae 0.011903 0.000737 0.011863 0.011444 0.011489
## mean_residual_deviance 0.000262 0.000032 0.000247 0.000267 0.000232
## mse 0.000262 0.000032 0.000247 0.000267 0.000232
## null_deviance 118.174614 26.120630 99.503430 124.715546 85.238010
## r2 0.999727 0.000045 0.999703 0.999747 0.999661
## residual_deviance 0.030779 0.003714 0.029177 0.031482 0.027371
## rmse 0.016157 0.000976 0.015725 0.016334 0.015230
## rmsle 0.016395 0.002266 0.018804 0.013793 0.018740
## cv_4_valid cv_5_valid
## mae 0.011529 0.013187
## mean_residual_deviance 0.000248 0.000315
## mse 0.000248 0.000315
## null_deviance 129.812580 151.603490
## r2 0.999776 0.999750
## residual_deviance 0.028983 0.036884
## rmse 0.015739 0.017755
## rmsle 0.015052 0.015584
##
## Scoring History:
## timestamp duration iteration lambda predictors deviance_train
## 1 2025-04-30 16:05:57 0.000 sec 1 .1E3 54 0.970
## 2 2025-04-30 16:05:57 0.000 sec 2 .73E2 54 0.960
## 3 2025-04-30 16:05:57 0.000 sec 3 .53E2 54 0.947
## 4 2025-04-30 16:05:57 0.000 sec 4 .39E2 54 0.929
## 5 2025-04-30 16:05:57 0.000 sec 5 .28E2 54 0.906
## deviance_xval deviance_se alpha iterations training_rmse training_deviance
## 1 0.985 0.099 0.000000 NA NA NA
## 2 0.977 0.098 0.000000 NA NA NA
## 3 0.967 0.097 0.000000 NA NA NA
## 4 0.953 0.096 0.000000 NA NA NA
## 5 0.935 0.094 0.000000 NA NA NA
## training_mae training_r2
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
##
## ---
## timestamp duration iteration lambda predictors deviance_train
## 26 2025-04-30 16:05:57 0.025 sec 26 .36E-1 54 0.002
## 27 2025-04-30 16:05:57 0.025 sec 27 .26E-1 54 0.001
## 28 2025-04-30 16:05:57 0.025 sec 28 .19E-1 54 0.000
## 29 2025-04-30 16:05:57 0.025 sec 29 .14E-1 54 0.000
## 30 2025-04-30 16:05:57 0.025 sec 30 .1E-1 54 0.000
## 31 2025-04-30 16:05:57 0.025 sec 31 .0E0 54 0.000
## deviance_xval deviance_se alpha iterations training_rmse
## 26 0.003 0.000 0.000000 NA NA
## 27 0.002 0.000 0.000000 NA NA
## 28 0.001 0.000 0.000000 NA NA
## 29 0.000 0.000 0.000000 NA NA
## 30 0.000 0.000 0.000000 NA NA
## 31 0.000 0.000 0.000000 31 0.01134
## training_deviance training_mae training_r2
## 26 NA NA NA
## 27 NA NA NA
## 28 NA NA NA
## 29 NA NA NA
## 30 NA NA NA
## 31 0.00013 0.00841 0.99987
##
## Variable Importances: (Extract with `h2o.varimp`)
## =================================================
##
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 Attrition_No 0.986911 1.000000 0.958840
## 2 OverTime_Yes 0.005037 0.005104 0.004894
## 3 EnvironmentSatisfaction 0.002244 0.002274 0.002180
## 4 MaritalStatus_Single 0.002067 0.002094 0.002008
## 5 YearsInCurrentRole 0.002063 0.002091 0.002004
##
## ---
## variable relative_importance scaled_importance
## 48 EducationField_Life.Sciences 0.000176 0.000178
## 49 EducationField_Medical 0.000075 0.000076
## 50 StockOptionLevel 0.000043 0.000044
## 51 Department_Sales 0.000039 0.000040
## 52 JobRole_Research.Scientist 0.000010 0.000010
## 53 MonthlyIncome 0.000001 0.000001
## percentage
## 48 0.000171
## 49 0.000073
## 50 0.000042
## 51 0.000038
## 52 0.000010
## 53 0.000001