Load required libraries

library(tidyverse) library(tidymodels) library(h2o)

Assuming you have your data loaded as ‘attrition_raw_tbl’

Step 1: Split the data into training and testing sets

set.seed(123) # for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.8, strata = “Attrition”) train_data <- training(split) test_data <- testing(split)

Step 2: Preprocess the data using tidymodels

For simplicity, let’s use recipe to preprocess the data

attrition_recipe <- recipe(Attrition ~ ., data = train_data) %>% step_rm(Over18, EmployeeCount, StandardHours) %>% # Remove columns with zero variance step_dummy(all_nominal(), one_hot = TRUE) %>% step_center(all_numeric(), -all_outcomes()) %>% step_scale(all_numeric(), -all_outcomes())

Fit the recipe

attrition_recipe_prep <- prep(attrition_recipe, training = train_data)

Apply the recipe to the data

train_data_processed <- bake(attrition_recipe_prep, new_data = train_data) test_data_processed <- bake(attrition_recipe_prep, new_data = test_data)

Step 3: Train a model using h2o.automl

h2o.init()

Convert data to H2OFrame

train_h2o <- as.h2o(train_data_processed) test_h2o <- as.h2o(test_data_processed)

Define predictors and response variable

predictors <- setdiff(names(train_data_processed), “Attrition_Yes”) response <- “Attrition_Yes”

Train AutoML model

automl <- h2o.automl( x = predictors, y = response, training_frame = train_h2o, max_runtime_secs = 30, # Maximum runtime in seconds seed = 123 )

Step 4: Evaluate the best model from AutoML

predictions <- h2o.predict(automl@leader, newdata = test_h2o)$predict

Step 5: Assess model performance using h2o.performance

performance <- h2o.performance(automl@leader, newdata = test_h2o) print(performance)

Optionally, you can also inspect the best model

summary(automl@leader)

Shut down H2O

h2o.shutdown()

Code Along 12

Johnny McKinnon

2024-12-03