Code Along 12

The goal is to help predict attrition for employees. Please write R code to create a predictive model that predicts the probability of attrition.

Series of with errors Prompt 1:

I have a dataset called attrition_raw_tbl that looks like this.

Prompt 2:

Load required libraries library(dplyr) library(tidyr) library(caret) library(glmnet)

Assuming ‘attrition_raw_tbl’ is your dataset

Preprocess the data

attrition_data <- attrition_raw_tbl %>% # Drop irrelevant columns select(-EmployeeNumber, -EmployeeCount, -Over18, -StandardHours) %>% # Convert categorical variables to factors mutate_if(is.character, as.factor) %>% # Handle missing values if any na.omit()

Split data into training and testing sets

set.seed(123) # for reproducibility train_index <- createDataPartition(attrition_data$Attrition, p = 0.7, list = FALSE) train_data <- attrition_data[train_index, ] test_data <- attrition_data[-train_index, ]

Train a logistic regression model

model <- glm(Attrition ~ ., data = train_data, family = binomial)

Make predictions on the test set

Evaluate the model

confusion_matrix <- table(Actual = test_data$Attrition, Predicted = ifelse(predictions > 0.5, “Yes”, “No”)) accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix) predictions <- predict(model, newdata = test_data, type = “response”)

Print the confusion matrix and accuracy

print(confusion_matrix) print(paste(“Accuracy:”, accuracy))

Prompt 3: “Please use tidymodels instead of caret and to use the h2o model instead of glmnet”

Load required libraries

library(dplyr) library(tidymodels) library(h2o)

Assuming ‘attrition_raw_tbl’ is your dataset

Preprocess the data

Split data into training and testing sets

set.seed(123) # for reproducibility train_data <- attrition_data %>% initial_split(prop = 0.7, strata = Attrition)

train_data <- training(train_data) test_data <- testing(train_data)

Train an h2o logistic regression model h2o.init() train_data_h2o <- as.h2o(train_data) test_data_h2o <- as.h2o(test_data)

model <- h2o.glm(x = names(train_data_h2o)[-1], y = “Attrition”, training_frame = train_data_h2o, family = “binomial”)

Make predictions on the test set

predictions <- h2o.predict(model, newdata = test_data_h2o) predictions <- as.data.frame(predictions$predict)

Evaluate the model

confusion_matrix <- table(Actual = test_dataAttrition,Predicted=ifelse(predictions predict > 0.5, “Yes”, “No”)) accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)

Print the confusion matrix and accuracy

print(confusion_matrix) print(paste(“Accuracy:”, accuracy))

Shutdown h2o

h2o.shutdown()

Prompt 4:

“Error”

Load required libraries

Assuming ‘attrition_raw_tbl’ is your dataset

Preprocess the data

Split data into training and testing sets

set.seed(123) # for reproducibility split_data <- initial_split(attrition_data, prop = 0.7, strata = Attrition) train_data <- training(split_data) test_data <- testing(split_data)

Train an h2o logistic regression model

h2o.init() train_data_h2o <- as.h2o(train_data) test_data_h2o <- as.h2o(test_data)

model <- h2o.glm(x = names(train_data_h2o)[-1], y = “Attrition”, training_frame = train_data_h2o, family = “binomial”)

Make predictions on the test set

predictions <- h2o.predict(model, newdata = test_data_h2o) predictions <- as.data.frame(predictions$predict)

Evaluate the model

confusion_matrix <- table(Actual = test_dataAttrition,Predicted=ifelse(predictions predict > 0.5, “Yes”, “No”)) accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)

Print the confusion matrix and accuracy

print(confusion_matrix) print(paste(“Accuracy:”, accuracy))

Shutdown h2o h2o.shutdown()

Prompt 5: “Please update from h2o.glm to h2o.automl”

Final Code

# Load required libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/

library(h2o)

## Warning: package 'h2o' was built under R version 4.3.3

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

attrition_raw_tbl <- read_csv("../00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")

## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Assuming you have your data loaded as 'attrition_raw_tbl'

# Step 1: Split the data into training and testing sets
set.seed(123) # for reproducibility
split <- initial_split(attrition_raw_tbl, prop = 0.8, strata = "Attrition")
train_data <- training(split)
test_data <- testing(split)

# Step 2: Preprocess the data using tidymodels
# For simplicity, let's use recipe to preprocess the data
attrition_recipe <- recipe(Attrition ~ ., data = train_data) %>%
    step_rm(Over18, EmployeeCount, StandardHours) %>% # Remove columns with zero variance
    step_dummy(all_nominal(), one_hot = TRUE) %>%
    step_center(all_numeric(), -all_outcomes()) %>%
    step_scale(all_numeric(), -all_outcomes())

# Fit the recipe
attrition_recipe_prep <- prep(attrition_recipe, training = train_data)

# Apply the recipe to the data
train_data_processed <- bake(attrition_recipe_prep, new_data = train_data)
test_data_processed <- bake(attrition_recipe_prep, new_data = test_data)

# Step 3: Train a model using h2o.automl
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         4 hours 3 minutes 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    4 months and 18 days 
##     H2O cluster name:           H2O_started_from_R_OPend_eji420 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.18 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.1 (2023-06-16 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (4 months and 18 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

# Convert data to H2OFrame
train_h2o <- as.h2o(train_data_processed)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

test_h2o <- as.h2o(test_data_processed)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

# Define predictors and response variable
predictors <- setdiff(names(train_data_processed), "Attrition_Yes")
response <- "Attrition_Yes"

# Train AutoML model
automl <- h2o.automl(
    x = predictors,
    y = response,
    training_frame = train_h2o,
    max_runtime_secs = 30,  # Maximum runtime in seconds
    seed = 123
)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
## 16:01:19.425: AutoML: XGBoost is not available; skipping it.
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |=======================                                               |  32%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===============================================                       |  68%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |======================================================================| 100%

# Step 4: Evaluate the best model from AutoML
predictions <- h2o.predict(automl@leader, newdata = test_h2o)$predict

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

# Step 5: Assess model performance using h2o.performance
performance <- h2o.performance(automl@leader, newdata = test_h2o)
print(performance)

## H2ORegressionMetrics: gbm
## 
## MSE:  2.722912e-13
## RMSE:  5.218153e-07
## MAE:  2.651004e-07
## RMSLE:  1.917963e-07
## Mean Residual Deviance :  2.722912e-13

# Optionally, you can also inspect the best model
summary(automl@leader)

## Model Details:
## ==============
## 
## H2ORegressionModel: gbm
## Model Key:  GBM_grid_1_AutoML_8_20240509_160119_model_1 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1             155                      155               13057         1
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         1    1.00000          2          2     2.00000
## 
## H2ORegressionMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  2.405081e-13
## RMSE:  4.904163e-07
## MAE:  2.917838e-07
## RMSLE:  2.427209e-07
## Mean Residual Deviance :  2.405081e-13
## 
## 
## 
## H2ORegressionMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  3.842066e-13
## RMSE:  6.19844e-07
## MAE:  3.858446e-07
## RMSLE:  3.586859e-07
## Mean Residual Deviance :  3.842066e-13
## 
## 
## Cross-Validation Metrics Summary: 
##                            mean       sd cv_1_valid cv_2_valid cv_3_valid
## mae                    0.000000 0.000000   0.000000   0.000000   0.000000
## mean_residual_deviance 0.000000 0.000000   0.000000   0.000000   0.000000
## mse                    0.000000 0.000000   0.000000   0.000000   0.000000
## r2                     1.000000 0.000000   1.000000   1.000000   1.000000
## residual_deviance      0.000000 0.000000   0.000000   0.000000   0.000000
## rmse                   0.000001 0.000000   0.000001   0.000001   0.000001
## rmsle                  0.000000 0.000000   0.000000   0.000000   0.000000
##                        cv_4_valid cv_5_valid
## mae                      0.000000   0.000000
## mean_residual_deviance   0.000000   0.000000
## mse                      0.000000   0.000000
## r2                       1.000000   1.000000
## residual_deviance        0.000000   0.000000
## rmse                     0.000001   0.000001
## rmsle                    0.000000   0.000000
## 
## Scoring History: 
##             timestamp   duration number_of_trees training_rmse training_mae
## 1 2024-05-09 16:01:35  1.085 sec               0       0.99957      0.73447
## 2 2024-05-09 16:01:35  1.086 sec               5       0.59024      0.43370
## 3 2024-05-09 16:01:35  1.104 sec              10       0.34853      0.25610
## 4 2024-05-09 16:01:35  1.113 sec              15       0.20580      0.15122
## 5 2024-05-09 16:01:35  1.125 sec              20       0.12153      0.08929
##   training_deviance
## 1           0.99915
## 2           0.34838
## 3           0.12147
## 4           0.04236
## 5           0.01477
## 
## ---
##              timestamp   duration number_of_trees training_rmse training_mae
## 27 2024-05-09 16:01:35  1.365 sec             130       0.00000      0.00000
## 28 2024-05-09 16:01:35  1.383 sec             135       0.00000      0.00000
## 29 2024-05-09 16:01:35  1.385 sec             140       0.00000      0.00000
## 30 2024-05-09 16:01:35  1.413 sec             145       0.00000      0.00000
## 31 2024-05-09 16:01:35  1.430 sec             150       0.00000      0.00000
## 32 2024-05-09 16:01:35  1.433 sec             155       0.00000      0.00000
##    training_deviance
## 27           0.00000
## 28           0.00000
## 29           0.00000
## 30           0.00000
## 31           0.00000
## 32           0.00000
## 
## Variable Importances: (Extract with `h2o.varimp`) 
## =================================================
## 
## Variable Importances: 
##           variable relative_importance scaled_importance percentage
## 1     Attrition_No         6178.946289          1.000000   1.000000
## 2              Age            0.000000          0.000000   0.000000
## 3        DailyRate            0.000000          0.000000   0.000000
## 4 DistanceFromHome            0.000000          0.000000   0.000000
## 5        Education            0.000000          0.000000   0.000000
## 
## ---
##                        variable relative_importance scaled_importance
## 48 JobRole_Sales.Representative            0.000000          0.000000
## 49       MaritalStatus_Divorced            0.000000          0.000000
## 50        MaritalStatus_Married            0.000000          0.000000
## 51         MaritalStatus_Single            0.000000          0.000000
## 52                  OverTime_No            0.000000          0.000000
## 53                 OverTime_Yes            0.000000          0.000000
##    percentage
## 48   0.000000
## 49   0.000000
## 50   0.000000
## 51   0.000000
## 52   0.000000
## 53   0.000000

# Shut down H2O
h2o.shutdown()

## Are you sure you want to shutdown the H2O instance running at http://localhost:54321/ (Y/N)?

Code Along 12

Olivia Pendergast

2024-05-09