Final Code:

# Load necessary libraries
library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.3.3

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.5     ✔ recipes      1.1.0
## ✔ dials        1.3.0     ✔ rsample      1.2.1
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1

## Warning: package 'dials' was built under R version 4.3.3

## Warning: package 'ggplot2' was built under R version 4.3.3

## Warning: package 'infer' was built under R version 4.3.3

## Warning: package 'modeldata' was built under R version 4.3.3

## Warning: package 'parsnip' was built under R version 4.3.3

## Warning: package 'recipes' was built under R version 4.3.3

## Warning: package 'rsample' was built under R version 4.3.3

## Warning: package 'tune' was built under R version 4.3.3

## Warning: package 'workflows' was built under R version 4.3.3

## Warning: package 'workflowsets' was built under R version 4.3.3

## Warning: package 'yardstick' was built under R version 4.3.3

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/

library(h2o)

## Warning: package 'h2o' was built under R version 4.3.3

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(dplyr)
library(ROCR)

## Warning: package 'ROCR' was built under R version 4.3.3

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ lubridate::day()    masks h2o::day()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ lubridate::hour()   masks h2o::hour()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ lubridate::month()  masks h2o::month()
## ✖ readr::spec()       masks yardstick::spec()
## ✖ lubridate::week()   masks h2o::week()
## ✖ lubridate::year()   masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

attrition_raw_tbl <- read_csv("../00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")

## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# If data is not sensitive:
attrition_raw_tbl %>% glimpse()

## Rows: 1,470
## Columns: 35
## $ Age                      <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2…
## $ Attrition                <chr> "Yes", "No", "Yes", "No", "No", "No", "No", "…
## $ BusinessTravel           <chr> "Travel_Rarely", "Travel_Frequently", "Travel…
## $ DailyRate                <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358,…
## $ Department               <chr> "Sales", "Research & Development", "Research …
## $ DistanceFromHome         <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, …
## $ Education                <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, …
## $ EducationField           <chr> "Life Sciences", "Life Sciences", "Other", "L…
## $ EmployeeCount            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EmployeeNumber           <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16,…
## $ EnvironmentSatisfaction  <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, …
## $ Gender                   <chr> "Female", "Male", "Male", "Female", "Male", "…
## $ HourlyRate               <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 4…
## $ JobInvolvement           <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, …
## $ JobLevel                 <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, …
## $ JobRole                  <chr> "Sales Executive", "Research Scientist", "Lab…
## $ JobSatisfaction          <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, …
## $ MaritalStatus            <chr> "Single", "Married", "Single", "Married", "Ma…
## $ MonthlyIncome            <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269…
## $ MonthlyRate              <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9964…
## $ NumCompaniesWorked       <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, …
## $ Over18                   <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ OverTime                 <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes",…
## $ PercentSalaryHike        <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1…
## $ PerformanceRating        <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, …
## $ StandardHours            <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ StockOptionLevel         <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, …
## $ TotalWorkingYears        <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3…
## $ TrainingTimesLastYear    <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, …
## $ WorkLifeBalance          <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, …
## $ YearsAtCompany           <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4,…
## $ YearsInCurrentRole       <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, …
## $ YearsSinceLastPromotion  <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, …
## $ YearsWithCurrManager     <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, …

# Initialize H2O
h2o.init()

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\eliza\AppData\Local\Temp\RtmpGYftuM\file8db85f9b7f46/h2o_eliza_started_from_r.out
##     C:\Users\eliza\AppData\Local\Temp\RtmpGYftuM\file8db874ed4833/h2o_eliza_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 949 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 14 days 
##     H2O cluster name:           H2O_started_from_R_eliza_ofz270 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.80 GB 
##     H2O cluster total cores:    16 
##     H2O cluster allowed cores:  16 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

# Preprocess the data
attrition_raw_tbl <- attrition_raw_tbl %>%
    mutate(Attrition = as.factor(Attrition))  # Convert target to factor

# Split data
set.seed(123)
data_split <- initial_split(attrition_raw_tbl, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)

# Convert data to H2O objects
train_h2o <- as.h2o(train_data)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

test_h2o <- as.h2o(test_data)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Define response and predictors
y <- "Attrition"
x <- setdiff(names(train_data), y)

# Train model
model <- h2o.glm(
    x = x,
    y = y,
    training_frame = train_h2o,
    family = "binomial",
    alpha = 0.5
)

## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [JobRole, MaritalStatus, StandardHours, BusinessTravel, Department, OverTime, Over18, EmployeeCount, Gender, EducationField].

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Check model
print(model)

## Model Details:
## ==============
## 
## H2OBinomialModel: glm
## Model ID:  GLM_model_R_1733423852685_1 
## GLM Model: summary
##     family  link                                regularization
## 1 binomial logit Elastic Net (alpha = 0.5, lambda = 1.389E-4 )
##   number_of_predictors_total number_of_active_predictors number_of_iterations
## 1                         24                          24                    4
##          training_frame
## 1 train_data_sid_a416_1
## 
## Coefficients: glm coefficients
##              names coefficients standardized_coefficients
## 1        Intercept     4.502577                 -2.113352
## 2              Age    -0.045919                 -0.422755
## 3        DailyRate    -0.000129                 -0.052317
## 4 DistanceFromHome     0.023717                  0.191636
## 5        Education     0.003606                  0.003694
## 
## ---
##                      names coefficients standardized_coefficients
## 20   TrainingTimesLastYear    -0.172102                 -0.223016
## 21         WorkLifeBalance    -0.208963                 -0.149971
## 22          YearsAtCompany     0.053766                  0.342805
## 23      YearsInCurrentRole    -0.132084                 -0.486107
## 24 YearsSinceLastPromotion     0.127801                  0.423859
## 25    YearsWithCurrManager    -0.087332                 -0.316860
## 
## H2OBinomialMetrics: glm
## ** Reported on training data. **
## 
## MSE:  0.1096461
## RMSE:  0.3311286
## LogLoss:  0.3655567
## Mean Per-Class Error:  0.2803107
## AUC:  0.779059
## AUCPR:  0.4883525
## Gini:  0.558118
## R^2:  0.1871151
## Residual Deviance:  859.7893
## AIC:  909.7893
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         No Yes    Error       Rate
## No     841 146 0.147923   =146/987
## Yes     78 111 0.412698    =78/189
## Totals 919 257 0.190476  =224/1176
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.247567   0.497758 151
## 2                       max f2  0.171720   0.582691 208
## 3                 max f0point5  0.361564   0.500705  90
## 4                 max accuracy  0.497896   0.862245  41
## 5                max precision  0.769197   1.000000   0
## 6                   max recall  0.011172   1.000000 389
## 7              max specificity  0.769197   1.000000   0
## 8             max absolute_mcc  0.247567   0.390485 151
## 9   max min_per_class_accuracy  0.171720   0.719577 208
## 10 max mean_per_class_accuracy  0.240351   0.722053 155
## 11                     max tns  0.769197 987.000000   0
## 12                     max fns  0.769197 187.000000   0
## 13                     max fps  0.001234 987.000000 399
## 14                     max tps  0.011172 189.000000 389
## 15                     max tnr  0.769197   1.000000   0
## 16                     max fnr  0.769197   0.989418   0
## 17                     max fpr  0.001234   1.000000 399
## 18                     max tpr  0.011172   1.000000 389
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

# Predict on test set
predictions <- h2o.predict(model, test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Debug: Check structure and dimensions
print(str(predictions))

## Class 'H2OFrame' <environment: 0x000002114b936c48> 
##  - attr(*, "op")= chr "transformation_b0ea_GLM_model_R_1733423852685_1_on_test_data_sid_a416_3"
##  - attr(*, "id")= chr "transformation_b0ea_GLM_model_R_1733423852685_1_on_test_data_sid_a416_3"
##  - attr(*, "eval")= logi FALSE
##  - attr(*, "nrow")= int 294
##  - attr(*, "ncol")= int 3
##  - attr(*, "types")=List of 3
##   ..$ : chr "enum"
##   ..$ : chr "real"
##   ..$ : chr "real"
##  - attr(*, "data")='data.frame': 10 obs. of  3 variables:
##   ..$ predict: Factor w/ 2 levels "No","Yes": 2 1 2 2 1 2 1 2 2 1
##   ..$ No     : num  0.738 0.85 0.508 0.632 0.914 ...
##   ..$ Yes    : num  0.2617 0.1497 0.4917 0.3676 0.0855 ...
## NULL

print(h2o.dim(predictions))  # Should match test_h2o rows

## [1] 294   3

# Ensure predictions are converted to a data frame
predicted_df <- as.data.frame(predictions)

# Debug: Check the structure of predicted_df
print(head(predicted_df))

##   predict        No        Yes
## 1     Yes 0.7382644 0.26173557
## 2      No 0.8503389 0.14966107
## 3     Yes 0.5083319 0.49166811
## 4     Yes 0.6324403 0.36755972
## 5      No 0.9144686 0.08553141
## 6     Yes 0.3877182 0.61228183

print(nrow(predicted_df))  # Should match the number of rows in test_h2o

## [1] 294

# Extract actual target values
actual_values <- as.vector(test_h2o[[y]])

# Error after this

Prompts:

Prompt 1:

I have a dataset called attrition_raw_tbl that looks like this.

attrition_raw_tbl %>% glimpse() Rows: 1,470 Columns: 35 $ Age 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29, 31, 34, 28, 29, 32, 22, 53, 38, 24, … $ Attrition “Yes”, “No”, “Yes”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”… $ BusinessTravel “Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”, “Travel_Frequently”, “Travel_… $ DailyRate 1102, 279, 1373, 1392, 591, 1005, 1324, 1358, 216, 1299, 809, 153, 670, 1346, 103, 1… $ Department ”Sales”, “Research & Development”, “Research & Development”, “Research & Development… $ DistanceFromHome 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, 19, 24, 21, 5, 16, 2, 2, 11, 9, 7, 15, … $ Education 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, 4, 2, 2, 4, 3, 2, 4, 4, 2, 1, 3, 1, 4, … $ EducationField ”Life Sciences”, “Life Sciences”, “Other”, “Life Sciences”, “Medical”, “Life Science… $ EmployeeCount 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … $ EmployeeNumber 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28… $ EnvironmentSatisfaction 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, 2, 1, 4, 1, 4, 1, 3, 1, 3, 2, 3, 2, 3, … $ Gender ”Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Male”… $ HourlyRate 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 49, 31, 93, 50, 51, 80, 96, 78, 45, 96, … $ JobInvolvement 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, 4, 4, 4, 2, 3, 4, 2, 3, 3, 3, 3, 1, 3, … $ JobLevel 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, 3, 1, 1, 4, 1, 2, 1, 3, 1, 1, 5, 1, 2, … $ JobRole “Sales Executive”, “Research Scientist”, “Laboratory Technician”, “Research Scientis… $ JobSatisfaction 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, 1, 2, 4, 4, 4, 3, 1, 2, 4, 1, 3, 1, 2, … $ MaritalStatus ”Single”, “Married”, “Single”, “Married”, “Married”, “Single”, “Married”, “Divorced”… $ MonthlyIncome 5993, 5130, 2090, 2909, 3468, 3068, 2670, 2693, 9526, 5237, 2426, 4193, 2911, 2661, … $ MonthlyRate 19479, 24907, 2396, 23159, 16632, 11864, 9964, 13335, 8787, 16577, 16479, 12682, 151… $ NumCompaniesWorked 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, 1, 0, 1, 2, 5, 0, 7, 0, 1, 2, 4, 1, 0, … $ Over18 “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”,… $ OverTime “Yes”, “No”, “Yes”, “Yes”, “No”, “No”, “Yes”, “No”, “No”, “No”, “No”, “Yes”, “No”, “… $ PercentSalaryHike 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 12, 17, 11, 14, 11, 12, 13, 16, 11, 18, … $ PerformanceRating 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, … $ RelationshipSatisfaction 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, 3, 4, 2, 3, 3, 4, 2, 3, 4, 3, 4, 2, 4, … $ StandardHours 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, … $ StockOptionLevel 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, … $ TotalWorkingYears 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3, 6, 10, 7, 1, 31, 6, 5, 10, 13, 0, 8, … $ TrainingTimesLastYear 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, 1, 5, 2, 3, 3, 5, 4, 4, 6, 2, 3, 5, 2, … $ WorkLifeBalance 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, … $ YearsAtCompany 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4, 10, 6, 1, 25, 3, 4, 5, 12, 0, 4, 14, 1… $ YearsInCurrentRole 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, 9, 2, 0, 8, 2, 2, 3, 6, 0, 2, 13, 2, 7,… $ YearsSinceLastPromotion 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, 8, 0, 0, 3, 1, 1, 0, 2, 0, 1, 4, 6, 4, … $ YearsWithCurrManager 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, 8, 5, 0, 7, 2, 3, 3, 11, 0, 3, 8, 7, 2,…

The goal is to help predict attrition for employees.

Please write R code to create a predictive model that predicts the probability of attrition.

Prompt 2:

Please update the code to use tidymodels instead of caret and to use the h2o model instead of glmnet.

Prompt 3:

Error in data.frame(…, check.names = FALSE) : arguments imply differing number of rows: 294, 0

Prompt 4:

Error in predicted_df$Predicted_Prob : $ operator is invalid for atomic vectors

codealong12

Eli O’Neill

2024-12-05