library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.7 ✔ recipes 1.1.0
## ✔ dials 1.3.0 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'dials' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'infer' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'parsnip' was built under R version 4.3.3
## Warning: package 'recipes' was built under R version 4.3.3
## Warning: package 'rsample' was built under R version 4.3.3
## Warning: package 'tune' was built under R version 4.3.3
## Warning: package 'workflows' was built under R version 4.3.3
## Warning: package 'workflowsets' was built under R version 4.3.3
## Warning: package 'yardstick' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(h2o)
## Warning: package 'h2o' was built under R version 4.3.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidyverse)
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ lubridate::day() masks h2o::day()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ lubridate::hour() masks h2o::hour()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::month() masks h2o::month()
## ✖ readr::spec() masks yardstick::spec()
## ✖ lubridate::week() masks h2o::week()
## ✖ lubridate::year() masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
attrition_raw_tbl <- read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# If data is not sensitive:
attrition_raw_tbl %>% glimpse()
## Rows: 1,470
## Columns: 35
## $ Age <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2…
## $ Attrition <chr> "Yes", "No", "Yes", "No", "No", "No", "No", "…
## $ BusinessTravel <chr> "Travel_Rarely", "Travel_Frequently", "Travel…
## $ DailyRate <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358,…
## $ Department <chr> "Sales", "Research & Development", "Research …
## $ DistanceFromHome <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, …
## $ Education <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, …
## $ EducationField <chr> "Life Sciences", "Life Sciences", "Other", "L…
## $ EmployeeCount <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EmployeeNumber <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16,…
## $ EnvironmentSatisfaction <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, …
## $ Gender <chr> "Female", "Male", "Male", "Female", "Male", "…
## $ HourlyRate <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 4…
## $ JobInvolvement <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, …
## $ JobLevel <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, …
## $ JobRole <chr> "Sales Executive", "Research Scientist", "Lab…
## $ JobSatisfaction <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, …
## $ MaritalStatus <chr> "Single", "Married", "Single", "Married", "Ma…
## $ MonthlyIncome <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269…
## $ MonthlyRate <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9964…
## $ NumCompaniesWorked <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, …
## $ Over18 <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ OverTime <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes",…
## $ PercentSalaryHike <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1…
## $ PerformanceRating <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, …
## $ StandardHours <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ StockOptionLevel <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, …
## $ TotalWorkingYears <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3…
## $ TrainingTimesLastYear <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, …
## $ WorkLifeBalance <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, …
## $ YearsAtCompany <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4,…
## $ YearsInCurrentRole <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, …
## $ YearsSinceLastPromotion <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, …
## $ YearsWithCurrManager <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, …
# If data is sensitive:
attrition_raw_tbl %>%
slice(0) %>%
glimpse()
## Rows: 0
## Columns: 35
## $ Age <dbl>
## $ Attrition <chr>
## $ BusinessTravel <chr>
## $ DailyRate <dbl>
## $ Department <chr>
## $ DistanceFromHome <dbl>
## $ Education <dbl>
## $ EducationField <chr>
## $ EmployeeCount <dbl>
## $ EmployeeNumber <dbl>
## $ EnvironmentSatisfaction <dbl>
## $ Gender <chr>
## $ HourlyRate <dbl>
## $ JobInvolvement <dbl>
## $ JobLevel <dbl>
## $ JobRole <chr>
## $ JobSatisfaction <dbl>
## $ MaritalStatus <chr>
## $ MonthlyIncome <dbl>
## $ MonthlyRate <dbl>
## $ NumCompaniesWorked <dbl>
## $ Over18 <chr>
## $ OverTime <chr>
## $ PercentSalaryHike <dbl>
## $ PerformanceRating <dbl>
## $ RelationshipSatisfaction <dbl>
## $ StandardHours <dbl>
## $ StockOptionLevel <dbl>
## $ TotalWorkingYears <dbl>
## $ TrainingTimesLastYear <dbl>
## $ WorkLifeBalance <dbl>
## $ YearsAtCompany <dbl>
## $ YearsInCurrentRole <dbl>
## $ YearsSinceLastPromotion <dbl>
## $ YearsWithCurrManager <dbl>
# Step 1: Initialize H2O
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\ktqua\AppData\Local\Temp\RtmpMVBsSj\file7c047940591/h2o_ktqua_started_from_r.out
## C:\Users\ktqua\AppData\Local\Temp\RtmpMVBsSj\file7c041c02a30/h2o_ktqua_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 seconds 694 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 11 months and 12 days
## H2O cluster name: H2O_started_from_R_ktqua_pnf574
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.91 GB
## H2O cluster total cores: 16
## H2O cluster allowed cores: 16
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.2 (2023-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (11 months and 12 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Step 2: Split data into training and testing sets
set.seed(123) # For reproducibility
data_split <- initial_split(attrition_raw_tbl, prop = 0.8, strata = Attrition)
train_data <- training(data_split)
test_data <- testing(data_split)
# Step 3: Create a preprocessing recipe
attrition_recipe <- recipe(Attrition ~ ., data = train_data) %>%
step_zv(all_predictors()) %>% # Remove zero-variance predictors (e.g., Over18)
step_dummy(all_nominal_predictors(), -all_outcomes()) %>% # One-hot encode categorical variables
step_normalize(all_numeric_predictors()) # Normalize numeric predictors
# Prepare the data using the recipe
prepped_data <- prep(attrition_recipe)
train_prepped <- bake(prepped_data, new_data = train_data)
test_prepped <- bake(prepped_data, new_data = test_data)
# Convert preprocessed data to H2O frames
train_h2o <- as.h2o(train_prepped)
##
|
| | 0%
|
|======================================================================| 100%
test_h2o <- as.h2o(test_prepped)
##
|
| | 0%
|
|======================================================================| 100%
# Step 4: Specify and train an H2O GBM model
h2o_model <- h2o.gbm(
x = setdiff(names(train_h2o), "Attrition"), # Predictor variables
y = "Attrition", # Response variable
training_frame = train_h2o,
ntrees = 1000,
learn_rate = 0.01,
max_depth = 6,
seed = 123
)
##
|
| | 0%
|
| | 1%
|
|= | 2%
|
|=========== | 16%
|
|============================== | 42%
|
|================================================ | 68%
|
|=================================================================== | 96%
|
|======================================================================| 100%
# Step 5: Evaluate the model using h2o.performance
perf <- h2o.performance(h2o_model, newdata = test_h2o)
# Print metrics
cat("AUC:", h2o.auc(perf), "\n")
## AUC: 0.86083
cat("Accuracy:", h2o.accuracy(perf, thresholds = "max")[[1]], "\n")
## Accuracy: 0.8779661
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(h2o.confusionMatrix(perf))
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.151257526138808:
## No Yes Error Rate
## No 222 25 0.101215 =25/247
## Yes 16 32 0.333333 =16/48
## Totals 238 57 0.138983 =41/295
# Step 6: Shut down H2O
h2o.shutdown(prompt = FALSE)
Prompts
I have a dataset called attrition_raw_tbl that looks like this.
attrition_raw_tbl %>% glimpse() Rows: 1,470 Columns: 35 $ Age
41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29, 31, 34, 28, 29, 32, 22, 53, 38, 24, … $ Attrition “Yes”, “No”, “Yes”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”… $ BusinessTravel “Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”, “Travel_Frequently”, “Travel_… $ DailyRate 1102, 279, 1373, 1392, 591, 1005, 1324, 1358, 216, 1299, 809, 153, 670, 1346, 103, 1… $ Department ”Sales”, “Research & Development”, “Research & Development”, “Research & Development… $ DistanceFromHome 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, 19, 24, 21, 5, 16, 2, 2, 11, 9, 7, 15, … $ Education 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, 4, 2, 2, 4, 3, 2, 4, 4, 2, 1, 3, 1, 4, … $ EducationField ”Life Sciences”, “Life Sciences”, “Other”, “Life Sciences”, “Medical”, “Life Science… $ EmployeeCount 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … $ EmployeeNumber 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28… $ EnvironmentSatisfaction 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, 2, 1, 4, 1, 4, 1, 3, 1, 3, 2, 3, 2, 3, … $ Gender ”Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Male”… $ HourlyRate 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 49, 31, 93, 50, 51, 80, 96, 78, 45, 96, … $ JobInvolvement 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, 4, 4, 4, 2, 3, 4, 2, 3, 3, 3, 3, 1, 3, … $ JobLevel 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, 3, 1, 1, 4, 1, 2, 1, 3, 1, 1, 5, 1, 2, … $ JobRole “Sales Executive”, “Research Scientist”, “Laboratory Technician”, “Research Scientis… $ JobSatisfaction 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, 1, 2, 4, 4, 4, 3, 1, 2, 4, 1, 3, 1, 2, … $ MaritalStatus ”Single”, “Married”, “Single”, “Married”, “Married”, “Single”, “Married”, “Divorced”… $ MonthlyIncome 5993, 5130, 2090, 2909, 3468, 3068, 2670, 2693, 9526, 5237, 2426, 4193, 2911, 2661, … $ MonthlyRate 19479, 24907, 2396, 23159, 16632, 11864, 9964, 13335, 8787, 16577, 16479, 12682, 151… $ NumCompaniesWorked 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, 1, 0, 1, 2, 5, 0, 7, 0, 1, 2, 4, 1, 0, … $ Over18 “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”,… $ OverTime “Yes”, “No”, “Yes”, “Yes”, “No”, “No”, “Yes”, “No”, “No”, “No”, “No”, “Yes”, “No”, “… $ PercentSalaryHike 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 12, 17, 11, 14, 11, 12, 13, 16, 11, 18, … $ PerformanceRating 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, … $ RelationshipSatisfaction 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, 3, 4, 2, 3, 3, 4, 2, 3, 4, 3, 4, 2, 4, … $ StandardHours 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, … $ StockOptionLevel 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, … $ TotalWorkingYears 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3, 6, 10, 7, 1, 31, 6, 5, 10, 13, 0, 8, … $ TrainingTimesLastYear 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, 1, 5, 2, 3, 3, 5, 4, 4, 6, 2, 3, 5, 2, … $ WorkLifeBalance 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, … $ YearsAtCompany 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4, 10, 6, 1, 25, 3, 4, 5, 12, 0, 4, 14, 1… $ YearsInCurrentRole 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, 9, 2, 0, 8, 2, 2, 3, 6, 0, 2, 13, 2, 7,… $ YearsSinceLastPromotion 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, 8, 0, 0, 3, 1, 1, 0, 2, 0, 1, 4, 6, 4, … $ YearsWithCurrManager 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, 8, 5, 0, 7, 2, 3, 3, 11, 0, 3, 8, 7, 2,…
The goal is to help predict attrition for employees.
Please write R code to create a predictive model that predicts the probability of attrition.
Please update the code to use tidymodels instead of caret and to use the h2o model instead of glmnet.
Error in .h2o.doSafeREST(h2oRestApiVersion = h2oRestApiVersion, urlSuffix = page, :
Please update the code to use h2o.performance in Step 5, instead of mean.
Error in step_dummy(): Caused by error in bake(): ! Only one factor level in col_name: Y.
can you add that to the larger chunk of code