library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.7     ✔ recipes      1.1.0
## ✔ dials        1.3.0     ✔ rsample      1.2.1
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'dials' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'infer' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'parsnip' was built under R version 4.3.3
## Warning: package 'recipes' was built under R version 4.3.3
## Warning: package 'rsample' was built under R version 4.3.3
## Warning: package 'tune' was built under R version 4.3.3
## Warning: package 'workflows' was built under R version 4.3.3
## Warning: package 'workflowsets' was built under R version 4.3.3
## Warning: package 'yardstick' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(h2o)
## Warning: package 'h2o' was built under R version 4.3.3
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(tidyverse)
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ lubridate::day()    masks h2o::day()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ lubridate::hour()   masks h2o::hour()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ lubridate::month()  masks h2o::month()
## ✖ readr::spec()       masks yardstick::spec()
## ✖ lubridate::week()   masks h2o::week()
## ✖ lubridate::year()   masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
attrition_raw_tbl <- read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# If data is not sensitive:
attrition_raw_tbl %>% glimpse()
## Rows: 1,470
## Columns: 35
## $ Age                      <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2…
## $ Attrition                <chr> "Yes", "No", "Yes", "No", "No", "No", "No", "…
## $ BusinessTravel           <chr> "Travel_Rarely", "Travel_Frequently", "Travel…
## $ DailyRate                <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358,…
## $ Department               <chr> "Sales", "Research & Development", "Research …
## $ DistanceFromHome         <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, …
## $ Education                <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, …
## $ EducationField           <chr> "Life Sciences", "Life Sciences", "Other", "L…
## $ EmployeeCount            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EmployeeNumber           <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16,…
## $ EnvironmentSatisfaction  <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, …
## $ Gender                   <chr> "Female", "Male", "Male", "Female", "Male", "…
## $ HourlyRate               <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 4…
## $ JobInvolvement           <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, …
## $ JobLevel                 <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, …
## $ JobRole                  <chr> "Sales Executive", "Research Scientist", "Lab…
## $ JobSatisfaction          <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, …
## $ MaritalStatus            <chr> "Single", "Married", "Single", "Married", "Ma…
## $ MonthlyIncome            <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269…
## $ MonthlyRate              <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9964…
## $ NumCompaniesWorked       <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, …
## $ Over18                   <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ OverTime                 <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes",…
## $ PercentSalaryHike        <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1…
## $ PerformanceRating        <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, …
## $ StandardHours            <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ StockOptionLevel         <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, …
## $ TotalWorkingYears        <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3…
## $ TrainingTimesLastYear    <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, …
## $ WorkLifeBalance          <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, …
## $ YearsAtCompany           <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4,…
## $ YearsInCurrentRole       <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, …
## $ YearsSinceLastPromotion  <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, …
## $ YearsWithCurrManager     <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, …
# If data is sensitive:
attrition_raw_tbl %>%
    slice(0) %>%
    glimpse()
## Rows: 0
## Columns: 35
## $ Age                      <dbl> 
## $ Attrition                <chr> 
## $ BusinessTravel           <chr> 
## $ DailyRate                <dbl> 
## $ Department               <chr> 
## $ DistanceFromHome         <dbl> 
## $ Education                <dbl> 
## $ EducationField           <chr> 
## $ EmployeeCount            <dbl> 
## $ EmployeeNumber           <dbl> 
## $ EnvironmentSatisfaction  <dbl> 
## $ Gender                   <chr> 
## $ HourlyRate               <dbl> 
## $ JobInvolvement           <dbl> 
## $ JobLevel                 <dbl> 
## $ JobRole                  <chr> 
## $ JobSatisfaction          <dbl> 
## $ MaritalStatus            <chr> 
## $ MonthlyIncome            <dbl> 
## $ MonthlyRate              <dbl> 
## $ NumCompaniesWorked       <dbl> 
## $ Over18                   <chr> 
## $ OverTime                 <chr> 
## $ PercentSalaryHike        <dbl> 
## $ PerformanceRating        <dbl> 
## $ RelationshipSatisfaction <dbl> 
## $ StandardHours            <dbl> 
## $ StockOptionLevel         <dbl> 
## $ TotalWorkingYears        <dbl> 
## $ TrainingTimesLastYear    <dbl> 
## $ WorkLifeBalance          <dbl> 
## $ YearsAtCompany           <dbl> 
## $ YearsInCurrentRole       <dbl> 
## $ YearsSinceLastPromotion  <dbl> 
## $ YearsWithCurrManager     <dbl>
# Load necessary libraries
library(h2o)
library(dplyr)
library(ggplot2)

# Initialize h2o
h2o.init()
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\ktqua\AppData\Local\Temp\RtmpcrVQY3\file8a8c1e2ae5b/h2o_ktqua_started_from_r.out
##     C:\Users\ktqua\AppData\Local\Temp\RtmpcrVQY3\file8a8c35dcf8f/h2o_ktqua_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 816 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 15 days 
##     H2O cluster name:           H2O_started_from_R_ktqua_jiv585 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.91 GB 
##     H2O cluster total cores:    16 
##     H2O cluster allowed cores:  16 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 15 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert necessary columns to factors and ensure the response column is categorical
attrition_h2o <- attrition_raw_tbl %>%
  mutate(Attrition = as.factor(Attrition)) %>% # Convert Attrition to a factor
  as.h2o()
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Split the dataset into training and testing sets (80/20)
splits <- h2o.splitFrame(attrition_h2o, ratios = 0.8, seed = 123)
train_h2o <- splits[[1]]
test_h2o <- splits[[2]]

# Define the response and predictors
response <- "Attrition"
predictors <- setdiff(colnames(attrition_raw_tbl), response)

# Train a random forest model in h2o
rf_model <- h2o.randomForest(
  x = predictors,
  y = response,
  training_frame = train_h2o,
  ntrees = 100,
  seed = 123
)
## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [JobRole, MaritalStatus, StandardHours, BusinessTravel, Department, OverTime, Over18, EmployeeCount, Gender, EducationField].
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Print model summary
print(rf_model)
## Model Details:
## ==============
## 
## H2OBinomialModel: drf
## Model ID:  DRF_model_R_1733448940009_1 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1             100                      100              189868        12
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1        20   15.37000        126        170   146.24000
## 
## 
## H2OBinomialMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
## 
## MSE:  0.1211658
## RMSE:  0.3480888
## LogLoss:  0.5028406
## Mean Per-Class Error:  0.3245402
## AUC:  0.7363441
## AUCPR:  0.4099164
## Gini:  0.4726881
## R^2:  0.1270096
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         No Yes    Error       Rate
## No     833 143 0.146516   =143/976
## Yes     98  97 0.502564    =98/195
## Totals 931 240 0.205807  =241/1171
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.275000   0.445977 132
## 2                       max f2  0.135135   0.570113 211
## 3                 max f0point5  0.348837   0.456323  96
## 4                 max accuracy  0.555556   0.844577  25
## 5                max precision  0.800000   1.000000   0
## 6                   max recall  0.000000   1.000000 294
## 7              max specificity  0.800000   1.000000   0
## 8             max absolute_mcc  0.318182   0.329840 105
## 9   max min_per_class_accuracy  0.176471   0.666667 184
## 10 max mean_per_class_accuracy  0.214286   0.678583 163
## 11                     max tns  0.800000 976.000000   0
## 12                     max fns  0.800000 194.000000   0
## 13                     max fps  0.000000 976.000000 294
## 14                     max tps  0.000000 195.000000 294
## 15                     max tnr  0.800000   1.000000   0
## 16                     max fnr  0.800000   0.994872   0
## 17                     max fpr  0.000000   1.000000 294
## 18                     max tpr  0.000000   1.000000 294
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Make predictions on the test set
predictions <- h2o.predict(rf_model, test_h2o)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Evaluate model performance
perf <- h2o.performance(rf_model, newdata = test_h2o)
print(perf)
## H2OBinomialMetrics: drf
## 
## MSE:  0.1060165
## RMSE:  0.3256017
## LogLoss:  0.3619989
## Mean Per-Class Error:  0.330693
## AUC:  0.7345284
## AUCPR:  0.4034395
## Gini:  0.4690569
## R^2:  0.1219216
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         No Yes    Error     Rate
## No     240  17 0.066148  =17/257
## Yes     25  17 0.595238   =25/42
## Totals 265  34 0.140468  =42/299
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.320000   0.447368  20
## 2                       max f2  0.170000   0.553691  38
## 3                 max f0point5  0.370000   0.500000  14
## 4                 max accuracy  0.370000   0.872910  14
## 5                max precision  0.740000   1.000000   0
## 6                   max recall  0.030000   1.000000  53
## 7              max specificity  0.740000   1.000000   0
## 8             max absolute_mcc  0.320000   0.370624  20
## 9   max min_per_class_accuracy  0.190000   0.684825  35
## 10 max mean_per_class_accuracy  0.170000   0.704141  38
## 11                     max tns  0.740000 257.000000   0
## 12                     max fns  0.740000  41.000000   0
## 13                     max fps  0.000000 257.000000  56
## 14                     max tps  0.030000  42.000000  53
## 15                     max tnr  0.740000   1.000000   0
## 16                     max fnr  0.740000   0.976190   0
## 17                     max fpr  0.000000   1.000000  56
## 18                     max tpr  0.030000   1.000000  53
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# 1. ROC Curve
roc_data <- h2o.performance(rf_model, newdata = test_h2o)
roc_df <- data.frame(
  FPR = as.numeric(roc_data@metrics$thresholds_and_metric_scores[, "fpr"]),
  TPR = as.numeric(roc_data@metrics$thresholds_and_metric_scores[, "tpr"])
)

roc_plot <- ggplot(roc_df, aes(x = FPR, y = TPR)) +
  geom_line(color = "blue") +
  geom_abline(linetype = "dashed", color = "red") +
  labs(
    title = "ROC Curve",
    x = "False Positive Rate (FPR)",
    y = "True Positive Rate (TPR)"
  ) +
  theme_minimal()
print(roc_plot)

# 2. Precision-Recall Curve
pr_data <- h2o.performance(rf_model, newdata = test_h2o)
pr_df <- data.frame(
  Recall = as.numeric(pr_data@metrics$thresholds_and_metric_scores[, "recall"]),
  Precision = as.numeric(pr_data@metrics$thresholds_and_metric_scores[, "precision"])
)

pr_plot <- ggplot(pr_df, aes(x = Recall, y = Precision)) +
  geom_line(color = "darkgreen") +
  labs(
    title = "Precision-Recall Curve",
    x = "Recall",
    y = "Precision"
  ) +
  theme_minimal()
print(pr_plot)

# 3. Variable Importance Plot
varimp <- as.data.frame(h2o.varimp(rf_model))
varimp_plot <- ggplot(varimp, aes(x = reorder(variable, relative_importance), y = relative_importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Variable Importance",
    x = "Variables",
    y = "Relative Importance"
  ) +
  theme_minimal()
print(varimp_plot)

# Shutdown h2o
h2o.shutdown(prompt = FALSE)

Prompts

Prompt 1:

I have a dataset called attrition_raw_tbl that looks like this.

attrition_raw_tbl %>% glimpse() Rows: 1,470 Columns: 35 $ Age 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29, 31, 34, 28, 29, 32, 22, 53, 38, 24, … $ Attrition “Yes”, “No”, “Yes”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”… $ BusinessTravel “Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”, “Travel_Frequently”, “Travel_… $ DailyRate 1102, 279, 1373, 1392, 591, 1005, 1324, 1358, 216, 1299, 809, 153, 670, 1346, 103, 1… $ Department ”Sales”, “Research & Development”, “Research & Development”, “Research & Development… $ DistanceFromHome 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, 19, 24, 21, 5, 16, 2, 2, 11, 9, 7, 15, … $ Education 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, 4, 2, 2, 4, 3, 2, 4, 4, 2, 1, 3, 1, 4, … $ EducationField ”Life Sciences”, “Life Sciences”, “Other”, “Life Sciences”, “Medical”, “Life Science… $ EmployeeCount 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … $ EmployeeNumber 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28… $ EnvironmentSatisfaction 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, 2, 1, 4, 1, 4, 1, 3, 1, 3, 2, 3, 2, 3, … $ Gender ”Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Male”… $ HourlyRate 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 49, 31, 93, 50, 51, 80, 96, 78, 45, 96, … $ JobInvolvement 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, 4, 4, 4, 2, 3, 4, 2, 3, 3, 3, 3, 1, 3, … $ JobLevel 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, 3, 1, 1, 4, 1, 2, 1, 3, 1, 1, 5, 1, 2, … $ JobRole “Sales Executive”, “Research Scientist”, “Laboratory Technician”, “Research Scientis… $ JobSatisfaction 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, 1, 2, 4, 4, 4, 3, 1, 2, 4, 1, 3, 1, 2, … $ MaritalStatus ”Single”, “Married”, “Single”, “Married”, “Married”, “Single”, “Married”, “Divorced”… $ MonthlyIncome 5993, 5130, 2090, 2909, 3468, 3068, 2670, 2693, 9526, 5237, 2426, 4193, 2911, 2661, … $ MonthlyRate 19479, 24907, 2396, 23159, 16632, 11864, 9964, 13335, 8787, 16577, 16479, 12682, 151… $ NumCompaniesWorked 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, 1, 0, 1, 2, 5, 0, 7, 0, 1, 2, 4, 1, 0, … $ Over18 “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”,… $ OverTime “Yes”, “No”, “Yes”, “Yes”, “No”, “No”, “Yes”, “No”, “No”, “No”, “No”, “Yes”, “No”, “… $ PercentSalaryHike 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 12, 17, 11, 14, 11, 12, 13, 16, 11, 18, … $ PerformanceRating 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, … $ RelationshipSatisfaction 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, 3, 4, 2, 3, 3, 4, 2, 3, 4, 3, 4, 2, 4, … $ StandardHours 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, … $ StockOptionLevel 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, … $ TotalWorkingYears 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3, 6, 10, 7, 1, 31, 6, 5, 10, 13, 0, 8, … $ TrainingTimesLastYear 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, 1, 5, 2, 3, 3, 5, 4, 4, 6, 2, 3, 5, 2, … $ WorkLifeBalance 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, … $ YearsAtCompany 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4, 10, 6, 1, 25, 3, 4, 5, 12, 0, 4, 14, 1… $ YearsInCurrentRole 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, 9, 2, 0, 8, 2, 2, 3, 6, 0, 2, 13, 2, 7,… $ YearsSinceLastPromotion 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, 8, 0, 0, 3, 1, 1, 0, 2, 0, 1, 4, 6, 4, … $ YearsWithCurrManager 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, 8, 5, 0, 7, 2, 3, 3, 11, 0, 3, 8, 7, 2,…

The goal is to help predict attrition for employees.

Please write R code to create a predictive model that predicts the probability of attrition.

Prompt 2:

Please write R code to create a classification model that predicts the probability of attrition.

Prompt 3:

could you update that with the h2o model as well as tidy models package

Prompt 4:

please correct this error within the h2o model : Error in .h2o.doSafeREST(h2oRestApiVersion = h2oRestApiVersion, urlSuffix = page, :

ERROR MESSAGE:

Illegal argument(s) for DRF model: DRF_model_R_1733406716253_1. Details: ERRR on field: _response_column: Use numerical, categorical or time variable. Currently used String

Prompt 5:

Error in step_dummy(): Caused by error in bake(): ! Only one factor level in col_name: Y.

Prompt 6:

would you be able to add graphs to the evalution of the model?