library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.7 ✔ recipes 1.1.0
## ✔ dials 1.3.0 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'dials' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'infer' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'parsnip' was built under R version 4.3.3
## Warning: package 'recipes' was built under R version 4.3.3
## Warning: package 'rsample' was built under R version 4.3.3
## Warning: package 'tune' was built under R version 4.3.3
## Warning: package 'workflows' was built under R version 4.3.3
## Warning: package 'workflowsets' was built under R version 4.3.3
## Warning: package 'yardstick' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(h2o)
## Warning: package 'h2o' was built under R version 4.3.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidyverse)
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ lubridate::day() masks h2o::day()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ lubridate::hour() masks h2o::hour()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::month() masks h2o::month()
## ✖ readr::spec() masks yardstick::spec()
## ✖ lubridate::week() masks h2o::week()
## ✖ lubridate::year() masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
attrition_raw_tbl <- read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# If data is not sensitive:
attrition_raw_tbl %>% glimpse()
## Rows: 1,470
## Columns: 35
## $ Age <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2…
## $ Attrition <chr> "Yes", "No", "Yes", "No", "No", "No", "No", "…
## $ BusinessTravel <chr> "Travel_Rarely", "Travel_Frequently", "Travel…
## $ DailyRate <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358,…
## $ Department <chr> "Sales", "Research & Development", "Research …
## $ DistanceFromHome <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, …
## $ Education <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, …
## $ EducationField <chr> "Life Sciences", "Life Sciences", "Other", "L…
## $ EmployeeCount <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EmployeeNumber <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16,…
## $ EnvironmentSatisfaction <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, …
## $ Gender <chr> "Female", "Male", "Male", "Female", "Male", "…
## $ HourlyRate <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 4…
## $ JobInvolvement <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, …
## $ JobLevel <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, …
## $ JobRole <chr> "Sales Executive", "Research Scientist", "Lab…
## $ JobSatisfaction <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, …
## $ MaritalStatus <chr> "Single", "Married", "Single", "Married", "Ma…
## $ MonthlyIncome <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269…
## $ MonthlyRate <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9964…
## $ NumCompaniesWorked <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, …
## $ Over18 <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ OverTime <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes",…
## $ PercentSalaryHike <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1…
## $ PerformanceRating <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, …
## $ StandardHours <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ StockOptionLevel <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, …
## $ TotalWorkingYears <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3…
## $ TrainingTimesLastYear <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, …
## $ WorkLifeBalance <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, …
## $ YearsAtCompany <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4,…
## $ YearsInCurrentRole <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, …
## $ YearsSinceLastPromotion <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, …
## $ YearsWithCurrManager <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, …
# If data is sensitive:
attrition_raw_tbl %>%
slice(0) %>%
glimpse()
## Rows: 0
## Columns: 35
## $ Age <dbl>
## $ Attrition <chr>
## $ BusinessTravel <chr>
## $ DailyRate <dbl>
## $ Department <chr>
## $ DistanceFromHome <dbl>
## $ Education <dbl>
## $ EducationField <chr>
## $ EmployeeCount <dbl>
## $ EmployeeNumber <dbl>
## $ EnvironmentSatisfaction <dbl>
## $ Gender <chr>
## $ HourlyRate <dbl>
## $ JobInvolvement <dbl>
## $ JobLevel <dbl>
## $ JobRole <chr>
## $ JobSatisfaction <dbl>
## $ MaritalStatus <chr>
## $ MonthlyIncome <dbl>
## $ MonthlyRate <dbl>
## $ NumCompaniesWorked <dbl>
## $ Over18 <chr>
## $ OverTime <chr>
## $ PercentSalaryHike <dbl>
## $ PerformanceRating <dbl>
## $ RelationshipSatisfaction <dbl>
## $ StandardHours <dbl>
## $ StockOptionLevel <dbl>
## $ TotalWorkingYears <dbl>
## $ TrainingTimesLastYear <dbl>
## $ WorkLifeBalance <dbl>
## $ YearsAtCompany <dbl>
## $ YearsInCurrentRole <dbl>
## $ YearsSinceLastPromotion <dbl>
## $ YearsWithCurrManager <dbl>
# Load necessary libraries
library(h2o)
library(dplyr)
library(ggplot2)
# Initialize h2o
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\ktqua\AppData\Local\Temp\RtmpcrVQY3\file8a8c1e2ae5b/h2o_ktqua_started_from_r.out
## C:\Users\ktqua\AppData\Local\Temp\RtmpcrVQY3\file8a8c35dcf8f/h2o_ktqua_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 seconds 816 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 11 months and 15 days
## H2O cluster name: H2O_started_from_R_ktqua_jiv585
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.91 GB
## H2O cluster total cores: 16
## H2O cluster allowed cores: 16
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.2 (2023-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (11 months and 15 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert necessary columns to factors and ensure the response column is categorical
attrition_h2o <- attrition_raw_tbl %>%
mutate(Attrition = as.factor(Attrition)) %>% # Convert Attrition to a factor
as.h2o()
##
|
| | 0%
|
|======================================================================| 100%
# Split the dataset into training and testing sets (80/20)
splits <- h2o.splitFrame(attrition_h2o, ratios = 0.8, seed = 123)
train_h2o <- splits[[1]]
test_h2o <- splits[[2]]
# Define the response and predictors
response <- "Attrition"
predictors <- setdiff(colnames(attrition_raw_tbl), response)
# Train a random forest model in h2o
rf_model <- h2o.randomForest(
x = predictors,
y = response,
training_frame = train_h2o,
ntrees = 100,
seed = 123
)
## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [JobRole, MaritalStatus, StandardHours, BusinessTravel, Department, OverTime, Over18, EmployeeCount, Gender, EducationField].
##
|
| | 0%
|
|======================================================================| 100%
# Print model summary
print(rf_model)
## Model Details:
## ==============
##
## H2OBinomialModel: drf
## Model ID: DRF_model_R_1733448940009_1
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 100 100 189868 12
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 20 15.37000 126 170 146.24000
##
##
## H2OBinomialMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
##
## MSE: 0.1211658
## RMSE: 0.3480888
## LogLoss: 0.5028406
## Mean Per-Class Error: 0.3245402
## AUC: 0.7363441
## AUCPR: 0.4099164
## Gini: 0.4726881
## R^2: 0.1270096
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## No Yes Error Rate
## No 833 143 0.146516 =143/976
## Yes 98 97 0.502564 =98/195
## Totals 931 240 0.205807 =241/1171
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.275000 0.445977 132
## 2 max f2 0.135135 0.570113 211
## 3 max f0point5 0.348837 0.456323 96
## 4 max accuracy 0.555556 0.844577 25
## 5 max precision 0.800000 1.000000 0
## 6 max recall 0.000000 1.000000 294
## 7 max specificity 0.800000 1.000000 0
## 8 max absolute_mcc 0.318182 0.329840 105
## 9 max min_per_class_accuracy 0.176471 0.666667 184
## 10 max mean_per_class_accuracy 0.214286 0.678583 163
## 11 max tns 0.800000 976.000000 0
## 12 max fns 0.800000 194.000000 0
## 13 max fps 0.000000 976.000000 294
## 14 max tps 0.000000 195.000000 294
## 15 max tnr 0.800000 1.000000 0
## 16 max fnr 0.800000 0.994872 0
## 17 max fpr 0.000000 1.000000 294
## 18 max tpr 0.000000 1.000000 294
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Make predictions on the test set
predictions <- h2o.predict(rf_model, test_h2o)
##
|
| | 0%
|
|======================================================================| 100%
# Evaluate model performance
perf <- h2o.performance(rf_model, newdata = test_h2o)
print(perf)
## H2OBinomialMetrics: drf
##
## MSE: 0.1060165
## RMSE: 0.3256017
## LogLoss: 0.3619989
## Mean Per-Class Error: 0.330693
## AUC: 0.7345284
## AUCPR: 0.4034395
## Gini: 0.4690569
## R^2: 0.1219216
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## No Yes Error Rate
## No 240 17 0.066148 =17/257
## Yes 25 17 0.595238 =25/42
## Totals 265 34 0.140468 =42/299
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.320000 0.447368 20
## 2 max f2 0.170000 0.553691 38
## 3 max f0point5 0.370000 0.500000 14
## 4 max accuracy 0.370000 0.872910 14
## 5 max precision 0.740000 1.000000 0
## 6 max recall 0.030000 1.000000 53
## 7 max specificity 0.740000 1.000000 0
## 8 max absolute_mcc 0.320000 0.370624 20
## 9 max min_per_class_accuracy 0.190000 0.684825 35
## 10 max mean_per_class_accuracy 0.170000 0.704141 38
## 11 max tns 0.740000 257.000000 0
## 12 max fns 0.740000 41.000000 0
## 13 max fps 0.000000 257.000000 56
## 14 max tps 0.030000 42.000000 53
## 15 max tnr 0.740000 1.000000 0
## 16 max fnr 0.740000 0.976190 0
## 17 max fpr 0.000000 1.000000 56
## 18 max tpr 0.030000 1.000000 53
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# 1. ROC Curve
roc_data <- h2o.performance(rf_model, newdata = test_h2o)
roc_df <- data.frame(
FPR = as.numeric(roc_data@metrics$thresholds_and_metric_scores[, "fpr"]),
TPR = as.numeric(roc_data@metrics$thresholds_and_metric_scores[, "tpr"])
)
roc_plot <- ggplot(roc_df, aes(x = FPR, y = TPR)) +
geom_line(color = "blue") +
geom_abline(linetype = "dashed", color = "red") +
labs(
title = "ROC Curve",
x = "False Positive Rate (FPR)",
y = "True Positive Rate (TPR)"
) +
theme_minimal()
print(roc_plot)
# 2. Precision-Recall Curve
pr_data <- h2o.performance(rf_model, newdata = test_h2o)
pr_df <- data.frame(
Recall = as.numeric(pr_data@metrics$thresholds_and_metric_scores[, "recall"]),
Precision = as.numeric(pr_data@metrics$thresholds_and_metric_scores[, "precision"])
)
pr_plot <- ggplot(pr_df, aes(x = Recall, y = Precision)) +
geom_line(color = "darkgreen") +
labs(
title = "Precision-Recall Curve",
x = "Recall",
y = "Precision"
) +
theme_minimal()
print(pr_plot)
# 3. Variable Importance Plot
varimp <- as.data.frame(h2o.varimp(rf_model))
varimp_plot <- ggplot(varimp, aes(x = reorder(variable, relative_importance), y = relative_importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Variable Importance",
x = "Variables",
y = "Relative Importance"
) +
theme_minimal()
print(varimp_plot)
# Shutdown h2o
h2o.shutdown(prompt = FALSE)
Prompts
I have a dataset called attrition_raw_tbl that looks like this.
attrition_raw_tbl %>% glimpse() Rows: 1,470 Columns: 35 $ Age
41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29, 31, 34, 28, 29, 32, 22, 53, 38, 24, … $ Attrition “Yes”, “No”, “Yes”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”… $ BusinessTravel “Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”, “Travel_Frequently”, “Travel_… $ DailyRate 1102, 279, 1373, 1392, 591, 1005, 1324, 1358, 216, 1299, 809, 153, 670, 1346, 103, 1… $ Department ”Sales”, “Research & Development”, “Research & Development”, “Research & Development… $ DistanceFromHome 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, 19, 24, 21, 5, 16, 2, 2, 11, 9, 7, 15, … $ Education 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, 4, 2, 2, 4, 3, 2, 4, 4, 2, 1, 3, 1, 4, … $ EducationField ”Life Sciences”, “Life Sciences”, “Other”, “Life Sciences”, “Medical”, “Life Science… $ EmployeeCount 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … $ EmployeeNumber 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28… $ EnvironmentSatisfaction 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, 2, 1, 4, 1, 4, 1, 3, 1, 3, 2, 3, 2, 3, … $ Gender ”Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Male”… $ HourlyRate 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 49, 31, 93, 50, 51, 80, 96, 78, 45, 96, … $ JobInvolvement 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, 4, 4, 4, 2, 3, 4, 2, 3, 3, 3, 3, 1, 3, … $ JobLevel 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, 3, 1, 1, 4, 1, 2, 1, 3, 1, 1, 5, 1, 2, … $ JobRole “Sales Executive”, “Research Scientist”, “Laboratory Technician”, “Research Scientis… $ JobSatisfaction 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, 1, 2, 4, 4, 4, 3, 1, 2, 4, 1, 3, 1, 2, … $ MaritalStatus ”Single”, “Married”, “Single”, “Married”, “Married”, “Single”, “Married”, “Divorced”… $ MonthlyIncome 5993, 5130, 2090, 2909, 3468, 3068, 2670, 2693, 9526, 5237, 2426, 4193, 2911, 2661, … $ MonthlyRate 19479, 24907, 2396, 23159, 16632, 11864, 9964, 13335, 8787, 16577, 16479, 12682, 151… $ NumCompaniesWorked 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, 1, 0, 1, 2, 5, 0, 7, 0, 1, 2, 4, 1, 0, … $ Over18 “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”,… $ OverTime “Yes”, “No”, “Yes”, “Yes”, “No”, “No”, “Yes”, “No”, “No”, “No”, “No”, “Yes”, “No”, “… $ PercentSalaryHike 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 12, 17, 11, 14, 11, 12, 13, 16, 11, 18, … $ PerformanceRating 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, … $ RelationshipSatisfaction 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, 3, 4, 2, 3, 3, 4, 2, 3, 4, 3, 4, 2, 4, … $ StandardHours 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, … $ StockOptionLevel 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, … $ TotalWorkingYears 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3, 6, 10, 7, 1, 31, 6, 5, 10, 13, 0, 8, … $ TrainingTimesLastYear 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, 1, 5, 2, 3, 3, 5, 4, 4, 6, 2, 3, 5, 2, … $ WorkLifeBalance 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, … $ YearsAtCompany 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4, 10, 6, 1, 25, 3, 4, 5, 12, 0, 4, 14, 1… $ YearsInCurrentRole 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, 9, 2, 0, 8, 2, 2, 3, 6, 0, 2, 13, 2, 7,… $ YearsSinceLastPromotion 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, 8, 0, 0, 3, 1, 1, 0, 2, 0, 1, 4, 6, 4, … $ YearsWithCurrManager 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, 8, 5, 0, 7, 2, 3, 3, 11, 0, 3, 8, 7, 2,…
The goal is to help predict attrition for employees.
Please write R code to create a predictive model that predicts the probability of attrition.
Please write R code to create a classification model that predicts the probability of attrition.
could you update that with the h2o model as well as tidy models package
please correct this error within the h2o model : Error in .h2o.doSafeREST(h2oRestApiVersion = h2oRestApiVersion, urlSuffix = page, :
ERROR MESSAGE:
Illegal argument(s) for DRF model: DRF_model_R_1733406716253_1. Details: ERRR on field: _response_column: Use numerical, categorical or time variable. Currently used String
Error in step_dummy(): Caused by error in bake(): ! Only one factor level in col_name: Y.
would you be able to add graphs to the evalution of the model?