CodeAlong12

R Markdown

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

attrition_raw_tbl <- read_csv("../00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")

## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# If data is not sensitive:
attrition_raw_tbl %>% glimpse()

## Rows: 1,470
## Columns: 35
## $ Age                      <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2…
## $ Attrition                <chr> "Yes", "No", "Yes", "No", "No", "No", "No", "…
## $ BusinessTravel           <chr> "Travel_Rarely", "Travel_Frequently", "Travel…
## $ DailyRate                <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358,…
## $ Department               <chr> "Sales", "Research & Development", "Research …
## $ DistanceFromHome         <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, …
## $ Education                <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, …
## $ EducationField           <chr> "Life Sciences", "Life Sciences", "Other", "L…
## $ EmployeeCount            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EmployeeNumber           <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16,…
## $ EnvironmentSatisfaction  <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, …
## $ Gender                   <chr> "Female", "Male", "Male", "Female", "Male", "…
## $ HourlyRate               <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 4…
## $ JobInvolvement           <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, …
## $ JobLevel                 <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, …
## $ JobRole                  <chr> "Sales Executive", "Research Scientist", "Lab…
## $ JobSatisfaction          <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, …
## $ MaritalStatus            <chr> "Single", "Married", "Single", "Married", "Ma…
## $ MonthlyIncome            <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269…
## $ MonthlyRate              <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9964…
## $ NumCompaniesWorked       <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, …
## $ Over18                   <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ OverTime                 <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes",…
## $ PercentSalaryHike        <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1…
## $ PerformanceRating        <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, …
## $ StandardHours            <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ StockOptionLevel         <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, …
## $ TotalWorkingYears        <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3…
## $ TrainingTimesLastYear    <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, …
## $ WorkLifeBalance          <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, …
## $ YearsAtCompany           <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4,…
## $ YearsInCurrentRole       <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, …
## $ YearsSinceLastPromotion  <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, …
## $ YearsWithCurrManager     <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, …

# If data is sensitive:
attrition_raw_tbl %>%
    slice(0) %>%
    glimpse()

## Rows: 0
## Columns: 35
## $ Age                      <dbl> 
## $ Attrition                <chr> 
## $ BusinessTravel           <chr> 
## $ DailyRate                <dbl> 
## $ Department               <chr> 
## $ DistanceFromHome         <dbl> 
## $ Education                <dbl> 
## $ EducationField           <chr> 
## $ EmployeeCount            <dbl> 
## $ EmployeeNumber           <dbl> 
## $ EnvironmentSatisfaction  <dbl> 
## $ Gender                   <chr> 
## $ HourlyRate               <dbl> 
## $ JobInvolvement           <dbl> 
## $ JobLevel                 <dbl> 
## $ JobRole                  <chr> 
## $ JobSatisfaction          <dbl> 
## $ MaritalStatus            <chr> 
## $ MonthlyIncome            <dbl> 
## $ MonthlyRate              <dbl> 
## $ NumCompaniesWorked       <dbl> 
## $ Over18                   <chr> 
## $ OverTime                 <chr> 
## $ PercentSalaryHike        <dbl> 
## $ PerformanceRating        <dbl> 
## $ RelationshipSatisfaction <dbl> 
## $ StandardHours            <dbl> 
## $ StockOptionLevel         <dbl> 
## $ TotalWorkingYears        <dbl> 
## $ TrainingTimesLastYear    <dbl> 
## $ WorkLifeBalance          <dbl> 
## $ YearsAtCompany           <dbl> 
## $ YearsInCurrentRole       <dbl> 
## $ YearsSinceLastPromotion  <dbl> 
## $ YearsWithCurrManager     <dbl>

Prompt 1:

I have a dataset called attrition_raw_tbl that looks like this.

attrition_raw_tbl %>% glimpse() Rows: 1,470 Columns: 35 $ Age 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29, 31, 34, 28, 29, 32, 22, 53, 38, 24, … $ Attrition “Yes”, “No”, “Yes”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”… $ BusinessTravel “Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”, “Travel_Frequently”, “Travel_… $ DailyRate 1102, 279, 1373, 1392, 591, 1005, 1324, 1358, 216, 1299, 809, 153, 670, 1346, 103, 1… $ Department ”Sales”, “Research & Development”, “Research & Development”, “Research & Development… $ DistanceFromHome 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, 19, 24, 21, 5, 16, 2, 2, 11, 9, 7, 15, … $ Education 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, 4, 2, 2, 4, 3, 2, 4, 4, 2, 1, 3, 1, 4, … $ EducationField ”Life Sciences”, “Life Sciences”, “Other”, “Life Sciences”, “Medical”, “Life Science… $ EmployeeCount 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … $ EmployeeNumber 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28… $ EnvironmentSatisfaction 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, 2, 1, 4, 1, 4, 1, 3, 1, 3, 2, 3, 2, 3, … $ Gender ”Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Male”… $ HourlyRate 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 49, 31, 93, 50, 51, 80, 96, 78, 45, 96, … $ JobInvolvement 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, 4, 4, 4, 2, 3, 4, 2, 3, 3, 3, 3, 1, 3, … $ JobLevel 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, 3, 1, 1, 4, 1, 2, 1, 3, 1, 1, 5, 1, 2, … $ JobRole “Sales Executive”, “Research Scientist”, “Laboratory Technician”, “Research Scientis… $ JobSatisfaction 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, 1, 2, 4, 4, 4, 3, 1, 2, 4, 1, 3, 1, 2, … $ MaritalStatus ”Single”, “Married”, “Single”, “Married”, “Married”, “Single”, “Married”, “Divorced”… $ MonthlyIncome 5993, 5130, 2090, 2909, 3468, 3068, 2670, 2693, 9526, 5237, 2426, 4193, 2911, 2661, … $ MonthlyRate 19479, 24907, 2396, 23159, 16632, 11864, 9964, 13335, 8787, 16577, 16479, 12682, 151… $ NumCompaniesWorked 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, 1, 0, 1, 2, 5, 0, 7, 0, 1, 2, 4, 1, 0, … $ Over18 “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”,… $ OverTime “Yes”, “No”, “Yes”, “Yes”, “No”, “No”, “Yes”, “No”, “No”, “No”, “No”, “Yes”, “No”, “… $ PercentSalaryHike 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 12, 17, 11, 14, 11, 12, 13, 16, 11, 18, … $ PerformanceRating 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, … $ RelationshipSatisfaction 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, 3, 4, 2, 3, 3, 4, 2, 3, 4, 3, 4, 2, 4, … $ StandardHours 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, … $ StockOptionLevel 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, … $ TotalWorkingYears 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3, 6, 10, 7, 1, 31, 6, 5, 10, 13, 0, 8, … $ TrainingTimesLastYear 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, 1, 5, 2, 3, 3, 5, 4, 4, 6, 2, 3, 5, 2, … $ WorkLifeBalance 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, … $ YearsAtCompany 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4, 10, 6, 1, 25, 3, 4, 5, 12, 0, 4, 14, 1… $ YearsInCurrentRole 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, 9, 2, 0, 8, 2, 2, 3, 6, 0, 2, 13, 2, 7,… $ YearsSinceLastPromotion 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, 8, 0, 0, 3, 1, 1, 0, 2, 0, 1, 4, 6, 4, … $ YearsWithCurrManager 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, 8, 5, 0, 7, 2, 3, 3, 11, 0, 3, 8, 7, 2,…

The goal is to help predict attrition for employees.

Please write R code to create a predictive model that predicts the probability of attrition.

Prompt 2:

Please update the code to use tidymodels instead of caret and to use the h2o model instead of glmnet.

Prompt 3:

Error in .h2o.doSafeREST(h2oRestApiVersion = h2oRestApiVersion, urlSuffix = page, :

Prompt 4:

Please update the code to use h2o.performance in Step 5, instead of mean.

Prompt 5:

Error in createDataPartition(attrition_raw_tbl$Attrition, p = 0.8, list = FALSE) : could not find function “createDataPartition”

Prompt 6:

Error in $<-: ! Assigned data ifelse(testData$predicted_prob > 0.5, "Yes", "No") must be compatible with existing data. ✖ Existing data has 294 rows. ✖ Assigned data has 0 rows. ℹ Only vectors of size 1 are recycled. Caused by error in vectbl_recycle_rhs_rows(): ! Can’t recycle input of size 0 to size 294.

# Load necessary libraries
library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.8     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.2
## ✔ recipes      1.1.0

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following object is masked from 'package:h2o':
## 
##     var

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Initialize h2o
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 days 16 hours 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 4 days 
##     H2O cluster name:           H2O_started_from_R_alyssadalessio_fyb567 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.69 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.1 (2024-06-14)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 4 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

# Data Preprocessing
# Convert categorical variables to factors using tidymodels
attrition_raw_tbl <- attrition_raw_tbl %>%
  mutate(
    Attrition = factor(Attrition, levels = c("No", "Yes")),
    BusinessTravel = factor(BusinessTravel),
    Department = factor(Department),
    EducationField = factor(EducationField),
    Gender = factor(Gender),
    JobRole = factor(JobRole),
    MaritalStatus = factor(MaritalStatus),
    Over18 = factor(Over18),
    OverTime = factor(OverTime)
  )

# Split data into training and testing sets using rsample
set.seed(123)  # For reproducibility
split <- initial_split(attrition_raw_tbl, prop = 0.8)
trainData <- training(split)
testData <- testing(split)

# Convert data to H2O frames
train_h2o <- as.h2o(trainData)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

test_h2o <- as.h2o(testData)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Check the number of rows in test_h2o
cat("Number of rows in test_h2o: ", nrow(test_h2o), "\n")

## Number of rows in test_h2o:  294

cat("Number of rows in testData: ", nrow(testData), "\n")

## Number of rows in testData:  294

# Define the target and features
target <- "Attrition"
features <- setdiff(names(trainData), target)

# Build the H2O logistic regression model
model_h2o <- h2o.glm(
  y = target,
  x = features,
  training_frame = train_h2o,
  family = "binomial",
  lambda = 0,  # No regularization (you can tune this parameter)
  alpha = 0,   # Elastic net mixing parameter (0 for ridge)
  nfolds = 5    # Cross-validation
)

## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [StandardHours, EmployeeCount, Over18].

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# View model summary
summary(model_h2o)

## Model Details:
## ==============
## 
## H2OBinomialModel: glm
## Model Key:  GLM_model_R_1745361192601_17641 
## GLM Model: summary
##     family  link regularization number_of_predictors_total
## 1 binomial logit           None                         45
##   number_of_active_predictors number_of_iterations       training_frame
## 1                          45                   10 trainData_sid_887b_1
## 
## H2OBinomialMetrics: glm
## ** Reported on training data. **
## 
## MSE:  0.08696564
## RMSE:  0.2948994
## LogLoss:  0.3006821
## Mean Per-Class Error:  0.2333671
## AUC:  0.8589789
## AUCPR:  0.6708205
## Gini:  0.7179578
## R^2:  0.3552618
## Residual Deviance:  707.2042
## AIC:  799.2042
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         No Yes    Error       Rate
## No     918  69 0.069909    =69/987
## Yes     75 114 0.396825    =75/189
## Totals 993 183 0.122449  =144/1176
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.344488   0.612903 138
## 2                       max f2  0.190643   0.670391 213
## 3                 max f0point5  0.451832   0.682148 102
## 4                 max accuracy  0.451832   0.892857 102
## 5                max precision  0.982678   1.000000   0
## 6                   max recall  0.004128   1.000000 390
## 7              max specificity  0.982678   1.000000   0
## 8             max absolute_mcc  0.435947   0.560171 107
## 9   max min_per_class_accuracy  0.167952   0.780142 231
## 10 max mean_per_class_accuracy  0.190643   0.792806 213
## 11                     max tns  0.982678 987.000000   0
## 12                     max fns  0.982678 188.000000   0
## 13                     max fps  0.000021 987.000000 399
## 14                     max tps  0.004128 189.000000 390
## 15                     max tnr  0.982678   1.000000   0
## 16                     max fnr  0.982678   0.994709   0
## 17                     max fpr  0.000021   1.000000 399
## 18                     max tpr  0.004128   1.000000 390
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## 
## H2OBinomialMetrics: glm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.1023938
## RMSE:  0.3199904
## LogLoss:  0.3514576
## Mean Per-Class Error:  0.2679838
## AUC:  0.8119951
## AUCPR:  0.5510294
## Gini:  0.6239902
## R^2:  0.2408817
## Residual Deviance:  826.6283
## AIC:  918.6283
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         No Yes    Error       Rate
## No     881 106 0.107396   =106/987
## Yes     81 108 0.428571    =81/189
## Totals 962 214 0.159014  =187/1176
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.323280   0.535980 151
## 2                       max f2  0.113592   0.614754 267
## 3                 max f0point5  0.585593   0.578842  65
## 4                 max accuracy  0.585593   0.871599  65
## 5                max precision  0.981588   1.000000   0
## 6                   max recall  0.001384   1.000000 396
## 7              max specificity  0.981588   1.000000   0
## 8             max absolute_mcc  0.338953   0.442178 145
## 9   max min_per_class_accuracy  0.144397   0.739615 242
## 10 max mean_per_class_accuracy  0.193076   0.744118 212
## 11                     max tns  0.981588 987.000000   0
## 12                     max fns  0.981588 188.000000   0
## 13                     max fps  0.000094 987.000000 399
## 14                     max tps  0.001384 189.000000 396
## 15                     max tnr  0.981588   1.000000   0
## 16                     max fnr  0.981588   0.994709   0
## 17                     max fpr  0.000094   1.000000 399
## 18                     max tpr  0.001384   1.000000 396
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                mean       sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy   0.849814 0.010163   0.842105   0.854626   0.837838   0.863436
## auc        0.809927 0.038092   0.781974   0.812690   0.820379   0.866271
## err        0.150186 0.010163   0.157895   0.145374   0.162162   0.136564
## err_count 35.400000 4.159327  36.000000  33.000000  42.000000  31.000000
## f0point5   0.536200 0.095529   0.492958   0.470588   0.454545   0.683962
##           cv_5_valid
## accuracy    0.851064
## auc         0.768320
## err         0.148936
## err_count  35.000000
## f0point5    0.578947
## 
## ---
##                         mean        sd cv_1_valid cv_2_valid cv_3_valid
## precision           0.532101  0.116165   0.466667   0.457143   0.434783
## r2                  0.228639  0.073650   0.223188   0.149120   0.217736
## recall              0.570646  0.048104   0.636364   0.533333   0.555556
## residual_deviance 165.325670 19.595740 157.863970 144.097230 162.885930
## rmse                0.319939  0.015158   0.310097   0.312394   0.305971
## specificity         0.903825  0.024136   0.876923   0.903553   0.883408
##                   cv_4_valid cv_5_valid
## precision           0.707317   0.594595
## r2                  0.349478   0.203673
## recall              0.604167   0.523810
## residual_deviance 164.471700 197.309520
## rmse                0.329346   0.341886
## specificity         0.932961   0.922280
## 
## Scoring History: 
##              timestamp   duration iterations negative_log_likelihood objective
## 1  2025-04-25 11:11:24  0.000 sec          0               518.44246   0.44085
## 2  2025-04-25 11:11:24  0.001 sec          1               384.28105   0.32677
## 3  2025-04-25 11:11:24  0.002 sec          2               357.07534   0.30364
## 4  2025-04-25 11:11:24  0.003 sec          3               353.83048   0.30088
## 5  2025-04-25 11:11:24  0.004 sec          4               353.64076   0.30071
## 6  2025-04-25 11:11:24  0.005 sec          5               353.61595   0.30069
## 7  2025-04-25 11:11:24  0.006 sec          6               353.60714   0.30069
## 8  2025-04-25 11:11:24  0.007 sec          7               353.60391   0.30068
## 9  2025-04-25 11:11:24  0.008 sec          8               353.60272   0.30068
## 10 2025-04-25 11:11:24  0.009 sec          9               353.60228   0.30068
## 11 2025-04-25 11:11:24  0.010 sec         10               353.60212   0.30068
##    training_rmse training_logloss training_r2 training_auc training_pr_auc
## 1             NA               NA          NA           NA              NA
## 2             NA               NA          NA           NA              NA
## 3             NA               NA          NA           NA              NA
## 4             NA               NA          NA           NA              NA
## 5             NA               NA          NA           NA              NA
## 6             NA               NA          NA           NA              NA
## 7             NA               NA          NA           NA              NA
## 8             NA               NA          NA           NA              NA
## 9             NA               NA          NA           NA              NA
## 10            NA               NA          NA           NA              NA
## 11       0.29490          0.30068     0.35526      0.85898         0.67082
##    training_lift training_classification_error
## 1             NA                            NA
## 2             NA                            NA
## 3             NA                            NA
## 4             NA                            NA
## 5             NA                            NA
## 6             NA                            NA
## 7             NA                            NA
## 8             NA                            NA
## 9             NA                            NA
## 10            NA                            NA
## 11       6.22222                       0.12245
## 
## Variable Importances: (Extract with `h2o.varimp`) 
## =================================================
## 
## Variable Importances: 
##                            variable relative_importance scaled_importance
## 1           JobRole.Human Resources            9.873778          1.000000
## 2                  Department.Sales            9.121917          0.923853
## 3 Department.Research & Development            8.987588          0.910248
## 4      JobRole.Sales Representative            1.798961          0.182196
## 5  BusinessTravel.Travel_Frequently            1.794498          0.181744
##   percentage
## 1   0.195935
## 2   0.181015
## 3   0.178349
## 4   0.035698
## 5   0.035610
## 
## ---
##           variable relative_importance scaled_importance percentage
## 40       DailyRate            0.056780          0.005751   0.001127
## 41 JobRole.Manager            0.049918          0.005056   0.000991
## 42   MonthlyIncome            0.037504          0.003798   0.000744
## 43      HourlyRate            0.021391          0.002166   0.000424
## 44       Education            0.015607          0.001581   0.000310
## 45        JobLevel            0.001831          0.000185   0.000036

# Make predictions on the test data
predictions <- h2o.predict(model_h2o, test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Check the structure of predictions
cat("Predictions structure:\n")

## Predictions structure:

str(predictions)

## Class 'H2OFrame' <environment: 0x7fc2e5bed768> 
##  - attr(*, "op")= chr "transformation_811f_GLM_model_R_1745361192601_17641_on_testData_sid_887b_3"
##  - attr(*, "id")= chr "transformation_811f_GLM_model_R_1745361192601_17641_on_testData_sid_887b_3"
##  - attr(*, "eval")= logi FALSE
##  - attr(*, "nrow")= int 294
##  - attr(*, "ncol")= int 3
##  - attr(*, "types")=List of 3
##   ..$ : chr "enum"
##   ..$ : chr "real"
##   ..$ : chr "real"
##  - attr(*, "data")='data.frame': 10 obs. of  3 variables:
##   ..$ predict: Factor w/ 2 levels "No","Yes": 2 1 2 2 1 2 1 2 1 1
##   ..$ No     : num  0.324 0.95 0.135 0.451 0.99 ...
##   ..$ Yes    : num  0.6755 0.0503 0.8645 0.5492 0.0105 ...

# Make sure to extract the correct column (probabilities)
# For binary classification, the predicted probability is usually in the second column of the output
predicted_prob <- as.data.frame(predictions)[, 3]  # p1 corresponds to probability of "Yes" class in binary classification

# Check the length of predicted probabilities
cat("Length of predicted_prob: ", length(predicted_prob), "\n")

## Length of predicted_prob:  294

# Ensure that the number of rows in predicted_prob matches the number of rows in testData
if (length(predicted_prob) != nrow(testData)) {
  stop("Mismatch in the number of predictions and the number of test data rows.")
}

# Add predictions to the test data for evaluation
testData$predicted_prob <- predicted_prob
testData$predicted_class <- ifelse(testData$predicted_prob > 0.5, "Yes", "No")

# Evaluate the model performance using h2o.performance
perf <- h2o.performance(model_h2o, newdata = test_h2o)

# Print the model performance metrics
print(perf)

## H2OBinomialMetrics: glm
## 
## MSE:  0.08165735
## RMSE:  0.2857575
## LogLoss:  0.2748423
## Mean Per-Class Error:  0.2202744
## AUC:  0.8923611
## AUCPR:  0.7098507
## Gini:  0.7847222
## R^2:  0.4022582
## Residual Deviance:  161.6073
## AIC:  253.6073
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         No Yes    Error     Rate
## No     235  11 0.044715  =11/246
## Yes     19  29 0.395833   =19/48
## Totals 254  40 0.102041  =30/294
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.451053   0.659091  39
## 2                       max f2  0.128712   0.716724 100
## 3                 max f0point5  0.549232   0.731707  28
## 4                 max accuracy  0.549232   0.901361  28
## 5                max precision  0.914289   1.000000   0
## 6                   max recall  0.025455   1.000000 216
## 7              max specificity  0.914289   1.000000   0
## 8             max absolute_mcc  0.451053   0.603120  39
## 9   max min_per_class_accuracy  0.173280   0.812500  84
## 10 max mean_per_class_accuracy  0.128712   0.817581 100
## 11                     max tns  0.914289 246.000000   0
## 12                     max fns  0.914289  47.000000   0
## 13                     max fps  0.000004 246.000000 293
## 14                     max tps  0.025455  48.000000 216
## 15                     max tnr  0.914289   1.000000   0
## 16                     max fnr  0.914289   0.979167   0
## 17                     max fpr  0.000004   1.000000 293
## 18                     max tpr  0.025455   1.000000 216
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

# For example, print the AUC (Area Under the Curve)
print(paste("AUC:", h2o.auc(perf)))

## [1] "AUC: 0.892361111111111"

# You can also print other metrics like accuracy, confusion matrix, etc.
print(paste("Accuracy:", h2o.accuracy(perf)))

## [1] "Accuracy: c(0.914288536036572, 0.883223795994258, 0.864536152214014, 0.862203765665762, 0.857739838095972, 0.851075311260739, 0.843522786343821, 0.83800428856617, 0.821571383112286, 0.769590154069416, 0.745507816131226, 0.716943904036953, 0.713865332885823, 0.704502201243352, 0.693267083158515, 0.675531804734526, 0.660881456337208, 0.648777566704479, 0.643900296779577, 0.634442995163679, 0.631861690827628, 0.625099057868352, 0.592563985793918, 0.583343294747386, 0.577504398345462, 0.569217828967198, 0.565271951619162, \n0.560318493114365, 0.549232214506552, 0.506059787568781, 0.504586937547718, 0.48832567001815, 0.480399993196149, 0.476258488959353, 0.463391982545745, 0.462836412774303, 0.460158399653855, 0.458134292187738, 0.45327525340248, 0.451052915904577, 0.450414640581968, 0.428967711997486, 0.417365710559858, 0.404982713274323, 0.393283021992738, 0.380153573087424, 0.374978480004284, 0.374922677225067, 0.373365613282202, 0.364950020033924, 0.336744959532501, 0.325705243395976, 0.323290101394393, 0.320033429530577, \n0.304308651439448, 0.297295488542766, 0.296705007826442, 0.29217812216789, 0.275976820422601, 0.265047955516226, 0.262051655202771, 0.258056569655126, 0.252258188670061, 0.251447708745181, 0.250314582635167, 0.250136421251817, 0.247305841382748, 0.247092266014113, 0.23702607113019, 0.236088059902331, 0.231921985234761, 0.230585172447349, 0.230471959320463, 0.226727256359927, 0.223375117332883, 0.213912413165347, 0.207192871297261, 0.206114344489622, 0.200109269842898, 0.184041133644022, 0.183995157545439, \n0.180760881400063, 0.178268941452454, 0.17469062547235, 0.173280163067417, 0.171474039506526, 0.168190235541709, 0.167548544692935, 0.166832545502934, 0.157528979913547, 0.155585991056527, 0.155462902035789, 0.15304513247434, 0.152805479903591, 0.14702234537573, 0.146394544499329, 0.140203448925985, 0.137020505912884, 0.129906268556732, 0.128955278347721, 0.128712187568932, 0.12845622007978, 0.128055784355433, 0.127103881240378, 0.126804055135711, 0.12589869032088, 0.123142862429301, 0.122680955170636, \n0.119925737581329, 0.115223624515165, 0.115211400416318, 0.114392100679942, 0.113334456874962, 0.111910874254447, 0.10799309218386, 0.106627098531431, 0.105897864497612, 0.105290341236941, 0.10464187517429, 0.104233130685699, 0.104142933537769, 0.10346975754587, 0.102400070008813, 0.0987603500130446, 0.0978313236621077, 0.0959824323678085, 0.0951711741797019, 0.0943482485369636, 0.0928639630743138, 0.0924111485636564, 0.0912663323000747, 0.0876999917206199, 0.0873426970578067, 0.0846530330291824, \n0.0846335593120807, 0.0827140810777448, 0.0818685883450225, 0.0814867094951084, 0.080915208279687, 0.0806739669173837, 0.0778177256460144, 0.077509052040883, 0.0774075952534782, 0.0773271218164516, 0.0750615028545567, 0.0738171513348066, 0.0734881080253677, 0.0733865785498233, 0.0732410861708566, 0.0729188227064109, 0.0723690326383975, 0.0723355666646828, 0.0712251555936834, 0.0702831343125025, 0.0693825652649039, 0.0684407615711365, 0.06828504586364, 0.0674561678734681, 0.0672569230529594, 0.0671007196833343, \n0.0669243791073919, 0.0669120629297886, 0.0663505525360089, 0.0657980539570455, 0.0654843663078062, 0.0648734714611766, 0.0643856577395475, 0.0576463840218263, 0.0550324498575938, 0.0549914347354143, 0.0543333492422711, 0.0540756494764346, 0.0533564819010518, 0.0527483290996708, 0.0521362698881012, 0.0518209003388338, 0.0509327063707017, 0.0507690924371053, 0.0506112848500245, 0.0503998015155155, 0.0503703908520175, 0.0503109113674953, 0.0501298868282805, 0.0492012911650388, 0.048723216529805, 0.048638649991089, \n0.0481501225818301, 0.0432421497414365, 0.0426753005582658, 0.0426124188000308, 0.0422994335410412, 0.0422949824526717, 0.0415740259578156, 0.0406935265763267, 0.0397227457884934, 0.0396788223926201, 0.0388145254158464, 0.038795461528885, 0.0362835299681423, 0.036085003510975, 0.0359337447002166, 0.0355902133781728, 0.0348399480772731, 0.034401114666801, 0.0332629001780294, 0.0310373535742162, 0.0299612251071288, 0.0296117558519566, 0.0292084171270196, 0.0287193506694516, 0.0285501376172704, 0.0284941686757205, \n0.0281721374996325, 0.0274540402231292, 0.0274290124296611, 0.0258675493078901, 0.0254547418951564, 0.0249462941689506, 0.0246015248834178, 0.0244415011208164, 0.0242644301691754, 0.0240711402602732, 0.0238012029925802, 0.0237686104102718, 0.022763788429503, 0.0222754771169354, 0.0218434629623956, 0.021838377590114, 0.0206032032192088, 0.0202961174612139, 0.0199014639125486, 0.0197788635548282, 0.0194840866366478, 0.0194550780505829, 0.0176736630240316, 0.0176073261116137, 0.0174031834245132, 0.0166357243082531, \n0.016613832307606, 0.0163567530048891, 0.0152546680922294, 0.0146239791066989, 0.0137495831669389, 0.013235927170362, 0.0131919419858883, 0.0128739548996193, 0.0123992859403517, 0.0123988772547774, 0.0120579857586788, 0.0115161235075191, 0.0111780519530117, 0.010671490572957, 0.0104634899132959, 0.0103581359975992, 0.00949988341243829, 0.0091923470508557, 0.00914894538823571, 0.00913756477789191, 0.00902285019989565, 0.00885996961249652, 0.00884953335544982, 0.00876025286745356, 0.00858781251196021, \n0.00808184501758805, 0.00804031278921246, 0.00716966368028945, 0.00709282191528477, 0.00706727331048027, 0.00675610599832123, 0.0066745770168347, 0.00647366997644087, 0.0062255651482317, 0.00585541429138682, 0.00566491089236044, 0.00530235421525382, 0.0049279066966866, 0.00488360995254303, 0.00486771491928463, 0.00466052318971048, 0.00459217054399805, 0.00451828757089981, 0.00419549597125058, 0.00378645902016959, 0.00374551916868815, 0.00351526810700375, 0.00288225973079383, 0.0026667820177272, 0.00235249578491875, \n0.00229025031577674, 0.00197732209803591, 0.00155411221634621, 0.00132768598527276, 0.000876346615929246, 4.04695079870169e-06)"
## [2] "Accuracy: c(0.840136054421769, 0.843537414965986, 0.846938775510204, 0.850340136054422, 0.853741496598639, 0.857142857142857, 0.860544217687075, 0.863945578231292, 0.86734693877551, 0.870748299319728, 0.874149659863946, 0.870748299319728, 0.874149659863946, 0.877551020408163, 0.874149659863946, 0.877551020408163, 0.880952380952381, 0.884353741496599, 0.880952380952381, 0.877551020408163, 0.880952380952381, 0.884353741496599, 0.880952380952381, 0.884353741496599, 0.887755102040816, 0.891156462585034, 0.894557823129252, \n0.897959183673469, 0.901360544217687, 0.897959183673469, 0.894557823129252, 0.897959183673469, 0.894557823129252, 0.891156462585034, 0.894557823129252, 0.897959183673469, 0.894557823129252, 0.897959183673469, 0.894557823129252, 0.897959183673469, 0.894557823129252, 0.891156462585034, 0.887755102040816, 0.884353741496599, 0.880952380952381, 0.877551020408163, 0.880952380952381, 0.884353741496599, 0.880952380952381, 0.884353741496599, 0.880952380952381, 0.877551020408163, 0.874149659863946, 0.877551020408163, \n0.874149659863946, 0.870748299319728, 0.86734693877551, 0.863945578231292, 0.860544217687075, 0.857142857142857, 0.860544217687075, 0.857142857142857, 0.853741496598639, 0.850340136054422, 0.846938775510204, 0.850340136054422, 0.846938775510204, 0.843537414965986, 0.840136054421769, 0.836734693877551, 0.833333333333333, 0.829931972789116, 0.826530612244898, 0.82312925170068, 0.819727891156463, 0.816326530612245, 0.812925170068027, 0.80952380952381, 0.806122448979592, 0.80952380952381, 0.812925170068027, \n0.80952380952381, 0.806122448979592, 0.80952380952381, 0.812925170068027, 0.80952380952381, 0.806122448979592, 0.802721088435374, 0.799319727891156, 0.802721088435374, 0.799319727891156, 0.795918367346939, 0.792517006802721, 0.789115646258503, 0.785714285714286, 0.789115646258503, 0.785714285714286, 0.782312925170068, 0.77891156462585, 0.775510204081633, 0.77891156462585, 0.775510204081633, 0.772108843537415, 0.768707482993197, 0.76530612244898, 0.761904761904762, 0.758503401360544, 0.755102040816326, \n0.751700680272109, 0.748299319727891, 0.744897959183674, 0.741496598639456, 0.738095238095238, 0.73469387755102, 0.731292517006803, 0.727891156462585, 0.724489795918367, 0.72108843537415, 0.717687074829932, 0.714285714285714, 0.717687074829932, 0.714285714285714, 0.710884353741497, 0.707482993197279, 0.704081632653061, 0.700680272108844, 0.697278911564626, 0.693877551020408, 0.69047619047619, 0.687074829931973, 0.683673469387755, 0.680272108843537, 0.67687074829932, 0.673469387755102, 0.670068027210884, \n0.666666666666667, 0.663265306122449, 0.666666666666667, 0.663265306122449, 0.659863945578231, 0.656462585034014, 0.653061224489796, 0.649659863945578, 0.646258503401361, 0.649659863945578, 0.646258503401361, 0.649659863945578, 0.646258503401361, 0.642857142857143, 0.639455782312925, 0.636054421768708, 0.63265306122449, 0.629251700680272, 0.625850340136054, 0.622448979591837, 0.619047619047619, 0.615646258503401, 0.612244897959184, 0.608843537414966, 0.605442176870748, 0.602040816326531, 0.598639455782313, \n0.595238095238095, 0.591836734693878, 0.58843537414966, 0.585034013605442, 0.581632653061224, 0.578231292517007, 0.574829931972789, 0.571428571428571, 0.568027210884354, 0.564625850340136, 0.561224489795918, 0.557823129251701, 0.554421768707483, 0.551020408163265, 0.547619047619048, 0.54421768707483, 0.540816326530612, 0.537414965986395, 0.534013605442177, 0.530612244897959, 0.527210884353742, 0.523809523809524, 0.520408163265306, 0.517006802721088, 0.513605442176871, 0.510204081632653, 0.506802721088435, \n0.503401360544218, 0.5, 0.496598639455782, 0.493197278911565, 0.489795918367347, 0.486394557823129, 0.482993197278912, 0.479591836734694, 0.476190476190476, 0.472789115646259, 0.469387755102041, 0.465986394557823, 0.462585034013605, 0.459183673469388, 0.45578231292517, 0.452380952380952, 0.448979591836735, 0.445578231292517, 0.442176870748299, 0.445578231292517, 0.442176870748299, 0.438775510204082, 0.435374149659864, 0.431972789115646, 0.428571428571429, 0.425170068027211, 0.421768707482993, 0.425170068027211, \n0.421768707482993, 0.418367346938776, 0.414965986394558, 0.41156462585034, 0.408163265306122, 0.404761904761905, 0.401360544217687, 0.397959183673469, 0.394557823129252, 0.391156462585034, 0.387755102040816, 0.384353741496599, 0.380952380952381, 0.377551020408163, 0.374149659863946, 0.370748299319728, 0.36734693877551, 0.363945578231293, 0.360544217687075, 0.357142857142857, 0.353741496598639, 0.350340136054422, 0.346938775510204, 0.343537414965986, 0.340136054421769, 0.336734693877551, 0.333333333333333, \n0.329931972789116, 0.326530612244898, 0.32312925170068, 0.319727891156463, 0.316326530612245, 0.312925170068027, 0.30952380952381, 0.306122448979592, 0.302721088435374, 0.299319727891156, 0.295918367346939, 0.292517006802721, 0.289115646258503, 0.285714285714286, 0.282312925170068, 0.27891156462585, 0.275510204081633, 0.272108843537415, 0.268707482993197, 0.26530612244898, 0.261904761904762, 0.258503401360544, 0.255102040816327, 0.251700680272109, 0.248299319727891, 0.244897959183673, 0.241496598639456, \n0.238095238095238, 0.23469387755102, 0.231292517006803, 0.227891156462585, 0.224489795918367, 0.22108843537415, 0.217687074829932, 0.214285714285714, 0.210884353741497, 0.207482993197279, 0.204081632653061, 0.200680272108844, 0.197278911564626, 0.193877551020408, 0.19047619047619, 0.187074829931973, 0.183673469387755, 0.180272108843537, 0.17687074829932, 0.173469387755102, 0.170068027210884, 0.166666666666667, 0.163265306122449)"

print(paste("Confusion Matrix:"))

## [1] "Confusion Matrix:"

print(h2o.confusionMatrix(perf))

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.451052915904577:
##         No Yes    Error     Rate
## No     235  11 0.044715  =11/246
## Yes     19  29 0.395833   =19/48
## Totals 254  40 0.102041  =30/294

# ROC Curve
roc_curve <- roc(testData$Attrition, testData$predicted_prob)

## Setting levels: control = No, case = Yes

## Setting direction: controls < cases

plot(roc_curve, col = "blue", main = "ROC Curve")

# Shutdown H2O
h2o.shutdown(prompt = FALSE)