Set up

Import data

Import the cleaned data from Module 7.

library(h2o)
## Warning: package 'h2o' was built under R version 4.3.1
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'tidyr' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'stringr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::day()   masks h2o::day()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ lubridate::hour()  masks h2o::hour()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ lubridate::month() masks h2o::month()
## ✖ lubridate::week()  masks h2o::week()
## ✖ lubridate::year()  masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.1
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'dials' was built under R version 4.3.1
## Warning: package 'scales' was built under R version 4.3.1
## Warning: package 'infer' was built under R version 4.3.1
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'parsnip' was built under R version 4.3.1
## Warning: package 'recipes' was built under R version 4.3.1
## Warning: package 'rsample' was built under R version 4.3.1
## Warning: package 'tune' was built under R version 4.3.1
## Warning: package 'workflows' was built under R version 4.3.1
## Warning: package 'workflowsets' was built under R version 4.3.1
## Warning: package 'yardstick' was built under R version 4.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(tidyquant)
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## 
## Attaching package: 'TTR'
## 
## The following object is masked from 'package:dials':
## 
##     momentum
## 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
data <- read_csv("../00_data/data_wrangled/data_clean2.csv") %>%
    
    # h2o requires all variables to be either numeric or factors
    mutate(across(where(is.character), factor))
## Rows: 501 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): still_there, notes
## dbl  (7): fyear, co_per_rol, departure_code, ceo_dismissal, tenure_no_ceodb,...
## dttm (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Split data

set.seed(1234)

data_split <- initial_split(data, strata = "ceo_dismissal")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
train_tbl <- train_tbl %>%
    mutate(ceo_dismissal = as.factor(ceo_dismissal))

test_tbl <- test_tbl %>%
    mutate(ceo_dismissal = as.factor(ceo_dismissal))

Recipes

recipe_obj <- recipe(ceo_dismissal ~ ., data = train_tbl) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors()) 

Model

# Initialize h2o
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         4 minutes 35 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 1 day 
##     H2O cluster name:           H2O_started_from_R_jobboonstoppel_isx672 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.85 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.0 (2023-04-21)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 1 day) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
train_h2o <- as.h2o(train_tbl)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test_h2o <- as.h2o(test_tbl)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
train_h2o[["ceo_dismissal"]] <- as.factor(train_h2o[["ceo_dismissal"]])
test_h2o[["ceo_dismissal"]] <- as.factor(test_h2o[["ceo_dismissal"]])

print(h2o.describe(train_h2o))
##               Label Type Missing Zeros PosInf NegInf          Min          Max
## 1             fyear  int       0     0      0      0         1993 2.018000e+03
## 2        co_per_rol  int       0     0      0      0          905 6.329400e+04
## 3    departure_code  int       0     0      0      0            3 7.000000e+00
## 4     ceo_dismissal enum       0   373      0      0            0 1.000000e+00
## 5   tenure_no_ceodb  int       0     0      0      0            1 3.000000e+00
## 6  max_tenure_ceodb  int       0     0      0      0            1 4.000000e+00
## 7        fyear_gone  int       0     0      0      0         1990 2.021000e+03
## 8           leftofc time       0     0      0      0 657417600000 1.631491e+12
## 9       still_there enum       0    26      0      0            0 4.200000e+01
## 10            notes enum       0     1      0      0            0 3.730000e+02
##            Mean        Sigma Cardinality
## 1  2.001616e+03 7.008914e+00          NA
## 2  1.382001e+04 1.334271e+04          NA
## 3  6.973333e+00 3.091014e-01          NA
## 4  5.333333e-03 7.293198e-02           2
## 5  1.013333e+00 1.361555e-01          NA
## 6  1.024000e+00 2.357376e-01          NA
## 7  2.003016e+03 7.207747e+00          NA
## 8  1.058184e+12 2.268587e+11          NA
## 9            NA           NA          43
## 10           NA           NA         374
print(h2o.describe(test_h2o))
##               Label Type Missing Zeros PosInf NegInf         Min          Max
## 1             fyear  int       0     0      0      0 1.99300e+03 2.018000e+03
## 2        co_per_rol  int       0     0      0      0 9.00000e+02 6.295400e+04
## 3    departure_code  int       0     0      0      0 3.00000e+00 7.000000e+00
## 4     ceo_dismissal enum       0   124      0      0 0.00000e+00 1.000000e+00
## 5   tenure_no_ceodb  int       0     0      0      0 1.00000e+00 2.000000e+00
## 6  max_tenure_ceodb  int       0     0      0      0 1.00000e+00 2.000000e+00
## 7        fyear_gone  int       0     0      0      0 1.99300e+03 2.997000e+03
## 8           leftofc time       0     0      0      0 7.57296e+11 3.245063e+13
## 9       still_there enum       0     7      0      0 0.00000e+00 3.300000e+01
## 10            notes enum       0     1      0      0 0.00000e+00 1.250000e+02
##            Mean        Sigma Cardinality
## 1  2.001675e+03 6.709789e+00          NA
## 2  1.449253e+04 1.365142e+04          NA
## 3  6.936508e+00 5.019328e-01          NA
## 4  1.587302e-02 1.254832e-01           2
## 5  1.015873e+00 1.254832e-01          NA
## 6  1.015873e+00 1.254832e-01          NA
## 7  2.010651e+03 8.881408e+01          NA
## 8  1.298774e+12 2.805030e+12          NA
## 9            NA           NA          34
## 10           NA           NA         126
split.h2o <- h2o.splitFrame(data = train_h2o, ratio = c(0.85), seed = 2345)
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]

y <- "ceo_dismissal"
x <- setdiff(names(train_tbl), y)

models_h2o <-h2o.automl(
    x = x, 
    y = y, 
    training_frame    = train_h2o, 
    validation_frame  = valid_h2o, 
    leaderboard_frame = test_h2o, 
    # max_runtime_secs  = 30, 
    max_models        = 10,
    exclude_algos     = "DeepLearning",
    nfolds            = 5, 
    seed              = 3456 
)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===                                                                   |   4%
## 20:57:27.549: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 20:57:27.550: AutoML: XGBoost is not available; skipping it.
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |======================================================================| 100%

Examine the output of h2o.automl

models_h2o %>% typeof() 
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
##                                                  model_id auc      logloss
## 1 StackedEnsemble_BestOfFamily_1_AutoML_3_20241121_205727   1 7.908083e-04
## 2                          GBM_5_AutoML_3_20241121_205727   1 3.723054e-06
## 3    StackedEnsemble_AllModels_1_AutoML_3_20241121_205727   1 7.908083e-04
## 4                          XRT_1_AutoML_3_20241121_205727   1 1.397135e-02
## 5             GBM_grid_1_AutoML_3_20241121_205727_model_2   1 2.247714e-01
## 6                          DRF_1_AutoML_3_20241121_205727   1 1.453925e-02
##   aucpr mean_per_class_error         rmse          mse
## 1     1                    0 6.899858e-03 4.760804e-05
## 2     1                    0 3.750128e-05 1.406346e-09
## 3     1                    0 6.899858e-03 4.760804e-05
## 4     1                    0 7.093887e-02 5.032324e-03
## 5     1                    0 1.259880e-01 1.587298e-02
## 6     1                    0 7.368939e-02 5.430127e-03
## 
## [12 rows x 7 columns]
models_h2o@leader
## Model Details:
## ==============
## 
## H2OBinomialModel: stackedensemble
## Model ID:  StackedEnsemble_BestOfFamily_1_AutoML_3_20241121_205727 
## Model Summary for Stacked Ensemble: 
##                                     key            value
## 1                     Stacking strategy cross_validation
## 2  Number of base models (used / total)              1/4
## 3      # GBM base models (used / total)              1/1
## 4      # DRF base models (used / total)              0/2
## 5      # GLM base models (used / total)              0/1
## 6                 Metalearner algorithm              GLM
## 7    Metalearner fold assignment scheme           Random
## 8                    Metalearner nfolds                5
## 9               Metalearner fold_column               NA
## 10   Custom metalearner hyperparameters             None
## 
## 
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
## 
## MSE:  8.040675e-12
## RMSE:  2.835608e-06
## LogLoss:  1.413818e-06
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  1
## Gini:  1
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##          0 1    Error    Rate
## 0      327 0 0.000000  =0/327
## 1        0 2 0.000000    =0/2
## Totals 327 2 0.000000  =0/329
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.999967   1.000000   0
## 2                       max f2  0.999967   1.000000   0
## 3                 max f0point5  0.999967   1.000000   0
## 4                 max accuracy  0.999967   1.000000   0
## 5                max precision  0.999967   1.000000   0
## 6                   max recall  0.999967   1.000000   0
## 7              max specificity  0.999967   1.000000   0
## 8             max absolute_mcc  0.999967   1.000000   0
## 9   max min_per_class_accuracy  0.999967   1.000000   0
## 10 max mean_per_class_accuracy  0.999967   1.000000   0
## 11                     max tns  0.999967 327.000000   0
## 12                     max fns  0.999967   0.000000   0
## 13                     max fps  0.000001 327.000000   1
## 14                     max tps  0.999967   2.000000   0
## 15                     max tnr  0.999967   1.000000   0
## 16                     max fnr  0.999967   0.000000   0
## 17                     max fpr  0.000001   1.000000   1
## 18                     max tpr  0.999967   1.000000   0
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on validation data. **
## 
## MSE:  1.492263e-12
## RMSE:  1.221582e-06
## LogLoss:  1.221583e-06
## Mean Per-Class Error:  NaN
## AUC:  0
## AUCPR:  0
## Gini:  -1
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         0  1 Error    Rate
## 0      NA NA    NA  =NA/NA
## 1      NA NA    NA  =NA/NA
## Totals NA NA    NA  =NA/NA
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold     value idx
## 1                       max f1        NA        NA  -1
## 2                       max f2        NA        NA  -1
## 3                 max f0point5        NA        NA  -1
## 4                 max accuracy  0.000001  0.000000   0
## 5                max precision  0.000001  0.000000   0
## 6                   max recall        NA        NA  -1
## 7              max specificity  0.000001  0.000000   0
## 8             max absolute_mcc  0.000001  0.000000   0
## 9   max min_per_class_accuracy        NA        NA  -1
## 10 max mean_per_class_accuracy        NA        NA  -1
## 11                     max tns  0.000001  0.000000   0
## 12                     max fns  0.000001  0.000000   0
## 13                     max fps  0.000001 46.000000   0
## 14                     max tps  0.000001  0.000000   0
## 15                     max tnr  0.000001  0.000000   0
## 16                     max fnr        NA        NA  -1
## 17                     max fpr  0.000001  1.000000   0
## 18                     max tpr        NA        NA  -1
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  9.723773e-10
## RMSE:  3.118296e-05
## LogLoss:  3.962217e-06
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  1
## Gini:  1
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##          0 1    Error    Rate
## 0      327 0 0.000000  =0/327
## 1        0 2 0.000000    =0/2
## Totals 327 2 0.000000  =0/329
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.999601   1.000000   0
## 2                       max f2  0.999601   1.000000   0
## 3                 max f0point5  0.999601   1.000000   0
## 4                 max accuracy  0.999601   1.000000   0
## 5                max precision  0.999601   1.000000   0
## 6                   max recall  0.999601   1.000000   0
## 7              max specificity  0.999601   1.000000   0
## 8             max absolute_mcc  0.999601   1.000000   0
## 9   max min_per_class_accuracy  0.999601   1.000000   0
## 10 max mean_per_class_accuracy  0.999601   1.000000   0
## 11                     max tns  0.999601 327.000000   0
## 12                     max fns  0.999601   0.000000   0
## 13                     max fps  0.000001 327.000000   5
## 14                     max tps  0.999601   2.000000   0
## 15                     max tnr  0.999601   1.000000   0
## 16                     max fnr  0.999601   0.000000   0
## 17                     max fpr  0.000001   1.000000   5
## 18                     max tpr  0.999601   1.000000   0
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##               mean       sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy  0.400000 0.547723   1.000000   0.000000   1.000000   0.000000
## auc       1.000000 0.000000   1.000000         NA   1.000000         NA
## err       0.000001 0.000001   0.000000   0.000002   0.000000   0.000002
## err_count 0.000000 0.000000   0.000000         NA   0.000000         NA
## f0point5  0.400000 0.547723   1.000000   0.000000   1.000000   0.000000
##           cv_5_valid
## accuracy    0.000000
## auc               NA
## err         0.000001
## err_count         NA
## f0point5    0.000000
## 
## ---
##                       mean       sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## precision         0.400000 0.547723   1.000000         NA   1.000000         NA
## r2                0.400000 0.547723   1.000000         NA   1.000000         NA
## recall            0.400000 0.547723   1.000000         NA   1.000000         NA
## residual_deviance 0.000317 0.000434   0.000772         NA   0.000811         NA
## rmse              0.000016 0.000022   0.000041         NA   0.000037         NA
## specificity       0.400000 0.547723   1.000000         NA   1.000000         NA
##                   cv_5_valid
## precision                 NA
## r2                        NA
## recall                    NA
## residual_deviance         NA
## rmse                      NA
## specificity               NA

Save and Load

?h2o.getModel
?h2o.saveModel
?h2o.loadModel

h2o.getModel("StackedEnsemble_BestOfFamily_1_AutoML_1_20241121_205358") %>%
    h2o.saveModel("h2o_models/")
## [1] "/Users/jobboonstoppel/Documents/USA/Plymouth State /Student/Classes/2024-2025/Fall/Intermediate Data Analytics (DAT 3100)/PSU_DAT3100/11_module13/h2o_models/StackedEnsemble_BestOfFamily_1_AutoML_1_20241121_205358"
best_model <- h2o.loadModel("h2o_models/StackedEnsemble_BestOfFamily_1_AutoML_1_20241121_205358")

Make predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'still_there' has levels not trained on: ["12/8//2020"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation dataset column 'notes' has levels not trained on: ["(Crain's) -- Three months after Enesco Group Inc. CEO Cynthia Passmore told employees she was in a personal relationship with a consultant hired to turn around the fragile knick-knack company, she has stepped down. Itasca-based Enesco on Thursday announced that the board and Ms. Passmore "mutually agreed" for her no longer to serve as CEO, president and director of the company. Her departure from those roles will become effective on Monday, May 15, when the company will hold a conference call to discuss the company's first quarter earnings results and the management change.", "20 Sep 2001, Richard H. (Rick) Wills has been elected to the position of Chairman of the Board. Wills will retain his existing positions of President and Chief Executive Officer of Tektronix. On Oct. 15, 2007, it was reported that Danaher to acquire Tektronix for $2.8 billion.", "After failing, so far, in its attempt to woo Southwest Gas away from Oneok, Southern Union Co. headed east to announce a friendly $500 million acquisition of distributor Pennsylvania Enterprises Inc. (PEI). With the purchase Southern Union will make its first entry into power marketing.Pennsylvania Enterprises will become an autonomous division of Southern Union with the division headquarters remaining in Wilkes-Barre [PA], and there will be no material changes to the operations of PEI. The combined company will have a market capitalization of about $1 billion and serve 1.2 million gas and electric customers in Pennsylvania, Texas, Missouri, Florida and Piedras Negras, Mexico.", "After weeks of speculation, Hoechst AG and Rhone-Poulenc have announcedthat they are to merge their life sciences businesses into a new company called Aventis. Aventis will have its global headquarters in Strasbourg, France, and will have a combined turnover of approximately $20 billion. The companies say that with a combined R&D budget of nearly $3 billion.", "Agreement and Plan of Merger, dated as of
##  April 19, 1996, by and among Ideon Group, Inc., CUC
##  International Inc. and IG Acquisition Corp. (filed as
##  Exhibit 10.21 to the Company's Annual Report on Form
##  10-K for the fiscal year ended January 31, 1996). On February 6, 1996, the Company announced that Eugene Miller, one of its
##  outside directors, has been appointed as Chairman of the Board and Chief
##  Executive Officer replacing Paul G. Kahn. Mr. Miller serves as the head of the
##  Company's Strategic Direction Committee as described above.", "Alston D. Correll has been the Chairman and Chief Executive officer of GP since 1993. He served as the Company’s President from 1991-2002. Mr. Correll has been a GP director since 1992, and his current term ends in 2005. Mr. Correll will enter into mandatory retirement at the end of 2005. He was cited as retired Chairman of the Board of Georgia-Pacific Corporation on a SEC report and his career culminated when he negotiated the sale of Georgia-Pacific. Georgia-Pacific Corp. (NYSE: GP) and KoCell, LLC, a wholly owned subsidiary of Koch Industries, Inc., today announced that the companies have signed a letter of intent for Koch to acquire Georgia-Pacific's non-integrated fluff and market pulp operations for $610 million, which includes assumption of $73 million of indebtedness. December 23, 2005, Koch Industries Finalizes $21 Billion Purchase of Georgia-Pacific.", "Anthony M. Sanfilippo has been Chief Executive Officer and one of the directors since April 2016 and has been the Company’s Chairman of the Board since May 2017. In addition, he served as Former Pinnacle’s Chief Executive Officer and one of Former Pinnacle’s directors from March 2010 to April 2016. He was also Former Pinnacle’s President from March 2010 to May 2013. On December 18, 2017, Penn National Gaming, Inc. and Pinnacle Entertainment, Inc. announced today that they have entered into a definitive agreement under which Penn National will acquire Pinnacle in a cash and stock transaction valued at approximately $2.8 billion.", "April 1, 2004--St. Paul Travelers today completed the merger that combines The St. Paul Companies (NYSE:SPC) and Travelers Property Casualty Corp. (NYSE:TAP.A and TAP.B). Beginning tomorrow morning, April 2, 2004, St. Paul Travelers Companies stock will be listed on the New York Stock Exchange under the symbol "STA." The combined company is uniquely positioned as the commercial insurer of choice for independent agents and brokers across the United States. The company also offers homeowners, auto and other insurance products for individuals and families under the highly regarded Travelers brand.", "April 20, 2004, Bruce R. Lakefield replaced US Airways Group Inc. president and chief executive David N. Siegel who abruptly resigned yesterday in the midst of a painful restructuring of the airline, which had emerged from bankruptcy only 13 months ago. In 2003, US Airways began exploring the availability of financing and merger partners, and after no financing was available, it filed for Chapter 11 bankruptcy again in 2004 for the second time in two years. May 20, 2005, America West Holdings Corp. and US Airways Group announced the two have reached a merger agreement. America West Airlines acquired the bankrupt US Airways on September 27, 2005 to form the US Airways Group.", "Aug. 13, 2012 (GLOBE NEWSWIRE) -- Comverse Technology, Inc. ("CTI") (Nasdaq:CMVT) today announced that it has signed a definitive merger agreement with its majority-owned subsidiary Verint Systems Inc. (Nasdaq:VRNT). Under the terms of the agreement, following the completion of CTI's previously announced distribution to its shareholders of substantially all of its assets, including its wholly-owned subsidiary Comverse, Inc. ("CNS"), other than its holdings in Verint, Verint will acquire the CTI holding company, eliminating the current holding company structure. As of August 10, 2012, CTI currently holds approximately 41.0% of Verint's basic outstanding common shares and 100% of Verint's outstanding convertible preferred shares which, if converted, would result in CTI holding approximately 53.7% of Verint's basic outstanding common shares.", ...106 not listed..., "Steve Schlotterbeck's surprise resignation Thursday sent shockwaves through the region's natural gas industry at a time when he was expecting to lead the country's largest natural gas producer through a split into two companies later this year. Schlotterbeck's departure comes at a busy time for EQT (NYSE: EQT), which in November completed the $6.7 billion acquisition of Rice Energy and announced only last month that it would be separating its exploration/production and midstream divisions. Schlotterbeck had been tapped to remain as CEO of EQT. And EQT and EQT Midstream Partners this year began construction on its long-awaited Mountain Valley Pipeline, which will connect Marcellus and Utica shale gas with Southeast markets.", "The acquisition of RGS Energy Group Inc. by Energy East Corp. could close within 30 days-ending more than a half century of local ownership. Energy East announced plans to acquire the roughly $1 billion RGS last year, proposing to pay $1.4 billion and assume $1 billion in RGS debt. Energy East is headed by Chairman Wesley Von Schack, who before the formation of Energy East was NYSEG’s CEO.", "The board of Lac Minerals Ltd. agreed today to a friendly takeover by the American Barrick Resources Corporation after Barrick sweetened its offer to $1.6 billion, the two Toronto-based companies said. The deal means that Barrick is the likely victor in the takeover fight for control of Lac, which pitted it against Royal Oak Mines Inc.", "The operations of all acquisitions for the three-year period ending   December 31, 1993, have been included in the consolidated statements of earnings  since the dates of acquisition. Mr. Bartlett joined the Company in 1976, served as its chief executive 
##  officer from December 1984 to April 1993 and assumed the Board chairmanship in  1989. Mr. Grimes was elected chief executive officer in April 1993 and resigned  that position in December 1993. Mr. Bartlett reassumed the position of c
predictions_tbl <- predictions %>%
    as_tibble()

predictions_tbl %>%
    bind_cols(test_tbl)
## # A tibble: 126 × 13
##    predict    p0            p1 fyear co_per_rol departure_code ceo_dismissal
##    <fct>   <dbl>         <dbl> <dbl>      <dbl>          <dbl> <fct>        
##  1 0        1.00 0.0000000150   1994        900              7 0            
##  2 0        1.00 0.00000000516  2007        933              7 0            
##  3 0        1.00 0.00000000691  2004       1023              7 0            
##  4 0        1.00 0.000000989    1997       1030              7 0            
##  5 0        1.00 0.00000000639  2005       1040              7 0            
##  6 0        1.00 0.00000212     1994       1107              7 0            
##  7 0        1.00 0.00000000944  2001       1434              7 0            
##  8 0        1.00 0.0000000122   1998       1522              7 0            
##  9 0        1.00 0.0000000124   1998       1566              7 0            
## 10 0        1.00 0.00000102     1996       1739              7 0            
## # ℹ 116 more rows
## # ℹ 6 more variables: tenure_no_ceodb <dbl>, max_tenure_ceodb <dbl>,
## #   fyear_gone <dbl>, leftofc <dttm>, still_there <fct>, notes <fct>

Evaluate model

?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
confusion_matrix <- h2o.confusionMatrix(performance_h2o)
print(confusion_matrix)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.999539021097766:
##          0 1    Error    Rate
## 0      124 0 0.000000  =0/124
## 1        0 2 0.000000    =0/2
## Totals 124 2 0.000000  =0/126
#typeof(performance_h2o)
#slotNames(performance_h2o)
#performance_h2o@metrics

auc <- h2o.auc(performance_h2o)
print(paste("AUC:", auc))
## [1] "AUC: 1"
metrics <- performance_h2o@metrics
print(metrics)
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "StackedEnsemble_BestOfFamily_1_AutoML_1_20241121_205358"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/StackedEnsemble_BestOfFamily_1_AutoML_1_20241121_205358"
## 
## 
## $model_checksum
## [1] "7993262640963615984"
## 
## $frame
## $frame$name
## [1] "RTMP_sid_b7ed_6"
## 
## 
## $frame_checksum
## [1] "-796301443263526576"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.732241e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 2.129433e-09
## 
## $RMSE
## [1] 4.614578e-05
## 
## $nobs
## [1] 126
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.9999999
## 
## $logloss
## [1] 5.888734e-06
## 
## $AUC
## [1] 1
## 
## $pr_auc
## [1] 1
## 
## $Gini
## [1] 1
## 
## $mean_per_class_error
## [1] 0
## 
## $domain
## [1] "0" "1"
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##          0 1  Error      Rate
## 0      124 0 0.0000 = 0 / 124
## 1        0 2 0.0000 =   0 / 2
## Totals 124 2 0.0000 = 0 / 126
## 
## 
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.999764 0.666667 0.555556 0.833333 0.992063  1.000000 0.500000    1.000000
## 2  0.999539 1.000000 1.000000 1.000000 1.000000  1.000000 1.000000    1.000000
## 3  0.000003 0.800000 0.909091 0.714286 0.992063  0.666667 1.000000    0.991935
## 4  0.000002 0.666667 0.833333 0.555556 0.984127  0.500000 1.000000    0.983871
## 5  0.000002 0.571429 0.769231 0.454545 0.976190  0.400000 1.000000    0.975806
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1     0.704273               0.500000                0.750000 124   1   0   1
## 2     1.000000               1.000000                1.000000 124   0   0   2
## 3     0.813198               0.991935                0.995968 123   0   1   2
## 4     0.701381               0.983871                0.991935 122   0   2   2
## 5     0.624758               0.975806                0.987903 121   0   3   2
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.500000 0.000000 0.500000   0
## 2 1.000000 0.000000 0.000000 1.000000   1
## 3 0.991935 0.000000 0.008065 1.000000   2
## 4 0.983871 0.000000 0.016129 1.000000   3
## 5 0.975806 0.000000 0.024194 1.000000   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 121  0.000000 0.032520 0.077519 0.020576 0.055556  0.016529 1.000000
## 122  0.000000 0.032258 0.076923 0.020408 0.047619  0.016393 1.000000
## 123  0.000000 0.032000 0.076336 0.020243 0.039683  0.016260 1.000000
## 124  0.000000 0.031746 0.075758 0.020080 0.031746  0.016129 1.000000
## 125  0.000000 0.031496 0.075188 0.019920 0.023810  0.016000 1.000000
## 126  0.000000 0.031250 0.074627 0.019763 0.015873  0.015873 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 121    0.040323     0.025816               0.040323                0.520161   5
## 122    0.032258     0.022996               0.032258                0.516129   4
## 123    0.024194     0.019834               0.024194                0.512097   3
## 124    0.016129     0.016129               0.016129                0.508065   2
## 125    0.008065     0.011359               0.008065                0.504032   1
## 126    0.000000     0.000000               0.000000                0.500000   0
##     fns fps tps      tnr      fnr      fpr      tpr idx
## 121   0 119   2 0.040323 0.000000 0.959677 1.000000 120
## 122   0 120   2 0.032258 0.000000 0.967742 1.000000 121
## 123   0 121   2 0.024194 0.000000 0.975806 1.000000 122
## 124   0 122   2 0.016129 0.000000 0.983871 1.000000 123
## 125   0 123   2 0.008065 0.000000 0.991935 1.000000 124
## 126   0 124   2 0.000000 0.000000 1.000000 1.000000 125
## 
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.999539   1.000000   1
## 2                       max f2  0.999539   1.000000   1
## 3                 max f0point5  0.999539   1.000000   1
## 4                 max accuracy  0.999539   1.000000   1
## 5                max precision  0.999764   1.000000   0
## 6                   max recall  0.999539   1.000000   1
## 7              max specificity  0.999764   1.000000   0
## 8             max absolute_mcc  0.999539   1.000000   1
## 9   max min_per_class_accuracy  0.999539   1.000000   1
## 10 max mean_per_class_accuracy  0.999539   1.000000   1
## 11                     max tns  0.999764 124.000000   0
## 12                     max fns  0.999764   1.000000   0
## 13                     max fps  0.000000 124.000000 125
## 14                     max tps  0.999539   2.000000   1
## 15                     max tnr  0.999764   1.000000   0
## 16                     max fnr  0.999764   0.500000   0
## 17                     max fpr  0.000000   1.000000 125
## 18                     max tpr  0.999539   1.000000   1
## 
## $gains_lift_table
## Gains/Lift Table: Avg response rate:  1.59 %, avg score:  1.59 %
##    group cumulative_data_fraction lower_threshold      lift cumulative_lift
## 1      1               0.01587302        0.749655 63.000000       63.000000
## 2      2               0.02380952        0.000003  0.000000       42.000000
## 3      3               0.03174603        0.000002  0.000000       31.500000
## 4      4               0.04761905        0.000002  0.000000       21.000000
## 5      5               0.05555556        0.000002  0.000000       18.000000
## 6      6               0.10317460        0.000002  0.000000        9.692308
## 7      7               0.15079365        0.000002  0.000000        6.631579
## 8      8               0.20634921        0.000001  0.000000        4.846154
## 9      9               0.30158730        0.000000  0.000000        3.315789
## 10    10               0.40476190        0.000000  0.000000        2.470588
## 11    11               0.50000000        0.000000  0.000000        2.000000
## 12    12               0.60317460        0.000000  0.000000        1.657895
## 13    13               0.69841270        0.000000  0.000000        1.431818
## 14    14               0.80158730        0.000000  0.000000        1.247525
## 15    15               0.89682540        0.000000  0.000000        1.115044
## 16    16               1.00000000        0.000000  0.000000        1.000000
##    response_rate    score cumulative_response_rate cumulative_score
## 1       1.000000 0.999651                 1.000000         0.999651
## 2       0.000000 0.000003                 0.666667         0.666435
## 3       0.000000 0.000002                 0.500000         0.499827
## 4       0.000000 0.000002                 0.333333         0.333219
## 5       0.000000 0.000002                 0.285714         0.285616
## 6       0.000000 0.000002                 0.153846         0.153794
## 7       0.000000 0.000002                 0.105263         0.105228
## 8       0.000000 0.000001                 0.076923         0.076898
## 9       0.000000 0.000000                 0.052632         0.052614
## 10      0.000000 0.000000                 0.039216         0.039203
## 11      0.000000 0.000000                 0.031746         0.031736
## 12      0.000000 0.000000                 0.026316         0.026307
## 13      0.000000 0.000000                 0.022727         0.022720
## 14      0.000000 0.000000                 0.019802         0.019796
## 15      0.000000 0.000000                 0.017699         0.017693
## 16      0.000000 0.000000                 0.015873         0.015868
##    capture_rate cumulative_capture_rate        gain cumulative_gain
## 1      1.000000                1.000000 6200.000000     6200.000000
## 2      0.000000                1.000000 -100.000000     4100.000000
## 3      0.000000                1.000000 -100.000000     3050.000000
## 4      0.000000                1.000000 -100.000000     2000.000000
## 5      0.000000                1.000000 -100.000000     1700.000000
## 6      0.000000                1.000000 -100.000000      869.230769
## 7      0.000000                1.000000 -100.000000      563.157895
## 8      0.000000                1.000000 -100.000000      384.615385
## 9      0.000000                1.000000 -100.000000      231.578947
## 10     0.000000                1.000000 -100.000000      147.058824
## 11     0.000000                1.000000 -100.000000      100.000000
## 12     0.000000                1.000000 -100.000000       65.789474
## 13     0.000000                1.000000 -100.000000       43.181818
## 14     0.000000                1.000000 -100.000000       24.752475
## 15     0.000000                1.000000 -100.000000       11.504425
## 16     0.000000                1.000000 -100.000000        0.000000
##    kolmogorov_smirnov
## 1            1.000000
## 2            0.991935
## 3            0.983871
## 4            0.967742
## 5            0.959677
## 6            0.911290
## 7            0.862903
## 8            0.806452
## 9            0.709677
## 10           0.604839
## 11           0.508065
## 12           0.403226
## 13           0.306452
## 14           0.201613
## 15           0.104839
## 16           0.000000
## 
## $residual_deviance
## [1] 0.001483961
## 
## $null_deviance
## [1] 21.92384
## 
## $AIC
## [1] 8.001484
## 
## $loglikelihood
## [1] 0
## 
## $null_degrees_of_freedom
## [1] 125
## 
## $residual_degrees_of_freedom
## [1] 122