Import the cleaned data from Module 7.
library(h2o)
## Warning: package 'h2o' was built under R version 4.3.1
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'tidyr' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'stringr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::day() masks h2o::day()
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::hour() masks h2o::hour()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::month() masks h2o::month()
## ✖ lubridate::week() masks h2o::week()
## ✖ lubridate::year() masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.1
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'dials' was built under R version 4.3.1
## Warning: package 'scales' was built under R version 4.3.1
## Warning: package 'infer' was built under R version 4.3.1
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'parsnip' was built under R version 4.3.1
## Warning: package 'recipes' was built under R version 4.3.1
## Warning: package 'rsample' was built under R version 4.3.1
## Warning: package 'tune' was built under R version 4.3.1
## Warning: package 'workflows' was built under R version 4.3.1
## Warning: package 'workflowsets' was built under R version 4.3.1
## Warning: package 'yardstick' was built under R version 4.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(tidyquant)
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
##
## Loading required package: quantmod
## Loading required package: TTR
##
## Attaching package: 'TTR'
##
## The following object is masked from 'package:dials':
##
## momentum
##
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
data <- read_csv("../00_data/data_wrangled/data_clean2.csv") %>%
# h2o requires all variables to be either numeric or factors
mutate(across(where(is.character), factor))
## Rows: 501 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): still_there, notes
## dbl (7): fyear, co_per_rol, departure_code, ceo_dismissal, tenure_no_ceodb,...
## dttm (1): leftofc
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(1234)
data_split <- initial_split(data, strata = "ceo_dismissal")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
train_tbl <- train_tbl %>%
mutate(ceo_dismissal = as.factor(ceo_dismissal))
test_tbl <- test_tbl %>%
mutate(ceo_dismissal = as.factor(ceo_dismissal))
recipe_obj <- recipe(ceo_dismissal ~ ., data = train_tbl) %>%
# Remove zero variance variables
step_zv(all_predictors())
# Initialize h2o
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 4 minutes 35 seconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 11 months and 1 day
## H2O cluster name: H2O_started_from_R_jobboonstoppel_isx672
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.85 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.0 (2023-04-21)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (11 months and 1 day) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
train_h2o <- as.h2o(train_tbl)
##
|
| | 0%
|
|======================================================================| 100%
test_h2o <- as.h2o(test_tbl)
##
|
| | 0%
|
|======================================================================| 100%
train_h2o[["ceo_dismissal"]] <- as.factor(train_h2o[["ceo_dismissal"]])
test_h2o[["ceo_dismissal"]] <- as.factor(test_h2o[["ceo_dismissal"]])
print(h2o.describe(train_h2o))
## Label Type Missing Zeros PosInf NegInf Min Max
## 1 fyear int 0 0 0 0 1993 2.018000e+03
## 2 co_per_rol int 0 0 0 0 905 6.329400e+04
## 3 departure_code int 0 0 0 0 3 7.000000e+00
## 4 ceo_dismissal enum 0 373 0 0 0 1.000000e+00
## 5 tenure_no_ceodb int 0 0 0 0 1 3.000000e+00
## 6 max_tenure_ceodb int 0 0 0 0 1 4.000000e+00
## 7 fyear_gone int 0 0 0 0 1990 2.021000e+03
## 8 leftofc time 0 0 0 0 657417600000 1.631491e+12
## 9 still_there enum 0 26 0 0 0 4.200000e+01
## 10 notes enum 0 1 0 0 0 3.730000e+02
## Mean Sigma Cardinality
## 1 2.001616e+03 7.008914e+00 NA
## 2 1.382001e+04 1.334271e+04 NA
## 3 6.973333e+00 3.091014e-01 NA
## 4 5.333333e-03 7.293198e-02 2
## 5 1.013333e+00 1.361555e-01 NA
## 6 1.024000e+00 2.357376e-01 NA
## 7 2.003016e+03 7.207747e+00 NA
## 8 1.058184e+12 2.268587e+11 NA
## 9 NA NA 43
## 10 NA NA 374
print(h2o.describe(test_h2o))
## Label Type Missing Zeros PosInf NegInf Min Max
## 1 fyear int 0 0 0 0 1.99300e+03 2.018000e+03
## 2 co_per_rol int 0 0 0 0 9.00000e+02 6.295400e+04
## 3 departure_code int 0 0 0 0 3.00000e+00 7.000000e+00
## 4 ceo_dismissal enum 0 124 0 0 0.00000e+00 1.000000e+00
## 5 tenure_no_ceodb int 0 0 0 0 1.00000e+00 2.000000e+00
## 6 max_tenure_ceodb int 0 0 0 0 1.00000e+00 2.000000e+00
## 7 fyear_gone int 0 0 0 0 1.99300e+03 2.997000e+03
## 8 leftofc time 0 0 0 0 7.57296e+11 3.245063e+13
## 9 still_there enum 0 7 0 0 0.00000e+00 3.300000e+01
## 10 notes enum 0 1 0 0 0.00000e+00 1.250000e+02
## Mean Sigma Cardinality
## 1 2.001675e+03 6.709789e+00 NA
## 2 1.449253e+04 1.365142e+04 NA
## 3 6.936508e+00 5.019328e-01 NA
## 4 1.587302e-02 1.254832e-01 2
## 5 1.015873e+00 1.254832e-01 NA
## 6 1.015873e+00 1.254832e-01 NA
## 7 2.010651e+03 8.881408e+01 NA
## 8 1.298774e+12 2.805030e+12 NA
## 9 NA NA 34
## 10 NA NA 126
split.h2o <- h2o.splitFrame(data = train_h2o, ratio = c(0.85), seed = 2345)
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
y <- "ceo_dismissal"
x <- setdiff(names(train_tbl), y)
models_h2o <-h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
# max_runtime_secs = 30,
max_models = 10,
exclude_algos = "DeepLearning",
nfolds = 5,
seed = 3456
)
##
|
| | 0%
|
|=== | 4%
## 20:57:27.549: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 20:57:27.550: AutoML: XGBoost is not available; skipping it.
|
|=========== | 15%
|
|=============== | 21%
|
|==================== | 29%
|
|======================= | 33%
|
|======================================================================| 100%
Examine the output of h2o.automl
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id auc logloss
## 1 StackedEnsemble_BestOfFamily_1_AutoML_3_20241121_205727 1 7.908083e-04
## 2 GBM_5_AutoML_3_20241121_205727 1 3.723054e-06
## 3 StackedEnsemble_AllModels_1_AutoML_3_20241121_205727 1 7.908083e-04
## 4 XRT_1_AutoML_3_20241121_205727 1 1.397135e-02
## 5 GBM_grid_1_AutoML_3_20241121_205727_model_2 1 2.247714e-01
## 6 DRF_1_AutoML_3_20241121_205727 1 1.453925e-02
## aucpr mean_per_class_error rmse mse
## 1 1 0 6.899858e-03 4.760804e-05
## 2 1 0 3.750128e-05 1.406346e-09
## 3 1 0 6.899858e-03 4.760804e-05
## 4 1 0 7.093887e-02 5.032324e-03
## 5 1 0 1.259880e-01 1.587298e-02
## 6 1 0 7.368939e-02 5.430127e-03
##
## [12 rows x 7 columns]
models_h2o@leader
## Model Details:
## ==============
##
## H2OBinomialModel: stackedensemble
## Model ID: StackedEnsemble_BestOfFamily_1_AutoML_3_20241121_205727
## Model Summary for Stacked Ensemble:
## key value
## 1 Stacking strategy cross_validation
## 2 Number of base models (used / total) 1/4
## 3 # GBM base models (used / total) 1/1
## 4 # DRF base models (used / total) 0/2
## 5 # GLM base models (used / total) 0/1
## 6 Metalearner algorithm GLM
## 7 Metalearner fold assignment scheme Random
## 8 Metalearner nfolds 5
## 9 Metalearner fold_column NA
## 10 Custom metalearner hyperparameters None
##
##
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 8.040675e-12
## RMSE: 2.835608e-06
## LogLoss: 1.413818e-06
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 1
## Gini: 1
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 327 0 0.000000 =0/327
## 1 0 2 0.000000 =0/2
## Totals 327 2 0.000000 =0/329
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.999967 1.000000 0
## 2 max f2 0.999967 1.000000 0
## 3 max f0point5 0.999967 1.000000 0
## 4 max accuracy 0.999967 1.000000 0
## 5 max precision 0.999967 1.000000 0
## 6 max recall 0.999967 1.000000 0
## 7 max specificity 0.999967 1.000000 0
## 8 max absolute_mcc 0.999967 1.000000 0
## 9 max min_per_class_accuracy 0.999967 1.000000 0
## 10 max mean_per_class_accuracy 0.999967 1.000000 0
## 11 max tns 0.999967 327.000000 0
## 12 max fns 0.999967 0.000000 0
## 13 max fps 0.000001 327.000000 1
## 14 max tps 0.999967 2.000000 0
## 15 max tnr 0.999967 1.000000 0
## 16 max fnr 0.999967 0.000000 0
## 17 max fpr 0.000001 1.000000 1
## 18 max tpr 0.999967 1.000000 0
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on validation data. **
##
## MSE: 1.492263e-12
## RMSE: 1.221582e-06
## LogLoss: 1.221583e-06
## Mean Per-Class Error: NaN
## AUC: 0
## AUCPR: 0
## Gini: -1
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 NA NA NA =NA/NA
## 1 NA NA NA =NA/NA
## Totals NA NA NA =NA/NA
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 NA NA -1
## 2 max f2 NA NA -1
## 3 max f0point5 NA NA -1
## 4 max accuracy 0.000001 0.000000 0
## 5 max precision 0.000001 0.000000 0
## 6 max recall NA NA -1
## 7 max specificity 0.000001 0.000000 0
## 8 max absolute_mcc 0.000001 0.000000 0
## 9 max min_per_class_accuracy NA NA -1
## 10 max mean_per_class_accuracy NA NA -1
## 11 max tns 0.000001 0.000000 0
## 12 max fns 0.000001 0.000000 0
## 13 max fps 0.000001 46.000000 0
## 14 max tps 0.000001 0.000000 0
## 15 max tnr 0.000001 0.000000 0
## 16 max fnr NA NA -1
## 17 max fpr 0.000001 1.000000 0
## 18 max tpr NA NA -1
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 9.723773e-10
## RMSE: 3.118296e-05
## LogLoss: 3.962217e-06
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 1
## Gini: 1
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 327 0 0.000000 =0/327
## 1 0 2 0.000000 =0/2
## Totals 327 2 0.000000 =0/329
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.999601 1.000000 0
## 2 max f2 0.999601 1.000000 0
## 3 max f0point5 0.999601 1.000000 0
## 4 max accuracy 0.999601 1.000000 0
## 5 max precision 0.999601 1.000000 0
## 6 max recall 0.999601 1.000000 0
## 7 max specificity 0.999601 1.000000 0
## 8 max absolute_mcc 0.999601 1.000000 0
## 9 max min_per_class_accuracy 0.999601 1.000000 0
## 10 max mean_per_class_accuracy 0.999601 1.000000 0
## 11 max tns 0.999601 327.000000 0
## 12 max fns 0.999601 0.000000 0
## 13 max fps 0.000001 327.000000 5
## 14 max tps 0.999601 2.000000 0
## 15 max tnr 0.999601 1.000000 0
## 16 max fnr 0.999601 0.000000 0
## 17 max fpr 0.000001 1.000000 5
## 18 max tpr 0.999601 1.000000 0
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy 0.400000 0.547723 1.000000 0.000000 1.000000 0.000000
## auc 1.000000 0.000000 1.000000 NA 1.000000 NA
## err 0.000001 0.000001 0.000000 0.000002 0.000000 0.000002
## err_count 0.000000 0.000000 0.000000 NA 0.000000 NA
## f0point5 0.400000 0.547723 1.000000 0.000000 1.000000 0.000000
## cv_5_valid
## accuracy 0.000000
## auc NA
## err 0.000001
## err_count NA
## f0point5 0.000000
##
## ---
## mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## precision 0.400000 0.547723 1.000000 NA 1.000000 NA
## r2 0.400000 0.547723 1.000000 NA 1.000000 NA
## recall 0.400000 0.547723 1.000000 NA 1.000000 NA
## residual_deviance 0.000317 0.000434 0.000772 NA 0.000811 NA
## rmse 0.000016 0.000022 0.000041 NA 0.000037 NA
## specificity 0.400000 0.547723 1.000000 NA 1.000000 NA
## cv_5_valid
## precision NA
## r2 NA
## recall NA
## residual_deviance NA
## rmse NA
## specificity NA
?h2o.getModel
?h2o.saveModel
?h2o.loadModel
h2o.getModel("StackedEnsemble_BestOfFamily_1_AutoML_1_20241121_205358") %>%
h2o.saveModel("h2o_models/")
## [1] "/Users/jobboonstoppel/Documents/USA/Plymouth State /Student/Classes/2024-2025/Fall/Intermediate Data Analytics (DAT 3100)/PSU_DAT3100/11_module13/h2o_models/StackedEnsemble_BestOfFamily_1_AutoML_1_20241121_205358"
best_model <- h2o.loadModel("h2o_models/StackedEnsemble_BestOfFamily_1_AutoML_1_20241121_205358")
predictions <- h2o.predict(best_model, newdata = test_h2o)
##
|
| | 0%
|
|======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'still_there' has levels not trained on: ["12/8//2020"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation dataset column 'notes' has levels not trained on: ["(Crain's) -- Three months after Enesco Group Inc. CEO Cynthia Passmore told employees she was in a personal relationship with a consultant hired to turn around the fragile knick-knack company, she has stepped down. Itasca-based Enesco on Thursday announced that the board and Ms. Passmore "mutually agreed" for her no longer to serve as CEO, president and director of the company. Her departure from those roles will become effective on Monday, May 15, when the company will hold a conference call to discuss the company's first quarter earnings results and the management change.", "20 Sep 2001, Richard H. (Rick) Wills has been elected to the position of Chairman of the Board. Wills will retain his existing positions of President and Chief Executive Officer of Tektronix. On Oct. 15, 2007, it was reported that Danaher to acquire Tektronix for $2.8 billion.", "After failing, so far, in its attempt to woo Southwest Gas away from Oneok, Southern Union Co. headed east to announce a friendly $500 million acquisition of distributor Pennsylvania Enterprises Inc. (PEI). With the purchase Southern Union will make its first entry into power marketing.Pennsylvania Enterprises will become an autonomous division of Southern Union with the division headquarters remaining in Wilkes-Barre [PA], and there will be no material changes to the operations of PEI. The combined company will have a market capitalization of about $1 billion and serve 1.2 million gas and electric customers in Pennsylvania, Texas, Missouri, Florida and Piedras Negras, Mexico.", "After weeks of speculation, Hoechst AG and Rhone-Poulenc have announcedthat they are to merge their life sciences businesses into a new company called Aventis. Aventis will have its global headquarters in Strasbourg, France, and will have a combined turnover of approximately $20 billion. The companies say that with a combined R&D budget of nearly $3 billion.", "Agreement and Plan of Merger, dated as of
## April 19, 1996, by and among Ideon Group, Inc., CUC
## International Inc. and IG Acquisition Corp. (filed as
## Exhibit 10.21 to the Company's Annual Report on Form
## 10-K for the fiscal year ended January 31, 1996). On February 6, 1996, the Company announced that Eugene Miller, one of its
## outside directors, has been appointed as Chairman of the Board and Chief
## Executive Officer replacing Paul G. Kahn. Mr. Miller serves as the head of the
## Company's Strategic Direction Committee as described above.", "Alston D. Correll has been the Chairman and Chief Executive officer of GP since 1993. He served as the Company’s President from 1991-2002. Mr. Correll has been a GP director since 1992, and his current term ends in 2005. Mr. Correll will enter into mandatory retirement at the end of 2005. He was cited as retired Chairman of the Board of Georgia-Pacific Corporation on a SEC report and his career culminated when he negotiated the sale of Georgia-Pacific. Georgia-Pacific Corp. (NYSE: GP) and KoCell, LLC, a wholly owned subsidiary of Koch Industries, Inc., today announced that the companies have signed a letter of intent for Koch to acquire Georgia-Pacific's non-integrated fluff and market pulp operations for $610 million, which includes assumption of $73 million of indebtedness. December 23, 2005, Koch Industries Finalizes $21 Billion Purchase of Georgia-Pacific.", "Anthony M. Sanfilippo has been Chief Executive Officer and one of the directors since April 2016 and has been the Company’s Chairman of the Board since May 2017. In addition, he served as Former Pinnacle’s Chief Executive Officer and one of Former Pinnacle’s directors from March 2010 to April 2016. He was also Former Pinnacle’s President from March 2010 to May 2013. On December 18, 2017, Penn National Gaming, Inc. and Pinnacle Entertainment, Inc. announced today that they have entered into a definitive agreement under which Penn National will acquire Pinnacle in a cash and stock transaction valued at approximately $2.8 billion.", "April 1, 2004--St. Paul Travelers today completed the merger that combines The St. Paul Companies (NYSE:SPC) and Travelers Property Casualty Corp. (NYSE:TAP.A and TAP.B). Beginning tomorrow morning, April 2, 2004, St. Paul Travelers Companies stock will be listed on the New York Stock Exchange under the symbol "STA." The combined company is uniquely positioned as the commercial insurer of choice for independent agents and brokers across the United States. The company also offers homeowners, auto and other insurance products for individuals and families under the highly regarded Travelers brand.", "April 20, 2004, Bruce R. Lakefield replaced US Airways Group Inc. president and chief executive David N. Siegel who abruptly resigned yesterday in the midst of a painful restructuring of the airline, which had emerged from bankruptcy only 13 months ago. In 2003, US Airways began exploring the availability of financing and merger partners, and after no financing was available, it filed for Chapter 11 bankruptcy again in 2004 for the second time in two years. May 20, 2005, America West Holdings Corp. and US Airways Group announced the two have reached a merger agreement. America West Airlines acquired the bankrupt US Airways on September 27, 2005 to form the US Airways Group.", "Aug. 13, 2012 (GLOBE NEWSWIRE) -- Comverse Technology, Inc. ("CTI") (Nasdaq:CMVT) today announced that it has signed a definitive merger agreement with its majority-owned subsidiary Verint Systems Inc. (Nasdaq:VRNT). Under the terms of the agreement, following the completion of CTI's previously announced distribution to its shareholders of substantially all of its assets, including its wholly-owned subsidiary Comverse, Inc. ("CNS"), other than its holdings in Verint, Verint will acquire the CTI holding company, eliminating the current holding company structure. As of August 10, 2012, CTI currently holds approximately 41.0% of Verint's basic outstanding common shares and 100% of Verint's outstanding convertible preferred shares which, if converted, would result in CTI holding approximately 53.7% of Verint's basic outstanding common shares.", ...106 not listed..., "Steve Schlotterbeck's surprise resignation Thursday sent shockwaves through the region's natural gas industry at a time when he was expecting to lead the country's largest natural gas producer through a split into two companies later this year. Schlotterbeck's departure comes at a busy time for EQT (NYSE: EQT), which in November completed the $6.7 billion acquisition of Rice Energy and announced only last month that it would be separating its exploration/production and midstream divisions. Schlotterbeck had been tapped to remain as CEO of EQT. And EQT and EQT Midstream Partners this year began construction on its long-awaited Mountain Valley Pipeline, which will connect Marcellus and Utica shale gas with Southeast markets.", "The acquisition of RGS Energy Group Inc. by Energy East Corp. could close within 30 days-ending more than a half century of local ownership. Energy East announced plans to acquire the roughly $1 billion RGS last year, proposing to pay $1.4 billion and assume $1 billion in RGS debt. Energy East is headed by Chairman Wesley Von Schack, who before the formation of Energy East was NYSEG’s CEO.", "The board of Lac Minerals Ltd. agreed today to a friendly takeover by the American Barrick Resources Corporation after Barrick sweetened its offer to $1.6 billion, the two Toronto-based companies said. The deal means that Barrick is the likely victor in the takeover fight for control of Lac, which pitted it against Royal Oak Mines Inc.", "The operations of all acquisitions for the three-year period ending December 31, 1993, have been included in the consolidated statements of earnings since the dates of acquisition. Mr. Bartlett joined the Company in 1976, served as its chief executive
## officer from December 1984 to April 1993 and assumed the Board chairmanship in 1989. Mr. Grimes was elected chief executive officer in April 1993 and resigned that position in December 1993. Mr. Bartlett reassumed the position of c
predictions_tbl <- predictions %>%
as_tibble()
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 126 × 13
## predict p0 p1 fyear co_per_rol departure_code ceo_dismissal
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 0 1.00 0.0000000150 1994 900 7 0
## 2 0 1.00 0.00000000516 2007 933 7 0
## 3 0 1.00 0.00000000691 2004 1023 7 0
## 4 0 1.00 0.000000989 1997 1030 7 0
## 5 0 1.00 0.00000000639 2005 1040 7 0
## 6 0 1.00 0.00000212 1994 1107 7 0
## 7 0 1.00 0.00000000944 2001 1434 7 0
## 8 0 1.00 0.0000000122 1998 1522 7 0
## 9 0 1.00 0.0000000124 1998 1566 7 0
## 10 0 1.00 0.00000102 1996 1739 7 0
## # ℹ 116 more rows
## # ℹ 6 more variables: tenure_no_ceodb <dbl>, max_tenure_ceodb <dbl>,
## # fyear_gone <dbl>, leftofc <dttm>, still_there <fct>, notes <fct>
?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
confusion_matrix <- h2o.confusionMatrix(performance_h2o)
print(confusion_matrix)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.999539021097766:
## 0 1 Error Rate
## 0 124 0 0.000000 =0/124
## 1 0 2 0.000000 =0/2
## Totals 124 2 0.000000 =0/126
#typeof(performance_h2o)
#slotNames(performance_h2o)
#performance_h2o@metrics
auc <- h2o.auc(performance_h2o)
print(paste("AUC:", auc))
## [1] "AUC: 1"
metrics <- performance_h2o@metrics
print(metrics)
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "StackedEnsemble_BestOfFamily_1_AutoML_1_20241121_205358"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/StackedEnsemble_BestOfFamily_1_AutoML_1_20241121_205358"
##
##
## $model_checksum
## [1] "7993262640963615984"
##
## $frame
## $frame$name
## [1] "RTMP_sid_b7ed_6"
##
##
## $frame_checksum
## [1] "-796301443263526576"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.732241e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 2.129433e-09
##
## $RMSE
## [1] 4.614578e-05
##
## $nobs
## [1] 126
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.9999999
##
## $logloss
## [1] 5.888734e-06
##
## $AUC
## [1] 1
##
## $pr_auc
## [1] 1
##
## $Gini
## [1] 1
##
## $mean_per_class_error
## [1] 0
##
## $domain
## [1] "0" "1"
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 0 1 Error Rate
## 0 124 0 0.0000 = 0 / 124
## 1 0 2 0.0000 = 0 / 2
## Totals 124 2 0.0000 = 0 / 126
##
##
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.999764 0.666667 0.555556 0.833333 0.992063 1.000000 0.500000 1.000000
## 2 0.999539 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
## 3 0.000003 0.800000 0.909091 0.714286 0.992063 0.666667 1.000000 0.991935
## 4 0.000002 0.666667 0.833333 0.555556 0.984127 0.500000 1.000000 0.983871
## 5 0.000002 0.571429 0.769231 0.454545 0.976190 0.400000 1.000000 0.975806
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.704273 0.500000 0.750000 124 1 0 1
## 2 1.000000 1.000000 1.000000 124 0 0 2
## 3 0.813198 0.991935 0.995968 123 0 1 2
## 4 0.701381 0.983871 0.991935 122 0 2 2
## 5 0.624758 0.975806 0.987903 121 0 3 2
## tnr fnr fpr tpr idx
## 1 1.000000 0.500000 0.000000 0.500000 0
## 2 1.000000 0.000000 0.000000 1.000000 1
## 3 0.991935 0.000000 0.008065 1.000000 2
## 4 0.983871 0.000000 0.016129 1.000000 3
## 5 0.975806 0.000000 0.024194 1.000000 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 121 0.000000 0.032520 0.077519 0.020576 0.055556 0.016529 1.000000
## 122 0.000000 0.032258 0.076923 0.020408 0.047619 0.016393 1.000000
## 123 0.000000 0.032000 0.076336 0.020243 0.039683 0.016260 1.000000
## 124 0.000000 0.031746 0.075758 0.020080 0.031746 0.016129 1.000000
## 125 0.000000 0.031496 0.075188 0.019920 0.023810 0.016000 1.000000
## 126 0.000000 0.031250 0.074627 0.019763 0.015873 0.015873 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 121 0.040323 0.025816 0.040323 0.520161 5
## 122 0.032258 0.022996 0.032258 0.516129 4
## 123 0.024194 0.019834 0.024194 0.512097 3
## 124 0.016129 0.016129 0.016129 0.508065 2
## 125 0.008065 0.011359 0.008065 0.504032 1
## 126 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 121 0 119 2 0.040323 0.000000 0.959677 1.000000 120
## 122 0 120 2 0.032258 0.000000 0.967742 1.000000 121
## 123 0 121 2 0.024194 0.000000 0.975806 1.000000 122
## 124 0 122 2 0.016129 0.000000 0.983871 1.000000 123
## 125 0 123 2 0.008065 0.000000 0.991935 1.000000 124
## 126 0 124 2 0.000000 0.000000 1.000000 1.000000 125
##
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.999539 1.000000 1
## 2 max f2 0.999539 1.000000 1
## 3 max f0point5 0.999539 1.000000 1
## 4 max accuracy 0.999539 1.000000 1
## 5 max precision 0.999764 1.000000 0
## 6 max recall 0.999539 1.000000 1
## 7 max specificity 0.999764 1.000000 0
## 8 max absolute_mcc 0.999539 1.000000 1
## 9 max min_per_class_accuracy 0.999539 1.000000 1
## 10 max mean_per_class_accuracy 0.999539 1.000000 1
## 11 max tns 0.999764 124.000000 0
## 12 max fns 0.999764 1.000000 0
## 13 max fps 0.000000 124.000000 125
## 14 max tps 0.999539 2.000000 1
## 15 max tnr 0.999764 1.000000 0
## 16 max fnr 0.999764 0.500000 0
## 17 max fpr 0.000000 1.000000 125
## 18 max tpr 0.999539 1.000000 1
##
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 1.59 %, avg score: 1.59 %
## group cumulative_data_fraction lower_threshold lift cumulative_lift
## 1 1 0.01587302 0.749655 63.000000 63.000000
## 2 2 0.02380952 0.000003 0.000000 42.000000
## 3 3 0.03174603 0.000002 0.000000 31.500000
## 4 4 0.04761905 0.000002 0.000000 21.000000
## 5 5 0.05555556 0.000002 0.000000 18.000000
## 6 6 0.10317460 0.000002 0.000000 9.692308
## 7 7 0.15079365 0.000002 0.000000 6.631579
## 8 8 0.20634921 0.000001 0.000000 4.846154
## 9 9 0.30158730 0.000000 0.000000 3.315789
## 10 10 0.40476190 0.000000 0.000000 2.470588
## 11 11 0.50000000 0.000000 0.000000 2.000000
## 12 12 0.60317460 0.000000 0.000000 1.657895
## 13 13 0.69841270 0.000000 0.000000 1.431818
## 14 14 0.80158730 0.000000 0.000000 1.247525
## 15 15 0.89682540 0.000000 0.000000 1.115044
## 16 16 1.00000000 0.000000 0.000000 1.000000
## response_rate score cumulative_response_rate cumulative_score
## 1 1.000000 0.999651 1.000000 0.999651
## 2 0.000000 0.000003 0.666667 0.666435
## 3 0.000000 0.000002 0.500000 0.499827
## 4 0.000000 0.000002 0.333333 0.333219
## 5 0.000000 0.000002 0.285714 0.285616
## 6 0.000000 0.000002 0.153846 0.153794
## 7 0.000000 0.000002 0.105263 0.105228
## 8 0.000000 0.000001 0.076923 0.076898
## 9 0.000000 0.000000 0.052632 0.052614
## 10 0.000000 0.000000 0.039216 0.039203
## 11 0.000000 0.000000 0.031746 0.031736
## 12 0.000000 0.000000 0.026316 0.026307
## 13 0.000000 0.000000 0.022727 0.022720
## 14 0.000000 0.000000 0.019802 0.019796
## 15 0.000000 0.000000 0.017699 0.017693
## 16 0.000000 0.000000 0.015873 0.015868
## capture_rate cumulative_capture_rate gain cumulative_gain
## 1 1.000000 1.000000 6200.000000 6200.000000
## 2 0.000000 1.000000 -100.000000 4100.000000
## 3 0.000000 1.000000 -100.000000 3050.000000
## 4 0.000000 1.000000 -100.000000 2000.000000
## 5 0.000000 1.000000 -100.000000 1700.000000
## 6 0.000000 1.000000 -100.000000 869.230769
## 7 0.000000 1.000000 -100.000000 563.157895
## 8 0.000000 1.000000 -100.000000 384.615385
## 9 0.000000 1.000000 -100.000000 231.578947
## 10 0.000000 1.000000 -100.000000 147.058824
## 11 0.000000 1.000000 -100.000000 100.000000
## 12 0.000000 1.000000 -100.000000 65.789474
## 13 0.000000 1.000000 -100.000000 43.181818
## 14 0.000000 1.000000 -100.000000 24.752475
## 15 0.000000 1.000000 -100.000000 11.504425
## 16 0.000000 1.000000 -100.000000 0.000000
## kolmogorov_smirnov
## 1 1.000000
## 2 0.991935
## 3 0.983871
## 4 0.967742
## 5 0.959677
## 6 0.911290
## 7 0.862903
## 8 0.806452
## 9 0.709677
## 10 0.604839
## 11 0.508065
## 12 0.403226
## 13 0.306452
## 14 0.201613
## 15 0.104839
## 16 0.000000
##
## $residual_deviance
## [1] 0.001483961
##
## $null_deviance
## [1] 21.92384
##
## $AIC
## [1] 8.001484
##
## $loglikelihood
## [1] 0
##
## $null_degrees_of_freedom
## [1] 125
##
## $residual_degrees_of_freedom
## [1] 122