departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv')
## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm (1): leftofc
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(departures)
Name | departures |
Number of rows | 9423 |
Number of columns | 19 |
_______________________ | |
Column type frequency: | |
character | 8 |
numeric | 10 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
coname | 0 | 1.00 | 2 | 30 | 0 | 3860 | 0 |
exec_fullname | 0 | 1.00 | 5 | 790 | 0 | 8701 | 0 |
interim_coceo | 9105 | 0.03 | 6 | 7 | 0 | 6 | 0 |
still_there | 7311 | 0.22 | 3 | 10 | 0 | 77 | 0 |
notes | 1644 | 0.83 | 5 | 3117 | 0 | 7755 | 0 |
sources | 1475 | 0.84 | 18 | 1843 | 0 | 7915 | 0 |
eight_ks | 4499 | 0.52 | 69 | 3884 | 0 | 4914 | 0 |
_merge | 0 | 1.00 | 11 | 11 | 0 | 1 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
dismissal_dataset_id | 0 | 1.00 | 5684.10 | 25005.46 | 1 | 2305.5 | 4593 | 6812.5 | 559044 | ▇▁▁▁▁ |
gvkey | 0 | 1.00 | 40132.48 | 53921.34 | 1004 | 7337.0 | 14385 | 60900.5 | 328795 | ▇▁▁▁▁ |
fyear | 0 | 1.00 | 2007.74 | 8.19 | 1987 | 2000.0 | 2008 | 2016.0 | 2020 | ▁▆▅▅▇ |
co_per_rol | 0 | 1.00 | 25580.22 | 18202.38 | -1 | 8555.5 | 22980 | 39275.5 | 64602 | ▇▆▅▃▃ |
departure_code | 1667 | 0.82 | 5.20 | 1.53 | 1 | 5.0 | 5 | 7.0 | 9 | ▁▃▇▅▁ |
ceo_dismissal | 1813 | 0.81 | 0.20 | 0.40 | 0 | 0.0 | 0 | 0.0 | 1 | ▇▁▁▁▂ |
tenure_no_ceodb | 0 | 1.00 | 1.03 | 0.17 | 0 | 1.0 | 1 | 1.0 | 3 | ▁▇▁▁▁ |
max_tenure_ceodb | 0 | 1.00 | 1.05 | 0.24 | 1 | 1.0 | 1 | 1.0 | 4 | ▇▁▁▁▁ |
fyear_gone | 1802 | 0.81 | 2006.64 | 13.63 | 1980 | 2000.0 | 2007 | 2013.0 | 2997 | ▇▁▁▁▁ |
cik | 245 | 0.97 | 741469.17 | 486551.43 | 1750 | 106413.0 | 857323 | 1050375.8 | 1808065 | ▆▁▇▂▁ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
leftofc | 1802 | 0.81 | 1981-01-01 | 2998-04-27 | 2006-12-31 | 3627 |
# Clean data
departures_clean <- departures %>%
# Clean the target variable
filter(!is.na(ceo_dismissal)) %>%
mutate(ceo_dismissal = if_else(ceo_dismissal == 1, "dismissed", "not_dis")) %>%
mutate(ceo_dismissal = as.factor(ceo_dismissal)) %>%
# Remove variables with too many missing values
select(-c(interim_coceo, still_there, eight_ks))%>%
# Remove irrelevant variables
select(-`_merge`, -sources) %>%
# Remove variables with info that only becomes
select(-departure_code) %>%
# Remove redundant variables
select(-c(gvkey, cik, co_per_rol)) %>% #need leftofc as date variable later
#Remove duplicated in dismissal_dataset_id our id variable
distinct(dismissal_dataset_id, .keep_all = TRUE) %>%
#Remove 2997 in fyear_gone
filter(fyear_gone < 2025) %>%
# Convert factors that are incorrectly imported as numeric variables
mutate(across(c(tenure_no_ceodb, max_tenure_ceodb, fyear_gone), as.factor)) %>%
mutate(across(where(is.character), as.factor)) %>%
mutate(notes = as.character(notes))
skimr::skim(departures_clean)
Name | departures_clean |
Number of rows | 7475 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
character | 1 |
factor | 6 |
numeric | 2 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
notes | 17 | 1 | 5 | 3117 | 0 | 7448 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
coname | 0 | 1 | FALSE | 3427 | BAR: 8, CLA: 8, FED: 8, GRE: 8 |
exec_fullname | 0 | 1 | FALSE | 6975 | Joh: 4, Mel: 4, Alb: 3, Ami: 3 |
ceo_dismissal | 0 | 1 | FALSE | 2 | not: 5992, dis: 1483 |
tenure_no_ceodb | 0 | 1 | FALSE | 3 | 1: 7289, 2: 179, 3: 7 |
max_tenure_ceodb | 0 | 1 | FALSE | 4 | 1: 7138, 2: 319, 3: 15, 4: 3 |
fyear_gone | 0 | 1 | FALSE | 34 | 200: 379, 199: 351, 200: 334, 200: 321 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
dismissal_dataset_id | 0 | 1 | 5570.32 | 25757.33 | 1 | 2175.5 | 4326 | 6579.5 | 559044 | ▇▁▁▁▁ |
fyear | 0 | 1 | 2005.61 | 7.45 | 1987 | 1999.0 | 2006 | 2012.0 | 2020 | ▁▇▆▇▆ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
leftofc | 0 | 1 | 1981-01-01 | 2021-12-01 | 2006-11-15 | 3576 |
library(tidymodels)
# Set seed for reproducibility
set.seed(1234)
data_clean <- departures_clean %>% group_by(ceo_dismissal)%>% sample_n(100) %>% ungroup
# Split the data into training and testing sets
data_split <- initial_split(data_clean, strata = ceo_dismissal)
data_train <- training(data_split)
data_test <- testing(data_split)
# Create cross-validation sets for the training data
data_cv <- vfold_cv(data_train, strata = ceo_dismissal)
data_cv
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [134/16]> Fold01
## 2 <split [134/16]> Fold02
## 3 <split [134/16]> Fold03
## 4 <split [134/16]> Fold04
## 5 <split [134/16]> Fold05
## 6 <split [136/14]> Fold06
## 7 <split [136/14]> Fold07
## 8 <split [136/14]> Fold08
## 9 <split [136/14]> Fold09
## 10 <split [136/14]> Fold10
#departures_clean <- departures_clean %>%
#mutate(leftofc = as.Date(leftofc, format = "%Y-%m-%d"))
xgboost_rec <- recipes::recipe(ceo_dismissal ~ ., data = data_train) %>%
update_role(dismissal_dataset_id, new_role = "ID") %>%
step_other(coname, exec_fullname, threshold = 0.05) %>%
step_tokenize(notes) %>%
step_tokenfilter(notes, max_tokens = 100) %>%
step_tfidf(notes) %>%
step_date(leftofc, features = c("year", "month", "doy"), keep_original_cols = FALSE) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(ceo_dismissal)
xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 150
## Columns: 156
## $ dismissal_dataset_id <dbl> 3386, 4200, 6287, 1277, 2064, 3066, 6716, 6613…
## $ fyear <dbl> 2012, 2004, 2001, 2013, 2012, 2005, 2016, 2000…
## $ ceo_dismissal <fct> dismissed, dismissed, dismissed, dismissed, di…
## $ tfidf_notes_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_1997 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_2003 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_3 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04145575…
## $ tfidf_notes_a <dbl> 0.06278155, 0.00000000, 0.03251187, 0.04477045…
## $ tfidf_notes_about <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_acquisition <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_after <dbl> 0.06483837, 0.00000000, 0.00000000, 0.03082480…
## $ tfidf_notes_agreement <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_all <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_also <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03835080…
## $ tfidf_notes_an <dbl> 0.06084517, 0.00000000, 0.00000000, 0.05785278…
## $ tfidf_notes_and <dbl> 0.02796311, 0.00000000, 0.05792359, 0.02658788…
## $ tfidf_notes_announced <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06164960…
## $ tfidf_notes_as <dbl> 0.10075215, 0.00000000, 0.06956696, 0.01596619…
## $ tfidf_notes_at <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_based <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_be <dbl> 0.00000000, 0.00000000, 0.08159930, 0.00000000…
## $ tfidf_notes_been <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_board <dbl> 0.00000000, 0.00000000, 0.04474153, 0.00000000…
## $ tfidf_notes_but <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_by <dbl> 0.00000000, 0.38467534, 0.00000000, 0.00000000…
## $ tfidf_notes_ceo <dbl> 0.09166455, 0.00000000, 0.09493828, 0.02178911…
## $ tfidf_notes_chairman <dbl> 0.00000000, 0.00000000, 0.00000000, 0.02224629…
## $ tfidf_notes_changed <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04266704…
## $ tfidf_notes_chief <dbl> 0.00000000, 0.00000000, 0.00000000, 0.01996979…
## $ tfidf_notes_company <dbl> 0.03506670, 0.00000000, 0.03631908, 0.05001316…
## $ `tfidf_notes_company's` <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_corp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_corporation <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_departure <dbl> 0.00000000, 0.00000000, 0.09031432, 0.00000000…
## $ tfidf_notes_did <dbl> 0.00000000, 0.00000000, 0.09031432, 0.00000000…
## $ tfidf_notes_director <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_directors <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_down <dbl> 0.16537209, 0.00000000, 0.00000000, 0.03930976…
## $ tfidf_notes_during <dbl> 0.17440006, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_effective <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_executive <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_financial <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_following <dbl> 0.00000000, 0.00000000, 0.09295320, 0.00000000…
## $ tfidf_notes_for <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04228418…
## $ tfidf_notes_from <dbl> 0.04536817, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_had <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06742466…
## $ tfidf_notes_has <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_have <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_he <dbl> 0.00000000, 0.00000000, 0.04651873, 0.04270572…
## $ tfidf_notes_him <dbl> 0.00000000, 0.59947382, 0.00000000, 0.00000000…
## $ tfidf_notes_his <dbl> 0.16326056, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_in <dbl> 0.09604772, 0.00000000, 0.00000000, 0.04566203…
## $ tfidf_notes_inc <dbl> 0.00000000, 0.00000000, 0.00000000, 0.02587189…
## $ tfidf_notes_into <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_is <dbl> 0.00000000, 0.00000000, 0.12235561, 0.00000000…
## $ tfidf_notes_it <dbl> 0.00000000, 0.00000000, 0.12416220, 0.00000000…
## $ tfidf_notes_its <dbl> 0.05442019, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_january <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_john <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_left <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_management <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04145575…
## $ tfidf_notes_march <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03930976…
## $ tfidf_notes_may <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03930976…
## $ tfidf_notes_merger <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_million <dbl> 0.07536068, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_mr <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_new <dbl> 0.00000000, 0.00000000, 0.07489790, 0.03437936…
## $ tfidf_notes_not <dbl> 0.06957922, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_of <dbl> 0.00000000, 0.00000000, 0.08738442, 0.02674059…
## $ tfidf_notes_officer <dbl> 0.00000000, 0.00000000, 0.00000000, 0.02248327…
## $ tfidf_notes_on <dbl> 0.00000000, 0.00000000, 0.04651873, 0.00000000…
## $ tfidf_notes_or <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_other <dbl> 0.07702042, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_out <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_over <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_performance <dbl> 0.00000000, 0.00000000, 0.00000000, 0.04145575…
## $ tfidf_notes_president <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_resignation <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_resigned <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_retire <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0829115, 0.…
## $ tfidf_notes_retirement <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_notes_said <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_served <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_since <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_stock <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_than <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_that <dbl> 0.00000000, 0.00000000, 0.00000000, 0.02015495…
## $ tfidf_notes_the <dbl> 0.02576601, 0.00000000, 0.05337246, 0.04899767…
## $ tfidf_notes_they <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_this <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_time <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_notes_to <dbl> 0.03021542, 0.00000000, 0.06258909, 0.04309413…
## $ tfidf_notes_today <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_notes_until <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03745542…
## $ tfidf_notes_was <dbl> 0.03915458, 0.28387071, 0.00000000, 0.03722895…
## $ tfidf_notes_were <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_when <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ tfidf_notes_which <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03835080…
## $ tfidf_notes_who <dbl> 0.00000000, 0.47809686, 0.00000000, 0.03135061…
## $ tfidf_notes_will <dbl> 0.00000000, 0.00000000, 0.00000000, 0.09405184…
## $ tfidf_notes_with <dbl> 0.00000000, 0.00000000, 0.00000000, 0.06103201…
## $ tfidf_notes_year <dbl> 0.00000000, 0.00000000, 0.06829955, 0.00000000…
## $ tfidf_notes_years <dbl> 0.00000000, 0.00000000, 0.00000000, 0.03190017…
## $ leftofc_year <int> 2012, 2005, 2001, 2014, 2012, 2005, 2017, 2003…
## $ leftofc_doy <int> 269, 32, 288, 60, 275, 31, 9, 90, 53, 333, 1, …
## $ coname_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ exec_fullname_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ tenure_no_ceodb_X2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tenure_no_ceodb_X3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ max_tenure_ceodb_X4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1988 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1990 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1991 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1992 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1993 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1994 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1995 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1996 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1997 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X1998 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ fyear_gone_X1999 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2000 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ fyear_gone_X2001 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2002 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2003 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2004 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2005 <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ fyear_gone_X2006 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2007 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2008 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2009 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2010 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2011 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2012 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ fyear_gone_X2013 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2014 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2016 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2017 <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2018 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1…
## $ fyear_gone_X2019 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2020 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear_gone_X2021 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Feb <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Mar <dbl> 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Apr <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ leftofc_month_May <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ leftofc_month_Jun <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Jul <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ leftofc_month_Aug <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Sep <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Oct <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0…
## $ leftofc_month_Nov <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ leftofc_month_Dec <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
xgboost_spec <-
boost_tree(trees = tune(), tree_depth = tune(), min_n = tune(), learn_rate = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_rec) %>%
add_model(xgboost_spec)
doParallel::registerDoParallel()
set.seed(65743)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5,
control = control_grid(save_pred = TRUE))
## Warning: ! tune detected a parallel backend registered with foreach but no backend
## registered with future.
## ℹ Support for parallel processing with foreach was soft-deprecated in tune
## 1.2.1.
## ℹ See ?parallelism (`?tune::parallelism()`) to learn more.
collect_metrics(xgboost_tune)
## # A tibble: 15 × 10
## trees min_n tree_depth learn_rate .metric .estimator mean n std_err
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 1000 2 1 0.0750 accuracy binary 0.624 10 0.0447
## 2 1000 2 1 0.0750 brier_class binary 0.265 10 0.0327
## 3 1000 2 1 0.0750 roc_auc binary 0.703 10 0.0501
## 4 1500 11 11 0.001 accuracy binary 0.588 10 0.0531
## 5 1500 11 11 0.001 brier_class binary 0.241 10 0.00621
## 6 1500 11 11 0.001 roc_auc binary 0.628 10 0.0454
## 7 500 21 15 0.316 accuracy binary 0.5 10 0
## 8 500 21 15 0.316 brier_class binary 0.25 10 0
## 9 500 21 15 0.316 roc_auc binary 0.5 10 0
## 10 1 30 4 0.00422 accuracy binary 0.5 10 0
## 11 1 30 4 0.00422 brier_class binary 0.25 10 0
## 12 1 30 4 0.00422 roc_auc binary 0.5 10 0
## 13 2000 40 8 0.0178 accuracy binary 0.5 10 0
## 14 2000 40 8 0.0178 brier_class binary 0.25 10 0
## 15 2000 40 8 0.0178 roc_auc binary 0.5 10 0
## # ℹ 1 more variable: .config <chr>
collect_predictions(xgboost_tune) %>%
group_by(id) %>%
roc_curve(ceo_dismissal, .pred_dismissed) %>%
autoplot()
xgboost_last <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
last_fit(data_split)
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.64 Preprocessor1_Model1
## 2 roc_auc binary 0.653 Preprocessor1_Model1
## 3 brier_class binary 0.269 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
yardstick::conf_mat(ceo_dismissal, .pred_class) %>%
autoplot()
library(vip)
xgboost_last %>%
workflows::extract_fit_engine() %>%
vip()
The previous model had accuracy of 0.838 and AUC of 0.856
Feature transformation: Date function transformation. Resulted in almost the same result, but slightly worsen with accuracy of 0.837 and AUC 0.0.853. Feature transformation: removal #%>% group_by(ceo_dismissal)%>% sample_n(100) %>% ungroup: No improvment accuracy, worsen result on full data set. *Feature slection: PCA didn’t make an improvement.
recipe_obj <- recipe(ceo_dismissal ~ ., data = data_train) %>%
# Remove zero variance variables
step_zv(all_predictors())
# Initialize h2o
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 6 minutes 44 seconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 17 days
## H2O cluster name: H2O_started_from_R_rad1081_pja023
## H2O cluster total nodes: 1
## H2O cluster total memory: 7.08 GB
## H2O cluster total cores: 20
## H2O cluster allowed cores: 20
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.1 (2024-06-14 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 17 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split.h2o <- h2o.splitFrame(as.h2o(data_train), ratios = c(0.85), seed = 2345)
## | | | 0% | |======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(data_test)
## | | | 0% | |======================================================================| 100%
y <- "ceo_dismissal"
x <- setdiff(names(data_train), y)
models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
# max_runtime_secs = 30,
max_models = 10,
exclude_algos = "DeepLearning",
nfolds = 5,
seed = 3456
)
## | | | 0% | |================================================== | 71%
## 22:21:33.192: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 22:21:33.196: AutoML: XGBoost is not available; skipping it.
## 22:21:33.196: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.342: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.342: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 133.0.
## 22:21:33.342: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.509: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.622: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.743: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.848: _train param, Dropping bad and constant columns: [notes]
## 22:21:33.940: _train param, Dropping bad and constant columns: [notes]
## 22:21:34.276: _train param, Dropping bad and constant columns: [notes]
## 22:21:34.425: _train param, Dropping unused columns: [notes]
## 22:21:34.666: _train param, Dropping unused columns: [notes] | |======================================================================| 100%
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id auc
## 1 GBM_lr_annealing_selection_AutoML_3_20250507_222133_select_model 0.6384
## 2 StackedEnsemble_BestOfFamily_1_AutoML_3_20250507_222133 0.6384
## 3 GLM_1_AutoML_3_20250507_222133 0.5632
## 4 XRT_1_AutoML_3_20250507_222133 0.5368
## 5 GBM_2_AutoML_3_20250507_222133 0.5128
## 6 GBM_grid_1_AutoML_3_20250507_222133_model_3 0.4904
## logloss aucpr mean_per_class_error rmse mse
## 1 1.1890790 0.6733367 0.32 0.6336895 0.4015624
## 2 0.6663851 0.6733367 0.32 0.4879913 0.2381355
## 3 0.6920394 0.5765021 0.50 0.4994458 0.2494461
## 4 0.7143198 0.5302203 0.46 0.5091135 0.2591966
## 5 1.2738569 0.5653167 0.44 0.6418286 0.4119440
## 6 0.9656350 0.5164876 0.50 0.5908143 0.3490615
##
## [11 rows x 7 columns]
models_h2o@leader
## Model Details:
## ==============
##
## H2OBinomialModel: gbm
## Model ID: GBM_lr_annealing_selection_AutoML_3_20250507_222133_select_model
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 20 20 5828 3
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 5 4.10000 7 9 8.40000
##
##
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
##
## MSE: 0.007384357
## RMSE: 0.08593228
## LogLoss: 0.08953965
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 1
## Gini: 1
## R^2: 0.9704609
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## dismissed not_dis Error Rate
## dismissed 67 0 0.000000 =0/67
## not_dis 0 66 0.000000 =0/66
## Totals 67 66 0.000000 =0/133
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.892790 1.000000 64
## 2 max f2 0.892790 1.000000 64
## 3 max f0point5 0.892790 1.000000 64
## 4 max accuracy 0.892790 1.000000 64
## 5 max precision 0.923658 1.000000 0
## 6 max recall 0.892790 1.000000 64
## 7 max specificity 0.923658 1.000000 0
## 8 max absolute_mcc 0.892790 1.000000 64
## 9 max min_per_class_accuracy 0.892790 1.000000 64
## 10 max mean_per_class_accuracy 0.892790 1.000000 64
## 11 max tns 0.923658 67.000000 0
## 12 max fns 0.923658 65.000000 0
## 13 max fps 0.077761 67.000000 127
## 14 max tps 0.892790 66.000000 64
## 15 max tnr 0.923658 1.000000 0
## 16 max fnr 0.923658 0.984848 0
## 17 max fpr 0.077761 1.000000 127
## 18 max tpr 0.892790 1.000000 64
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
##
## MSE: 0.4410905
## RMSE: 0.6641464
## LogLoss: 1.314171
## Mean Per-Class Error: 0.375
## AUC: 0.5277778
## AUCPR: 0.5289719
## Gini: 0.05555556
## R^2: -0.7704883
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## dismissed not_dis Error Rate
## dismissed 2 6 0.750000 =6/8
## not_dis 0 9 0.000000 =0/9
## Totals 2 15 0.352941 =6/17
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.078710 0.750000 9
## 2 max f2 0.078710 0.882353 9
## 3 max f0point5 0.078710 0.652174 9
## 4 max accuracy 0.078710 0.647059 9
## 5 max precision 0.097067 0.600000 2
## 6 max recall 0.078710 1.000000 9
## 7 max specificity 0.107839 0.875000 0
## 8 max absolute_mcc 0.078710 0.387298 9
## 9 max min_per_class_accuracy 0.092963 0.500000 4
## 10 max mean_per_class_accuracy 0.078710 0.625000 9
## 11 max tns 0.107839 7.000000 0
## 12 max fns 0.107839 8.000000 0
## 13 max fps 0.076896 8.000000 11
## 14 max tps 0.078710 9.000000 9
## 15 max tnr 0.107839 0.875000 0
## 16 max fnr 0.107839 0.888889 0
## 17 max fpr 0.076896 1.000000 11
## 18 max tpr 0.078710 1.000000 9
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.3925783
## RMSE: 0.6265607
## LogLoss: 1.171669
## Mean Per-Class Error: 0.5
## AUC: 0.5574401
## AUCPR: 0.595799
## Gini: 0.1148801
## R^2: -0.5704022
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## dismissed not_dis Error Rate
## dismissed 0 67 1.000000 =67/67
## not_dis 0 66 0.000000 =0/66
## Totals 0 133 0.503759 =67/133
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.072764 0.663317 81
## 2 max f2 0.072764 0.831234 81
## 3 max f0point5 0.083791 0.570776 58
## 4 max accuracy 0.090045 0.571429 44
## 5 max precision 0.515654 1.000000 0
## 6 max recall 0.072764 1.000000 81
## 7 max specificity 0.515654 1.000000 0
## 8 max absolute_mcc 0.128158 0.237481 6
## 9 max min_per_class_accuracy 0.090294 0.552239 43
## 10 max mean_per_class_accuracy 0.090045 0.571574 44
## 11 max tns 0.515654 67.000000 0
## 12 max fns 0.515654 65.000000 0
## 13 max fps 0.074445 67.000000 79
## 14 max tps 0.072764 66.000000 81
## 15 max tnr 0.515654 1.000000 0
## 16 max fnr 0.515654 0.984848 0
## 17 max fpr 0.074445 1.000000 79
## 18 max tpr 0.072764 1.000000 81
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## accuracy 0.577778 0.099208 0.555556 0.592593 0.740741
## auc 0.569611 0.083104 0.516484 0.620879 0.684066
## err 0.422222 0.099208 0.444444 0.407407 0.259259
## err_count 11.200000 2.489980 12.000000 11.000000 7.000000
## f0point5 0.608048 0.085271 0.575221 0.596330 0.757576
## f1 0.692197 0.030961 0.684211 0.702703 0.740741
## f2 0.818145 0.053055 0.844156 0.855263 0.724638
## lift_top_group 2.016484 0.062406 2.076923 2.076923 1.928572
## logloss 1.472991 0.066341 1.502453 1.362837 1.460304
## max_per_class_error 0.785714 0.294508 0.857143 0.785714 0.285714
## mcc 0.365588 0.107662 0.272554 0.340693 0.483516
## mean_per_class_accuracy 0.584066 0.099616 0.571429 0.607143 0.741758
## mean_per_class_error 0.415934 0.099616 0.428571 0.392857 0.258242
## mse 0.429751 0.018685 0.438094 0.396594 0.438892
## pr_auc 0.624372 0.083814 0.565712 0.655953 0.749358
## precision 0.566179 0.114807 0.520000 0.541667 0.769231
## r2 -0.720403 0.074348 -0.754784 -0.588554 -0.757979
## recall 0.942857 0.127775 1.000000 1.000000 0.714286
## rmse 0.655426 0.014460 0.661887 0.629757 0.662489
## specificity 0.225275 0.317923 0.142857 0.214286 0.769231
## cv_4_valid cv_5_valid
## accuracy 0.500000 0.500000
## auc 0.476331 0.550296
## err 0.500000 0.500000
## err_count 13.000000 13.000000
## f0point5 0.555556 0.555556
## f1 0.666667 0.666667
## f2 0.833333 0.833333
## lift_top_group 2.000000 2.000000
## logloss 1.513267 1.526091
## max_per_class_error 1.000000 1.000000
## mcc NA NA
## mean_per_class_accuracy 0.500000 0.500000
## mean_per_class_error 0.500000 0.500000
## mse 0.434334 0.440840
## pr_auc 0.535128 0.615709
## precision 0.500000 0.500000
## r2 -0.737335 -0.763362
## recall 1.000000 1.000000
## rmse 0.659040 0.663958
## specificity 0.000000 0.000000
?h2o.getModel
## starting httpd help server ... done
?h2o.saveModel
?h2o.loadModel
best_model <- models_h2o@leader
# best_model <- h2o.loadModel("h2o_models/GBM_lr_annealing_selection_AutoML_1_20250504_210132_select_model")
# having trouble making this one work
predictions <- h2o.predict(best_model, newdata = test_h2o)
## | | | 0% | |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'coname' has levels not trained on: ["ADVANCE AUTO PARTS INC",
## "AK STEEL HOLDING CORP", "AMOCO CORP", "BISYS GROUP INC", "BOB EVANS FARMS",
## "BRUNSWICK CORP", "CACI INTL INC -CL A", "CALAMP CORP", "CATALINA MARKETING
## CORP", "CDW CORP", ...26 not listed..., "SYMANTEC CORP", "SYMBOL TECHNOLOGIES",
## "SYSCO CORP", "TECUMSEH PRODUCTS CO", "U S TRUST CORP", "U.S. STEEL", "US
## AIRWAYS GROUP INC-OLD", "VOLT INFO SCIENCES INC", "WET SEAL INC -CL A", "XPERI
## CORPORATION"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'exec_fullname' has levels not trained on: ["Bruce Karatz",
## "Bruce M. McWilliams", "Bruce R. Lakefield", "Charles R. Perrin", "David W.
## Fox", "Dennis C. Pence", "Eric Krasnoff", "Frank J. Hansen", "H. Laurance
## Fuller", "H. Marshall Schwarz", ...30 not listed..., "Steven D. Butler",
## "Thomas E. Richards", "Thomas J. Usher", "Thomas Kendall Hunt", "Timothy N.
## Jenson", "Tomo Razmilovic", "Volker Wypyszyk", "W. A. Griffin III", "William H.
## Swanson", "William L. Schrader"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'fyear_gone' has levels not trained on: ["2020"]
predictions_tbl<- predictions %>%
as_tibble()
predictions_tbl %>%
bind_cols(data_test)
## # A tibble: 50 × 13
## predict dismissed not_dis dismissal_dataset_id coname fyear exec_fullname
## <fct> <dbl> <dbl> <dbl> <fct> <dbl> <fct>
## 1 not_dis 0.911 0.0893 2853 POWELL IN… 2015 "Michael All…
## 2 not_dis 0.910 0.0897 8320 FREEPORT … 2014 "James C. Fl…
## 3 not_dis 0.911 0.0895 5771 BISYS GRO… 2005 "Russell P. …
## 4 not_dis 0.911 0.0892 3296 STRIDE RI… 1999 "James A. Es…
## 5 not_dis 0.911 0.0895 8250 ADVANCE A… 2006 "Michael N. …
## 6 not_dis 0.892 0.108 5286 WET SEAL … 2004 "Peter D. Wh…
## 7 not_dis 0.893 0.107 2208 MAXTOR CO… 1994 "Hyundai's c…
## 8 not_dis 0.903 0.0971 3344 SYMBOL TE… 2002 "Tomo Razmil…
## 9 not_dis 0.911 0.0895 7390 STARTEK I… 2006 "Steven D. B…
## 10 not_dis 0.910 0.0897 4811 SYMANTEC … 2015 "Michael A. …
## # ℹ 40 more rows
## # ℹ 6 more variables: ceo_dismissal <fct>, tenure_no_ceodb <fct>,
## # max_tenure_ceodb <fct>, fyear_gone <fct>, leftofc <dttm>, notes <chr>
?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
confusion_matrix <- h2o.confusionMatrix(performance_h2o)
print(confusion_matrix)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.0926881801248008:
## dismissed not_dis Error Rate
## dismissed 16 9 0.360000 =9/25
## not_dis 7 18 0.280000 =7/25
## Totals 23 27 0.320000 =16/50
#typeof(performance_h2o)
#slotNames(performance_h2o)
#performance_h2o@metrics
metrics <- performance_h2o@metrics
print(metrics)
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "GBM_lr_annealing_selection_AutoML_3_20250507_222133_select_model"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/GBM_lr_annealing_selection_AutoML_3_20250507_222133_select_model"
##
##
## $model_checksum
## [1] "-8767878325203396011"
##
## $frame
## $frame$name
## [1] "data_test_sid_b620_3"
##
##
## $frame_checksum
## [1] "8908287964745272696"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.746671e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.4015624
##
## $RMSE
## [1] 0.6336895
##
## $nobs
## [1] 50
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] -0.6062494
##
## $logloss
## [1] 1.189079
##
## $AUC
## [1] 0.6384
##
## $pr_auc
## [1] 0.6733367
##
## $Gini
## [1] 0.2768
##
## $mean_per_class_error
## [1] 0.32
##
## $domain
## [1] "dismissed" "not_dis"
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## dismissed not_dis Error Rate
## dismissed 16 9 0.3600 = 9 / 25
## not_dis 7 18 0.2800 = 7 / 25
## Totals 23 27 0.3200 = 16 / 50
##
##
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.482191 0.076923 0.049505 0.172414 0.520000 1.000000 0.040000 1.000000
## 2 0.125795 0.148148 0.098039 0.303030 0.540000 1.000000 0.080000 1.000000
## 3 0.116661 0.214286 0.145631 0.405405 0.560000 1.000000 0.120000 1.000000
## 4 0.107860 0.206897 0.144231 0.365854 0.540000 0.750000 0.120000 0.960000
## 5 0.107856 0.375000 0.280374 0.566038 0.600000 0.857143 0.240000 0.960000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.142857 0.040000 0.520000 25 24 0 1
## 2 0.204124 0.080000 0.540000 25 23 0 2
## 3 0.252646 0.120000 0.560000 25 22 0 3
## 4 0.147442 0.120000 0.540000 24 22 1 3
## 5 0.288195 0.240000 0.600000 24 19 1 6
## tnr fnr fpr tpr idx
## 1 1.000000 0.960000 0.000000 0.040000 0
## 2 1.000000 0.920000 0.000000 0.080000 1
## 3 1.000000 0.880000 0.000000 0.120000 2
## 4 0.960000 0.880000 0.040000 0.120000 3
## 5 0.960000 0.760000 0.040000 0.240000 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 28 0.087099 0.626866 0.739437 0.544041 0.500000 0.500000 0.840000 0.160000
## 29 0.079123 0.647059 0.769231 0.558376 0.520000 0.511628 0.880000 0.160000
## 30 0.079113 0.628571 0.758621 0.536585 0.480000 0.488889 0.880000 0.080000
## 31 0.078754 0.647887 0.787671 0.550239 0.500000 0.500000 0.920000 0.080000
## 32 0.077264 0.648649 0.805369 0.542986 0.480000 0.489796 0.960000 0.000000
## 33 0.077261 0.666667 0.833333 0.555556 0.500000 0.500000 1.000000 0.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 28 0.000000 0.160000 0.500000 4 4 21 21
## 29 0.057639 0.160000 0.520000 4 3 21 22
## 30 0.066667 0.080000 0.480000 2 3 23 22
## 31 0.000000 0.080000 0.500000 2 2 23 23
## 32 0.142857 0.000000 0.480000 0 1 25 24
## 33 0.000000 0.000000 0.500000 0 0 25 25
## tnr fnr fpr tpr idx
## 28 0.160000 0.160000 0.840000 0.840000 27
## 29 0.160000 0.120000 0.840000 0.880000 28
## 30 0.080000 0.120000 0.920000 0.880000 29
## 31 0.080000 0.080000 0.920000 0.920000 30
## 32 0.000000 0.040000 1.000000 0.960000 31
## 33 0.000000 0.000000 1.000000 1.000000 32
##
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.092688 0.692308 17
## 2 max f2 0.077261 0.833333 32
## 3 max f0point5 0.093099 0.680000 15
## 4 max accuracy 0.093099 0.680000 15
## 5 max precision 0.482191 1.000000 0
## 6 max recall 0.077261 1.000000 32
## 7 max specificity 0.482191 1.000000 0
## 8 max absolute_mcc 0.092688 0.361158 17
## 9 max min_per_class_accuracy 0.093099 0.680000 15
## 10 max mean_per_class_accuracy 0.093099 0.680000 15
## 11 max tns 0.482191 25.000000 0
## 12 max fns 0.482191 24.000000 0
## 13 max fps 0.077264 25.000000 31
## 14 max tps 0.077261 25.000000 32
## 15 max tnr 0.482191 1.000000 0
## 16 max fnr 0.482191 0.960000 0
## 17 max fpr 0.077264 1.000000 31
## 18 max tpr 0.077261 1.000000 32
##
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 50.00 %, avg score: 10.19 %
## group cumulative_data_fraction lower_threshold lift cumulative_lift
## 1 1 0.02000000 0.307557 2.000000 2.000000
## 2 2 0.02000000 0.132923 0.000000 2.000000
## 3 3 0.04000000 0.121502 2.000000 2.000000
## 4 4 0.04000000 0.117027 0.000000 2.000000
## 5 5 0.06000000 0.112701 2.000000 2.000000
## 6 6 0.14000000 0.107856 1.500000 1.714286
## 7 7 0.16000000 0.107385 0.000000 1.500000
## 8 8 0.20000000 0.107373 1.000000 1.400000
## 9 9 0.30000000 0.097080 0.800000 1.200000
## 10 10 0.42000000 0.096646 1.666667 1.333333
## 11 11 0.50000000 0.092932 1.500000 1.360000
## 12 12 0.60000000 0.089567 0.400000 1.200000
## 13 13 0.72000000 0.089205 0.333333 1.055556
## 14 14 0.82000000 0.087104 0.800000 1.024390
## 15 15 0.90000000 0.079077 0.500000 0.977778
## 16 16 1.00000000 0.077261 1.200000 1.000000
## response_rate score cumulative_response_rate cumulative_score
## 1 1.000000 0.482191 1.000000 0.482191
## 2 0.000000 0.000000 1.000000 0.482191
## 3 1.000000 0.125795 1.000000 0.303993
## 4 0.000000 0.000000 1.000000 0.303993
## 5 1.000000 0.116661 1.000000 0.241549
## 6 0.750000 0.107857 0.857143 0.165154
## 7 0.000000 0.107390 0.750000 0.157933
## 8 0.500000 0.107376 0.700000 0.147822
## 9 0.400000 0.102730 0.600000 0.132791
## 10 0.833333 0.096899 0.666667 0.122536
## 11 0.750000 0.093836 0.680000 0.117944
## 12 0.200000 0.090893 0.600000 0.113436
## 13 0.166667 0.089364 0.527778 0.109424
## 14 0.400000 0.088289 0.512195 0.106846
## 15 0.250000 0.081112 0.488889 0.104559
## 16 0.600000 0.077561 0.500000 0.101859
## capture_rate cumulative_capture_rate gain cumulative_gain
## 1 0.040000 0.040000 100.000000 100.000000
## 2 0.000000 0.040000 -100.000000 100.000000
## 3 0.040000 0.080000 100.000000 100.000000
## 4 0.000000 0.080000 -100.000000 100.000000
## 5 0.040000 0.120000 100.000000 100.000000
## 6 0.120000 0.240000 50.000000 71.428571
## 7 0.000000 0.240000 -100.000000 50.000000
## 8 0.040000 0.280000 0.000000 40.000000
## 9 0.080000 0.360000 -20.000000 20.000000
## 10 0.200000 0.560000 66.666667 33.333333
## 11 0.120000 0.680000 50.000000 36.000000
## 12 0.040000 0.720000 -60.000000 20.000000
## 13 0.040000 0.760000 -66.666667 5.555556
## 14 0.080000 0.840000 -20.000000 2.439024
## 15 0.040000 0.880000 -50.000000 -2.222222
## 16 0.120000 1.000000 20.000000 0.000000
## kolmogorov_smirnov
## 1 0.040000
## 2 0.040000
## 3 0.080000
## 4 0.080000
## 5 0.120000
## 6 0.200000
## 7 0.160000
## 8 0.160000
## 9 0.120000
## 10 0.280000
## 11 0.360000
## 12 0.240000
## 13 0.080000
## 14 0.040000
## 15 -0.040000
## 16 0.000000
#auc <- h2o.auc(performance_h2o)
#print(paste("AUC:", auc))
#typeof(performance_h2o)
#slotNames(performance_h2o)
#performance_h2o@metrics
#h2o.auc(performance_h2o)
#h2o.accuracy(performance_h2o)
#h2o.confusionMatrix(performance_h2o)
#h2o.metric(performance_h2o)