load and clean data

data <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-04-27/departures.csv')
## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_clean <- data %>%
    select(-c(`_merge`, still_there, sources, eight_ks)) %>%
    na.omit() %>%
    mutate(across(c(departure_code, ceo_dismissal), as.factor))
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         8 days 19 hours 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 11 days 
##     H2O cluster name:           H2O_started_from_R_owner_dit581 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.29 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.1 (2023-06-16)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 11 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

split data

set.seed(123)
data_split <- initial_split(data_clean, strata = ceo_dismissal)
train_tbl <- training(data_split)
test_tbl  <- testing(data_split)

pre with recipes

ceo_recipe <- recipe(ceo_dismissal ~ ., data = train_tbl) %>%
    update_role(dismissal_dataset_id, new_role = "ID") %>%
    step_date(leftofc, keep_original_cols = FALSE) %>%
    step_other(coname, exec_fullname) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_zv(all_predictors()) %>%
    prep()

train_prepped <- bake(ceo_recipe, new_data = train_tbl)
test_prepped  <- bake(ceo_recipe, new_data = test_tbl)
## Warning: ! There are new levels in `notes`: "Prior to his appointment as Executive Vice
##   President of Administration on February 26, 2007, Mr. Hann had served as
##   Interim President and Chief Executive Officer from March 27, 2006 through
##   February 26, 2007. He quit and retired from his Vp spot 3 months after the
##   new person came in", "George J. Harad relinquished the Chief Executive
##   position at Boise in October 2004, when Boise sold its forest product assets
##   and changed its name to OfficeMax .", "MELVYN J. ESTRIN, age 56, has served
##   as a director of the Corporation since 1990. Mr. Estrin has also served as
##   Co-Chairman of the Board of the Corporation since March 1991 and was
##   appointed Co-Chief Executive Officer of the Corporation in October 1991.
##   Dallas, Texas—December 11, 2002—Avatex Corporation (OTCBB: AVAT) today
##   announced that it and five subsidiaries each filed a voluntary petition for
##   relief under chapter 11 of the Bankruptcy Code in the United States
##   Bankruptcy Court for the Northern District of Texas, Dallas Division. . . .
##   The Board also terminated without cause the employment of all of company's
##   other officers, including its Co-Chief Executive Officers, Abbey J. Butler
##   and Melvyn J. Estrin.", "The company said Chairman James V. Napier will be
##   interim chief executive officer while a search is under way for a permanent
##   replacement.", "He was there to find a replacement after the company fired
##   Thoman. He was part of a settlement with the SEC for accounting fraud in
##   2003", "Cline will retire as chairman of Airborne's board on April 30, along
##   with Robert G. Brazier, the vice chairman. During his time, Airborne saw its
##   financial performance sag. Excluding federal assistance of $5.2 million in
##   the fourth quarter and a one-time sale, the company lost money in every
##   quarter last year.", "Mr. Smucker, 68, has been a Director since October
##   1973. He has been the Company’s Chairman of the Board since 1987 and served
##   as Co-Chief Executive Officer from February 2001 through August 2011.", "Nov
##   19, 2009-- In a move that signals Tyson Foods Inc. believes its chicken
##   segment is again profitable, the world's largest meat maker has named a new
##   CEO to replace interim head Leland Tollett, who had been tapped in January to
##   help weather an industry downturn. Tollett, who had also been interim
##   president since January, will assist the 50-year-old Smith during the
##   transition period.", "Cumenal, 57, who had run the company since April 2015,
##   is being succeeded on an interim basis by chairman and former CEO Michael
##   Kowalski, Tiffany said on Sunday afternoon. The shake-up follows the
##   departure of the jeweler’s top designer three weeks ago and weak holiday
##   sales that sent the stock tumbling. February 6, 2017, Tiffany & Co. abruptly
##   replaced Chief Executive Officer Frederic Cumenal after disappointing
##   financial results, just hours before the jewelry chain introduced a new
##   campaign with the first Super Bowl ad in its history.", "Metz succeeds
##   Christopher Twomey, the company’s chairman and longtime CEO who returned to
##   serve as an interim CEO after the company fired its previous CEO, Claude
##   Jordan, on June 2. Twomey will remain chairman of the board. “I am excited to
##   lead a terrific team at Arctic Cat.” Christopher Metz, Arctic Cat’s new chief
##   executive, will start Dec. 3.", "Mr. Leonard Joseph Feinstein is a Co-Founder
##   of Bed Bath & Beyond, Inc. and has been its Co-Chairman since 1999. Mr.
##   Feinstein served as Co-Chief Executive Officer of Bed Bath & Beyond Inc. from
##   1971 to April 2003 and President from 1992 to 1999. He has been a Director of
##   Bed Bath & Beyond, Inc. since 1971.", "At the age of 81, Mr. Shaw passed away
##   in his home that morning.", "Mr. Houdaille ceased to be an executive officer
##   of the Company as of November 1996.  Mr. Houdaille served as President and
##   Chief Executive Officer of the company from May 1996 to November 1996. SEC
##   filing said that he was filling the roll they could meet as a board to
##   discuss", "Gibson Greetings Fires CEO: Gibson Greetings Inc. said Benjamin J.
##   Sottile will no longer head the Cincinnati-based company. Albert R. Pezzillo,
##   chairman of Gibson’s executive committee, was named interim chairman and CEO
##   while the company searches for a successor. GIBSON GREETINGS INC.,
##   Cincinnati, named Frank O'Connell president and chief executive. Mr.
##   O'Connell succeeds Albert R. Pezzillo, who will continue as chairman.",
##   "Christian Wilhelm Erich Haub or CH is and has been a member of the Board of
##   Directors of the Company (the “Board”) since December 3, 1991, has served as
##   Chairman of the Board since May 1, 2001 and has served as Chair of the
##   Executive Committee of the Board since August 15, 2005. In addition, CH
##   served as Interim President and Chief Executive Officer of the Company from
##   October 20, 2009 through February 8, 2010, Chief Executive Officer of the
##   Company from May 1, 1998 through August 15, 2005 and President of the Company
##   from December 7, 1993 through February 24, 2002, and from November 4, 2002
##   through November 15, 2004.", "Restaurant operator Luby's Inc. has named
##   Christopher J. Pappas president and CEO and Harris J. Pappas as chief
##   operating officer. The company said David Daviss resigned as acting chief
##   executive and chairman. He'll remain a member of the board. Board member
##   Robert T. Herres has been elected to the chairman post.", "ROBERT v.d. LUFT
##   [|] Age 63 [|] Director Since 1992 [|] Chadds Ford, Pennsylvania [|] -Acting
##   Chief Executive Officer of Entergy, May-December, 1998", "Retired once the
##   issues with the company were solved", …, "On March 8, 2017, the Board
##   appointed Jeffrey L. Rutherford, age 56, as Interim President and Interim
##   Chief Executive Officer, effective immediately. The Board also elected Mr.
##   Rutherford as Chairman of the Board, effective immediately. Mr. Rutherford
##   joined the Board on February 16, 2017.", and "Irvine’s Opus Bank (Nasdaq:
##   OPB), the third largest bank based in Orange County, named Paul W. Taylor as
##   chief executive officer and president, effective today, May 1, 2019. Taylor
##   succeeds Paul G. Greig, chairman of Opus’ board, who served as interim CEO
##   after the departure last November of founder Stephen Gordon.".
## ℹ Consider using step_novel() (`?recipes::step_novel()`) before `step_dummy()`
##   to handle unseen values.

convert h20 to frames

train_h2o <- as.h2o(train_prepped)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test_h2o  <- as.h2o(test_prepped)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

train H20 model

h2o_model <- h2o.gbm(
  x = setdiff(names(train_h2o), "ceo_dismissal"),
  y = "ceo_dismissal",
  training_frame = train_h2o,
  model_id = "ceo_dismissal_gbm_model",
  seed = 1234
)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

evaluate model

perf <- h2o.performance(h2o_model, newdata = test_h2o)
print(perf)
## H2OBinomialMetrics: gbm
## 
## MSE:  0.001510004
## RMSE:  0.03885877
## LogLoss:  0.009715358
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  1
## Gini:  1
## R^2:  0.9471041
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         0 1    Error   Rate
## 0      66 0 0.000000  =0/66
## 1       0 2 0.000000   =0/2
## Totals 66 2 0.000000  =0/68
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold     value idx
## 1                       max f1  0.984863  1.000000   1
## 2                       max f2  0.984863  1.000000   1
## 3                 max f0point5  0.984863  1.000000   1
## 4                 max accuracy  0.984863  1.000000   1
## 5                max precision  0.995013  1.000000   0
## 6                   max recall  0.984863  1.000000   1
## 7              max specificity  0.995013  1.000000   0
## 8             max absolute_mcc  0.984863  1.000000   1
## 9   max min_per_class_accuracy  0.984863  1.000000   1
## 10 max mean_per_class_accuracy  0.984863  1.000000   1
## 11                     max tns  0.995013 66.000000   0
## 12                     max fns  0.995013  1.000000   0
## 13                     max fps  0.000461 66.000000  42
## 14                     max tps  0.984863  2.000000   1
## 15                     max tnr  0.995013  1.000000   0
## 16                     max fnr  0.995013  0.500000   0
## 17                     max fpr  0.000461  1.000000  42
## 18                     max tpr  0.984863  1.000000   1
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
h2o.auc(perf)
## [1] 1
h2o.confusionMatrix(perf)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.984862555298816:
##         0 1    Error   Rate
## 0      66 0 0.000000  =0/66
## 1       0 2 0.000000   =0/2
## Totals 66 2 0.000000  =0/68
plot(perf, type = "roc")

predict on test data

predictions <- h2o.predict(h2o_model, test_h2o)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
head(predictions)
##   predict          p0           p1
## 1       0 0.688898615 0.3111013851
## 2       0 0.979731526 0.0202684744
## 3       1 0.004986509 0.9950134909
## 4       0 0.999535375 0.0004646254
## 5       0 0.999535375 0.0004646253
## 6       1 0.015137445 0.9848625553