Load necessary libraries

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.7      ✔ recipes      1.0.10
## ✔ dials        1.2.1      ✔ rsample      1.2.1 
## ✔ dplyr        1.1.4      ✔ tibble       3.2.1 
## ✔ ggplot2      3.5.1      ✔ tidyr        1.3.1 
## ✔ infer        1.0.7      ✔ tune         1.2.1 
## ✔ modeldata    1.4.0      ✔ workflows    1.1.4 
## ✔ parsnip      1.2.1      ✔ workflowsets 1.1.0 
## ✔ purrr        1.0.2      ✔ yardstick    1.3.1
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:yardstick':
## 
##     spec
## The following object is masked from 'package:scales':
## 
##     col_factor
library(dplyr)
museum_raw_tbl <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
museum_raw_tbl %>% glimpse()
## Rows: 4,191
## Columns: 35
## $ museum_id                              <chr> "mm.New.1", "mm.aim.1230", "mm.…
## $ Name_of_museum                         <chr> "Titanic Belfast", "The Woodlan…
## $ Address_line_1                         <chr> "1 Olympic Way", NA, "Warwick C…
## $ Address_line_2                         <chr> NA, "Brokerswood", "Horticultur…
## $ `Village,_Town_or_City`                <chr> "Belfast", "nr Westbury", "More…
## $ Postcode                               <chr> "BT3 9EP", "BA13 4EH", "CV35 9B…
## $ Latitude                               <dbl> 54.60808, 51.27090, 52.19715, 4…
## $ Longitude                              <dbl> -5.909915, -2.231863, -1.555528…
## $ Admin_area                             <chr> "/Northern Ireland/Belfast (NI …
## $ Accreditation                          <chr> "Unaccredited", "Unaccredited",…
## $ Governance                             <chr> "Independent-Not_for_profit", "…
## $ Size                                   <chr> "large", "small", "medium", "sm…
## $ Size_provenance                        <chr> NA, "aim_size_designation", "mm…
## $ Subject_Matter                         <chr> "Sea_and_seafaring-Boats_and_sh…
## $ Year_opened                            <chr> "2012:2012", "1971:1971", "1984…
## $ Year_closed                            <chr> "9999:9999", "2007:2017", "9999…
## $ DOMUS_Subject_Matter                   <chr> NA, NA, NA, NA, NA, NA, NA, "sc…
## $ DOMUS_identifier                       <dbl> NA, NA, 1218, NA, NA, NA, 1528,…
## $ Primary_provenance_of_data             <chr> "New", "aim", "domus", "aim", "…
## $ Identifier_used_in_primary_data_source <chr> NA, NA, "WM000019", NA, NA, NA,…
## $ Area_Deprivation_index                 <dbl> 2, 9, 8, NA, 8, 2, 6, 6, 5, 6, …
## $ Area_Deprivation_index_crime           <dbl> 3, 8, 9, NA, 10, 1, 10, 3, 1, 1…
## $ Area_Deprivation_index_education       <dbl> 1, 9, 8, NA, 7, 6, 8, 7, 7, 6, …
## $ Area_Deprivation_index_employment      <dbl> 2, 8, 10, NA, 7, 3, 7, 6, 6, 7,…
## $ Area_Deprivation_index_health          <dbl> 1, 8, 8, NA, 8, 2, 7, 8, 5, 7, …
## $ Area_Deprivation_index_housing         <dbl> 4, 6, 5, NA, 7, 1, 8, 9, 1, 7, …
## $ Area_Deprivation_index_income          <dbl> 5, 8, 8, NA, 8, 3, 5, 5, 7, 5, …
## $ Area_Deprivation_index_services        <dbl> 5, 4, 1, NA, 4, 4, 2, 3, 9, 1, …
## $ Area_Geodemographic_group              <chr> "Larger Towns and Cities", "Cou…
## $ Area_Geodemographic_group_code         <chr> "2ar", "7ar", "3ar", NA, "7ar",…
## $ Area_Geodemographic_subgroup           <chr> "Larger Towns and Cities", "Cou…
## $ Area_Geodemographic_subgroup_code      <chr> "2a1r", "7a1r", "3a1r", NA, "7a…
## $ Area_Geodemographic_supergroup         <chr> "Business Education and Heritag…
## $ Area_Geodemographic_supergroup_code    <chr> "2r", "7r", "3r", NA, "7r", "5r…
## $ Notes                                  <chr> NA, "Previously known as Philli…

Data cleaning and preprocessing

library(stringr)
## 
## Attaching package: 'stringr'
## The following object is masked from 'package:recipes':
## 
##     fixed
museum_data <- museum_raw_tbl %>%
    # Remove unnecessary columns
    select(-c(Size_provenance, DOMUS_Subject_Matter, Year_closed, Primary_provenance_of_data, 
              Identifier_used_in_primary_data_source, Area_Geodemographic_subgroup, 
              Area_Geodemographic_subgroup_code, Area_Geodemographic_supergroup, 
              Area_Geodemographic_supergroup_code, Notes, Latitude, Longitude, DOMUS_identifier, 
              Address_line_2, Postcode, Admin_area, Area_Geodemographic_group, Name_of_museum)) %>%
    # Remove rows with missing values
    na.omit() %>%
    
    # Change column name and adjust Years opened
    rename("Village_Town_City" = "Village,_Town_or_City") %>%
    mutate(Year_opened = str_sub(Year_opened, 1, 4)) %>%
    
    # Recode Accreditation
    mutate(Accreditation = if_else(Accreditation == "Accredited", "Yes", "No")) %>%
    
    # Convert selected columns to factors
    mutate(across(c(Accreditation, Governance, Size, Subject_Matter, 
                    Area_Geodemographic_group_code, Area_Deprivation_index, 
                    Area_Deprivation_index_crime, Area_Deprivation_index_education, 
                    Area_Deprivation_index_employment, Area_Deprivation_index_health, 
                    Area_Deprivation_index_housing, Area_Deprivation_index_income, 
                    Area_Deprivation_index_services, Year_opened, Village_Town_City, 
                    Address_line_1, museum_id), as.factor))

Initialize H2O

h2o.init()
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /var/folders/hx/57xnmk_52272gw1nc2jn7nbr0000gn/T//RtmpXwzGrx/file1e5e4ac30e2a/h2o_max_started_from_r.out
##     /var/folders/hx/57xnmk_52272gw1nc2jn7nbr0000gn/T//RtmpXwzGrx/file1e5e4bd70dbe/h2o_max_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: ... Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 883 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 14 days 
##     H2O cluster name:           H2O_started_from_R_max_kez235 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   4.00 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

Split the data into training and testing sets

set.seed(123)  # Set seed for reproducibility
museum_split <- initial_split(museum_data, prop = 0.7, strata = Accreditation)
train_data <- training(museum_split)
test_data <- testing(museum_split)

Convert data to H2O format

train_h2o <- as.h2o(train_data)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
test_h2o <- as.h2o(test_data)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

Train an H2O model (example with AutoML)

h2o_model <- h2o.automl(
  x = setdiff(names(train_h2o), "Accreditation"),
  y = "Accreditation",
  training_frame = train_h2o,
  max_models = 10,
  seed = 123
)
##   |                                                                              |                                                                      |   0%  |                                                                              |=                                                                     |   2%
## 11:04:30.153: AutoML: XGBoost is not available; skipping it.  |                                                                              |==                                                                    |   3%  |                                                                              |====                                                                  |   6%  |                                                                              |======                                                                |   8%  |                                                                              |======                                                                |   9%  |                                                                              |========                                                              |  12%  |                                                                              |==========                                                            |  15%  |                                                                              |=============                                                         |  18%  |                                                                              |==============                                                        |  20%  |                                                                              |================                                                      |  24%  |                                                                              |=================                                                     |  24%  |                                                                              |==================                                                    |  25%  |                                                                              |==================                                                    |  26%  |                                                                              |=================================                                     |  47%  |                                                                              |===================================                                   |  50%  |                                                                              |======================================================================| 100%

View leaderboard of models

h2o_leaderboard <- h2o_model@leaderboard
print(h2o_leaderboard)
##                                                  model_id       auc   logloss
## 1    StackedEnsemble_AllModels_1_AutoML_1_20241205_110430 0.8724992 0.4230080
## 2 StackedEnsemble_BestOfFamily_1_AutoML_1_20241205_110430 0.8716045 0.4243102
## 3                          GBM_1_AutoML_1_20241205_110430 0.8683092 0.4381646
## 4                          GBM_4_AutoML_1_20241205_110430 0.8662477 0.4521128
## 5                          GBM_2_AutoML_1_20241205_110430 0.8645222 0.4539867
## 6                          GBM_3_AutoML_1_20241205_110430 0.8625660 0.4609696
##       aucpr mean_per_class_error      rmse       mse
## 1 0.7964384            0.1722401 0.3711937 0.1377847
## 2 0.7947533            0.1787731 0.3719884 0.1383753
## 3 0.7970700            0.1891296 0.3792700 0.1438458
## 4 0.7864528            0.1805652 0.3831627 0.1468137
## 5 0.7893594            0.1850801 0.3858587 0.1488869
## 6 0.7917009            0.1910325 0.3885095 0.1509396
## 
## [12 rows x 7 columns]

Get the best model and evaluate

best_model <- h2o_model@leader
perf <- h2o.performance(model = best_model, newdata = test_h2o)

Predict on the test data

predictions <- h2o.predict(best_model, test_h2o)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'museum_id' has levels not trained on: ["domus.NE043",
## "mm.MDN.005", "mm.MDN.023", "mm.MDN.031", "mm.Mus70Cal.003", "mm.Mus70Cal.004",
## "mm.Mus70Cal.008", "mm.Mus70Cal.014", "mm.Mus70Cal.015", "mm.Mus70Cal.017",
## ...1093 not listed..., "mm.wiki.444", "mm.wiki.449", "mm.wiki.452",
## "mm.wiki.456", "mm.wiki.460", "mm.wiki.467", "mm.wiki.468", "mm.wiki.484",
## "mm.wiki.500", "mm.wiki.501"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Address_line_1' has levels not trained on: ["(Eling Tide
## Mill/Eling Heritage Centre)", "1 - 29 Bridge Street", "1 Albany Road", "1 Clink
## Street", "1 Elmsleigh Road", "1 Market Place", "1 Queen Street", "1 Rectory
## Street", "1 Scala St", "1 Taylor Street", ...873 not listed..., "York
## Racecourse", "York Road", "Yorkshire Squadron", "c/o 1 Royal Crescent", "c/o 32
## Churchgate", "c/o 7 Hospital Lane", "c/o Cornish Gold Site", "c/o Jewish
## Museum", "c/o Spennymoor Town Hall", "c/of Gullivers world"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Village_Town_City' has levels not trained on: ["Abergele",
## "Abernethy", "Aberporth", "Abington", "Accrington", "Alfreton", "Alloa",
## "Alston", "Amlwch Port", "Ardvasar", ...275 not listed..., "nr Leek", "nr
## Okehampton", "nr Romsey", "nr Yeovil", "nr. Cranbrook", "nr. Helston", "nr.
## Ponteland", "nr. Richmond", "nr. Southampton", "nr. Winkleigh"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Subject_Matter' has levels not trained on: ["Archaeology"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Year_opened' has levels not trained on: ["1660", "1676",
## "1771", "1811", "1812", "1818", "1848", "1854", "1864", "1865", "1871", "1882",
## "1909", "1916"]
test_data <- test_data %>%
  mutate(
    Predicted_Prob = as.vector(predictions[, "Yes"]),  # Probability of "Yes" (Accredited)
    Predicted_Class = as.vector(predictions[, "predict"])  # Predicted class
  )

Example output of predictions

print(head(test_data %>% select(Accreditation, Predicted_Prob, Predicted_Class)))
## # A tibble: 6 × 3
##   Accreditation Predicted_Prob Predicted_Class
##   <fct>                  <dbl> <chr>          
## 1 No                    0.483  Yes            
## 2 Yes                   0.908  Yes            
## 3 Yes                   0.823  Yes            
## 4 No                    0.0680 No             
## 5 No                    0.673  Yes            
## 6 No                    0.766  Yes

Plot variable importance

h2o.varimp_plot(best_model)
## Warning: This model doesn't have variable importances

Shutdown H2O when done

h2o.shutdown(prompt = FALSE)