Load necessary libraries

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.7      ✔ recipes      1.0.10
## ✔ dials        1.2.1      ✔ rsample      1.2.1 
## ✔ dplyr        1.1.4      ✔ tibble       3.2.1 
## ✔ ggplot2      3.5.1      ✔ tidyr        1.3.1 
## ✔ infer        1.0.7      ✔ tune         1.2.1 
## ✔ modeldata    1.4.0      ✔ workflows    1.1.4 
## ✔ parsnip      1.2.1      ✔ workflowsets 1.1.0 
## ✔ purrr        1.0.2      ✔ yardstick    1.3.1

## Warning: package 'broom' was built under R version 4.3.3

## Warning: package 'modeldata' was built under R version 4.3.3

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(readr)

## 
## Attaching package: 'readr'

## The following object is masked from 'package:yardstick':
## 
##     spec

## The following object is masked from 'package:scales':
## 
##     col_factor

library(dplyr)

museum_raw_tbl <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')

## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

museum_raw_tbl %>% glimpse()

## Rows: 4,191
## Columns: 35
## $ museum_id                              <chr> "mm.New.1", "mm.aim.1230", "mm.…
## $ Name_of_museum                         <chr> "Titanic Belfast", "The Woodlan…
## $ Address_line_1                         <chr> "1 Olympic Way", NA, "Warwick C…
## $ Address_line_2                         <chr> NA, "Brokerswood", "Horticultur…
## $ `Village,_Town_or_City`                <chr> "Belfast", "nr Westbury", "More…
## $ Postcode                               <chr> "BT3 9EP", "BA13 4EH", "CV35 9B…
## $ Latitude                               <dbl> 54.60808, 51.27090, 52.19715, 4…
## $ Longitude                              <dbl> -5.909915, -2.231863, -1.555528…
## $ Admin_area                             <chr> "/Northern Ireland/Belfast (NI …
## $ Accreditation                          <chr> "Unaccredited", "Unaccredited",…
## $ Governance                             <chr> "Independent-Not_for_profit", "…
## $ Size                                   <chr> "large", "small", "medium", "sm…
## $ Size_provenance                        <chr> NA, "aim_size_designation", "mm…
## $ Subject_Matter                         <chr> "Sea_and_seafaring-Boats_and_sh…
## $ Year_opened                            <chr> "2012:2012", "1971:1971", "1984…
## $ Year_closed                            <chr> "9999:9999", "2007:2017", "9999…
## $ DOMUS_Subject_Matter                   <chr> NA, NA, NA, NA, NA, NA, NA, "sc…
## $ DOMUS_identifier                       <dbl> NA, NA, 1218, NA, NA, NA, 1528,…
## $ Primary_provenance_of_data             <chr> "New", "aim", "domus", "aim", "…
## $ Identifier_used_in_primary_data_source <chr> NA, NA, "WM000019", NA, NA, NA,…
## $ Area_Deprivation_index                 <dbl> 2, 9, 8, NA, 8, 2, 6, 6, 5, 6, …
## $ Area_Deprivation_index_crime           <dbl> 3, 8, 9, NA, 10, 1, 10, 3, 1, 1…
## $ Area_Deprivation_index_education       <dbl> 1, 9, 8, NA, 7, 6, 8, 7, 7, 6, …
## $ Area_Deprivation_index_employment      <dbl> 2, 8, 10, NA, 7, 3, 7, 6, 6, 7,…
## $ Area_Deprivation_index_health          <dbl> 1, 8, 8, NA, 8, 2, 7, 8, 5, 7, …
## $ Area_Deprivation_index_housing         <dbl> 4, 6, 5, NA, 7, 1, 8, 9, 1, 7, …
## $ Area_Deprivation_index_income          <dbl> 5, 8, 8, NA, 8, 3, 5, 5, 7, 5, …
## $ Area_Deprivation_index_services        <dbl> 5, 4, 1, NA, 4, 4, 2, 3, 9, 1, …
## $ Area_Geodemographic_group              <chr> "Larger Towns and Cities", "Cou…
## $ Area_Geodemographic_group_code         <chr> "2ar", "7ar", "3ar", NA, "7ar",…
## $ Area_Geodemographic_subgroup           <chr> "Larger Towns and Cities", "Cou…
## $ Area_Geodemographic_subgroup_code      <chr> "2a1r", "7a1r", "3a1r", NA, "7a…
## $ Area_Geodemographic_supergroup         <chr> "Business Education and Heritag…
## $ Area_Geodemographic_supergroup_code    <chr> "2r", "7r", "3r", NA, "7r", "5r…
## $ Notes                                  <chr> NA, "Previously known as Philli…

Data cleaning and preprocessing

library(stringr)

## 
## Attaching package: 'stringr'

## The following object is masked from 'package:recipes':
## 
##     fixed

museum_data <- museum_raw_tbl %>%
    # Remove unnecessary columns
    select(-c(Size_provenance, DOMUS_Subject_Matter, Year_closed, Primary_provenance_of_data, 
              Identifier_used_in_primary_data_source, Area_Geodemographic_subgroup, 
              Area_Geodemographic_subgroup_code, Area_Geodemographic_supergroup, 
              Area_Geodemographic_supergroup_code, Notes, Latitude, Longitude, DOMUS_identifier, 
              Address_line_2, Postcode, Admin_area, Area_Geodemographic_group, Name_of_museum)) %>%
    # Remove rows with missing values
    na.omit() %>%
    
    # Change column name and adjust Years opened
    rename("Village_Town_City" = "Village,_Town_or_City") %>%
    mutate(Year_opened = str_sub(Year_opened, 1, 4)) %>%
    
    # Recode Accreditation
    mutate(Accreditation = if_else(Accreditation == "Accredited", "Yes", "No")) %>%
    
    # Convert selected columns to factors
    mutate(across(c(Accreditation, Governance, Size, Subject_Matter, 
                    Area_Geodemographic_group_code, Area_Deprivation_index, 
                    Area_Deprivation_index_crime, Area_Deprivation_index_education, 
                    Area_Deprivation_index_employment, Area_Deprivation_index_health, 
                    Area_Deprivation_index_housing, Area_Deprivation_index_income, 
                    Area_Deprivation_index_services, Year_opened, Village_Town_City, 
                    Address_line_1, museum_id), as.factor))

Initialize H2O

h2o.init()

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /var/folders/hx/57xnmk_52272gw1nc2jn7nbr0000gn/T//RtmpXwzGrx/file1e5e4ac30e2a/h2o_max_started_from_r.out
##     /var/folders/hx/57xnmk_52272gw1nc2jn7nbr0000gn/T//RtmpXwzGrx/file1e5e4bd70dbe/h2o_max_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: ... Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 883 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 14 days 
##     H2O cluster name:           H2O_started_from_R_max_kez235 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   4.00 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

Split the data into training and testing sets

set.seed(123)  # Set seed for reproducibility
museum_split <- initial_split(museum_data, prop = 0.7, strata = Accreditation)
train_data <- training(museum_split)
test_data <- testing(museum_split)

Convert data to H2O format

train_h2o <- as.h2o(train_data)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

test_h2o <- as.h2o(test_data)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

Train an H2O model (example with AutoML)

h2o_model <- h2o.automl(
  x = setdiff(names(train_h2o), "Accreditation"),
  y = "Accreditation",
  training_frame = train_h2o,
  max_models = 10,
  seed = 123
)

##   |                                                                              |                                                                      |   0%  |                                                                              |=                                                                     |   2%
## 11:04:30.153: AutoML: XGBoost is not available; skipping it.  |                                                                              |==                                                                    |   3%  |                                                                              |====                                                                  |   6%  |                                                                              |======                                                                |   8%  |                                                                              |======                                                                |   9%  |                                                                              |========                                                              |  12%  |                                                                              |==========                                                            |  15%  |                                                                              |=============                                                         |  18%  |                                                                              |==============                                                        |  20%  |                                                                              |================                                                      |  24%  |                                                                              |=================                                                     |  24%  |                                                                              |==================                                                    |  25%  |                                                                              |==================                                                    |  26%  |                                                                              |=================================                                     |  47%  |                                                                              |===================================                                   |  50%  |                                                                              |======================================================================| 100%

View leaderboard of models

h2o_leaderboard <- h2o_model@leaderboard
print(h2o_leaderboard)

##                                                  model_id       auc   logloss
## 1    StackedEnsemble_AllModels_1_AutoML_1_20241205_110430 0.8724992 0.4230080
## 2 StackedEnsemble_BestOfFamily_1_AutoML_1_20241205_110430 0.8716045 0.4243102
## 3                          GBM_1_AutoML_1_20241205_110430 0.8683092 0.4381646
## 4                          GBM_4_AutoML_1_20241205_110430 0.8662477 0.4521128
## 5                          GBM_2_AutoML_1_20241205_110430 0.8645222 0.4539867
## 6                          GBM_3_AutoML_1_20241205_110430 0.8625660 0.4609696
##       aucpr mean_per_class_error      rmse       mse
## 1 0.7964384            0.1722401 0.3711937 0.1377847
## 2 0.7947533            0.1787731 0.3719884 0.1383753
## 3 0.7970700            0.1891296 0.3792700 0.1438458
## 4 0.7864528            0.1805652 0.3831627 0.1468137
## 5 0.7893594            0.1850801 0.3858587 0.1488869
## 6 0.7917009            0.1910325 0.3885095 0.1509396
## 
## [12 rows x 7 columns]

Get the best model and evaluate

best_model <- h2o_model@leader
perf <- h2o.performance(model = best_model, newdata = test_h2o)

Print evaluation metrics

auc <- h2o.auc(perf)
accuracy <- h2o.accuracy(perf)[1]
conf_matrix <- h2o.confusionMatrix(perf)

# Extract the first accuracy value as a numeric value
accuracy_value <- as.numeric(accuracy[[1]])

# Display metrics
cat("AUC:", auc, "\n")

## AUC: 0.8060383

cat("Accuracy:", accuracy_value, "\n")

## Accuracy: 0.9882912 0.9767092 0.9680851 0.9662391 0.9618365 0.9601964 0.9579237 0.9541261 0.9528713 0.9488491 0.9464409 0.9407771 0.9372398 0.9353971 0.9332279 0.930636 0.9294836 0.9276712 0.9243781 0.9214854 0.9205395 0.9172129 0.9155324 0.9137486 0.9118078 0.9105374 0.9083459 0.9069854 0.9037838 0.9015871 0.8994127 0.8970059 0.895885 0.8945946 0.892772 0.8902122 0.8884796 0.8870361 0.8851532 0.8841667 0.8829434 0.8815852 0.8802644 0.8771255 0.8750463 0.87284 0.8715822 0.8697823 0.8676952 0.8663742 0.8653541 0.8633532 0.8611799 0.8592667 0.8579888 0.8569902 0.8558366 0.8545242 0.8530648 0.850268 0.8488162 0.8467503 0.8453921 0.8436523 0.8420911 0.8408593 0.8395567 0.8372408 0.8358936 0.8327387 0.8311139 0.8294144 0.8274662 0.826013 0.8245863 0.8229585 0.8210645 0.8198978 0.8181426 0.8155408 0.8143596 0.8134103 0.8120017 0.8104114 0.8088116 0.8066661 0.8042583 0.8026536 0.8018425 0.8003751 0.7986555 0.7965679 0.7946307 0.7937175 0.7912766 0.7905506 0.7899129 0.7884354 0.7872018 0.7842907 0.7829582 0.7819766 0.7804262 0.7789809 0.7777819 0.7758327 0.7741312 0.7730104 0.7712402 0.7691108 0.7675268 0.7660223 0.7650629 0.7636817 0.7627724 0.7619158 0.7610003 0.7590634 0.7577994 0.7558833 0.7545017 0.7534763 0.7513651 0.7499097 0.7488065 0.7462044 0.7432433 0.7414041 0.7396842 0.7381143 0.7360052 0.7346701 0.7322858 0.7309713 0.7293676 0.727943 0.7257688 0.7247666 0.72278 0.7212439 0.7202138 0.7183384 0.7162213 0.7150371 0.7136173 0.7113809 0.7098169 0.7081243 0.7071341 0.7062231 0.705021 0.7023743 0.6999312 0.6982045 0.6972926 0.6958109 0.6931257 0.6915758 0.689608 0.6873364 0.6844132 0.6829053 0.6797098 0.6780604 0.6770119 0.672744 0.670956 0.6694857 0.6667957 0.6617441 0.6595621 0.6578229 0.6566148 0.6549328 0.6526737 0.6503149 0.647401 0.6464442 0.6453163 0.6433741 0.6418333 0.6403665 0.637899 0.636246 0.6353435 0.6334793 0.6306595 0.6281637 0.6269226 0.6258297 0.6248923 0.6227289 0.6191572 0.6157383 0.6134489 0.6102314 0.6066698 0.605276 0.6040104 0.6023172 0.5981817 0.5955358 0.5936199 0.5915606 0.5895553 0.5880443 0.5860421 0.5843268 0.5815153 0.5803652 0.5778527 0.5759818 0.5738498 0.5730902 0.571458 0.5672821 0.5645268 0.563088 0.5605558 0.5596553 0.5576632 0.5529486 0.551079 0.5502888 0.5465071 0.5451652 0.5428109 0.5401397 0.5382254 0.5360853 0.5350125 0.5329109 0.5296221 0.5278875 0.5247262 0.5222117 0.5204582 0.5175558 0.5157852 0.5131076 0.5111735 0.5088863 0.5071185 0.5053611 0.5035186 0.4959915 0.4948479 0.4921075 0.4907912 0.4893682 0.4876761 0.4853393 0.4837128 0.4828395 0.4809965 0.4763351 0.4726066 0.4712921 0.4684185 0.4667841 0.4640184 0.457109 0.4554314 0.4530283 0.4513428 0.4491977 0.446832 0.4447901 0.4425641 0.4388787 0.4343077 0.4313239 0.4293434 0.4258098 0.4225229 0.4198664 0.4113812 0.4097684 0.404999 0.4014692 0.3979922 0.3954397 0.3924408 0.3885009 0.3865473 0.3756522 0.3691058 0.3604233 0.3570657 0.3538014 0.3523886 0.349642 0.3479325 0.3448623 0.3393144 0.3351807 0.3225929 0.3190061 0.3176251 0.3158573 0.3134751 0.3059468 0.3028284 0.3016941 0.2916909 0.2897718 0.285274 0.2779097 0.2735507 0.2711085 0.2683717 0.2645332 0.2614672 0.2540277 0.2446618 0.2367012 0.2262744 0.2222287 0.2186977 0.2148883 0.2106847 0.2072131 0.2056932 0.2033045 0.1977821 0.1927259 0.1870971 0.1774534 0.1715691 0.1698603 0.1664135 0.1633673 0.1575466 0.1555448 0.152887 0.1503855 0.1454095 0.1433499 0.1403752 0.1378406 0.1358449 0.1332337 0.1318882 0.1291509 0.1276557 0.1257605 0.1232252 0.1217235 0.1203075 0.1187722 0.1158537 0.1130286 0.1089636 0.1073796 0.1061785 0.104643 0.1023955 0.101206 0.09928449 0.09769946 0.09669809 0.09579493 0.09410177 0.09288888 0.09121125 0.09027474 0.08881108 0.08732413 0.08611178 0.0848231 0.08399826 0.08281374 0.08117734 0.08026678 0.07828253 0.07627396 0.07495205 0.07424034 0.07293635 0.07163518 0.07044665 0.06929159 0.06828793 0.06665654 0.06575476 0.06474131 0.0632122 0.06227951 0.06130372 0.06038419 0.05830176 0.05687422 0.05514898 0.05275748 0.05125445 0.04869501 0.04670815 0.04350422 0.04138165 0.03740089

Predict on the test data

predictions <- h2o.predict(best_model, test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'museum_id' has levels not trained on: ["domus.NE043",
## "mm.MDN.005", "mm.MDN.023", "mm.MDN.031", "mm.Mus70Cal.003", "mm.Mus70Cal.004",
## "mm.Mus70Cal.008", "mm.Mus70Cal.014", "mm.Mus70Cal.015", "mm.Mus70Cal.017",
## ...1093 not listed..., "mm.wiki.444", "mm.wiki.449", "mm.wiki.452",
## "mm.wiki.456", "mm.wiki.460", "mm.wiki.467", "mm.wiki.468", "mm.wiki.484",
## "mm.wiki.500", "mm.wiki.501"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Address_line_1' has levels not trained on: ["(Eling Tide
## Mill/Eling Heritage Centre)", "1 - 29 Bridge Street", "1 Albany Road", "1 Clink
## Street", "1 Elmsleigh Road", "1 Market Place", "1 Queen Street", "1 Rectory
## Street", "1 Scala St", "1 Taylor Street", ...873 not listed..., "York
## Racecourse", "York Road", "Yorkshire Squadron", "c/o 1 Royal Crescent", "c/o 32
## Churchgate", "c/o 7 Hospital Lane", "c/o Cornish Gold Site", "c/o Jewish
## Museum", "c/o Spennymoor Town Hall", "c/of Gullivers world"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Village_Town_City' has levels not trained on: ["Abergele",
## "Abernethy", "Aberporth", "Abington", "Accrington", "Alfreton", "Alloa",
## "Alston", "Amlwch Port", "Ardvasar", ...275 not listed..., "nr Leek", "nr
## Okehampton", "nr Romsey", "nr Yeovil", "nr. Cranbrook", "nr. Helston", "nr.
## Ponteland", "nr. Richmond", "nr. Southampton", "nr. Winkleigh"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Subject_Matter' has levels not trained on: ["Archaeology"]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Year_opened' has levels not trained on: ["1660", "1676",
## "1771", "1811", "1812", "1818", "1848", "1854", "1864", "1865", "1871", "1882",
## "1909", "1916"]

test_data <- test_data %>%
  mutate(
    Predicted_Prob = as.vector(predictions[, "Yes"]),  # Probability of "Yes" (Accredited)
    Predicted_Class = as.vector(predictions[, "predict"])  # Predicted class
  )

Example output of predictions

print(head(test_data %>% select(Accreditation, Predicted_Prob, Predicted_Class)))

## # A tibble: 6 × 3
##   Accreditation Predicted_Prob Predicted_Class
##   <fct>                  <dbl> <chr>          
## 1 No                    0.483  Yes            
## 2 Yes                   0.908  Yes            
## 3 Yes                   0.823  Yes            
## 4 No                    0.0680 No             
## 5 No                    0.673  Yes            
## 6 No                    0.766  Yes

Plot variable importance

h2o.varimp_plot(best_model)

## Warning: This model doesn't have variable importances

Shutdown H2O when done

h2o.shutdown(prompt = FALSE)

Apply 12

Max Haussmann

2024-12-05

Load necessary libraries

Data cleaning and preprocessing

Initialize H2O

Split the data into training and testing sets

Convert data to H2O format

Train an H2O model (example with AutoML)

View leaderboard of models

Get the best model and evaluate

Print evaluation metrics

Predict on the test data

Example output of predictions

Plot variable importance

Shutdown H2O when done