Load necessary libraries
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.7 ✔ recipes 1.0.10
## ✔ dials 1.2.1 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:yardstick':
##
## spec
## The following object is masked from 'package:scales':
##
## col_factor
library(dplyr)
museum_raw_tbl <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
museum_raw_tbl %>% glimpse()
## Rows: 4,191
## Columns: 35
## $ museum_id <chr> "mm.New.1", "mm.aim.1230", "mm.…
## $ Name_of_museum <chr> "Titanic Belfast", "The Woodlan…
## $ Address_line_1 <chr> "1 Olympic Way", NA, "Warwick C…
## $ Address_line_2 <chr> NA, "Brokerswood", "Horticultur…
## $ `Village,_Town_or_City` <chr> "Belfast", "nr Westbury", "More…
## $ Postcode <chr> "BT3 9EP", "BA13 4EH", "CV35 9B…
## $ Latitude <dbl> 54.60808, 51.27090, 52.19715, 4…
## $ Longitude <dbl> -5.909915, -2.231863, -1.555528…
## $ Admin_area <chr> "/Northern Ireland/Belfast (NI …
## $ Accreditation <chr> "Unaccredited", "Unaccredited",…
## $ Governance <chr> "Independent-Not_for_profit", "…
## $ Size <chr> "large", "small", "medium", "sm…
## $ Size_provenance <chr> NA, "aim_size_designation", "mm…
## $ Subject_Matter <chr> "Sea_and_seafaring-Boats_and_sh…
## $ Year_opened <chr> "2012:2012", "1971:1971", "1984…
## $ Year_closed <chr> "9999:9999", "2007:2017", "9999…
## $ DOMUS_Subject_Matter <chr> NA, NA, NA, NA, NA, NA, NA, "sc…
## $ DOMUS_identifier <dbl> NA, NA, 1218, NA, NA, NA, 1528,…
## $ Primary_provenance_of_data <chr> "New", "aim", "domus", "aim", "…
## $ Identifier_used_in_primary_data_source <chr> NA, NA, "WM000019", NA, NA, NA,…
## $ Area_Deprivation_index <dbl> 2, 9, 8, NA, 8, 2, 6, 6, 5, 6, …
## $ Area_Deprivation_index_crime <dbl> 3, 8, 9, NA, 10, 1, 10, 3, 1, 1…
## $ Area_Deprivation_index_education <dbl> 1, 9, 8, NA, 7, 6, 8, 7, 7, 6, …
## $ Area_Deprivation_index_employment <dbl> 2, 8, 10, NA, 7, 3, 7, 6, 6, 7,…
## $ Area_Deprivation_index_health <dbl> 1, 8, 8, NA, 8, 2, 7, 8, 5, 7, …
## $ Area_Deprivation_index_housing <dbl> 4, 6, 5, NA, 7, 1, 8, 9, 1, 7, …
## $ Area_Deprivation_index_income <dbl> 5, 8, 8, NA, 8, 3, 5, 5, 7, 5, …
## $ Area_Deprivation_index_services <dbl> 5, 4, 1, NA, 4, 4, 2, 3, 9, 1, …
## $ Area_Geodemographic_group <chr> "Larger Towns and Cities", "Cou…
## $ Area_Geodemographic_group_code <chr> "2ar", "7ar", "3ar", NA, "7ar",…
## $ Area_Geodemographic_subgroup <chr> "Larger Towns and Cities", "Cou…
## $ Area_Geodemographic_subgroup_code <chr> "2a1r", "7a1r", "3a1r", NA, "7a…
## $ Area_Geodemographic_supergroup <chr> "Business Education and Heritag…
## $ Area_Geodemographic_supergroup_code <chr> "2r", "7r", "3r", NA, "7r", "5r…
## $ Notes <chr> NA, "Previously known as Philli…
Data cleaning and preprocessing
library(stringr)
##
## Attaching package: 'stringr'
## The following object is masked from 'package:recipes':
##
## fixed
museum_data <- museum_raw_tbl %>%
# Remove unnecessary columns
select(-c(Size_provenance, DOMUS_Subject_Matter, Year_closed, Primary_provenance_of_data,
Identifier_used_in_primary_data_source, Area_Geodemographic_subgroup,
Area_Geodemographic_subgroup_code, Area_Geodemographic_supergroup,
Area_Geodemographic_supergroup_code, Notes, Latitude, Longitude, DOMUS_identifier,
Address_line_2, Postcode, Admin_area, Area_Geodemographic_group, Name_of_museum)) %>%
# Remove rows with missing values
na.omit() %>%
# Change column name and adjust Years opened
rename("Village_Town_City" = "Village,_Town_or_City") %>%
mutate(Year_opened = str_sub(Year_opened, 1, 4)) %>%
# Recode Accreditation
mutate(Accreditation = if_else(Accreditation == "Accredited", "Yes", "No")) %>%
# Convert selected columns to factors
mutate(across(c(Accreditation, Governance, Size, Subject_Matter,
Area_Geodemographic_group_code, Area_Deprivation_index,
Area_Deprivation_index_crime, Area_Deprivation_index_education,
Area_Deprivation_index_employment, Area_Deprivation_index_health,
Area_Deprivation_index_housing, Area_Deprivation_index_income,
Area_Deprivation_index_services, Year_opened, Village_Town_City,
Address_line_1, museum_id), as.factor))
Initialize H2O
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /var/folders/hx/57xnmk_52272gw1nc2jn7nbr0000gn/T//RtmpXwzGrx/file1e5e4ac30e2a/h2o_max_started_from_r.out
## /var/folders/hx/57xnmk_52272gw1nc2jn7nbr0000gn/T//RtmpXwzGrx/file1e5e4bd70dbe/h2o_max_started_from_r.err
##
##
## Starting H2O JVM and connecting: ... Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 seconds 883 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 11 months and 14 days
## H2O cluster name: H2O_started_from_R_max_kez235
## H2O cluster total nodes: 1
## H2O cluster total memory: 4.00 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.2 (2023-10-31)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (11 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
Split the data into training and testing sets
set.seed(123) # Set seed for reproducibility
museum_split <- initial_split(museum_data, prop = 0.7, strata = Accreditation)
train_data <- training(museum_split)
test_data <- testing(museum_split)
Train an H2O model (example with AutoML)
h2o_model <- h2o.automl(
x = setdiff(names(train_h2o), "Accreditation"),
y = "Accreditation",
training_frame = train_h2o,
max_models = 10,
seed = 123
)
## | | | 0% | |= | 2%
## 11:04:30.153: AutoML: XGBoost is not available; skipping it. | |== | 3% | |==== | 6% | |====== | 8% | |====== | 9% | |======== | 12% | |========== | 15% | |============= | 18% | |============== | 20% | |================ | 24% | |================= | 24% | |================== | 25% | |================== | 26% | |================================= | 47% | |=================================== | 50% | |======================================================================| 100%
View leaderboard of models
h2o_leaderboard <- h2o_model@leaderboard
print(h2o_leaderboard)
## model_id auc logloss
## 1 StackedEnsemble_AllModels_1_AutoML_1_20241205_110430 0.8724992 0.4230080
## 2 StackedEnsemble_BestOfFamily_1_AutoML_1_20241205_110430 0.8716045 0.4243102
## 3 GBM_1_AutoML_1_20241205_110430 0.8683092 0.4381646
## 4 GBM_4_AutoML_1_20241205_110430 0.8662477 0.4521128
## 5 GBM_2_AutoML_1_20241205_110430 0.8645222 0.4539867
## 6 GBM_3_AutoML_1_20241205_110430 0.8625660 0.4609696
## aucpr mean_per_class_error rmse mse
## 1 0.7964384 0.1722401 0.3711937 0.1377847
## 2 0.7947533 0.1787731 0.3719884 0.1383753
## 3 0.7970700 0.1891296 0.3792700 0.1438458
## 4 0.7864528 0.1805652 0.3831627 0.1468137
## 5 0.7893594 0.1850801 0.3858587 0.1488869
## 6 0.7917009 0.1910325 0.3885095 0.1509396
##
## [12 rows x 7 columns]
Get the best model and evaluate
best_model <- h2o_model@leader
perf <- h2o.performance(model = best_model, newdata = test_h2o)
Print evaluation metrics
auc <- h2o.auc(perf)
accuracy <- h2o.accuracy(perf)[1]
conf_matrix <- h2o.confusionMatrix(perf)
# Extract the first accuracy value as a numeric value
accuracy_value <- as.numeric(accuracy[[1]])
# Display metrics
cat("AUC:", auc, "\n")
## AUC: 0.8060383
cat("Accuracy:", accuracy_value, "\n")
## Accuracy: 0.9882912 0.9767092 0.9680851 0.9662391 0.9618365 0.9601964 0.9579237 0.9541261 0.9528713 0.9488491 0.9464409 0.9407771 0.9372398 0.9353971 0.9332279 0.930636 0.9294836 0.9276712 0.9243781 0.9214854 0.9205395 0.9172129 0.9155324 0.9137486 0.9118078 0.9105374 0.9083459 0.9069854 0.9037838 0.9015871 0.8994127 0.8970059 0.895885 0.8945946 0.892772 0.8902122 0.8884796 0.8870361 0.8851532 0.8841667 0.8829434 0.8815852 0.8802644 0.8771255 0.8750463 0.87284 0.8715822 0.8697823 0.8676952 0.8663742 0.8653541 0.8633532 0.8611799 0.8592667 0.8579888 0.8569902 0.8558366 0.8545242 0.8530648 0.850268 0.8488162 0.8467503 0.8453921 0.8436523 0.8420911 0.8408593 0.8395567 0.8372408 0.8358936 0.8327387 0.8311139 0.8294144 0.8274662 0.826013 0.8245863 0.8229585 0.8210645 0.8198978 0.8181426 0.8155408 0.8143596 0.8134103 0.8120017 0.8104114 0.8088116 0.8066661 0.8042583 0.8026536 0.8018425 0.8003751 0.7986555 0.7965679 0.7946307 0.7937175 0.7912766 0.7905506 0.7899129 0.7884354 0.7872018 0.7842907 0.7829582 0.7819766 0.7804262 0.7789809 0.7777819 0.7758327 0.7741312 0.7730104 0.7712402 0.7691108 0.7675268 0.7660223 0.7650629 0.7636817 0.7627724 0.7619158 0.7610003 0.7590634 0.7577994 0.7558833 0.7545017 0.7534763 0.7513651 0.7499097 0.7488065 0.7462044 0.7432433 0.7414041 0.7396842 0.7381143 0.7360052 0.7346701 0.7322858 0.7309713 0.7293676 0.727943 0.7257688 0.7247666 0.72278 0.7212439 0.7202138 0.7183384 0.7162213 0.7150371 0.7136173 0.7113809 0.7098169 0.7081243 0.7071341 0.7062231 0.705021 0.7023743 0.6999312 0.6982045 0.6972926 0.6958109 0.6931257 0.6915758 0.689608 0.6873364 0.6844132 0.6829053 0.6797098 0.6780604 0.6770119 0.672744 0.670956 0.6694857 0.6667957 0.6617441 0.6595621 0.6578229 0.6566148 0.6549328 0.6526737 0.6503149 0.647401 0.6464442 0.6453163 0.6433741 0.6418333 0.6403665 0.637899 0.636246 0.6353435 0.6334793 0.6306595 0.6281637 0.6269226 0.6258297 0.6248923 0.6227289 0.6191572 0.6157383 0.6134489 0.6102314 0.6066698 0.605276 0.6040104 0.6023172 0.5981817 0.5955358 0.5936199 0.5915606 0.5895553 0.5880443 0.5860421 0.5843268 0.5815153 0.5803652 0.5778527 0.5759818 0.5738498 0.5730902 0.571458 0.5672821 0.5645268 0.563088 0.5605558 0.5596553 0.5576632 0.5529486 0.551079 0.5502888 0.5465071 0.5451652 0.5428109 0.5401397 0.5382254 0.5360853 0.5350125 0.5329109 0.5296221 0.5278875 0.5247262 0.5222117 0.5204582 0.5175558 0.5157852 0.5131076 0.5111735 0.5088863 0.5071185 0.5053611 0.5035186 0.4959915 0.4948479 0.4921075 0.4907912 0.4893682 0.4876761 0.4853393 0.4837128 0.4828395 0.4809965 0.4763351 0.4726066 0.4712921 0.4684185 0.4667841 0.4640184 0.457109 0.4554314 0.4530283 0.4513428 0.4491977 0.446832 0.4447901 0.4425641 0.4388787 0.4343077 0.4313239 0.4293434 0.4258098 0.4225229 0.4198664 0.4113812 0.4097684 0.404999 0.4014692 0.3979922 0.3954397 0.3924408 0.3885009 0.3865473 0.3756522 0.3691058 0.3604233 0.3570657 0.3538014 0.3523886 0.349642 0.3479325 0.3448623 0.3393144 0.3351807 0.3225929 0.3190061 0.3176251 0.3158573 0.3134751 0.3059468 0.3028284 0.3016941 0.2916909 0.2897718 0.285274 0.2779097 0.2735507 0.2711085 0.2683717 0.2645332 0.2614672 0.2540277 0.2446618 0.2367012 0.2262744 0.2222287 0.2186977 0.2148883 0.2106847 0.2072131 0.2056932 0.2033045 0.1977821 0.1927259 0.1870971 0.1774534 0.1715691 0.1698603 0.1664135 0.1633673 0.1575466 0.1555448 0.152887 0.1503855 0.1454095 0.1433499 0.1403752 0.1378406 0.1358449 0.1332337 0.1318882 0.1291509 0.1276557 0.1257605 0.1232252 0.1217235 0.1203075 0.1187722 0.1158537 0.1130286 0.1089636 0.1073796 0.1061785 0.104643 0.1023955 0.101206 0.09928449 0.09769946 0.09669809 0.09579493 0.09410177 0.09288888 0.09121125 0.09027474 0.08881108 0.08732413 0.08611178 0.0848231 0.08399826 0.08281374 0.08117734 0.08026678 0.07828253 0.07627396 0.07495205 0.07424034 0.07293635 0.07163518 0.07044665 0.06929159 0.06828793 0.06665654 0.06575476 0.06474131 0.0632122 0.06227951 0.06130372 0.06038419 0.05830176 0.05687422 0.05514898 0.05275748 0.05125445 0.04869501 0.04670815 0.04350422 0.04138165 0.03740089
Predict on the test data
predictions <- h2o.predict(best_model, test_h2o)
## | | | 0% | |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'museum_id' has levels not trained on: ["domus.NE043",
## "mm.MDN.005", "mm.MDN.023", "mm.MDN.031", "mm.Mus70Cal.003", "mm.Mus70Cal.004",
## "mm.Mus70Cal.008", "mm.Mus70Cal.014", "mm.Mus70Cal.015", "mm.Mus70Cal.017",
## ...1093 not listed..., "mm.wiki.444", "mm.wiki.449", "mm.wiki.452",
## "mm.wiki.456", "mm.wiki.460", "mm.wiki.467", "mm.wiki.468", "mm.wiki.484",
## "mm.wiki.500", "mm.wiki.501"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Address_line_1' has levels not trained on: ["(Eling Tide
## Mill/Eling Heritage Centre)", "1 - 29 Bridge Street", "1 Albany Road", "1 Clink
## Street", "1 Elmsleigh Road", "1 Market Place", "1 Queen Street", "1 Rectory
## Street", "1 Scala St", "1 Taylor Street", ...873 not listed..., "York
## Racecourse", "York Road", "Yorkshire Squadron", "c/o 1 Royal Crescent", "c/o 32
## Churchgate", "c/o 7 Hospital Lane", "c/o Cornish Gold Site", "c/o Jewish
## Museum", "c/o Spennymoor Town Hall", "c/of Gullivers world"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Village_Town_City' has levels not trained on: ["Abergele",
## "Abernethy", "Aberporth", "Abington", "Accrington", "Alfreton", "Alloa",
## "Alston", "Amlwch Port", "Ardvasar", ...275 not listed..., "nr Leek", "nr
## Okehampton", "nr Romsey", "nr Yeovil", "nr. Cranbrook", "nr. Helston", "nr.
## Ponteland", "nr. Richmond", "nr. Southampton", "nr. Winkleigh"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Subject_Matter' has levels not trained on: ["Archaeology"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Year_opened' has levels not trained on: ["1660", "1676",
## "1771", "1811", "1812", "1818", "1848", "1854", "1864", "1865", "1871", "1882",
## "1909", "1916"]
test_data <- test_data %>%
mutate(
Predicted_Prob = as.vector(predictions[, "Yes"]), # Probability of "Yes" (Accredited)
Predicted_Class = as.vector(predictions[, "predict"]) # Predicted class
)
Example output of predictions
print(head(test_data %>% select(Accreditation, Predicted_Prob, Predicted_Class)))
## # A tibble: 6 × 3
## Accreditation Predicted_Prob Predicted_Class
## <fct> <dbl> <chr>
## 1 No 0.483 Yes
## 2 Yes 0.908 Yes
## 3 Yes 0.823 Yes
## 4 No 0.0680 No
## 5 No 0.673 Yes
## 6 No 0.766 Yes
Plot variable importance
h2o.varimp_plot(best_model)
## Warning: This model doesn't have variable importances
Shutdown H2O when done
h2o.shutdown(prompt = FALSE)