library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::day() masks h2o::day()
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::hour() masks h2o::hour()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::month() masks h2o::month()
## ✖ lubridate::week() masks h2o::week()
## ✖ lubridate::year() masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.7 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching core tidyquant packages ──────────────────────── tidyquant 1.0.9 ──
## ✔ PerformanceAnalytics 2.0.4 ✔ TTR 0.24.4
## ✔ quantmod 0.4.25 ✔ xts 0.13.2── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date() masks base::as.Date()
## ✖ zoo::as.Date.numeric() masks base::as.Date.numeric()
## ✖ scales::col_factor() masks readr::col_factor()
## ✖ lubridate::day() masks h2o::day()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ lubridate::hour() masks h2o::hour()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ TTR::momentum() masks dials::momentum()
## ✖ lubridate::month() masks h2o::month()
## ✖ yardstick::spec() masks readr::spec()
## ✖ quantmod::summary() masks h2o::summary(), base::summary()
## ✖ lubridate::week() masks h2o::week()
## ✖ lubridate::year() masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(stringr)
data_clean <- data %>%
# Remove unnecessary columns
select(-c(Size_provenance, DOMUS_Subject_Matter, Year_closed, Primary_provenance_of_data,
Identifier_used_in_primary_data_source, Area_Geodemographic_subgroup,
Area_Geodemographic_subgroup_code, Area_Geodemographic_supergroup,
Area_Geodemographic_supergroup_code, Notes, Latitude, Longitude, DOMUS_identifier,
Address_line_2, Postcode, Admin_area, Area_Geodemographic_group, Name_of_museum)) %>%
# Remove rows with missing values
na.omit() %>%
# Change column name and adjust Years opened
rename("Village_Town_City" = "Village,_Town_or_City") %>%
mutate(Year_opened = str_sub(Year_opened, 1, 4)) %>%
# Recode Accreditation
mutate(Accreditation = if_else(Accreditation == "Accredited", "Yes", "No")) %>%
# Convert selected columns to factors
mutate(across(c(Accreditation, Governance, Size, Subject_Matter,
Area_Geodemographic_group_code, Area_Deprivation_index,
Area_Deprivation_index_crime, Area_Deprivation_index_education,
Area_Deprivation_index_employment, Area_Deprivation_index_health,
Area_Deprivation_index_housing, Area_Deprivation_index_income,
Area_Deprivation_index_services, Year_opened, Village_Town_City,
Address_line_1, museum_id), as.factor))
# Preview the cleaned data
glimpse(data_clean)
## Rows: 3,708
## Columns: 17
## $ museum_id <fct> mm.New.1, mm.domus.WM019, mm.aim.048…
## $ Address_line_1 <fct> "1 Olympic Way", "Warwick College of…
## $ Village_Town_City <fct> "Belfast", "Moreton Morrell", "Chelt…
## $ Accreditation <fct> No, No, Yes, No, No, Yes, Yes, Yes, …
## $ Governance <fct> Independent-Not_for_profit, Governme…
## $ Size <fct> large, medium, medium, small, small,…
## $ Subject_Matter <fct> Sea_and_seafaring-Boats_and_ships, R…
## $ Year_opened <fct> 2012, 1984, 2013, 1996, 1980, 1993, …
## $ Area_Deprivation_index <fct> 2, 8, 8, 2, 6, 6, 5, 6, 3, 7, 5, 8, …
## $ Area_Deprivation_index_crime <fct> 3, 9, 10, 1, 10, 3, 1, 10, 1, 7, 10,…
## $ Area_Deprivation_index_education <fct> 1, 8, 7, 6, 8, 7, 7, 6, 4, 7, 6, 6, …
## $ Area_Deprivation_index_employment <fct> 2, 10, 7, 3, 7, 6, 6, 7, 2, 7, 6, 8,…
## $ Area_Deprivation_index_health <fct> 1, 8, 8, 2, 7, 8, 5, 7, 2, 9, 5, 8, …
## $ Area_Deprivation_index_housing <fct> 4, 5, 7, 1, 8, 9, 1, 7, 6, 7, 7, 5, …
## $ Area_Deprivation_index_income <fct> 5, 8, 8, 3, 5, 5, 7, 5, 3, 8, 6, 8, …
## $ Area_Deprivation_index_services <fct> 5, 1, 4, 4, 2, 3, 9, 1, 9, 1, 1, 2, …
## $ Area_Geodemographic_group_code <fct> 2ar, 3ar, 7ar, 5ar, 3cr, 6br, 2ar, 3…
set.seed(1234)
data_clean <- data_clean %>% sample_n(100)
data_split <- initial_split(data_clean, strata = "Accreditation")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
recipe_obj <- recipe(Accreditation ~ ., data = train_tbl) %>%
# Remove zero variance variables
step_zv(all_predictors())
# Initialize h2o
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 days 6 hours
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 11 months and 1 day
## H2O cluster name: H2O_started_from_R_max_dlj571
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.20 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.2 (2023-10-31)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (11 months and 1 day) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split.h2o <- h2o.splitFrame(as.h2o(train_tbl), seed = 2345)
## | | | 0% | |======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)
## | | | 0% | |======================================================================| 100%
y <- "Accreditation"
x <- setdiff(names(train_tbl), y)
models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 60,
exclude_algos = c("DeepLearning", "XGBoost"),
nfolds = 2, # Reduce number of folds for faster execution
seed = 3456
)
## | | | 0% | |================== | 26%
## 18:27:11.814: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 18:27:12.66: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 54.0. | |====================== | 31% | |====================== | 32% | |======================= | 32% | |======================= | 33% | |======================== | 34% | |========================= | 35% | |========================= | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 39% | |============================== | 42% | |================================ | 46% | |================================== | 49% | |===================================== | 53% | |======================================= | 56% | |========================================== | 60% | |============================================ | 63% | |============================================== | 66% | |================================================= | 70% | |=================================================== | 73% | |====================================================== | 77% | |======================================================== | 80% | |========================================================== | 83% | |============================================================= | 87% | |=============================================================== | 90% | |======================================================================| 100%
Examine the output of h2o.automl
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id auc logloss aucpr
## 1 GBM_grid_1_AutoML_29_20241121_182711_model_26 0.8848485 0.6506123 0.8687271
## 2 GBM_grid_1_AutoML_29_20241121_182711_model_264 0.8303030 0.7631890 0.8228281
## 3 GBM_grid_1_AutoML_29_20241121_182711_model_143 0.8242424 0.7207109 0.7579524
## 4 GBM_grid_1_AutoML_29_20241121_182711_model_436 0.8000000 0.7672182 0.7619648
## 5 GBM_grid_1_AutoML_29_20241121_182711_model_446 0.7878788 0.7494822 0.7592999
## 6 GBM_grid_1_AutoML_29_20241121_182711_model_406 0.7757576 0.9181558 0.7742626
## mean_per_class_error rmse mse
## 1 0.1696970 0.4828174 0.2331126
## 2 0.1909091 0.5246858 0.2752952
## 3 0.1787879 0.5093841 0.2594722
## 4 0.2666667 0.5259563 0.2766301
## 5 0.2454545 0.5154917 0.2657317
## 6 0.2151515 0.5586833 0.3121270
##
## [292 rows x 7 columns]
models_h2o@leader
## Model Details:
## ==============
##
## H2OBinomialModel: gbm
## Model ID: GBM_grid_1_AutoML_29_20241121_182711_model_26
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 20 20 1810 1
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 2 1.15000 2 3 2.15000
##
##
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
##
## MSE: 0.03584618
## RMSE: 0.1893309
## LogLoss: 0.2014354
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 1
## Gini: 1
## R^2: 0.8491667
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## No Yes Error Rate
## No 33 0 0.000000 =0/33
## Yes 0 21 0.000000 =0/21
## Totals 33 21 0.000000 =0/54
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.673684 1.000000 15
## 2 max f2 0.673684 1.000000 15
## 3 max f0point5 0.673684 1.000000 15
## 4 max accuracy 0.673684 1.000000 15
## 5 max precision 0.850243 1.000000 0
## 6 max recall 0.673684 1.000000 15
## 7 max specificity 0.850243 1.000000 0
## 8 max absolute_mcc 0.673684 1.000000 15
## 9 max min_per_class_accuracy 0.673684 1.000000 15
## 10 max mean_per_class_accuracy 0.673684 1.000000 15
## 11 max tns 0.850243 33.000000 0
## 12 max fns 0.850243 18.000000 0
## 13 max fps 0.089241 33.000000 46
## 14 max tps 0.673684 21.000000 15
## 15 max tnr 0.850243 1.000000 0
## 16 max fnr 0.850243 0.857143 0
## 17 max fpr 0.089241 1.000000 46
## 18 max tpr 0.673684 1.000000 15
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
##
## MSE: 0.2845475
## RMSE: 0.5334299
## LogLoss: 0.7642133
## Mean Per-Class Error: 0.1
## AUC: 0.92
## AUCPR: 0.9136954
## Gini: 0.84
## R^2: -0.13819
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## No Yes Error Rate
## No 8 2 0.200000 =2/10
## Yes 0 10 0.000000 =0/10
## Totals 8 12 0.100000 =2/20
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.180057 0.909091 11
## 2 max f2 0.180057 0.961538 11
## 3 max f0point5 0.180057 0.862069 11
## 4 max accuracy 0.180057 0.900000 11
## 5 max precision 0.385679 1.000000 0
## 6 max recall 0.180057 1.000000 11
## 7 max specificity 0.385679 1.000000 0
## 8 max absolute_mcc 0.180057 0.816497 11
## 9 max min_per_class_accuracy 0.210815 0.800000 9
## 10 max mean_per_class_accuracy 0.180057 0.900000 11
## 11 max tns 0.385679 10.000000 0
## 12 max fns 0.385679 9.000000 0
## 13 max fps 0.095409 10.000000 18
## 14 max tps 0.180057 10.000000 11
## 15 max tnr 0.385679 1.000000 0
## 16 max fnr 0.385679 0.900000 0
## 17 max fpr 0.095409 1.000000 18
## 18 max tpr 0.180057 1.000000 11
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 2-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.2700837
## RMSE: 0.5196958
## LogLoss: 0.7732328
## Mean Per-Class Error: 0.5
## AUC: 0.47114
## AUCPR: 0.4909416
## Gini: -0.05772006
## R^2: -0.1364561
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## No Yes Error Rate
## No 0 33 1.000000 =33/33
## Yes 0 21 0.000000 =0/21
## Totals 0 54 0.611111 =33/54
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.101725 0.560000 49
## 2 max f2 0.101725 0.760870 49
## 3 max f0point5 0.374013 0.555556 4
## 4 max accuracy 0.374013 0.685185 4
## 5 max precision 0.457127 1.000000 0
## 6 max recall 0.101725 1.000000 49
## 7 max specificity 0.457127 1.000000 0
## 8 max absolute_mcc 0.374013 0.322329 4
## 9 max min_per_class_accuracy 0.194146 0.393939 26
## 10 max mean_per_class_accuracy 0.374013 0.603896 4
## 11 max tns 0.457127 33.000000 0
## 12 max fns 0.457127 20.000000 0
## 13 max fps 0.107157 33.000000 48
## 14 max tps 0.101725 21.000000 49
## 15 max tnr 0.457127 1.000000 0
## 16 max fnr 0.457127 0.952381 0
## 17 max fpr 0.107157 1.000000 48
## 18 max tpr 0.101725 1.000000 49
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid
## accuracy 0.407407 0.000000 0.407407 0.407407
## auc 0.443416 0.136269 0.347059 0.539773
## err 0.592593 0.000000 0.592593 0.592593
## err_count 16.000000 0.000000 16.000000 16.000000
## f0point5 0.450391 0.016680 0.438597 0.462185
## f1 0.567251 0.016541 0.555556 0.578947
## f2 0.766112 0.012072 0.757576 0.774648
## lift_top_group 2.577273 0.173563 2.700000 2.454546
## logloss 0.907311 0.031907 0.884750 0.929873
## max_per_class_error 0.970588 0.041595 0.941176 1.000000
## mcc 0.150414 0.000000 0.150414 NA
## mean_per_class_accuracy 0.514706 0.020797 0.529412 0.500000
## mean_per_class_error 0.485294 0.020797 0.470588 0.500000
## mse 0.299219 0.010493 0.291799 0.306639
## pr_auc 0.503355 0.100334 0.432409 0.574302
## precision 0.396011 0.016116 0.384615 0.407407
## r2 -0.260709 0.013300 -0.251305 -0.270114
## recall 1.000000 0.000000 1.000000 1.000000
## rmse 0.546967 0.009592 0.540185 0.553750
## specificity 0.029412 0.041595 0.058824 0.000000
?h2o.getModel
?h2o.saveModel
?h2o.loadModel
h2o.getModel("GBM_grid_1_AutoML_27_20241121_181806_model_26") %>%
h2o.saveModel("/Users/max/Desktop/DAT3100_DataAnalytics/PSU_DAT3100/11_module13/h2o_models/")
## [1] "/Users/max/Desktop/DAT3100_DataAnalytics/PSU_DAT3100/11_module13/h2o_models/GBM_grid_1_AutoML_27_20241121_181806_model_26"
best_model <- h2o.loadModel("/Users/max/Desktop/DAT3100_DataAnalytics/PSU_DAT3100/11_module13/h2o_models/GBM_grid_1_AutoML_27_20241121_181806_model_26")
predictions <- h2o.predict(best_model, newdata = test_h2o)
## | | | 0% | |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'museum_id' has levels not trained on: ["mm.MDN.020",
## "mm.New.173", "mm.ace.1172", "mm.ace.1174", "mm.aim.0200", "mm.aim.0262",
## "mm.aim.0300", "mm.aim.0818", "mm.aim.1156", "mm.aim82M.052", ...6 not
## listed..., "mm.domus.SE154", "mm.domus.SE336", "mm.domus.WA005",
## "mm.domus.WA022", "mm.domus.WM147", "mm.domus.YH113", "mm.misc.101",
## "mm.misc.226", "mm.wiki.440", "mm.wiki.479"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Address_line_1' has levels not trained on: ["104 Grove Road",
## "11 St Andrews Place", "22 St Georges Road", "3 Commercial Street", "65 Peckham
## Road", "94-B Halftown Road", "Abergwili", "Barr St", "Broad Street", "Castle
## Street", ...6 not listed..., "Frogmore Paper Mill", "Harbour Road", "Livingston
## Mill", "Prince Consort Road", "School Road", "The Coliseum", "The Quays", "The
## Shambles", "White Street", "Yore Mill"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Village_Town_City' has levels not trained on: ["Aberystwyth",
## "Apsley", "Carmarthen", "Castletown", "Deal", "Diss", "Galston", "Gateshead",
## "Great Yarmouth", "Horsham", ...1 not listed..., "Leyburn", "Lisburn",
## "Livingston Village", "Newport", "North Berwick", "Norton", "Portland",
## "Powys", "Ruthin", "Shepton Mallet"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Subject_Matter' has levels not trained on:
## ["Medicine_and_health-Professional_association", "Natural_world-Dinosaurs",
## "Rural_Industry-Other", "Services-Other", "Transport-Aviation",
## "Transport-Other", "War_and_conflict-Military"]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'Year_opened' has levels not trained on: ["1891", "1901",
## "1908", "1917", "1984", "2001", "2002", "2007", "2008", "2013", "2014", "2020"]
predictions_tbl <- predictions %>%
as.tibble()
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 26 × 20
## predict No Yes museum_id Address_line_1 Village_Town_City Accreditation
## <fct> <dbl> <dbl> <fct> <fct> <fct> <fct>
## 1 Yes 0.802 0.198 mm.MDN.020 94-B Halftown… Lisburn No
## 2 Yes 0.802 0.198 mm.aim.02… Dinosaur Farm Newport No
## 3 Yes 0.732 0.268 mm.domus.… The Shambles Diss Yes
## 4 Yes 0.724 0.276 mm.wiki.4… Castle Street Ruthin No
## 5 Yes 0.681 0.319 mm.wiki.4… Harbour Road Castletown No
## 6 No 0.827 0.173 mm.misc.1… 104 Grove Road Portland No
## 7 Yes 0.747 0.253 mm.domus.… 65 Peckham Ro… London No
## 8 Yes 0.780 0.220 mm.domus.… Prince Consor… Gateshead Yes
## 9 Yes 0.792 0.208 mm.aim82M… White Street Great Yarmouth No
## 10 Yes 0.756 0.244 mm.domus.… 22 St Georges… Deal No
## # ℹ 16 more rows
## # ℹ 13 more variables: Governance <fct>, Size <fct>, Subject_Matter <fct>,
## # Year_opened <fct>, Area_Deprivation_index <fct>,
## # Area_Deprivation_index_crime <fct>, Area_Deprivation_index_education <fct>,
## # Area_Deprivation_index_employment <fct>,
## # Area_Deprivation_index_health <fct>, Area_Deprivation_index_housing <fct>,
## # Area_Deprivation_index_income <fct>, …
?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "GBM_grid_1_AutoML_27_20241121_181806_model_26"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/GBM_grid_1_AutoML_27_20241121_181806_model_26"
##
##
## $model_checksum
## [1] "-3234214099757550656"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_9e4e_3"
##
##
## $frame_checksum
## [1] -1.409538e+12
##
## $description
## NULL
##
## $scoring_time
## [1] 1.732232e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.2331126
##
## $RMSE
## [1] 0.4828174
##
## $nobs
## [1] 26
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.04494471
##
## $logloss
## [1] 0.6506123
##
## $AUC
## [1] 0.8848485
##
## $pr_auc
## [1] 0.8687271
##
## $Gini
## [1] 0.769697
##
## $mean_per_class_error
## [1] 0.169697
##
## $domain
## [1] "No" "Yes"
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## No Yes Error Rate
## No 14 1 0.0667 = 1 / 15
## Yes 3 8 0.2727 = 3 / 11
## Totals 17 9 0.1538 = 4 / 26
##
##
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.389455 0.166667 0.111111 0.333333 0.615385 1.000000 0.090909 1.000000
## 2 0.387902 0.307692 0.217391 0.526316 0.653846 1.000000 0.181818 1.000000
## 3 0.349147 0.428571 0.319149 0.652174 0.692308 1.000000 0.272727 1.000000
## 4 0.335706 0.625000 0.510204 0.806452 0.769231 1.000000 0.454545 1.000000
## 5 0.319013 0.588235 0.500000 0.714286 0.730769 0.833333 0.454545 0.933333
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.233550 0.090909 0.545455 15 10 0 1
## 2 0.337100 0.181818 0.590909 15 9 0 2
## 3 0.421741 0.272727 0.636364 15 8 0 3
## 4 0.569803 0.454545 0.727273 15 6 0 5
## 5 0.454828 0.454545 0.693939 14 6 1 5
## tnr fnr fpr tpr idx
## 1 1.000000 0.909091 0.000000 0.090909 0
## 2 1.000000 0.818182 0.000000 0.181818 1
## 3 1.000000 0.727273 0.000000 0.272727 2
## 4 1.000000 0.545455 0.000000 0.454545 3
## 5 0.933333 0.545455 0.066667 0.454545 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 19 0.207595 0.709677 0.859375 0.604396 0.653846 0.550000 1.000000 0.400000
## 20 0.198324 0.666667 0.833333 0.555556 0.576923 0.500000 1.000000 0.266667
## 21 0.198155 0.647059 0.820896 0.533981 0.538462 0.478261 1.000000 0.200000
## 22 0.178375 0.628571 0.808824 0.514019 0.500000 0.458333 1.000000 0.133333
## 23 0.172737 0.611111 0.797101 0.495495 0.461538 0.440000 1.000000 0.066667
## 24 0.159221 0.594595 0.785714 0.478261 0.423077 0.423077 1.000000 0.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 19 0.469042 0.400000 0.700000 6 0 9 11
## 20 0.365148 0.266667 0.633333 4 0 11 11
## 21 0.309277 0.200000 0.600000 3 0 12 11
## 22 0.247207 0.133333 0.566667 2 0 13 11
## 23 0.171270 0.066667 0.533333 1 0 14 11
## 24 0.000000 0.000000 0.500000 0 0 15 11
## tnr fnr fpr tpr idx
## 19 0.400000 0.000000 0.600000 1.000000 18
## 20 0.266667 0.000000 0.733333 1.000000 19
## 21 0.200000 0.000000 0.800000 1.000000 20
## 22 0.133333 0.000000 0.866667 1.000000 21
## 23 0.066667 0.000000 0.933333 1.000000 22
## 24 0.000000 0.000000 1.000000 1.000000 23
##
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.281246 0.800000 7
## 2 max f2 0.219858 0.873016 17
## 3 max f0point5 0.281246 0.851064 7
## 4 max accuracy 0.281246 0.846154 7
## 5 max precision 0.389455 1.000000 0
## 6 max recall 0.219858 1.000000 17
## 7 max specificity 0.389455 1.000000 0
## 8 max absolute_mcc 0.281246 0.686023 7
## 9 max min_per_class_accuracy 0.268016 0.800000 10
## 10 max mean_per_class_accuracy 0.281246 0.830303 7
## 11 max tns 0.389455 15.000000 0
## 12 max fns 0.389455 10.000000 0
## 13 max fps 0.159221 15.000000 23
## 14 max tps 0.219858 11.000000 17
## 15 max tnr 0.389455 1.000000 0
## 16 max fnr 0.389455 0.909091 0
## 17 max fpr 0.159221 1.000000 23
## 18 max tpr 0.219858 1.000000 17
##
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 42,31 %, avg score: 26,24 %
## group cumulative_data_fraction lower_threshold lift cumulative_lift
## 1 1 0.03846154 0.389067 2.363636 2.363636
## 2 2 0.03846154 0.388679 0.000000 2.363636
## 3 3 0.03846154 0.388291 0.000000 2.363636
## 4 4 0.07692308 0.387902 2.363636 2.363636
## 5 5 0.07692308 0.378214 0.000000 2.363636
## 6 6 0.11538462 0.342426 2.363636 2.363636
## 7 7 0.19230769 0.335706 2.363636 2.363636
## 8 8 0.23076923 0.319013 0.000000 1.969697
## 9 9 0.30769231 0.282972 2.363636 2.068182
## 10 10 0.42307692 0.275796 0.787879 1.719008
## 11 11 0.50000000 0.250392 1.181818 1.636364
## 12 12 0.61538462 0.244200 0.787879 1.477273
## 13 13 0.69230769 0.225509 0.000000 1.313131
## 14 14 0.84615385 0.198324 0.590909 1.181818
## 15 15 0.88461538 0.188265 0.000000 1.130435
## 16 16 1.00000000 0.159221 0.000000 1.000000
## response_rate score cumulative_response_rate cumulative_score
## 1 1.000000 0.389455 1.000000 0.389455
## 2 0.000000 0.000000 1.000000 0.389455
## 3 0.000000 0.000000 1.000000 0.389455
## 4 1.000000 0.387902 1.000000 0.388679
## 5 0.000000 0.000000 1.000000 0.388679
## 6 1.000000 0.349147 1.000000 0.375502
## 7 1.000000 0.335706 1.000000 0.359583
## 8 0.000000 0.319013 0.833333 0.352822
## 9 1.000000 0.301562 0.875000 0.340007
## 10 0.333333 0.279364 0.727273 0.323468
## 11 0.500000 0.260646 0.692308 0.313803
## 12 0.333333 0.246008 0.625000 0.301091
## 13 0.000000 0.236736 0.555556 0.293941
## 14 0.250000 0.206025 0.500000 0.277956
## 15 0.000000 0.198155 0.478261 0.274487
## 16 0.000000 0.170111 0.423077 0.262443
## capture_rate cumulative_capture_rate gain cumulative_gain
## 1 0.090909 0.090909 136.363636 136.363636
## 2 0.000000 0.090909 -100.000000 136.363636
## 3 0.000000 0.090909 -100.000000 136.363636
## 4 0.090909 0.181818 136.363636 136.363636
## 5 0.000000 0.181818 -100.000000 136.363636
## 6 0.090909 0.272727 136.363636 136.363636
## 7 0.181818 0.454545 136.363636 136.363636
## 8 0.000000 0.454545 -100.000000 96.969697
## 9 0.181818 0.636364 136.363636 106.818182
## 10 0.090909 0.727273 -21.212121 71.900826
## 11 0.090909 0.818182 18.181818 63.636364
## 12 0.090909 0.909091 -21.212121 47.727273
## 13 0.000000 0.909091 -100.000000 31.313131
## 14 0.090909 1.000000 -40.909091 18.181818
## 15 0.000000 1.000000 -100.000000 13.043478
## 16 0.000000 1.000000 -100.000000 0.000000
## kolmogorov_smirnov
## 1 0.090909
## 2 0.090909
## 3 0.090909
## 4 0.181818
## 5 0.181818
## 6 0.272727
## 7 0.454545
## 8 0.387879
## 9 0.569697
## 10 0.527273
## 11 0.551515
## 12 0.509091
## 13 0.375758
## 14 0.266667
## 15 0.200000
## 16 0.000000
h2o.auc(performance_h2o)
## [1] 0.8848485
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.281246044647006:
## No Yes Error Rate
## No 14 1 0.066667 =1/15
## Yes 3 8 0.272727 =3/11
## Totals 17 9 0.153846 =4/26
h2o.metric(performance_h2o)
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.389455 0.166667 0.111111 0.333333 0.615385 1.000000 0.090909 1.000000
## 2 0.387902 0.307692 0.217391 0.526316 0.653846 1.000000 0.181818 1.000000
## 3 0.349147 0.428571 0.319149 0.652174 0.692308 1.000000 0.272727 1.000000
## 4 0.335706 0.625000 0.510204 0.806452 0.769231 1.000000 0.454545 1.000000
## 5 0.319013 0.588235 0.500000 0.714286 0.730769 0.833333 0.454545 0.933333
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.233550 0.090909 0.545455 15 10 0 1
## 2 0.337100 0.181818 0.590909 15 9 0 2
## 3 0.421741 0.272727 0.636364 15 8 0 3
## 4 0.569803 0.454545 0.727273 15 6 0 5
## 5 0.454828 0.454545 0.693939 14 6 1 5
## tnr fnr fpr tpr idx
## 1 1.000000 0.909091 0.000000 0.090909 0
## 2 1.000000 0.818182 0.000000 0.181818 1
## 3 1.000000 0.727273 0.000000 0.272727 2
## 4 1.000000 0.545455 0.000000 0.454545 3
## 5 0.933333 0.545455 0.066667 0.454545 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 19 0.207595 0.709677 0.859375 0.604396 0.653846 0.550000 1.000000 0.400000
## 20 0.198324 0.666667 0.833333 0.555556 0.576923 0.500000 1.000000 0.266667
## 21 0.198155 0.647059 0.820896 0.533981 0.538462 0.478261 1.000000 0.200000
## 22 0.178375 0.628571 0.808824 0.514019 0.500000 0.458333 1.000000 0.133333
## 23 0.172737 0.611111 0.797101 0.495495 0.461538 0.440000 1.000000 0.066667
## 24 0.159221 0.594595 0.785714 0.478261 0.423077 0.423077 1.000000 0.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 19 0.469042 0.400000 0.700000 6 0 9 11
## 20 0.365148 0.266667 0.633333 4 0 11 11
## 21 0.309277 0.200000 0.600000 3 0 12 11
## 22 0.247207 0.133333 0.566667 2 0 13 11
## 23 0.171270 0.066667 0.533333 1 0 14 11
## 24 0.000000 0.000000 0.500000 0 0 15 11
## tnr fnr fpr tpr idx
## 19 0.400000 0.000000 0.600000 1.000000 18
## 20 0.266667 0.000000 0.733333 1.000000 19
## 21 0.200000 0.000000 0.800000 1.000000 20
## 22 0.133333 0.000000 0.866667 1.000000 21
## 23 0.066667 0.000000 0.933333 1.000000 22
## 24 0.000000 0.000000 1.000000 1.000000 23