# for Core packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for financial analysis
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.5.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Warning: package 'xts' was built under R version 4.5.3
## Warning: package 'zoo' was built under R version 4.5.3
## Warning: package 'quantmod' was built under R version 4.5.3
## Warning: package 'TTR' was built under R version 4.5.3
## Warning: package 'PerformanceAnalytics' was built under R version 4.5.3
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.12 ──
## ✔ PerformanceAnalytics 2.1.0 ✔ TTR 0.24.4
## ✔ quantmod 0.4.28 ✔ xts 0.14.2
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date() masks base::as.Date()
## ✖ zoo::as.Date.numeric() masks base::as.Date.numeric()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary() masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for times series
library(timetk)
## Warning: package 'timetk' was built under R version 4.5.3
##
## Attaching package: 'timetk'
##
## The following object is masked from 'package:tidyquant':
##
## FANG
Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.
The following is the replication of Matt Dancho’s tutorial on this page
start_date <- "1989-01-01"
symbols_txt <- c("CTICLAIMS", # Connecticut
"MEICLAIMS", # Maine
"MAICLAIMS", # Massachusetts
"NHICLAIMS", # New Hampshire
"RIICLAIMS", # Rhode Island
"VTICLAIMS") # Vermont
claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
mutate(symbol = fct_recode(symbol,
"Connecticut" = "CTICLAIMS",
"Maine" = "MEICLAIMS",
"Massachusetts" = "MAICLAIMS",
"New Hampshire" = "NHICLAIMS",
"Rhode Island" = "RIICLAIMS",
"Vermont" = "VTICLAIMS")) %>%
rename(claims = price)
claims_tbl
## # A tibble: 11,676 × 3
## symbol date claims
## <fct> <date> <int>
## 1 Connecticut 1989-01-07 8345
## 2 Connecticut 1989-01-14 6503
## 3 Connecticut 1989-01-21 3821
## 4 Connecticut 1989-01-28 4663
## 5 Connecticut 1989-02-04 4162
## 6 Connecticut 1989-02-11 4337
## 7 Connecticut 1989-02-18 4079
## 8 Connecticut 1989-02-25 3556
## 9 Connecticut 1989-03-04 3826
## 10 Connecticut 1989-03-11 3515
## # ℹ 11,666 more rows
claims_tbl %>%
plot_time_series(.date_var = date, .value = claims)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the timetk package.
## Please report the issue at
## <https://github.com/business-science/timetk/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Ignoring unknown labels:
## • colour : "Legend"
claims_tbl %>%
group_by(symbol) %>%
plot_time_series(
.date_var = date,
.value = log(claims),
.facet_ncol = 2,
.facet_scales = "free_y",
.color_var = year(date)
)
claims_tbl %>%
plot_time_series(
date, claims,
.color_var = month(date, label = TRUE),
# Returns static ggplot
.interactive = FALSE,
# Customize
.title = "New England Initial Unemployment Claims",
.x_lab = "Date",
.y_lab = "Initial Claims",
.color_lab = "Month"
)
claims_tbl %>% count(symbol)
## # A tibble: 6 × 2
## symbol n
## <fct> <int>
## 1 Connecticut 1946
## 2 Massachusetts 1946
## 3 Maine 1946
## 4 New Hampshire 1946
## 5 Rhode Island 1946
## 6 Vermont 1946
claims_tbl %>%
filter_by_time(.date_var = date, .end_date = "1995") %>%
group_by(symbol) %>%
plot_time_series_boxplot(
.date_var = date,
.value = claims,
.period = "1 year",
.facet_ncol = 2
)
## Ignoring unknown labels:
## • colour : "Legend"
claims_tbl %>%
group_by(symbol) %>%
plot_time_series_regression(
.date_var = date,
.facet_ncol = 2,
.formula = log(claims) ~ as.numeric(date) + month(date, label = TRUE),
.show_summary = FALSE
)
claims_tbl %>%
group_by(symbol) %>%
plot_acf_diagnostics(
date, claims,
.lags = "2 years")
claims_tbl %>%
plot_seasonal_diagnostics(date, claims)
claims_tbl %>% count(symbol)
## # A tibble: 6 × 2
## symbol n
## <fct> <int>
## 1 Connecticut 1946
## 2 Massachusetts 1946
## 3 Maine 1946
## 4 New Hampshire 1946
## 5 Rhode Island 1946
## 6 Vermont 1946
claims_tbl %>%
group_by(symbol) %>%
plot_seasonal_diagnostics(date, claims)
claims_tbl %>%
group_by(symbol) %>%
plot_stl_diagnostics(
date, claims,
.feature_set = c("observed", "season", "trend", "remainder")
)
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
claims_tbl %>%
group_by(symbol) %>%
summarise_by_time(.date_var = date, claims = sum(claims), .by = "quarter") %>%
plot_time_series(
date, claims,
.facet_ncol = 2,
.interactive = FALSE
)
## Ignoring unknown labels:
## • colour : "Legend"
claims_tbl %>%
group_by(symbol) %>%
summarise_by_time(.date_var = date, claims = mean(claims), .by = "month") %>%
plot_time_series(date, claims, .facet_ncol = 2, .interactive = FALSE)
## Ignoring unknown labels:
## • colour : "Legend"
claims_tbl %>%
group_by(symbol) %>%
filter_by_time(.date_var = date,
.start_date = "2008-01",
.end_date = "2010") %>%
plot_time_series(date, claims, .facet_ncol = 2)
## Ignoring unknown labels:
## • colour : "Legend"
claims_tbl %>%
group_by(symbol) %>%
pad_by_time(date, .by = "week", .pad_value = 0)
## # A tibble: 11,676 × 3
## # Groups: symbol [6]
## symbol date claims
## <fct> <date> <int>
## 1 Connecticut 1989-01-07 8345
## 2 Connecticut 1989-01-14 6503
## 3 Connecticut 1989-01-21 3821
## 4 Connecticut 1989-01-28 4663
## 5 Connecticut 1989-02-04 4162
## 6 Connecticut 1989-02-11 4337
## 7 Connecticut 1989-02-18 4079
## 8 Connecticut 1989-02-25 3556
## 9 Connecticut 1989-03-04 3826
## 10 Connecticut 1989-03-11 3515
## # ℹ 11,666 more rows
claims_tbl %>%
head(10) %>%
mutate(rolling_avg_4 = slidify_vec(claims, mean,
.period = 4,
.align = "right",
.partial = TRUE))
## # A tibble: 10 × 4
## symbol date claims rolling_avg_4
## <fct> <date> <int> <dbl>
## 1 Connecticut 1989-01-07 8345 8345
## 2 Connecticut 1989-01-14 6503 7424
## 3 Connecticut 1989-01-21 3821 6223
## 4 Connecticut 1989-01-28 4663 5833
## 5 Connecticut 1989-02-04 4162 4787.
## 6 Connecticut 1989-02-11 4337 4246.
## 7 Connecticut 1989-02-18 4079 4310.
## 8 Connecticut 1989-02-25 3556 4034.
## 9 Connecticut 1989-03-04 3826 3950.
## 10 Connecticut 1989-03-11 3515 3744
# Rolling regressions are easy to implement using `.unlist = FALSE`
lm_roll <- slidify(~ lm(..1 ~ ..2 + ..3), .period = 26,
.unlist = FALSE, .align = "right")
claims_tbl %>%
select(symbol, date, claims) %>%
group_by(symbol) %>%
mutate(lag_claims = lag(claims),
numeric_date = as.numeric(date)) %>%
filter(!is.na(lag_claims)) %>%
# Apply rolling regression
mutate(rolling_lm = lm_roll(claims, lag_claims, numeric_date)) %>%
filter(!is.na(rolling_lm))
## # A tibble: 11,520 × 6
## # Groups: symbol [6]
## symbol date claims lag_claims numeric_date rolling_lm
## <fct> <date> <int> <int> <dbl> <list>
## 1 Connecticut 1989-07-08 7010 5232 7128 <lm>
## 2 Connecticut 1989-07-15 5630 7010 7135 <lm>
## 3 Connecticut 1989-07-22 4590 5630 7142 <lm>
## 4 Connecticut 1989-07-29 4929 4590 7149 <lm>
## 5 Connecticut 1989-08-05 7029 4929 7156 <lm>
## 6 Connecticut 1989-08-12 3704 7029 7163 <lm>
## 7 Connecticut 1989-08-19 4082 3704 7170 <lm>
## 8 Connecticut 1989-08-26 3373 4082 7177 <lm>
## 9 Connecticut 1989-09-02 2902 3373 7184 <lm>
## 10 Connecticut 1989-09-09 2856 2902 7191 <lm>
## # ℹ 11,510 more rows
## Model
library(h2o)
## Warning: package 'h2o' was built under R version 4.5.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.12 ✔ rsample 1.3.2
## ✔ dials 1.4.2 ✔ tailor 0.1.0
## ✔ infer 1.1.0 ✔ tune 2.0.1
## ✔ modeldata 1.5.1 ✔ workflows 1.3.0
## ✔ parsnip 1.4.1 ✔ workflowsets 1.1.1
## ✔ recipes 1.3.1 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 minutes 29 seconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 2 years, 4 months and 7 days
## H2O cluster name: H2O_started_from_R_javony_gwl568
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.40 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.5.2 (2025-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (2 years, 4 months and 7 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
claims_model_tbl <- claims_tbl %>%
mutate(
claims_class = if_else(claims > median(claims, na.rm = TRUE), "High", "Low"),
claims_class = as.factor(claims_class),
symbol = as.factor(symbol),
year = lubridate::year(date),
month = lubridate::month(date)
) %>%
select(-date)
set.seed(2345)
split <- initial_split(claims_model_tbl, prop = 0.80, strata = claims_class)
train_tbl <- training(split)
test_tbl <- testing(split)
split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)
## | | | 0% | |======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)
## | | | 0% | |======================================================================| 100%
y <- "claims_class"
x <- setdiff(names(train_tbl), y)
models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_models = 5,
exclude_algos = "DeepLearning",
nfolds = 3,
seed = 3456
)
## | | | 0% | |=== | 4%
## 08:57:44.883: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 08:57:44.887: AutoML: XGBoost is not available; skipping it. | |===== | 8% | |========= | 12% | |========== | 15% | |=========== | 16% | |============ | 17% | |============= | 18% | |============== | 20% | |==================== | 29% | |======================================================================| 100%
## Examine the output of h2o.automl
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id auc logloss aucpr
## 1 StackedEnsemble_BestOfFamily_1_AutoML_2_20260428_85744 1 1.666746e-04 1
## 2 DRF_1_AutoML_2_20260428_85744 1 3.225557e-03 1
## 3 GBM_1_AutoML_2_20260428_85744 1 2.416923e-05 1
## 4 GBM_3_AutoML_2_20260428_85744 1 3.563682e-12 1
## 5 StackedEnsemble_AllModels_1_AutoML_2_20260428_85744 1 1.033902e-06 1
## 6 GBM_2_AutoML_2_20260428_85744 1 1.392626e-10 1
## mean_per_class_error rmse mse
## 1 0 1.813247e-03 3.287864e-06
## 2 0 2.087159e-02 4.356234e-04
## 3 0 2.065652e-04 4.266918e-08
## 4 0 6.019183e-11 3.623057e-21
## 5 0 1.829544e-06 3.347232e-12
## 6 0 4.540186e-09 2.061329e-17
##
## [7 rows x 7 columns]
models_h2o@leader
## Model Details:
## ==============
##
## H2OBinomialModel: stackedensemble
## Model ID: StackedEnsemble_BestOfFamily_1_AutoML_2_20260428_85744
## Model Summary for Stacked Ensemble:
## key value
## 1 Stacking strategy cross_validation
## 2 Number of base models (used / total) 2/3
## 3 # GBM base models (used / total) 1/1
## 4 # DRF base models (used / total) 1/1
## 5 # GLM base models (used / total) 0/1
## 6 Metalearner algorithm GLM
## 7 Metalearner fold assignment scheme Random
## 8 Metalearner nfolds 3
## 9 Metalearner fold_column NA
## 10 Custom metalearner hyperparameters None
##
##
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 2.916358e-07
## RMSE: 0.0005400331
## LogLoss: 5.164982e-05
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 1
## Gini: 1
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## High Low Error Rate
## High 3961 0 0.000000 =0/3961
## Low 0 3997 0.000000 =0/3997
## Totals 3961 3997 0.000000 =0/7958
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.991413 1.000000 95
## 2 max f2 0.991413 1.000000 95
## 3 max f0point5 0.991413 1.000000 95
## 4 max accuracy 0.991413 1.000000 95
## 5 max precision 1.000000 1.000000 0
## 6 max recall 0.991413 1.000000 95
## 7 max specificity 1.000000 1.000000 0
## 8 max absolute_mcc 0.991413 1.000000 95
## 9 max min_per_class_accuracy 0.991413 1.000000 95
## 10 max mean_per_class_accuracy 0.991413 1.000000 95
## 11 max tns 1.000000 3961.000000 0
## 12 max fns 1.000000 102.000000 0
## 13 max fps 0.000000 3961.000000 399
## 14 max tps 0.991413 3997.000000 95
## 15 max tnr 1.000000 1.000000 0
## 16 max fnr 1.000000 0.025519 0
## 17 max fpr 0.000000 1.000000 399
## 18 max tpr 0.991413 1.000000 95
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on validation data. **
##
## MSE: 1.401988e-05
## RMSE: 0.003744313
## LogLoss: 0.000268815
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 1
## Gini: 1
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## High Low Error Rate
## High 709 0 0.000000 =0/709
## Low 0 673 0.000000 =0/673
## Totals 709 673 0.000000 =0/1382
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.990220 1.000000 25
## 2 max f2 0.990220 1.000000 25
## 3 max f0point5 0.990220 1.000000 25
## 4 max accuracy 0.990220 1.000000 25
## 5 max precision 1.000000 1.000000 0
## 6 max recall 0.990220 1.000000 25
## 7 max specificity 1.000000 1.000000 0
## 8 max absolute_mcc 0.990220 1.000000 25
## 9 max min_per_class_accuracy 0.990220 1.000000 25
## 10 max mean_per_class_accuracy 0.990220 1.000000 25
## 11 max tns 1.000000 709.000000 0
## 12 max fns 1.000000 26.000000 0
## 13 max fps 0.000000 709.000000 399
## 14 max tps 0.990220 673.000000 25
## 15 max tnr 1.000000 1.000000 0
## 16 max fnr 1.000000 0.038633 0
## 17 max fpr 0.000000 1.000000 399
## 18 max tpr 0.990220 1.000000 25
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 3-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.0003900759
## RMSE: 0.01975034
## LogLoss: 0.001823021
## Mean Per-Class Error: 0.0002501876
## AUC: 0.9999996
## AUCPR: 0.9999996
## Gini: 0.9999992
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## High Low Error Rate
## High 3961 0 0.000000 =0/3961
## Low 2 3995 0.000500 =2/3997
## Totals 3963 3995 0.000251 =2/7958
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.396117 0.999750 144
## 2 max f2 0.244490 0.999800 150
## 3 max f0point5 0.396117 0.999900 144
## 4 max accuracy 0.396117 0.999749 144
## 5 max precision 1.000000 1.000000 0
## 6 max recall 0.244490 1.000000 150
## 7 max specificity 1.000000 1.000000 0
## 8 max absolute_mcc 0.396117 0.999497 144
## 9 max min_per_class_accuracy 0.396117 0.999500 144
## 10 max mean_per_class_accuracy 0.396117 0.999750 144
## 11 max tns 1.000000 3961.000000 0
## 12 max fns 1.000000 197.000000 0
## 13 max fps 0.000000 3961.000000 399
## 14 max tps 0.244490 3997.000000 150
## 15 max tnr 1.000000 1.000000 0
## 16 max fnr 1.000000 0.049287 0
## 17 max fpr 0.000000 1.000000 399
## 18 max tpr 0.244490 1.000000 150
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## accuracy 0.999873 0.000221 1.000000 1.000000 0.999618
## auc 1.000000 0.000000 1.000000 1.000000 0.999999
## err 0.000127 0.000221 0.000000 0.000000 0.000382
## err_count 0.333333 0.577350 0.000000 0.000000 1.000000
## f0point5 0.999800 0.000347 1.000000 1.000000 0.999400
##
## ---
## mean sd cv_1_valid cv_2_valid cv_3_valid
## precision 0.999750 0.000433 1.000000 1.000000 0.999250
## r2 0.998436 0.000456 0.998800 0.998583 0.997925
## recall 1.000000 0.000000 1.000000 1.000000 1.000000
## residual_deviance 9.671735 1.603306 9.527632 8.145347 11.342228
## rmse 0.019638 0.002818 0.017317 0.018824 0.022774
## specificity 0.999741 0.000449 1.000000 1.000000 0.999222
## Save and Load
?h2o.getModel
## starting httpd help server ... done
?h2o.saveModel
?h2o.loadModel
dir.create("h2o_models", showWarnings = FALSE)
h2o.saveModel(
object = models_h2o@leader,
path = "h2o_models/",
force = TRUE
)
## [1] "C:\\Users\\javony\\Desktop\\PSU_DATA3100\\h2o_models\\StackedEnsemble_BestOfFamily_1_AutoML_2_20260428_85744"
best_model <- models_h2o@leader
## Make predictions
predictions <- h2o.predict(best_model, newdata = test_h2o)
## | | | 0% | |======================================================================| 100%
predictions_tbl <- predictions %>%
as_tibble()
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 2,336 × 8
## predict High Low symbol claims claims_class year month
## <fct> <dbl> <dbl> <fct> <int> <fct> <dbl> <dbl>
## 1 High 1.000 1.19e- 9 Connecticut 4162 High 1989 2
## 2 High 1.000 1.58e- 9 Connecticut 2886 High 1989 3
## 3 High 1.000 1.47e- 9 Connecticut 2694 High 1989 4
## 4 High 1.000 1.43e- 9 Connecticut 3224 High 1989 5
## 5 High 1.000 1.38e- 9 Connecticut 2663 High 1989 6
## 6 High 1.000 1.26e- 9 Connecticut 5630 High 1989 7
## 7 High 1.000 1.21e- 9 Connecticut 7029 High 1989 8
## 8 High 1.000 1.20e- 9 Connecticut 3025 High 1989 9
## 9 High 1.000 1.29e- 9 Connecticut 3454 High 1989 10
## 10 High 1.000 9.69e-10 Connecticut 4392 High 1990 5
## # ℹ 2,326 more rows
## Evaluate model
?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "StackedEnsemble_BestOfFamily_1_AutoML_2_20260428_85744"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/StackedEnsemble_BestOfFamily_1_AutoML_2_20260428_85744"
##
##
## $model_checksum
## [1] "-1123968970341071515"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_8b64_3"
##
##
## $frame_checksum
## [1] 5.300821e+15
##
## $description
## NULL
##
## $scoring_time
## [1] 1.777381e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 3.287864e-06
##
## $RMSE
## [1] 0.001813247
##
## $nobs
## [1] 2336
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.9999868
##
## $logloss
## [1] 0.0001666746
##
## $AUC
## [1] 1
##
## $pr_auc
## [1] 1
##
## $Gini
## [1] 1
##
## $mean_per_class_error
## [1] 0
##
## $domain
## [1] "High" "Low"
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## High Low Error Rate
## High 1168 0 0.0000 = 0 / 1,168
## Low 0 1168 0.0000 = 0 / 1,168
## Totals 1168 1168 0.0000 = 0 / 2,336
##
##
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 1.000000 0.979467 0.967547 0.991684 0.979880 1.000000 0.959760 1.000000
## 2 0.999991 0.979913 0.968243 0.991867 0.980308 1.000000 0.960616 1.000000
## 3 0.999991 0.980358 0.968939 0.992049 0.980736 1.000000 0.961473 1.000000
## 4 0.999969 0.980803 0.969634 0.992232 0.981164 1.000000 0.962329 1.000000
## 5 0.999966 0.981247 0.970329 0.992414 0.981592 1.000000 0.963185 1.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.960538 0.959760 0.979880 1168 47 0 1121
## 2 0.961362 0.960616 0.980308 1168 46 0 1122
## 3 0.962187 0.961473 0.980736 1168 45 0 1123
## 4 0.963012 0.962329 0.981164 1168 44 0 1124
## 5 0.963838 0.963185 0.981592 1168 43 0 1125
## tnr fnr fpr tpr idx
## 1 1.000000 0.040240 0.000000 0.959760 0
## 2 1.000000 0.039384 0.000000 0.960616 1
## 3 1.000000 0.038527 0.000000 0.961473 2
## 4 1.000000 0.037671 0.000000 0.962329 3
## 5 1.000000 0.036815 0.000000 0.963185 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 395 0.000000 0.672811 0.837156 0.562404 0.513699 0.506944 1.000000
## 396 0.000000 0.671072 0.836077 0.560461 0.509846 0.504972 1.000000
## 397 0.000000 0.669149 0.834882 0.558317 0.505565 0.502798 1.000000
## 398 0.000000 0.668575 0.834524 0.557678 0.504281 0.502150 1.000000
## 399 0.000000 0.668001 0.834167 0.557039 0.502997 0.501503 1.000000
## 400 0.000000 0.666667 0.833333 0.555556 0.500000 0.500000 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395 0.027397 0.117851 0.027397 0.513699 32
## 396 0.019692 0.099719 0.019692 0.509846 23
## 397 0.011130 0.074808 0.011130 0.505565 13
## 398 0.008562 0.065568 0.008562 0.504281 10
## 399 0.005993 0.054823 0.005993 0.502997 7
## 400 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 395 0 1136 1168 0.027397 0.000000 0.972603 1.000000 394
## 396 0 1145 1168 0.019692 0.000000 0.980308 1.000000 395
## 397 0 1155 1168 0.011130 0.000000 0.988870 1.000000 396
## 398 0 1158 1168 0.008562 0.000000 0.991438 1.000000 397
## 399 0 1161 1168 0.005993 0.000000 0.994007 1.000000 398
## 400 0 1168 1168 0.000000 0.000000 1.000000 1.000000 399
##
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.980389 1.000000 47
## 2 max f2 0.980389 1.000000 47
## 3 max f0point5 0.980389 1.000000 47
## 4 max accuracy 0.980389 1.000000 47
## 5 max precision 1.000000 1.000000 0
## 6 max recall 0.980389 1.000000 47
## 7 max specificity 1.000000 1.000000 0
## 8 max absolute_mcc 0.980389 1.000000 47
## 9 max min_per_class_accuracy 0.980389 1.000000 47
## 10 max mean_per_class_accuracy 0.980389 1.000000 47
## 11 max tns 1.000000 1168.000000 0
## 12 max fns 1.000000 47.000000 0
## 13 max fps 0.000000 1168.000000 399
## 14 max tps 0.980389 1168.000000 47
## 15 max tnr 1.000000 1.000000 0
## 16 max fnr 1.000000 0.040240 0
## 17 max fpr 0.000000 1.000000 399
## 18 max tpr 0.980389 1.000000 47
##
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 50.00 %, avg score: 50.01 %
## group cumulative_data_fraction lower_threshold lift cumulative_lift
## 1 1 0.01027397 1.000000 2.000000 2.000000
## 2 2 0.02011986 1.000000 2.000000 2.000000
## 3 3 0.03039384 1.000000 2.000000 2.000000
## 4 4 0.04023973 1.000000 2.000000 2.000000
## 5 5 0.05008562 1.000000 2.000000 2.000000
## 6 6 0.10017123 1.000000 2.000000 2.000000
## 7 7 0.15025685 1.000000 2.000000 2.000000
## 8 8 0.20034247 1.000000 2.000000 2.000000
## 9 9 0.30008562 1.000000 2.000000 2.000000
## 10 10 0.40025685 1.000000 2.000000 2.000000
## 11 11 0.50000000 0.521434 2.000000 2.000000
## 12 12 0.60017123 0.000000 0.000000 1.666191
## 13 13 0.69991438 0.000000 0.000000 1.428746
## 14 14 0.80008562 0.000000 0.000000 1.249866
## 15 15 0.89982877 0.000000 0.000000 1.111323
## 16 16 1.00000000 0.000000 0.000000 1.000000
## response_rate score cumulative_response_rate cumulative_score
## 1 1.000000 1.000000 1.000000 1.000000
## 2 1.000000 1.000000 1.000000 1.000000
## 3 1.000000 1.000000 1.000000 1.000000
## 4 1.000000 1.000000 1.000000 1.000000
## 5 1.000000 1.000000 1.000000 1.000000
## 6 1.000000 1.000000 1.000000 1.000000
## 7 1.000000 1.000000 1.000000 1.000000
## 8 1.000000 1.000000 1.000000 1.000000
## 9 1.000000 1.000000 1.000000 1.000000
## 10 1.000000 1.000000 1.000000 1.000000
## 11 1.000000 0.999430 1.000000 0.999886
## 12 0.000000 0.001079 0.833096 0.833181
## 13 0.000000 0.000000 0.714373 0.714446
## 14 0.000000 0.000000 0.624933 0.624997
## 15 0.000000 0.000000 0.555661 0.555718
## 16 0.000000 0.000000 0.500000 0.500051
## capture_rate cumulative_capture_rate gain cumulative_gain
## 1 0.020548 0.020548 100.000000 100.000000
## 2 0.019692 0.040240 100.000000 100.000000
## 3 0.020548 0.060788 100.000000 100.000000
## 4 0.019692 0.080479 100.000000 100.000000
## 5 0.019692 0.100171 100.000000 100.000000
## 6 0.100171 0.200342 100.000000 100.000000
## 7 0.100171 0.300514 100.000000 100.000000
## 8 0.100171 0.400685 100.000000 100.000000
## 9 0.199486 0.600171 100.000000 100.000000
## 10 0.200342 0.800514 100.000000 100.000000
## 11 0.199486 1.000000 100.000000 100.000000
## 12 0.000000 1.000000 -100.000000 66.619116
## 13 0.000000 1.000000 -100.000000 42.874618
## 14 0.000000 1.000000 -100.000000 24.986624
## 15 0.000000 1.000000 -100.000000 11.132255
## 16 0.000000 1.000000 -100.000000 0.000000
## kolmogorov_smirnov
## 1 0.020548
## 2 0.040240
## 3 0.060788
## 4 0.080479
## 5 0.100171
## 6 0.200342
## 7 0.300514
## 8 0.400685
## 9 0.600171
## 10 0.800514
## 11 1.000000
## 12 0.799658
## 13 0.600171
## 14 0.399829
## 15 0.200342
## 16 0.000000
##
## $residual_deviance
## [1] 0.7787036
##
## $null_deviance
## [1] 3238.431
##
## $AIC
## [1] 6.778704
##
## $loglikelihood
## [1] 0
##
## $null_degrees_of_freedom
## [1] 2335
##
## $residual_degrees_of_freedom
## [1] 2333
h2o.auc(performance_h2o)
## [1] 1
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.980388689091854:
## High Low Error Rate
## High 1168 0 0.000000 =0/1168
## Low 0 1168 0.000000 =0/1168
## Totals 1168 1168 0.000000 =0/2336
h2o.metric(performance_h2o)
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 1.000000 0.979467 0.967547 0.991684 0.979880 1.000000 0.959760 1.000000
## 2 0.999991 0.979913 0.968243 0.991867 0.980308 1.000000 0.960616 1.000000
## 3 0.999991 0.980358 0.968939 0.992049 0.980736 1.000000 0.961473 1.000000
## 4 0.999969 0.980803 0.969634 0.992232 0.981164 1.000000 0.962329 1.000000
## 5 0.999966 0.981247 0.970329 0.992414 0.981592 1.000000 0.963185 1.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.960538 0.959760 0.979880 1168 47 0 1121
## 2 0.961362 0.960616 0.980308 1168 46 0 1122
## 3 0.962187 0.961473 0.980736 1168 45 0 1123
## 4 0.963012 0.962329 0.981164 1168 44 0 1124
## 5 0.963838 0.963185 0.981592 1168 43 0 1125
## tnr fnr fpr tpr idx
## 1 1.000000 0.040240 0.000000 0.959760 0
## 2 1.000000 0.039384 0.000000 0.960616 1
## 3 1.000000 0.038527 0.000000 0.961473 2
## 4 1.000000 0.037671 0.000000 0.962329 3
## 5 1.000000 0.036815 0.000000 0.963185 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 395 0.000000 0.672811 0.837156 0.562404 0.513699 0.506944 1.000000
## 396 0.000000 0.671072 0.836077 0.560461 0.509846 0.504972 1.000000
## 397 0.000000 0.669149 0.834882 0.558317 0.505565 0.502798 1.000000
## 398 0.000000 0.668575 0.834524 0.557678 0.504281 0.502150 1.000000
## 399 0.000000 0.668001 0.834167 0.557039 0.502997 0.501503 1.000000
## 400 0.000000 0.666667 0.833333 0.555556 0.500000 0.500000 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395 0.027397 0.117851 0.027397 0.513699 32
## 396 0.019692 0.099719 0.019692 0.509846 23
## 397 0.011130 0.074808 0.011130 0.505565 13
## 398 0.008562 0.065568 0.008562 0.504281 10
## 399 0.005993 0.054823 0.005993 0.502997 7
## 400 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 395 0 1136 1168 0.027397 0.000000 0.972603 1.000000 394
## 396 0 1145 1168 0.019692 0.000000 0.980308 1.000000 395
## 397 0 1155 1168 0.011130 0.000000 0.988870 1.000000 396
## 398 0 1158 1168 0.008562 0.000000 0.991438 1.000000 397
## 399 0 1161 1168 0.005993 0.000000 0.994007 1.000000 398
## 400 0 1168 1168 0.000000 0.000000 1.000000 1.000000 399