library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(timetk)
## Warning: package 'timetk' was built under R version 4.5.3
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.5.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Warning: package 'xts' was built under R version 4.5.3
## Warning: package 'zoo' was built under R version 4.5.3
## Warning: package 'quantmod' was built under R version 4.5.3
## Warning: package 'TTR' was built under R version 4.5.3
## Warning: package 'PerformanceAnalytics' was built under R version 4.5.3
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.12 ──
## ✔ PerformanceAnalytics 2.1.0 ✔ TTR 0.24.4
## ✔ quantmod 0.4.28 ✔ xts 0.14.2
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date() masks base::as.Date()
## ✖ zoo::as.Date.numeric() masks base::as.Date.numeric()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary() masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'tidyquant'
##
##
## The following object is masked from 'package:timetk':
##
## FANG
Plotting time series
taylor_30_min
## # A tibble: 4,032 × 2
## date value
## <dttm> <dbl>
## 1 2000-06-05 00:00:00 22262
## 2 2000-06-05 00:30:00 21756
## 3 2000-06-05 01:00:00 22247
## 4 2000-06-05 01:30:00 22759
## 5 2000-06-05 02:00:00 22549
## 6 2000-06-05 02:30:00 22313
## 7 2000-06-05 03:00:00 22128
## 8 2000-06-05 03:30:00 21860
## 9 2000-06-05 04:00:00 21751
## 10 2000-06-05 04:30:00 21336
## # ℹ 4,022 more rows
taylor_30_min %>%
plot_time_series(.date_var = date, .value = value)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the timetk package.
## Please report the issue at
## <https://github.com/business-science/timetk/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Ignoring unknown labels:
## • colour : "Legend"
m4_hourly %>%
group_by(id) %>%
plot_time_series(
.date_var = date,
.value = log(value),
.facet_ncol = 2,
.facet_scales = "free",
.color_var = week(date)
)
Static ggpolt2 Visualization & customizations
taylor_30_min %>%
plot_time_series(
date, value,
.color_var = month(date, label = TRUE),
# Returns static ggplot
.interactive = FALSE,
# Customize
.title = "Taylor's MegaWatt Data",
.x_lab = "Date (30-min intervals)",
.y_lab = "Energy Demand (MW)",
.color_lab = "Month"
)

Box plots
m4_monthly %>% count(id)
## # A tibble: 4 × 2
## id n
## <fct> <int>
## 1 M1 469
## 2 M2 469
## 3 M750 306
## 4 M1000 330
m4_monthly %>%
filter_by_time(.date_var = date, .end_date = "1976") %>%
group_by(id) %>%
plot_time_series_boxplot(
.date_var = date,
.value = value,
.period = "1 year",
.facet_ncol = 2
)
## Ignoring unknown labels:
## • colour : "Legend"
Regression plots
m4_monthly %>%
group_by(id) %>%
plot_time_series_regression(
.date_var = date,
.facet_ncol = 2,
.formula = log(value) ~ as.numeric(date) + month(date, label = TRUE),
.show_summary = FALSE
)
Plotting Seasonality and Correlation
Correlation Plots
m4_hourly %>%
group_by(id) %>%
plot_acf_diagnostics(
date, value,
.lags = "7 days")
walmart_sales_weekly %>%
group_by(id) %>%
plot_acf_diagnostics(
Date, Weekly_Sales,
.ccf_vars = c(Temperature, Fuel_Price),
.lags = "3 months"
)
Seasonality
taylor_30_min %>%
plot_seasonal_diagnostics(date, value)
m4_hourly %>% count(id)
## # A tibble: 4 × 2
## id n
## <fct> <int>
## 1 H10 700
## 2 H50 700
## 3 H150 700
## 4 H410 960
m4_hourly %>%
group_by(id) %>%
plot_seasonal_diagnostics(date, value)
STL Diagnostics
m4_hourly %>%
group_by(id) %>%
plot_stl_diagnostics(
date, value,
.feature_set = c("observed", "season", "trend", "remainder")
)
## frequency = 24 observations per 1 day
## trend = 336 observations per 14 days
## frequency = 24 observations per 1 day
## trend = 336 observations per 14 days
## frequency = 24 observations per 1 day
## trend = 336 observations per 14 days
## frequency = 24 observations per 1 day
## trend = 336 observations per 14 days
Time Series Data Wrangling
Summarize by Time
FANG %>%
group_by(symbol) %>%
summarise_by_time(.date_var = date, volume = sum(volume), .by = "quarter") %>%
plot_time_series(
date, volume,
.facet_ncol = 2,
.interactive = FALSE
)
## Ignoring unknown labels:
## • colour : "Legend"

FANG %>%
group_by(symbol) %>%
summarise_by_time(.date_var = date, adjusted = mean(adjusted), .by = "month") %>%
plot_time_series(date, adjusted, .facet_ncol = 2, .interactive = FALSE)
## Ignoring unknown labels:
## • colour : "Legend"

Filter By Time
FANG %>%
group_by(symbol) %>%
filter_by_time(.date_var = date,
.start_date = "2013-09",
.end_date = "2013") %>%
plot_time_series(date, adjusted, .facet_ncol = 2)
## Ignoring unknown labels:
## • colour : "Legend"
Padding Data
FANG %>%
group_by(symbol) %>%
pad_by_time(date, .by = "day", .pad_value = 0)
## # A tibble: 5,836 × 8
## # Groups: symbol [4]
## symbol date open high low close volume adjusted
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AMZN 2013-01-02 256. 258. 253. 257. 3271000 257.
## 2 AMZN 2013-01-03 257. 261. 256. 258. 2750900 258.
## 3 AMZN 2013-01-04 258. 260. 257. 259. 1874200 259.
## 4 AMZN 2013-01-05 0 0 0 0 0 0
## 5 AMZN 2013-01-06 0 0 0 0 0 0
## 6 AMZN 2013-01-07 263. 270. 263. 268. 4910000 268.
## 7 AMZN 2013-01-08 267. 269. 264. 266. 3010700 266.
## 8 AMZN 2013-01-09 268. 270. 265. 266. 2265600 266.
## 9 AMZN 2013-01-10 269. 269. 262. 265. 2863400 265.
## 10 AMZN 2013-01-11 265. 268. 264. 268. 2413300 268.
## # ℹ 5,826 more rows
Sliding (Rolling) Calculation
FANG %>%
head(10) %>%
mutate(rolling_avg_2 = slidify_vec(adjusted, mean,
.period = 2,
.align = "right",
.partial = TRUE))
## # A tibble: 10 × 9
## symbol date open high low close volume adjusted rolling_avg_2
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 META 2013-01-02 27.4 28.2 27.4 28 69846400 28 28
## 2 META 2013-01-03 27.9 28.5 27.6 27.8 63140600 27.8 27.9
## 3 META 2013-01-04 28.0 28.9 27.8 28.8 72715400 28.8 28.3
## 4 META 2013-01-07 28.7 29.8 28.6 29.4 83781800 29.4 29.1
## 5 META 2013-01-08 29.5 29.6 28.9 29.1 45871300 29.1 29.2
## 6 META 2013-01-09 29.7 30.6 29.5 30.6 104787700 30.6 29.8
## 7 META 2013-01-10 30.6 31.5 30.3 31.3 95316400 31.3 30.9
## 8 META 2013-01-11 31.3 32.0 31.1 31.7 89598000 31.7 31.5
## 9 META 2013-01-14 32.1 32.2 30.6 31.0 98892800 31.0 31.3
## 10 META 2013-01-15 30.6 31.7 29.9 30.1 173242600 30.1 30.5
# Rolling regressions are easy to implement using `.unlist = FALSE`
lm_roll <- slidify(~ lm(..1 ~ ..2 + ..3), .period = 90,
.unlist = FALSE, .align = "right")
FANG %>%
select(symbol, date, adjusted, volume) %>%
group_by(symbol) %>%
mutate(numeric_date = as.numeric(date)) %>%
# Apply rolling regression
mutate(rolling_lm = lm_roll(adjusted, volume, numeric_date)) %>%
filter(!is.na(rolling_lm))
## # A tibble: 3,676 × 6
## # Groups: symbol [4]
## symbol date adjusted volume numeric_date rolling_lm
## <chr> <date> <dbl> <dbl> <dbl> <list>
## 1 META 2013-05-10 26.7 30847100 15835 <lm>
## 2 META 2013-05-13 26.8 29068800 15838 <lm>
## 3 META 2013-05-14 27.1 24930300 15839 <lm>
## 4 META 2013-05-15 26.6 30299800 15840 <lm>
## 5 META 2013-05-16 26.1 35499100 15841 <lm>
## 6 META 2013-05-17 26.2 29462700 15842 <lm>
## 7 META 2013-05-20 25.8 42402900 15845 <lm>
## 8 META 2013-05-21 25.7 26261300 15846 <lm>
## 9 META 2013-05-22 25.2 45314500 15847 <lm>
## 10 META 2013-05-23 25.1 37663100 15848 <lm>
## # ℹ 3,666 more rows
Model
library(h2o)
## Warning: package 'h2o' was built under R version 4.5.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.12 ✔ rsample 1.3.2
## ✔ dials 1.4.2 ✔ tailor 0.1.0
## ✔ infer 1.1.0 ✔ tidyr 1.3.2
## ✔ modeldata 1.5.1 ✔ tune 2.0.1
## ✔ parsnip 1.4.1 ✔ workflows 1.3.0
## ✔ purrr 1.2.1 ✔ workflowsets 1.1.1
## ✔ recipes 1.3.1 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ recipes::step() masks stats::step()
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 11 minutes 42 seconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 2 years, 4 months and 4 days
## H2O cluster name: H2O_started_from_R_javony_bdq039
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.09 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.5.2 (2025-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (2 years, 4 months and 4 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
fang_tbl <- FANG %>%
select(symbol, date, open, high, low, close, volume, adjusted) %>%
mutate(
volume_class = if_else(volume > median(volume, na.rm = TRUE), "High", "Low"),
volume_class = as.factor(volume_class),
symbol = as.factor(symbol),
year = lubridate::year(date),
month = lubridate::month(date)
) %>%
select(-date)
set.seed(2345)
split <- initial_split(fang_tbl, prop = 0.80, strata = volume_class)
train_tbl <- training(split)
test_tbl <- testing(split)
split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)
## | | | 0% | |======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)
## | | | 0% | |======================================================================| 100%
y <- "volume_class"
x <- setdiff(names(train_tbl), y)
models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
# max_runtime_secs = 30,
max_models = 5,
exclude_algos = "DeepLearning",
nfolds = 3,
seed = 3456
)
## | | | 0% | |=== | 4%
## 21:58:27.490: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 21:58:27.499: AutoML: XGBoost is not available; skipping it. | |========= | 12% | |============ | 17% | |============== | 20% | |======================================================================| 100%
## Examine the output of h2o.automl
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id auc logloss
## 1 GBM_2_AutoML_3_20260424_215827 1 0.0007287438
## 2 GBM_1_AutoML_3_20260424_215827 1 0.0007818663
## 3 DRF_1_AutoML_3_20260424_215827 1 0.0062555239
## 4 StackedEnsemble_AllModels_1_AutoML_3_20260424_215827 1 0.0009422223
## 5 StackedEnsemble_BestOfFamily_1_AutoML_3_20260424_215827 1 0.0010203401
## 6 GBM_3_AutoML_3_20260424_215827 1 0.0006904773
## aucpr mean_per_class_error rmse mse
## 1 1 0 0.010023800 1.004766e-04
## 2 1 0 0.005919087 3.503559e-05
## 3 1 0 0.029342937 8.610080e-04
## 4 1 0 0.010577578 1.118852e-04
## 5 1 0 0.011385963 1.296402e-04
## 6 1 0 0.009775758 9.556544e-05
##
## [7 rows x 7 columns]
models_h2o@leader
## Model Details:
## ==============
##
## H2OBinomialModel: gbm
## Model ID: GBM_2_AutoML_3_20260424_215827
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 111 111 56788 4
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 7 6.96396 7 68 35.98198
##
##
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
##
## MSE: 7.112633e-06
## RMSE: 0.002666952
## LogLoss: 0.0001904919
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 1
## Gini: 1
## R^2: 0.9999715
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## High Low Error Rate
## High 1371 0 0.000000 =0/1371
## Low 0 1368 0.000000 =0/1368
## Totals 1371 1368 0.000000 =0/2739
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.913077 1.000000 216
## 2 max f2 0.913077 1.000000 216
## 3 max f0point5 0.913077 1.000000 216
## 4 max accuracy 0.913077 1.000000 216
## 5 max precision 0.999993 1.000000 0
## 6 max recall 0.913077 1.000000 216
## 7 max specificity 0.999993 1.000000 0
## 8 max absolute_mcc 0.913077 1.000000 216
## 9 max min_per_class_accuracy 0.913077 1.000000 216
## 10 max mean_per_class_accuracy 0.913077 1.000000 216
## 11 max tns 0.999993 1371.000000 0
## 12 max fns 0.999993 1366.000000 0
## 13 max fps 0.000009 1371.000000 399
## 14 max tps 0.913077 1368.000000 216
## 15 max tnr 0.999993 1.000000 0
## 16 max fnr 0.999993 0.998538 0
## 17 max fpr 0.000009 1.000000 399
## 18 max tpr 0.913077 1.000000 216
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
##
## MSE: 5.983943e-08
## RMSE: 0.000244621
## LogLoss: 6.188569e-05
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 1
## Gini: 1
## R^2: 0.9999998
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## High Low Error Rate
## High 241 0 0.000000 =0/241
## Low 0 244 0.000000 =0/244
## Totals 241 244 0.000000 =0/485
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.997992 1.000000 184
## 2 max f2 0.997992 1.000000 184
## 3 max f0point5 0.997992 1.000000 184
## 4 max accuracy 0.997992 1.000000 184
## 5 max precision 0.999993 1.000000 0
## 6 max recall 0.997992 1.000000 184
## 7 max specificity 0.999993 1.000000 0
## 8 max absolute_mcc 0.997992 1.000000 184
## 9 max min_per_class_accuracy 0.997992 1.000000 184
## 10 max mean_per_class_accuracy 0.997992 1.000000 184
## 11 max tns 0.999993 241.000000 0
## 12 max fns 0.999993 243.000000 0
## 13 max fps 0.000009 241.000000 399
## 14 max tps 0.997992 244.000000 184
## 15 max tnr 0.999993 1.000000 0
## 16 max fnr 0.999993 0.995902 0
## 17 max fpr 0.000009 1.000000 399
## 18 max tpr 0.997992 1.000000 184
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 3-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.001342915
## RMSE: 0.03664581
## LogLoss: 0.004200538
## Mean Per-Class Error: 0.001094092
## AUC: 0.999992
## AUCPR: 0.999992
## Gini: 0.999984
## R^2: 0.9946283
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## High Low Error Rate
## High 1368 3 0.002188 =3/1371
## Low 0 1368 0.000000 =0/1368
## Totals 1368 1371 0.001095 =3/2739
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.081349 0.998905 218
## 2 max f2 0.081349 0.999562 218
## 3 max f0point5 0.973794 0.998972 208
## 4 max accuracy 0.081349 0.998905 218
## 5 max precision 0.999999 1.000000 0
## 6 max recall 0.081349 1.000000 218
## 7 max specificity 0.999999 1.000000 0
## 8 max absolute_mcc 0.081349 0.997812 218
## 9 max min_per_class_accuracy 0.486473 0.998538 215
## 10 max mean_per_class_accuracy 0.081349 0.998906 218
## 11 max tns 0.999999 1371.000000 0
## 12 max fns 0.999999 1306.000000 0
## 13 max fps 0.000001 1371.000000 399
## 14 max tps 0.081349 1368.000000 218
## 15 max tnr 0.999999 1.000000 0
## 16 max fnr 0.999999 0.954678 0
## 17 max fpr 0.000001 1.000000 399
## 18 max tpr 0.081349 1.000000 218
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## accuracy 0.999270 0.000632 1.000000 0.998905 0.998905
## auc 0.999995 0.000005 1.000000 0.999995 0.999990
## err 0.000730 0.000632 0.000000 0.001095 0.001095
## err_count 0.666667 0.577350 0.000000 1.000000 1.000000
## f0point5 0.998832 0.001011 1.000000 0.998249 0.998249
## f1 0.999270 0.000632 1.000000 0.998905 0.998905
## f2 0.999708 0.000253 1.000000 0.999562 0.999562
## lift_top_group 2.002193 NA 2.002193 2.002193 2.002193
## logloss 0.004357 0.001944 0.002848 0.003672 0.006550
## max_per_class_error 0.001459 0.001263 0.000000 0.002188 0.002188
## mcc 0.998541 0.001263 1.000000 0.997812 0.997812
## mean_per_class_accuracy 0.999271 0.000632 1.000000 0.998906 0.998906
## mean_per_class_error 0.000729 0.000632 0.000000 0.001094 0.001094
## mse 0.001369 0.000516 0.000935 0.001233 0.001940
## pr_auc 0.999995 0.000005 1.000000 0.999995 0.999990
## precision 0.998541 0.001263 1.000000 0.997812 0.997812
## r2 0.994522 0.002065 0.996260 0.995067 0.992240
## recall 1.000000 0.000000 1.000000 1.000000 1.000000
## rmse 0.036581 0.006852 0.030579 0.035118 0.044046
## specificity 0.998541 0.001263 1.000000 0.997812 0.997812
Save and Load
?h2o.getModel
## starting httpd help server ... done
?h2o.saveModel
?h2o.loadModel
dir.create("h2o_models", showWarnings = FALSE)
h2o.saveModel(
object = models_h2o@leader,
path = "h2o_models/",
force = TRUE
)
## [1] "C:\\Users\\javony\\Desktop\\PSU_DATA3100\\11_module13\\h2o_models\\GBM_2_AutoML_3_20260424_215827"
best_model <- models_h2o@leader
Make predictions
predictions <- h2o.predict(best_model, newdata = test_h2o)
## | | | 0% | |======================================================================| 100%
predictions_tbl <- predictions %>%
as_tibble()
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 808 × 13
## predict High Low symbol open high low close volume adjusted
## <fct> <dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 High 1.000 0.00000988 META 27.4 28.2 27.4 28 69846400 28
## 2 High 1.000 0.00000988 META 32.1 32.2 30.6 31.0 98892800 31.0
## 3 High 1.000 0.00000988 META 31.1 31.5 30.8 30.8 48899800 30.8
## 4 High 1.000 0.00000988 META 31.3 31.5 30.8 31.1 43845100 31.1
## 5 High 1.000 0.00000988 META 28.9 29.2 28.5 28.5 37708800 28.5
## 6 High 1.000 0.00000988 META 28.3 28.5 27.2 27.3 49642300 27.3
## 7 High 1.000 0.00000924 META 28.0 28.6 27.8 28.1 35642100 28.1
## 8 High 1.000 0.00000924 META 27.6 27.6 26.9 27.1 39619500 27.1
## 9 High 1.000 0.00000924 META 26.7 26.7 25.8 25.9 44006500 25.9
## 10 High 1.000 0.00000924 META 26.1 26.2 25.5 25.6 28585700 25.6
## # ℹ 798 more rows
## # ℹ 3 more variables: volume_class <fct>, year <dbl>, month <dbl>
Evaluate model
?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "GBM_2_AutoML_3_20260424_215827"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/GBM_2_AutoML_3_20260424_215827"
##
##
## $model_checksum
## [1] "-8836266441872637560"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_8899_3"
##
##
## $frame_checksum
## [1] "-9010569464111329883"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.777082e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.0001004766
##
## $RMSE
## [1] 0.0100238
##
## $nobs
## [1] 808
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.9995981
##
## $logloss
## [1] 0.0007287438
##
## $AUC
## [1] 1
##
## $pr_auc
## [1] 1
##
## $Gini
## [1] 1
##
## $mean_per_class_error
## [1] 0
##
## $domain
## [1] "High" "Low"
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## High Low Error Rate
## High 404 0 0.0000 = 0 / 404
## Low 0 404 0.0000 = 0 / 404
## Totals 404 404 0.0000 = 0 / 808
##
##
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
## threshold f1 f2 f0point5 accuracy precision recall specificity
## 1 0.999993 0.004938 0.003092 0.012255 0.501238 1.000000 0.002475 1.000000
## 2 0.999992 0.019608 0.012346 0.047619 0.504950 1.000000 0.009901 1.000000
## 3 0.999992 0.024450 0.015423 0.058962 0.506188 1.000000 0.012376 1.000000
## 4 0.999989 0.029268 0.018496 0.070093 0.507426 1.000000 0.014851 1.000000
## 5 0.999988 0.034063 0.021565 0.081019 0.508663 1.000000 0.017327 1.000000
## absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1 0.035202 0.002475 0.501238 404 403 0 1
## 2 0.070535 0.009901 0.504950 404 400 0 4
## 3 0.078909 0.012376 0.506188 404 399 0 5
## 4 0.086494 0.014851 0.507426 404 398 0 6
## 5 0.093483 0.017327 0.508663 404 397 0 7
## tnr fnr fpr tpr idx
## 1 1.000000 0.997525 0.000000 0.002475 0
## 2 1.000000 0.990099 0.000000 0.009901 1
## 3 1.000000 0.987624 0.000000 0.012376 2
## 4 1.000000 0.985149 0.000000 0.014851 3
## 5 1.000000 0.982673 0.000000 0.017327 4
##
## ---
## threshold f1 f2 f0point5 accuracy precision recall
## 395 0.000010 0.718861 0.864726 0.615104 0.608911 0.561111 1.000000
## 396 0.000010 0.714412 0.862143 0.609903 0.600248 0.555708 1.000000
## 397 0.000010 0.690598 0.848027 0.582468 0.551980 0.527415 1.000000
## 398 0.000010 0.679563 0.841316 0.569977 0.528465 0.514650 1.000000
## 399 0.000009 0.671096 0.836093 0.560488 0.509901 0.505000 1.000000
## 400 0.000009 0.666667 0.833333 0.555556 0.500000 0.500000 1.000000
## specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395 0.217822 0.349603 0.217822 0.608911 88
## 396 0.200495 0.333792 0.200495 0.600248 81
## 397 0.103960 0.234159 0.103960 0.551980 42
## 398 0.056931 0.171171 0.056931 0.528465 23
## 399 0.019802 0.100000 0.019802 0.509901 8
## 400 0.000000 0.000000 0.000000 0.500000 0
## fns fps tps tnr fnr fpr tpr idx
## 395 0 316 404 0.217822 0.000000 0.782178 1.000000 394
## 396 0 323 404 0.200495 0.000000 0.799505 1.000000 395
## 397 0 362 404 0.103960 0.000000 0.896040 1.000000 396
## 398 0 381 404 0.056931 0.000000 0.943069 1.000000 397
## 399 0 396 404 0.019802 0.000000 0.980198 1.000000 398
## 400 0 404 404 0.000000 0.000000 1.000000 1.000000 399
##
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.751460 1.000000 233
## 2 max f2 0.751460 1.000000 233
## 3 max f0point5 0.751460 1.000000 233
## 4 max accuracy 0.751460 1.000000 233
## 5 max precision 0.999993 1.000000 0
## 6 max recall 0.751460 1.000000 233
## 7 max specificity 0.999993 1.000000 0
## 8 max absolute_mcc 0.751460 1.000000 233
## 9 max min_per_class_accuracy 0.751460 1.000000 233
## 10 max mean_per_class_accuracy 0.751460 1.000000 233
## 11 max tns 0.999993 404.000000 0
## 12 max fns 0.999993 403.000000 0
## 13 max fps 0.000009 404.000000 399
## 14 max tps 0.751460 404.000000 233
## 15 max tnr 0.999993 1.000000 0
## 16 max fnr 0.999993 0.997525 0
## 17 max fpr 0.000009 1.000000 399
## 18 max tpr 0.751460 1.000000 233
##
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 50.00 %, avg score: 49.94 %
## group cumulative_data_fraction lower_threshold lift cumulative_lift
## 1 1 0.01113861 0.999988 2.000000 2.000000
## 2 2 0.02103960 0.999988 2.000000 2.000000
## 3 3 0.03094059 0.999988 2.000000 2.000000
## 4 4 0.04084158 0.999988 2.000000 2.000000
## 5 5 0.05321782 0.999988 2.000000 2.000000
## 6 6 0.10396040 0.999986 2.000000 2.000000
## 7 7 0.15717822 0.999984 2.000000 2.000000
## 8 8 0.20049505 0.999982 2.000000 2.000000
## 9 9 0.30074257 0.999978 2.000000 2.000000
## 10 10 0.39975248 0.999967 2.000000 2.000000
## 11 11 0.50000000 0.378354 2.000000 2.000000
## 12 12 0.60024752 0.000020 0.000000 1.665979
## 13 13 0.69925743 0.000014 0.000000 1.430088
## 14 14 0.79950495 0.000012 0.000000 1.250774
## 15 15 0.89975248 0.000010 0.000000 1.111417
## 16 16 1.00000000 0.000009 0.000000 1.000000
## response_rate score cumulative_response_rate cumulative_score
## 1 1.000000 0.999991 1.000000 0.999991
## 2 1.000000 0.999988 1.000000 0.999989
## 3 1.000000 0.999988 1.000000 0.999989
## 4 1.000000 0.999988 1.000000 0.999989
## 5 1.000000 0.999988 1.000000 0.999988
## 6 1.000000 0.999987 1.000000 0.999988
## 7 1.000000 0.999985 1.000000 0.999987
## 8 1.000000 0.999983 1.000000 0.999986
## 9 1.000000 0.999980 1.000000 0.999984
## 10 1.000000 0.999975 1.000000 0.999982
## 11 1.000000 0.993795 1.000000 0.998741
## 12 0.000000 0.000355 0.832990 0.832000
## 13 0.000000 0.000017 0.715044 0.714197
## 14 0.000000 0.000013 0.625387 0.624648
## 15 0.000000 0.000011 0.555708 0.555053
## 16 0.000000 0.000010 0.500000 0.499411
## capture_rate cumulative_capture_rate gain cumulative_gain
## 1 0.022277 0.022277 100.000000 100.000000
## 2 0.019802 0.042079 100.000000 100.000000
## 3 0.019802 0.061881 100.000000 100.000000
## 4 0.019802 0.081683 100.000000 100.000000
## 5 0.024752 0.106436 100.000000 100.000000
## 6 0.101485 0.207921 100.000000 100.000000
## 7 0.106436 0.314356 100.000000 100.000000
## 8 0.086634 0.400990 100.000000 100.000000
## 9 0.200495 0.601485 100.000000 100.000000
## 10 0.198020 0.799505 100.000000 100.000000
## 11 0.200495 1.000000 100.000000 100.000000
## 12 0.000000 1.000000 -100.000000 66.597938
## 13 0.000000 1.000000 -100.000000 43.008850
## 14 0.000000 1.000000 -100.000000 25.077399
## 15 0.000000 1.000000 -100.000000 11.141678
## 16 0.000000 1.000000 -100.000000 0.000000
## kolmogorov_smirnov
## 1 0.022277
## 2 0.042079
## 3 0.061881
## 4 0.081683
## 5 0.106436
## 6 0.207921
## 7 0.314356
## 8 0.400990
## 9 0.601485
## 10 0.799505
## 11 1.000000
## 12 0.799505
## 13 0.601485
## 14 0.400990
## 15 0.200495
## 16 0.000000
h2o.auc(performance_h2o)
## [1] 1
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.751460448805453:
## High Low Error Rate
## High 404 0 0.000000 =0/404
## Low 0 404 0.000000 =0/404
## Totals 404 404 0.000000 =0/808