library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(timetk)
## Warning: package 'timetk' was built under R version 4.5.3
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.5.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Warning: package 'xts' was built under R version 4.5.3
## Warning: package 'zoo' was built under R version 4.5.3
## Warning: package 'quantmod' was built under R version 4.5.3
## Warning: package 'TTR' was built under R version 4.5.3
## Warning: package 'PerformanceAnalytics' was built under R version 4.5.3
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.12 ──
## ✔ PerformanceAnalytics 2.1.0      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.28     ✔ xts                  0.14.2
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'tidyquant'
## 
## 
## The following object is masked from 'package:timetk':
## 
##     FANG

Plotting time series

taylor_30_min
## # A tibble: 4,032 × 2
##    date                value
##    <dttm>              <dbl>
##  1 2000-06-05 00:00:00 22262
##  2 2000-06-05 00:30:00 21756
##  3 2000-06-05 01:00:00 22247
##  4 2000-06-05 01:30:00 22759
##  5 2000-06-05 02:00:00 22549
##  6 2000-06-05 02:30:00 22313
##  7 2000-06-05 03:00:00 22128
##  8 2000-06-05 03:30:00 21860
##  9 2000-06-05 04:00:00 21751
## 10 2000-06-05 04:30:00 21336
## # ℹ 4,022 more rows
taylor_30_min %>%
  plot_time_series(.date_var = date, .value = value)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the timetk package.
##   Please report the issue at
##   <https://github.com/business-science/timetk/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Ignoring unknown labels:
## • colour : "Legend"
m4_hourly %>%
  group_by(id) %>%
  plot_time_series(
    .date_var = date,
    .value    = log(value),
    .facet_ncol   = 2,
    .facet_scales = "free",
    .color_var    = week(date)
  )

Static ggpolt2 Visualization & customizations

taylor_30_min %>%
  plot_time_series(
    date, value,
    .color_var = month(date, label = TRUE),

    # Returns static ggplot
    .interactive = FALSE,

    # Customize
    .title     = "Taylor's MegaWatt Data",
    .x_lab     = "Date (30-min intervals)",
    .y_lab     = "Energy Demand (MW)",
    .color_lab = "Month"
  )

Box plots

m4_monthly %>% count(id)
## # A tibble: 4 × 2
##   id        n
##   <fct> <int>
## 1 M1      469
## 2 M2      469
## 3 M750    306
## 4 M1000   330
m4_monthly %>%
  filter_by_time(.date_var = date, .end_date = "1976") %>%
  group_by(id) %>%
  plot_time_series_boxplot(
    .date_var   = date,
    .value      = value,
    .period     = "1 year",
    .facet_ncol = 2
  )
## Ignoring unknown labels:
## • colour : "Legend"

Regression plots

m4_monthly %>%
  group_by(id) %>%
  plot_time_series_regression(
    .date_var   = date,
    .facet_ncol = 2,
    .formula    = log(value) ~ as.numeric(date) + month(date, label = TRUE),
    .show_summary = FALSE
  )

Plotting Seasonality and Correlation

Correlation Plots

m4_hourly %>%
  group_by(id) %>%
  plot_acf_diagnostics(
    date, value,
    .lags = "7 days")
walmart_sales_weekly %>%
  group_by(id) %>%
  plot_acf_diagnostics(
    Date, Weekly_Sales,
    .ccf_vars = c(Temperature, Fuel_Price),
    .lags = "3 months"
  )

Seasonality

taylor_30_min %>%
  plot_seasonal_diagnostics(date, value)
m4_hourly %>% count(id)
## # A tibble: 4 × 2
##   id        n
##   <fct> <int>
## 1 H10     700
## 2 H50     700
## 3 H150    700
## 4 H410    960
m4_hourly %>%
  group_by(id) %>%
  plot_seasonal_diagnostics(date, value)

STL Diagnostics

m4_hourly %>%
  group_by(id) %>%
  plot_stl_diagnostics(
    date, value,
    .feature_set = c("observed", "season", "trend", "remainder")
  )
## frequency = 24 observations per 1 day
## trend = 336 observations per 14 days
## frequency = 24 observations per 1 day
## trend = 336 observations per 14 days
## frequency = 24 observations per 1 day
## trend = 336 observations per 14 days
## frequency = 24 observations per 1 day
## trend = 336 observations per 14 days

Time Series Data Wrangling

Summarize by Time

FANG %>%
  group_by(symbol) %>%
  summarise_by_time(.date_var = date, volume = sum(volume), .by = "quarter") %>%
  plot_time_series(
    date, volume,
    .facet_ncol = 2,
    .interactive = FALSE
  )
## Ignoring unknown labels:
## • colour : "Legend"

FANG %>%
  group_by(symbol) %>%
  summarise_by_time(.date_var = date, adjusted = mean(adjusted), .by = "month") %>%
  plot_time_series(date, adjusted, .facet_ncol = 2, .interactive = FALSE)
## Ignoring unknown labels:
## • colour : "Legend"

Filter By Time

FANG %>%
  group_by(symbol) %>%
  filter_by_time(.date_var = date,
                 .start_date = "2013-09",
                 .end_date = "2013") %>%
  plot_time_series(date, adjusted, .facet_ncol = 2)
## Ignoring unknown labels:
## • colour : "Legend"

Padding Data

FANG %>%
  group_by(symbol) %>%
  pad_by_time(date, .by = "day", .pad_value = 0)
## # A tibble: 5,836 × 8
## # Groups:   symbol [4]
##    symbol date        open  high   low close  volume adjusted
##    <chr>  <date>     <dbl> <dbl> <dbl> <dbl>   <dbl>    <dbl>
##  1 AMZN   2013-01-02  256.  258.  253.  257. 3271000     257.
##  2 AMZN   2013-01-03  257.  261.  256.  258. 2750900     258.
##  3 AMZN   2013-01-04  258.  260.  257.  259. 1874200     259.
##  4 AMZN   2013-01-05    0     0     0     0        0       0 
##  5 AMZN   2013-01-06    0     0     0     0        0       0 
##  6 AMZN   2013-01-07  263.  270.  263.  268. 4910000     268.
##  7 AMZN   2013-01-08  267.  269.  264.  266. 3010700     266.
##  8 AMZN   2013-01-09  268.  270.  265.  266. 2265600     266.
##  9 AMZN   2013-01-10  269.  269.  262.  265. 2863400     265.
## 10 AMZN   2013-01-11  265.  268.  264.  268. 2413300     268.
## # ℹ 5,826 more rows

Sliding (Rolling) Calculation

FANG %>%
  head(10) %>%
  mutate(rolling_avg_2 = slidify_vec(adjusted, mean,
                                     .period = 2,
                                     .align = "right",
                                     .partial = TRUE))
## # A tibble: 10 × 9
##    symbol date        open  high   low close    volume adjusted rolling_avg_2
##    <chr>  <date>     <dbl> <dbl> <dbl> <dbl>     <dbl>    <dbl>         <dbl>
##  1 META   2013-01-02  27.4  28.2  27.4  28    69846400     28            28  
##  2 META   2013-01-03  27.9  28.5  27.6  27.8  63140600     27.8          27.9
##  3 META   2013-01-04  28.0  28.9  27.8  28.8  72715400     28.8          28.3
##  4 META   2013-01-07  28.7  29.8  28.6  29.4  83781800     29.4          29.1
##  5 META   2013-01-08  29.5  29.6  28.9  29.1  45871300     29.1          29.2
##  6 META   2013-01-09  29.7  30.6  29.5  30.6 104787700     30.6          29.8
##  7 META   2013-01-10  30.6  31.5  30.3  31.3  95316400     31.3          30.9
##  8 META   2013-01-11  31.3  32.0  31.1  31.7  89598000     31.7          31.5
##  9 META   2013-01-14  32.1  32.2  30.6  31.0  98892800     31.0          31.3
## 10 META   2013-01-15  30.6  31.7  29.9  30.1 173242600     30.1          30.5
# Rolling regressions are easy to implement using `.unlist = FALSE`
lm_roll <- slidify(~ lm(..1 ~ ..2 + ..3), .period = 90,
                   .unlist = FALSE, .align = "right")

FANG %>%
  select(symbol, date, adjusted, volume) %>%
  group_by(symbol) %>%
  mutate(numeric_date = as.numeric(date)) %>%
  # Apply rolling regression
  mutate(rolling_lm = lm_roll(adjusted, volume, numeric_date)) %>%
  filter(!is.na(rolling_lm))
## # A tibble: 3,676 × 6
## # Groups:   symbol [4]
##    symbol date       adjusted   volume numeric_date rolling_lm
##    <chr>  <date>        <dbl>    <dbl>        <dbl> <list>    
##  1 META   2013-05-10     26.7 30847100        15835 <lm>      
##  2 META   2013-05-13     26.8 29068800        15838 <lm>      
##  3 META   2013-05-14     27.1 24930300        15839 <lm>      
##  4 META   2013-05-15     26.6 30299800        15840 <lm>      
##  5 META   2013-05-16     26.1 35499100        15841 <lm>      
##  6 META   2013-05-17     26.2 29462700        15842 <lm>      
##  7 META   2013-05-20     25.8 42402900        15845 <lm>      
##  8 META   2013-05-21     25.7 26261300        15846 <lm>      
##  9 META   2013-05-22     25.2 45314500        15847 <lm>      
## 10 META   2013-05-23     25.1 37663100        15848 <lm>      
## # ℹ 3,666 more rows

Model

library(h2o)
## Warning: package 'h2o' was built under R version 4.5.3
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.12     ✔ rsample      1.3.2 
## ✔ dials        1.4.2      ✔ tailor       0.1.0 
## ✔ infer        1.1.0      ✔ tidyr        1.3.2 
## ✔ modeldata    1.5.1      ✔ tune         2.0.1 
## ✔ parsnip      1.4.1      ✔ workflows    1.3.0 
## ✔ purrr        1.2.1      ✔ workflowsets 1.1.1 
## ✔ recipes      1.3.1      ✔ yardstick    1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard()  masks scales::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ xts::first()      masks dplyr::first()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ xts::last()       masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ recipes::step()   masks stats::step()
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         11 minutes 42 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    2 years, 4 months and 4 days 
##     H2O cluster name:           H2O_started_from_R_javony_bdq039 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.09 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.5.2 (2025-10-31 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (2 years, 4 months and 4 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
fang_tbl <- FANG %>%
  select(symbol, date, open, high, low, close, volume, adjusted) %>%
  mutate(
    volume_class = if_else(volume > median(volume, na.rm = TRUE), "High", "Low"),
    volume_class = as.factor(volume_class),
    symbol = as.factor(symbol),
    year = lubridate::year(date),
    month = lubridate::month(date)
  ) %>%
  select(-date)

set.seed(2345)

split <- initial_split(fang_tbl, prop = 0.80, strata = volume_class)

train_tbl <- training(split)
test_tbl  <- testing(split)

split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o  <- as.h2o(test_tbl)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
y <- "volume_class"
x <- setdiff(names(train_tbl), y)

models_h2o <- h2o.automl(
  x = x,
  y = y,
  training_frame    = train_h2o,
  validation_frame  = valid_h2o,
  leaderboard_frame = test_h2o,
  # max_runtime_secs = 30,
  max_models        = 5,
  exclude_algos     = "DeepLearning",
  nfolds            = 3,
  seed              = 3456
)
##   |                                                                              |                                                                      |   0%  |                                                                              |===                                                                   |   4%
## 21:58:27.490: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 21:58:27.499: AutoML: XGBoost is not available; skipping it.  |                                                                              |=========                                                             |  12%  |                                                                              |============                                                          |  17%  |                                                                              |==============                                                        |  20%  |                                                                              |======================================================================| 100%
## Examine the output of h2o.automl

models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
##                                                  model_id auc      logloss
## 1                          GBM_2_AutoML_3_20260424_215827   1 0.0007287438
## 2                          GBM_1_AutoML_3_20260424_215827   1 0.0007818663
## 3                          DRF_1_AutoML_3_20260424_215827   1 0.0062555239
## 4    StackedEnsemble_AllModels_1_AutoML_3_20260424_215827   1 0.0009422223
## 5 StackedEnsemble_BestOfFamily_1_AutoML_3_20260424_215827   1 0.0010203401
## 6                          GBM_3_AutoML_3_20260424_215827   1 0.0006904773
##   aucpr mean_per_class_error        rmse          mse
## 1     1                    0 0.010023800 1.004766e-04
## 2     1                    0 0.005919087 3.503559e-05
## 3     1                    0 0.029342937 8.610080e-04
## 4     1                    0 0.010577578 1.118852e-04
## 5     1                    0 0.011385963 1.296402e-04
## 6     1                    0 0.009775758 9.556544e-05
## 
## [7 rows x 7 columns]
models_h2o@leader
## Model Details:
## ==============
## 
## H2OBinomialModel: gbm
## Model ID:  GBM_2_AutoML_3_20260424_215827 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1             111                      111               56788         4
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         7    6.96396          7         68    35.98198
## 
## 
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  7.112633e-06
## RMSE:  0.002666952
## LogLoss:  0.0001904919
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  1
## Gini:  1
## R^2:  0.9999715
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        High  Low    Error     Rate
## High   1371    0 0.000000  =0/1371
## Low       0 1368 0.000000  =0/1368
## Totals 1371 1368 0.000000  =0/2739
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.913077    1.000000 216
## 2                       max f2  0.913077    1.000000 216
## 3                 max f0point5  0.913077    1.000000 216
## 4                 max accuracy  0.913077    1.000000 216
## 5                max precision  0.999993    1.000000   0
## 6                   max recall  0.913077    1.000000 216
## 7              max specificity  0.999993    1.000000   0
## 8             max absolute_mcc  0.913077    1.000000 216
## 9   max min_per_class_accuracy  0.913077    1.000000 216
## 10 max mean_per_class_accuracy  0.913077    1.000000 216
## 11                     max tns  0.999993 1371.000000   0
## 12                     max fns  0.999993 1366.000000   0
## 13                     max fps  0.000009 1371.000000 399
## 14                     max tps  0.913077 1368.000000 216
## 15                     max tnr  0.999993    1.000000   0
## 16                     max fnr  0.999993    0.998538   0
## 17                     max fpr  0.000009    1.000000 399
## 18                     max tpr  0.913077    1.000000 216
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
## 
## MSE:  5.983943e-08
## RMSE:  0.000244621
## LogLoss:  6.188569e-05
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  1
## Gini:  1
## R^2:  0.9999998
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        High Low    Error    Rate
## High    241   0 0.000000  =0/241
## Low       0 244 0.000000  =0/244
## Totals  241 244 0.000000  =0/485
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.997992   1.000000 184
## 2                       max f2  0.997992   1.000000 184
## 3                 max f0point5  0.997992   1.000000 184
## 4                 max accuracy  0.997992   1.000000 184
## 5                max precision  0.999993   1.000000   0
## 6                   max recall  0.997992   1.000000 184
## 7              max specificity  0.999993   1.000000   0
## 8             max absolute_mcc  0.997992   1.000000 184
## 9   max min_per_class_accuracy  0.997992   1.000000 184
## 10 max mean_per_class_accuracy  0.997992   1.000000 184
## 11                     max tns  0.999993 241.000000   0
## 12                     max fns  0.999993 243.000000   0
## 13                     max fps  0.000009 241.000000 399
## 14                     max tps  0.997992 244.000000 184
## 15                     max tnr  0.999993   1.000000   0
## 16                     max fnr  0.999993   0.995902   0
## 17                     max fpr  0.000009   1.000000 399
## 18                     max tpr  0.997992   1.000000 184
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 3-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.001342915
## RMSE:  0.03664581
## LogLoss:  0.004200538
## Mean Per-Class Error:  0.001094092
## AUC:  0.999992
## AUCPR:  0.999992
## Gini:  0.999984
## R^2:  0.9946283
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        High  Low    Error     Rate
## High   1368    3 0.002188  =3/1371
## Low       0 1368 0.000000  =0/1368
## Totals 1368 1371 0.001095  =3/2739
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.081349    0.998905 218
## 2                       max f2  0.081349    0.999562 218
## 3                 max f0point5  0.973794    0.998972 208
## 4                 max accuracy  0.081349    0.998905 218
## 5                max precision  0.999999    1.000000   0
## 6                   max recall  0.081349    1.000000 218
## 7              max specificity  0.999999    1.000000   0
## 8             max absolute_mcc  0.081349    0.997812 218
## 9   max min_per_class_accuracy  0.486473    0.998538 215
## 10 max mean_per_class_accuracy  0.081349    0.998906 218
## 11                     max tns  0.999999 1371.000000   0
## 12                     max fns  0.999999 1306.000000   0
## 13                     max fps  0.000001 1371.000000 399
## 14                     max tps  0.081349 1368.000000 218
## 15                     max tnr  0.999999    1.000000   0
## 16                     max fnr  0.999999    0.954678   0
## 17                     max fpr  0.000001    1.000000 399
## 18                     max tpr  0.081349    1.000000 218
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                             mean       sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                0.999270 0.000632   1.000000   0.998905   0.998905
## auc                     0.999995 0.000005   1.000000   0.999995   0.999990
## err                     0.000730 0.000632   0.000000   0.001095   0.001095
## err_count               0.666667 0.577350   0.000000   1.000000   1.000000
## f0point5                0.998832 0.001011   1.000000   0.998249   0.998249
## f1                      0.999270 0.000632   1.000000   0.998905   0.998905
## f2                      0.999708 0.000253   1.000000   0.999562   0.999562
## lift_top_group          2.002193       NA   2.002193   2.002193   2.002193
## logloss                 0.004357 0.001944   0.002848   0.003672   0.006550
## max_per_class_error     0.001459 0.001263   0.000000   0.002188   0.002188
## mcc                     0.998541 0.001263   1.000000   0.997812   0.997812
## mean_per_class_accuracy 0.999271 0.000632   1.000000   0.998906   0.998906
## mean_per_class_error    0.000729 0.000632   0.000000   0.001094   0.001094
## mse                     0.001369 0.000516   0.000935   0.001233   0.001940
## pr_auc                  0.999995 0.000005   1.000000   0.999995   0.999990
## precision               0.998541 0.001263   1.000000   0.997812   0.997812
## r2                      0.994522 0.002065   0.996260   0.995067   0.992240
## recall                  1.000000 0.000000   1.000000   1.000000   1.000000
## rmse                    0.036581 0.006852   0.030579   0.035118   0.044046
## specificity             0.998541 0.001263   1.000000   0.997812   0.997812

Save and Load

?h2o.getModel
## starting httpd help server ... done
?h2o.saveModel
?h2o.loadModel

dir.create("h2o_models", showWarnings = FALSE)

h2o.saveModel(
  object = models_h2o@leader,
  path = "h2o_models/",
  force = TRUE
)
## [1] "C:\\Users\\javony\\Desktop\\PSU_DATA3100\\11_module13\\h2o_models\\GBM_2_AutoML_3_20260424_215827"
best_model <- models_h2o@leader

Make predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
predictions_tbl <- predictions %>%
  as_tibble()

predictions_tbl %>%
  bind_cols(test_tbl)
## # A tibble: 808 × 13
##    predict  High        Low symbol  open  high   low close   volume adjusted
##    <fct>   <dbl>      <dbl> <fct>  <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
##  1 High    1.000 0.00000988 META    27.4  28.2  27.4  28   69846400     28  
##  2 High    1.000 0.00000988 META    32.1  32.2  30.6  31.0 98892800     31.0
##  3 High    1.000 0.00000988 META    31.1  31.5  30.8  30.8 48899800     30.8
##  4 High    1.000 0.00000988 META    31.3  31.5  30.8  31.1 43845100     31.1
##  5 High    1.000 0.00000988 META    28.9  29.2  28.5  28.5 37708800     28.5
##  6 High    1.000 0.00000988 META    28.3  28.5  27.2  27.3 49642300     27.3
##  7 High    1.000 0.00000924 META    28.0  28.6  27.8  28.1 35642100     28.1
##  8 High    1.000 0.00000924 META    27.6  27.6  26.9  27.1 39619500     27.1
##  9 High    1.000 0.00000924 META    26.7  26.7  25.8  25.9 44006500     25.9
## 10 High    1.000 0.00000924 META    26.1  26.2  25.5  25.6 28585700     25.6
## # ℹ 798 more rows
## # ℹ 3 more variables: volume_class <fct>, year <dbl>, month <dbl>

Evaluate model

?h2o.performance

performance_h2o <- h2o.performance(best_model, newdata = test_h2o)

typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "GBM_2_AutoML_3_20260424_215827"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/GBM_2_AutoML_3_20260424_215827"
## 
## 
## $model_checksum
## [1] "-8836266441872637560"
## 
## $frame
## $frame$name
## [1] "test_tbl_sid_8899_3"
## 
## 
## $frame_checksum
## [1] "-9010569464111329883"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.777082e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.0001004766
## 
## $RMSE
## [1] 0.0100238
## 
## $nobs
## [1] 808
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.9995981
## 
## $logloss
## [1] 0.0007287438
## 
## $AUC
## [1] 1
## 
## $pr_auc
## [1] 1
## 
## $Gini
## [1] 1
## 
## $mean_per_class_error
## [1] 0
## 
## $domain
## [1] "High" "Low" 
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##        High Low  Error      Rate
## High    404   0 0.0000 = 0 / 404
## Low       0 404 0.0000 = 0 / 404
## Totals  404 404 0.0000 = 0 / 808
## 
## 
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.999993 0.004938 0.003092 0.012255 0.501238  1.000000 0.002475    1.000000
## 2  0.999992 0.019608 0.012346 0.047619 0.504950  1.000000 0.009901    1.000000
## 3  0.999992 0.024450 0.015423 0.058962 0.506188  1.000000 0.012376    1.000000
## 4  0.999989 0.029268 0.018496 0.070093 0.507426  1.000000 0.014851    1.000000
## 5  0.999988 0.034063 0.021565 0.081019 0.508663  1.000000 0.017327    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1     0.035202               0.002475                0.501238 404 403   0   1
## 2     0.070535               0.009901                0.504950 404 400   0   4
## 3     0.078909               0.012376                0.506188 404 399   0   5
## 4     0.086494               0.014851                0.507426 404 398   0   6
## 5     0.093483               0.017327                0.508663 404 397   0   7
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.997525 0.000000 0.002475   0
## 2 1.000000 0.990099 0.000000 0.009901   1
## 3 1.000000 0.987624 0.000000 0.012376   2
## 4 1.000000 0.985149 0.000000 0.014851   3
## 5 1.000000 0.982673 0.000000 0.017327   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.000010 0.718861 0.864726 0.615104 0.608911  0.561111 1.000000
## 396  0.000010 0.714412 0.862143 0.609903 0.600248  0.555708 1.000000
## 397  0.000010 0.690598 0.848027 0.582468 0.551980  0.527415 1.000000
## 398  0.000010 0.679563 0.841316 0.569977 0.528465  0.514650 1.000000
## 399  0.000009 0.671096 0.836093 0.560488 0.509901  0.505000 1.000000
## 400  0.000009 0.666667 0.833333 0.555556 0.500000  0.500000 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.217822     0.349603               0.217822                0.608911  88
## 396    0.200495     0.333792               0.200495                0.600248  81
## 397    0.103960     0.234159               0.103960                0.551980  42
## 398    0.056931     0.171171               0.056931                0.528465  23
## 399    0.019802     0.100000               0.019802                0.509901   8
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns fps tps      tnr      fnr      fpr      tpr idx
## 395   0 316 404 0.217822 0.000000 0.782178 1.000000 394
## 396   0 323 404 0.200495 0.000000 0.799505 1.000000 395
## 397   0 362 404 0.103960 0.000000 0.896040 1.000000 396
## 398   0 381 404 0.056931 0.000000 0.943069 1.000000 397
## 399   0 396 404 0.019802 0.000000 0.980198 1.000000 398
## 400   0 404 404 0.000000 0.000000 1.000000 1.000000 399
## 
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.751460   1.000000 233
## 2                       max f2  0.751460   1.000000 233
## 3                 max f0point5  0.751460   1.000000 233
## 4                 max accuracy  0.751460   1.000000 233
## 5                max precision  0.999993   1.000000   0
## 6                   max recall  0.751460   1.000000 233
## 7              max specificity  0.999993   1.000000   0
## 8             max absolute_mcc  0.751460   1.000000 233
## 9   max min_per_class_accuracy  0.751460   1.000000 233
## 10 max mean_per_class_accuracy  0.751460   1.000000 233
## 11                     max tns  0.999993 404.000000   0
## 12                     max fns  0.999993 403.000000   0
## 13                     max fps  0.000009 404.000000 399
## 14                     max tps  0.751460 404.000000 233
## 15                     max tnr  0.999993   1.000000   0
## 16                     max fnr  0.999993   0.997525   0
## 17                     max fpr  0.000009   1.000000 399
## 18                     max tpr  0.751460   1.000000 233
## 
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 50.00 %, avg score: 49.94 %
##    group cumulative_data_fraction lower_threshold     lift cumulative_lift
## 1      1               0.01113861        0.999988 2.000000        2.000000
## 2      2               0.02103960        0.999988 2.000000        2.000000
## 3      3               0.03094059        0.999988 2.000000        2.000000
## 4      4               0.04084158        0.999988 2.000000        2.000000
## 5      5               0.05321782        0.999988 2.000000        2.000000
## 6      6               0.10396040        0.999986 2.000000        2.000000
## 7      7               0.15717822        0.999984 2.000000        2.000000
## 8      8               0.20049505        0.999982 2.000000        2.000000
## 9      9               0.30074257        0.999978 2.000000        2.000000
## 10    10               0.39975248        0.999967 2.000000        2.000000
## 11    11               0.50000000        0.378354 2.000000        2.000000
## 12    12               0.60024752        0.000020 0.000000        1.665979
## 13    13               0.69925743        0.000014 0.000000        1.430088
## 14    14               0.79950495        0.000012 0.000000        1.250774
## 15    15               0.89975248        0.000010 0.000000        1.111417
## 16    16               1.00000000        0.000009 0.000000        1.000000
##    response_rate    score cumulative_response_rate cumulative_score
## 1       1.000000 0.999991                 1.000000         0.999991
## 2       1.000000 0.999988                 1.000000         0.999989
## 3       1.000000 0.999988                 1.000000         0.999989
## 4       1.000000 0.999988                 1.000000         0.999989
## 5       1.000000 0.999988                 1.000000         0.999988
## 6       1.000000 0.999987                 1.000000         0.999988
## 7       1.000000 0.999985                 1.000000         0.999987
## 8       1.000000 0.999983                 1.000000         0.999986
## 9       1.000000 0.999980                 1.000000         0.999984
## 10      1.000000 0.999975                 1.000000         0.999982
## 11      1.000000 0.993795                 1.000000         0.998741
## 12      0.000000 0.000355                 0.832990         0.832000
## 13      0.000000 0.000017                 0.715044         0.714197
## 14      0.000000 0.000013                 0.625387         0.624648
## 15      0.000000 0.000011                 0.555708         0.555053
## 16      0.000000 0.000010                 0.500000         0.499411
##    capture_rate cumulative_capture_rate        gain cumulative_gain
## 1      0.022277                0.022277  100.000000      100.000000
## 2      0.019802                0.042079  100.000000      100.000000
## 3      0.019802                0.061881  100.000000      100.000000
## 4      0.019802                0.081683  100.000000      100.000000
## 5      0.024752                0.106436  100.000000      100.000000
## 6      0.101485                0.207921  100.000000      100.000000
## 7      0.106436                0.314356  100.000000      100.000000
## 8      0.086634                0.400990  100.000000      100.000000
## 9      0.200495                0.601485  100.000000      100.000000
## 10     0.198020                0.799505  100.000000      100.000000
## 11     0.200495                1.000000  100.000000      100.000000
## 12     0.000000                1.000000 -100.000000       66.597938
## 13     0.000000                1.000000 -100.000000       43.008850
## 14     0.000000                1.000000 -100.000000       25.077399
## 15     0.000000                1.000000 -100.000000       11.141678
## 16     0.000000                1.000000 -100.000000        0.000000
##    kolmogorov_smirnov
## 1            0.022277
## 2            0.042079
## 3            0.061881
## 4            0.081683
## 5            0.106436
## 6            0.207921
## 7            0.314356
## 8            0.400990
## 9            0.601485
## 10           0.799505
## 11           1.000000
## 12           0.799505
## 13           0.601485
## 14           0.400990
## 15           0.200495
## 16           0.000000
h2o.auc(performance_h2o)
## [1] 1
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.751460448805453:
##        High Low    Error    Rate
## High    404   0 0.000000  =0/404
## Low       0 404 0.000000  =0/404
## Totals  404 404 0.000000  =0/808