Code Along 11

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(timetk)

## Warning: package 'timetk' was built under R version 4.5.3

library(tidyquant)

## Warning: package 'tidyquant' was built under R version 4.5.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Warning: package 'xts' was built under R version 4.5.3

## Warning: package 'zoo' was built under R version 4.5.3

## Warning: package 'quantmod' was built under R version 4.5.3

## Warning: package 'TTR' was built under R version 4.5.3

## Warning: package 'PerformanceAnalytics' was built under R version 4.5.3

## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.12 ──
## ✔ PerformanceAnalytics 2.1.0      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.28     ✔ xts                  0.14.2

## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'tidyquant'
## 
## 
## The following object is masked from 'package:timetk':
## 
##     FANG

Plotting time series

taylor_30_min

## # A tibble: 4,032 × 2
##    date                value
##    <dttm>              <dbl>
##  1 2000-06-05 00:00:00 22262
##  2 2000-06-05 00:30:00 21756
##  3 2000-06-05 01:00:00 22247
##  4 2000-06-05 01:30:00 22759
##  5 2000-06-05 02:00:00 22549
##  6 2000-06-05 02:30:00 22313
##  7 2000-06-05 03:00:00 22128
##  8 2000-06-05 03:30:00 21860
##  9 2000-06-05 04:00:00 21751
## 10 2000-06-05 04:30:00 21336
## # ℹ 4,022 more rows

taylor_30_min %>%
  plot_time_series(.date_var = date, .value = value)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the timetk package.
##   Please report the issue at
##   <https://github.com/business-science/timetk/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Ignoring unknown labels:
## • colour : "Legend"

m4_hourly %>%
  group_by(id) %>%
  plot_time_series(
    .date_var = date,
    .value    = log(value),
    .facet_ncol   = 2,
    .facet_scales = "free",
    .color_var    = week(date)
  )

Static ggpolt2 Visualization & customizations

taylor_30_min %>%
  plot_time_series(
    date, value,
    .color_var = month(date, label = TRUE),

    # Returns static ggplot
    .interactive = FALSE,

    # Customize
    .title     = "Taylor's MegaWatt Data",
    .x_lab     = "Date (30-min intervals)",
    .y_lab     = "Energy Demand (MW)",
    .color_lab = "Month"
  )

Box plots

m4_monthly %>% count(id)

## # A tibble: 4 × 2
##   id        n
##   <fct> <int>
## 1 M1      469
## 2 M2      469
## 3 M750    306
## 4 M1000   330

m4_monthly %>%
  filter_by_time(.date_var = date, .end_date = "1976") %>%
  group_by(id) %>%
  plot_time_series_boxplot(
    .date_var   = date,
    .value      = value,
    .period     = "1 year",
    .facet_ncol = 2
  )

## Ignoring unknown labels:
## • colour : "Legend"

Regression plots

m4_monthly %>%
  group_by(id) %>%
  plot_time_series_regression(
    .date_var   = date,
    .facet_ncol = 2,
    .formula    = log(value) ~ as.numeric(date) + month(date, label = TRUE),
    .show_summary = FALSE
  )

Plotting Seasonality and Correlation

Correlation Plots

m4_hourly %>%
  group_by(id) %>%
  plot_acf_diagnostics(
    date, value,
    .lags = "7 days")

walmart_sales_weekly %>%
  group_by(id) %>%
  plot_acf_diagnostics(
    Date, Weekly_Sales,
    .ccf_vars = c(Temperature, Fuel_Price),
    .lags = "3 months"
  )

Seasonality

taylor_30_min %>%
  plot_seasonal_diagnostics(date, value)

m4_hourly %>% count(id)

## # A tibble: 4 × 2
##   id        n
##   <fct> <int>
## 1 H10     700
## 2 H50     700
## 3 H150    700
## 4 H410    960

m4_hourly %>%
  group_by(id) %>%
  plot_seasonal_diagnostics(date, value)

STL Diagnostics

m4_hourly %>%
  group_by(id) %>%
  plot_stl_diagnostics(
    date, value,
    .feature_set = c("observed", "season", "trend", "remainder")
  )

## frequency = 24 observations per 1 day

## trend = 336 observations per 14 days

## frequency = 24 observations per 1 day

## trend = 336 observations per 14 days

## frequency = 24 observations per 1 day

## trend = 336 observations per 14 days

## frequency = 24 observations per 1 day

## trend = 336 observations per 14 days

Time Series Data Wrangling

Summarize by Time

FANG %>%
  group_by(symbol) %>%
  summarise_by_time(.date_var = date, volume = sum(volume), .by = "quarter") %>%
  plot_time_series(
    date, volume,
    .facet_ncol = 2,
    .interactive = FALSE
  )

## Ignoring unknown labels:
## • colour : "Legend"

FANG %>%
  group_by(symbol) %>%
  summarise_by_time(.date_var = date, adjusted = mean(adjusted), .by = "month") %>%
  plot_time_series(date, adjusted, .facet_ncol = 2, .interactive = FALSE)

## Ignoring unknown labels:
## • colour : "Legend"

Filter By Time

FANG %>%
  group_by(symbol) %>%
  filter_by_time(.date_var = date,
                 .start_date = "2013-09",
                 .end_date = "2013") %>%
  plot_time_series(date, adjusted, .facet_ncol = 2)

## Ignoring unknown labels:
## • colour : "Legend"

Padding Data

FANG %>%
  group_by(symbol) %>%
  pad_by_time(date, .by = "day", .pad_value = 0)

## # A tibble: 5,836 × 8
## # Groups:   symbol [4]
##    symbol date        open  high   low close  volume adjusted
##    <chr>  <date>     <dbl> <dbl> <dbl> <dbl>   <dbl>    <dbl>
##  1 AMZN   2013-01-02  256.  258.  253.  257. 3271000     257.
##  2 AMZN   2013-01-03  257.  261.  256.  258. 2750900     258.
##  3 AMZN   2013-01-04  258.  260.  257.  259. 1874200     259.
##  4 AMZN   2013-01-05    0     0     0     0        0       0 
##  5 AMZN   2013-01-06    0     0     0     0        0       0 
##  6 AMZN   2013-01-07  263.  270.  263.  268. 4910000     268.
##  7 AMZN   2013-01-08  267.  269.  264.  266. 3010700     266.
##  8 AMZN   2013-01-09  268.  270.  265.  266. 2265600     266.
##  9 AMZN   2013-01-10  269.  269.  262.  265. 2863400     265.
## 10 AMZN   2013-01-11  265.  268.  264.  268. 2413300     268.
## # ℹ 5,826 more rows

Sliding (Rolling) Calculation

FANG %>%
  head(10) %>%
  mutate(rolling_avg_2 = slidify_vec(adjusted, mean,
                                     .period = 2,
                                     .align = "right",
                                     .partial = TRUE))

## # A tibble: 10 × 9
##    symbol date        open  high   low close    volume adjusted rolling_avg_2
##    <chr>  <date>     <dbl> <dbl> <dbl> <dbl>     <dbl>    <dbl>         <dbl>
##  1 META   2013-01-02  27.4  28.2  27.4  28    69846400     28            28  
##  2 META   2013-01-03  27.9  28.5  27.6  27.8  63140600     27.8          27.9
##  3 META   2013-01-04  28.0  28.9  27.8  28.8  72715400     28.8          28.3
##  4 META   2013-01-07  28.7  29.8  28.6  29.4  83781800     29.4          29.1
##  5 META   2013-01-08  29.5  29.6  28.9  29.1  45871300     29.1          29.2
##  6 META   2013-01-09  29.7  30.6  29.5  30.6 104787700     30.6          29.8
##  7 META   2013-01-10  30.6  31.5  30.3  31.3  95316400     31.3          30.9
##  8 META   2013-01-11  31.3  32.0  31.1  31.7  89598000     31.7          31.5
##  9 META   2013-01-14  32.1  32.2  30.6  31.0  98892800     31.0          31.3
## 10 META   2013-01-15  30.6  31.7  29.9  30.1 173242600     30.1          30.5

# Rolling regressions are easy to implement using `.unlist = FALSE`
lm_roll <- slidify(~ lm(..1 ~ ..2 + ..3), .period = 90,
                   .unlist = FALSE, .align = "right")

FANG %>%
  select(symbol, date, adjusted, volume) %>%
  group_by(symbol) %>%
  mutate(numeric_date = as.numeric(date)) %>%
  # Apply rolling regression
  mutate(rolling_lm = lm_roll(adjusted, volume, numeric_date)) %>%
  filter(!is.na(rolling_lm))

## # A tibble: 3,676 × 6
## # Groups:   symbol [4]
##    symbol date       adjusted   volume numeric_date rolling_lm
##    <chr>  <date>        <dbl>    <dbl>        <dbl> <list>    
##  1 META   2013-05-10     26.7 30847100        15835 <lm>      
##  2 META   2013-05-13     26.8 29068800        15838 <lm>      
##  3 META   2013-05-14     27.1 24930300        15839 <lm>      
##  4 META   2013-05-15     26.6 30299800        15840 <lm>      
##  5 META   2013-05-16     26.1 35499100        15841 <lm>      
##  6 META   2013-05-17     26.2 29462700        15842 <lm>      
##  7 META   2013-05-20     25.8 42402900        15845 <lm>      
##  8 META   2013-05-21     25.7 26261300        15846 <lm>      
##  9 META   2013-05-22     25.2 45314500        15847 <lm>      
## 10 META   2013-05-23     25.1 37663100        15848 <lm>      
## # ℹ 3,666 more rows

Model

library(h2o)

## Warning: package 'h2o' was built under R version 4.5.3

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──

## ✔ broom        1.0.12     ✔ rsample      1.3.2 
## ✔ dials        1.4.2      ✔ tailor       0.1.0 
## ✔ infer        1.1.0      ✔ tidyr        1.3.2 
## ✔ modeldata    1.5.1      ✔ tune         2.0.1 
## ✔ parsnip      1.4.1      ✔ workflows    1.3.0 
## ✔ purrr        1.2.1      ✔ workflowsets 1.1.1 
## ✔ recipes      1.3.1      ✔ yardstick    1.3.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard()  masks scales::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ xts::first()      masks dplyr::first()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ xts::last()       masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ recipes::step()   masks stats::step()

h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         11 minutes 42 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    2 years, 4 months and 4 days 
##     H2O cluster name:           H2O_started_from_R_javony_bdq039 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.09 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.5.2 (2025-10-31 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (2 years, 4 months and 4 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

fang_tbl <- FANG %>%
  select(symbol, date, open, high, low, close, volume, adjusted) %>%
  mutate(
    volume_class = if_else(volume > median(volume, na.rm = TRUE), "High", "Low"),
    volume_class = as.factor(volume_class),
    symbol = as.factor(symbol),
    year = lubridate::year(date),
    month = lubridate::month(date)
  ) %>%
  select(-date)

set.seed(2345)

split <- initial_split(fang_tbl, prop = 0.80, strata = volume_class)

train_tbl <- training(split)
test_tbl  <- testing(split)

split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o  <- as.h2o(test_tbl)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

y <- "volume_class"
x <- setdiff(names(train_tbl), y)

models_h2o <- h2o.automl(
  x = x,
  y = y,
  training_frame    = train_h2o,
  validation_frame  = valid_h2o,
  leaderboard_frame = test_h2o,
  # max_runtime_secs = 30,
  max_models        = 5,
  exclude_algos     = "DeepLearning",
  nfolds            = 3,
  seed              = 3456
)

##   |                                                                              |                                                                      |   0%  |                                                                              |===                                                                   |   4%
## 21:58:27.490: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 21:58:27.499: AutoML: XGBoost is not available; skipping it.  |                                                                              |=========                                                             |  12%  |                                                                              |============                                                          |  17%  |                                                                              |==============                                                        |  20%  |                                                                              |======================================================================| 100%

## Examine the output of h2o.automl

models_h2o %>% typeof()

## [1] "S4"

models_h2o %>% slotNames()

## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"

models_h2o@leaderboard

##                                                  model_id auc      logloss
## 1                          GBM_2_AutoML_3_20260424_215827   1 0.0007287438
## 2                          GBM_1_AutoML_3_20260424_215827   1 0.0007818663
## 3                          DRF_1_AutoML_3_20260424_215827   1 0.0062555239
## 4    StackedEnsemble_AllModels_1_AutoML_3_20260424_215827   1 0.0009422223
## 5 StackedEnsemble_BestOfFamily_1_AutoML_3_20260424_215827   1 0.0010203401
## 6                          GBM_3_AutoML_3_20260424_215827   1 0.0006904773
##   aucpr mean_per_class_error        rmse          mse
## 1     1                    0 0.010023800 1.004766e-04
## 2     1                    0 0.005919087 3.503559e-05
## 3     1                    0 0.029342937 8.610080e-04
## 4     1                    0 0.010577578 1.118852e-04
## 5     1                    0 0.011385963 1.296402e-04
## 6     1                    0 0.009775758 9.556544e-05
## 
## [7 rows x 7 columns]

models_h2o@leader

## Model Details:
## ==============
## 
## H2OBinomialModel: gbm
## Model ID:  GBM_2_AutoML_3_20260424_215827 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1             111                      111               56788         4
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         7    6.96396          7         68    35.98198
## 
## 
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  7.112633e-06
## RMSE:  0.002666952
## LogLoss:  0.0001904919
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  1
## Gini:  1
## R^2:  0.9999715
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        High  Low    Error     Rate
## High   1371    0 0.000000  =0/1371
## Low       0 1368 0.000000  =0/1368
## Totals 1371 1368 0.000000  =0/2739
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.913077    1.000000 216
## 2                       max f2  0.913077    1.000000 216
## 3                 max f0point5  0.913077    1.000000 216
## 4                 max accuracy  0.913077    1.000000 216
## 5                max precision  0.999993    1.000000   0
## 6                   max recall  0.913077    1.000000 216
## 7              max specificity  0.999993    1.000000   0
## 8             max absolute_mcc  0.913077    1.000000 216
## 9   max min_per_class_accuracy  0.913077    1.000000 216
## 10 max mean_per_class_accuracy  0.913077    1.000000 216
## 11                     max tns  0.999993 1371.000000   0
## 12                     max fns  0.999993 1366.000000   0
## 13                     max fps  0.000009 1371.000000 399
## 14                     max tps  0.913077 1368.000000 216
## 15                     max tnr  0.999993    1.000000   0
## 16                     max fnr  0.999993    0.998538   0
## 17                     max fpr  0.000009    1.000000 399
## 18                     max tpr  0.913077    1.000000 216
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on validation data. **
## ** Validation metrics **
## 
## MSE:  5.983943e-08
## RMSE:  0.000244621
## LogLoss:  6.188569e-05
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  1
## Gini:  1
## R^2:  0.9999998
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        High Low    Error    Rate
## High    241   0 0.000000  =0/241
## Low       0 244 0.000000  =0/244
## Totals  241 244 0.000000  =0/485
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.997992   1.000000 184
## 2                       max f2  0.997992   1.000000 184
## 3                 max f0point5  0.997992   1.000000 184
## 4                 max accuracy  0.997992   1.000000 184
## 5                max precision  0.999993   1.000000   0
## 6                   max recall  0.997992   1.000000 184
## 7              max specificity  0.999993   1.000000   0
## 8             max absolute_mcc  0.997992   1.000000 184
## 9   max min_per_class_accuracy  0.997992   1.000000 184
## 10 max mean_per_class_accuracy  0.997992   1.000000 184
## 11                     max tns  0.999993 241.000000   0
## 12                     max fns  0.999993 243.000000   0
## 13                     max fps  0.000009 241.000000 399
## 14                     max tps  0.997992 244.000000 184
## 15                     max tnr  0.999993   1.000000   0
## 16                     max fnr  0.999993   0.995902   0
## 17                     max fpr  0.000009   1.000000 399
## 18                     max tpr  0.997992   1.000000 184
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 3-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.001342915
## RMSE:  0.03664581
## LogLoss:  0.004200538
## Mean Per-Class Error:  0.001094092
## AUC:  0.999992
## AUCPR:  0.999992
## Gini:  0.999984
## R^2:  0.9946283
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        High  Low    Error     Rate
## High   1368    3 0.002188  =3/1371
## Low       0 1368 0.000000  =0/1368
## Totals 1368 1371 0.001095  =3/2739
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.081349    0.998905 218
## 2                       max f2  0.081349    0.999562 218
## 3                 max f0point5  0.973794    0.998972 208
## 4                 max accuracy  0.081349    0.998905 218
## 5                max precision  0.999999    1.000000   0
## 6                   max recall  0.081349    1.000000 218
## 7              max specificity  0.999999    1.000000   0
## 8             max absolute_mcc  0.081349    0.997812 218
## 9   max min_per_class_accuracy  0.486473    0.998538 215
## 10 max mean_per_class_accuracy  0.081349    0.998906 218
## 11                     max tns  0.999999 1371.000000   0
## 12                     max fns  0.999999 1306.000000   0
## 13                     max fps  0.000001 1371.000000 399
## 14                     max tps  0.081349 1368.000000 218
## 15                     max tnr  0.999999    1.000000   0
## 16                     max fnr  0.999999    0.954678   0
## 17                     max fpr  0.000001    1.000000 399
## 18                     max tpr  0.081349    1.000000 218
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                             mean       sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                0.999270 0.000632   1.000000   0.998905   0.998905
## auc                     0.999995 0.000005   1.000000   0.999995   0.999990
## err                     0.000730 0.000632   0.000000   0.001095   0.001095
## err_count               0.666667 0.577350   0.000000   1.000000   1.000000
## f0point5                0.998832 0.001011   1.000000   0.998249   0.998249
## f1                      0.999270 0.000632   1.000000   0.998905   0.998905
## f2                      0.999708 0.000253   1.000000   0.999562   0.999562
## lift_top_group          2.002193       NA   2.002193   2.002193   2.002193
## logloss                 0.004357 0.001944   0.002848   0.003672   0.006550
## max_per_class_error     0.001459 0.001263   0.000000   0.002188   0.002188
## mcc                     0.998541 0.001263   1.000000   0.997812   0.997812
## mean_per_class_accuracy 0.999271 0.000632   1.000000   0.998906   0.998906
## mean_per_class_error    0.000729 0.000632   0.000000   0.001094   0.001094
## mse                     0.001369 0.000516   0.000935   0.001233   0.001940
## pr_auc                  0.999995 0.000005   1.000000   0.999995   0.999990
## precision               0.998541 0.001263   1.000000   0.997812   0.997812
## r2                      0.994522 0.002065   0.996260   0.995067   0.992240
## recall                  1.000000 0.000000   1.000000   1.000000   1.000000
## rmse                    0.036581 0.006852   0.030579   0.035118   0.044046
## specificity             0.998541 0.001263   1.000000   0.997812   0.997812

Save and Load

?h2o.getModel

## starting httpd help server ... done

?h2o.saveModel
?h2o.loadModel

dir.create("h2o_models", showWarnings = FALSE)

h2o.saveModel(
  object = models_h2o@leader,
  path = "h2o_models/",
  force = TRUE
)

## [1] "C:\\Users\\javony\\Desktop\\PSU_DATA3100\\11_module13\\h2o_models\\GBM_2_AutoML_3_20260424_215827"

best_model <- models_h2o@leader

Make predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

predictions_tbl <- predictions %>%
  as_tibble()

predictions_tbl %>%
  bind_cols(test_tbl)

## # A tibble: 808 × 13
##    predict  High        Low symbol  open  high   low close   volume adjusted
##    <fct>   <dbl>      <dbl> <fct>  <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
##  1 High    1.000 0.00000988 META    27.4  28.2  27.4  28   69846400     28  
##  2 High    1.000 0.00000988 META    32.1  32.2  30.6  31.0 98892800     31.0
##  3 High    1.000 0.00000988 META    31.1  31.5  30.8  30.8 48899800     30.8
##  4 High    1.000 0.00000988 META    31.3  31.5  30.8  31.1 43845100     31.1
##  5 High    1.000 0.00000988 META    28.9  29.2  28.5  28.5 37708800     28.5
##  6 High    1.000 0.00000988 META    28.3  28.5  27.2  27.3 49642300     27.3
##  7 High    1.000 0.00000924 META    28.0  28.6  27.8  28.1 35642100     28.1
##  8 High    1.000 0.00000924 META    27.6  27.6  26.9  27.1 39619500     27.1
##  9 High    1.000 0.00000924 META    26.7  26.7  25.8  25.9 44006500     25.9
## 10 High    1.000 0.00000924 META    26.1  26.2  25.5  25.6 28585700     25.6
## # ℹ 798 more rows
## # ℹ 3 more variables: volume_class <fct>, year <dbl>, month <dbl>

Evaluate model

?h2o.performance

performance_h2o <- h2o.performance(best_model, newdata = test_h2o)

typeof(performance_h2o)

## [1] "S4"

slotNames(performance_h2o)

## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"

performance_h2o@metrics

## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "GBM_2_AutoML_3_20260424_215827"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/GBM_2_AutoML_3_20260424_215827"
## 
## 
## $model_checksum
## [1] "-8836266441872637560"
## 
## $frame
## $frame$name
## [1] "test_tbl_sid_8899_3"
## 
## 
## $frame_checksum
## [1] "-9010569464111329883"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.777082e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.0001004766
## 
## $RMSE
## [1] 0.0100238
## 
## $nobs
## [1] 808
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.9995981
## 
## $logloss
## [1] 0.0007287438
## 
## $AUC
## [1] 1
## 
## $pr_auc
## [1] 1
## 
## $Gini
## [1] 1
## 
## $mean_per_class_error
## [1] 0
## 
## $domain
## [1] "High" "Low" 
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##        High Low  Error      Rate
## High    404   0 0.0000 = 0 / 404
## Low       0 404 0.0000 = 0 / 404
## Totals  404 404 0.0000 = 0 / 808
## 
## 
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  0.999993 0.004938 0.003092 0.012255 0.501238  1.000000 0.002475    1.000000
## 2  0.999992 0.019608 0.012346 0.047619 0.504950  1.000000 0.009901    1.000000
## 3  0.999992 0.024450 0.015423 0.058962 0.506188  1.000000 0.012376    1.000000
## 4  0.999989 0.029268 0.018496 0.070093 0.507426  1.000000 0.014851    1.000000
## 5  0.999988 0.034063 0.021565 0.081019 0.508663  1.000000 0.017327    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns fns fps tps
## 1     0.035202               0.002475                0.501238 404 403   0   1
## 2     0.070535               0.009901                0.504950 404 400   0   4
## 3     0.078909               0.012376                0.506188 404 399   0   5
## 4     0.086494               0.014851                0.507426 404 398   0   6
## 5     0.093483               0.017327                0.508663 404 397   0   7
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.997525 0.000000 0.002475   0
## 2 1.000000 0.990099 0.000000 0.009901   1
## 3 1.000000 0.987624 0.000000 0.012376   2
## 4 1.000000 0.985149 0.000000 0.014851   3
## 5 1.000000 0.982673 0.000000 0.017327   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.000010 0.718861 0.864726 0.615104 0.608911  0.561111 1.000000
## 396  0.000010 0.714412 0.862143 0.609903 0.600248  0.555708 1.000000
## 397  0.000010 0.690598 0.848027 0.582468 0.551980  0.527415 1.000000
## 398  0.000010 0.679563 0.841316 0.569977 0.528465  0.514650 1.000000
## 399  0.000009 0.671096 0.836093 0.560488 0.509901  0.505000 1.000000
## 400  0.000009 0.666667 0.833333 0.555556 0.500000  0.500000 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.217822     0.349603               0.217822                0.608911  88
## 396    0.200495     0.333792               0.200495                0.600248  81
## 397    0.103960     0.234159               0.103960                0.551980  42
## 398    0.056931     0.171171               0.056931                0.528465  23
## 399    0.019802     0.100000               0.019802                0.509901   8
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns fps tps      tnr      fnr      fpr      tpr idx
## 395   0 316 404 0.217822 0.000000 0.782178 1.000000 394
## 396   0 323 404 0.200495 0.000000 0.799505 1.000000 395
## 397   0 362 404 0.103960 0.000000 0.896040 1.000000 396
## 398   0 381 404 0.056931 0.000000 0.943069 1.000000 397
## 399   0 396 404 0.019802 0.000000 0.980198 1.000000 398
## 400   0 404 404 0.000000 0.000000 1.000000 1.000000 399
## 
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.751460   1.000000 233
## 2                       max f2  0.751460   1.000000 233
## 3                 max f0point5  0.751460   1.000000 233
## 4                 max accuracy  0.751460   1.000000 233
## 5                max precision  0.999993   1.000000   0
## 6                   max recall  0.751460   1.000000 233
## 7              max specificity  0.999993   1.000000   0
## 8             max absolute_mcc  0.751460   1.000000 233
## 9   max min_per_class_accuracy  0.751460   1.000000 233
## 10 max mean_per_class_accuracy  0.751460   1.000000 233
## 11                     max tns  0.999993 404.000000   0
## 12                     max fns  0.999993 403.000000   0
## 13                     max fps  0.000009 404.000000 399
## 14                     max tps  0.751460 404.000000 233
## 15                     max tnr  0.999993   1.000000   0
## 16                     max fnr  0.999993   0.997525   0
## 17                     max fpr  0.000009   1.000000 399
## 18                     max tpr  0.751460   1.000000 233
## 
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 50.00 %, avg score: 49.94 %
##    group cumulative_data_fraction lower_threshold     lift cumulative_lift
## 1      1               0.01113861        0.999988 2.000000        2.000000
## 2      2               0.02103960        0.999988 2.000000        2.000000
## 3      3               0.03094059        0.999988 2.000000        2.000000
## 4      4               0.04084158        0.999988 2.000000        2.000000
## 5      5               0.05321782        0.999988 2.000000        2.000000
## 6      6               0.10396040        0.999986 2.000000        2.000000
## 7      7               0.15717822        0.999984 2.000000        2.000000
## 8      8               0.20049505        0.999982 2.000000        2.000000
## 9      9               0.30074257        0.999978 2.000000        2.000000
## 10    10               0.39975248        0.999967 2.000000        2.000000
## 11    11               0.50000000        0.378354 2.000000        2.000000
## 12    12               0.60024752        0.000020 0.000000        1.665979
## 13    13               0.69925743        0.000014 0.000000        1.430088
## 14    14               0.79950495        0.000012 0.000000        1.250774
## 15    15               0.89975248        0.000010 0.000000        1.111417
## 16    16               1.00000000        0.000009 0.000000        1.000000
##    response_rate    score cumulative_response_rate cumulative_score
## 1       1.000000 0.999991                 1.000000         0.999991
## 2       1.000000 0.999988                 1.000000         0.999989
## 3       1.000000 0.999988                 1.000000         0.999989
## 4       1.000000 0.999988                 1.000000         0.999989
## 5       1.000000 0.999988                 1.000000         0.999988
## 6       1.000000 0.999987                 1.000000         0.999988
## 7       1.000000 0.999985                 1.000000         0.999987
## 8       1.000000 0.999983                 1.000000         0.999986
## 9       1.000000 0.999980                 1.000000         0.999984
## 10      1.000000 0.999975                 1.000000         0.999982
## 11      1.000000 0.993795                 1.000000         0.998741
## 12      0.000000 0.000355                 0.832990         0.832000
## 13      0.000000 0.000017                 0.715044         0.714197
## 14      0.000000 0.000013                 0.625387         0.624648
## 15      0.000000 0.000011                 0.555708         0.555053
## 16      0.000000 0.000010                 0.500000         0.499411
##    capture_rate cumulative_capture_rate        gain cumulative_gain
## 1      0.022277                0.022277  100.000000      100.000000
## 2      0.019802                0.042079  100.000000      100.000000
## 3      0.019802                0.061881  100.000000      100.000000
## 4      0.019802                0.081683  100.000000      100.000000
## 5      0.024752                0.106436  100.000000      100.000000
## 6      0.101485                0.207921  100.000000      100.000000
## 7      0.106436                0.314356  100.000000      100.000000
## 8      0.086634                0.400990  100.000000      100.000000
## 9      0.200495                0.601485  100.000000      100.000000
## 10     0.198020                0.799505  100.000000      100.000000
## 11     0.200495                1.000000  100.000000      100.000000
## 12     0.000000                1.000000 -100.000000       66.597938
## 13     0.000000                1.000000 -100.000000       43.008850
## 14     0.000000                1.000000 -100.000000       25.077399
## 15     0.000000                1.000000 -100.000000       11.141678
## 16     0.000000                1.000000 -100.000000        0.000000
##    kolmogorov_smirnov
## 1            0.022277
## 2            0.042079
## 3            0.061881
## 4            0.081683
## 5            0.106436
## 6            0.207921
## 7            0.314356
## 8            0.400990
## 9            0.601485
## 10           0.799505
## 11           1.000000
## 12           0.799505
## 13           0.601485
## 14           0.400990
## 15           0.200495
## 16           0.000000

h2o.auc(performance_h2o)

## [1] 1

h2o.confusionMatrix(performance_h2o)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.751460448805453:
##        High Low    Error    Rate
## High    404   0 0.000000  =0/404
## Low       0 404 0.000000  =0/404
## Totals  404 404 0.000000  =0/808

Code Along 11

Javony Deleon

2026-04-16

Plotting time series

Static ggpolt2 Visualization & customizations

Box plots

Regression plots

Plotting Seasonality and Correlation

Correlation Plots

Seasonality

STL Diagnostics

Time Series Data Wrangling

Summarize by Time

Filter By Time

Padding Data

Sliding (Rolling) Calculation

Model

Save and Load

Make predictions

Evaluate model