h2o Timeseries Forecast

Packages

library(h2o)
library(dplyr)
library(tidymodels)
library(modeltime.h2o)
library(tidyverse)
library(timetk)

Data

#read in data
data_tbl <- walmart_sales_weekly %>%
  select(id, Date, Weekly_Sales)%>%
  mutate(id = paste("store", str_remove(id, "1_")))

head(data_tbl)

## # A tibble: 6 x 3
##   id      Date       Weekly_Sales
##   <chr>   <date>            <dbl>
## 1 store 1 2010-02-05       24924.
## 2 store 1 2010-02-12       46039.
## 3 store 1 2010-02-19       41596.
## 4 store 1 2010-02-26       19404.
## 5 store 1 2010-03-05       21828.
## 6 store 1 2010-03-12       21043.

#plot data
data_tbl %>%
  group_by(id) %>% 
  plot_time_series(
    .date_var    = Date,
    .value       = Weekly_Sales,
    .facet_ncol  = 2,
    .smooth      = F,
    .interactive = F
  )

Create Test Data and Recipe

#split test data using 3 months
splits <- time_series_split(data_tbl, assess = "3 month", cumulative = TRUE)

#create recipe
recipe_spec <- recipe(Weekly_Sales ~ ., data = training(splits)) %>%
  step_timeseries_signature(Date) 

train_tbl <- training(splits) %>% bake(prep(recipe_spec), .)
test_tbl  <- testing(splits) %>% bake(prep(recipe_spec), .)

h2o model specifications

#model specs
model_spec <- automl_reg(mode = 'regression') %>%
  set_engine(
    engine                     = 'h2o',
    max_runtime_secs           = 10, 
    max_runtime_secs_per_model = 10,
    max_models                 = 10,
    nfolds                     = 5,
    exclude_algos              = c("DeepLearning"),
    verbosity                  = NULL,
    seed                       = 786
  )

model_spec

## H2O AutoML Model Specification (regression)
## 
## Engine-Specific Arguments:
##   max_runtime_secs = 10
##   max_runtime_secs_per_model = 10
##   max_models = 10
##   nfolds = 5
##   exclude_algos = c("DeepLearning")
##   verbosity = NULL
##   seed = 786
## 
## Computational engine: h2o

###Fit model

model_fitted <- model_spec %>%
    fit(Weekly_Sales ~ ., data = train_tbl)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
##                                              model_id mean_residual_deviance
## 1    StackedEnsemble_AllModels_AutoML_20210630_211140               46162977
## 2 StackedEnsemble_BestOfFamily_AutoML_20210630_211140               54623136
## 3                        DRF_1_AutoML_20210630_211140              139444042
## 4          GBM_grid__1_AutoML_20210630_211140_model_1              357134113
## 5                        GBM_3_AutoML_20210630_211140              593750048
## 6                        GBM_1_AutoML_20210630_211140              636007808
##        rmse       mse       mae     rmsle
## 1  6794.334  46162977  4414.035 0.1795215
## 2  7390.747  54623136  4968.496 0.2063314
## 3 11808.643 139444042  8286.015 0.3025745
## 4 18897.992 357134113 15727.312 0.5339985
## 5 24366.987 593750048 20577.132 0.6397330
## 6 25219.195 636007808 21368.810 0.6536901
## 
## [12 rows x 6 columns]

model_fitted

## parsnip model object
## 
## Fit time:  52.8s 
## 
## H2O AutoML - Stackedensemble
## --------
## Model: Model Details:
## ==============
## 
## H2ORegressionModel: stackedensemble
## Model ID:  StackedEnsemble_AllModels_AutoML_20210630_211140 
## Number of Base Models: 10
## 
## Base Models (count by algorithm type):
## 
## drf gbm glm 
##   2   7   1 
## 
## Metalearner:
## 
## Metalearner algorithm: glm
## Metalearner cross-validation fold assignment:
##   Fold assignment scheme: AUTO
##   Number of folds: 5
##   Fold column: NULL
## Metalearner hyperparameters: 
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on training data. **
## 
## MSE:  24624542
## RMSE:  4962.312
## MAE:  3179.488
## RMSLE:  0.1414845
## Mean Residual Deviance :  24624542
## 
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  46162977
## RMSE:  6794.334
## MAE:  4414.035
## RMSLE:  0.1795215
## Mean Residual Deviance :  46162977

Model Leaderboard

Top model selected is a stacked ensemble

automl_leaderboard(model_fitted)

## # A tibble: 12 x 6
##    model_id                     mean_residual_devi~   rmse      mse    mae rmsle
##    <chr>                                      <dbl>  <dbl>    <dbl>  <dbl> <dbl>
##  1 StackedEnsemble_AllModels_A~           46162977.  6794.   4.62e7  4414. 0.180
##  2 StackedEnsemble_BestOfFamil~           54623136.  7391.   5.46e7  4968. 0.206
##  3 DRF_1_AutoML_20210630_211140          139444042. 11809.   1.39e8  8286. 0.303
##  4 GBM_grid__1_AutoML_20210630~          357134113. 18898.   3.57e8 15727. 0.534
##  5 GBM_3_AutoML_20210630_211140          593750048. 24367.   5.94e8 20577. 0.640
##  6 GBM_1_AutoML_20210630_211140          636007808. 25219.   6.36e8 21369. 0.654
##  7 GBM_4_AutoML_20210630_211140          850921054. 29171.   8.51e8 24792. 0.718
##  8 GBM_grid__1_AutoML_20210630~          885064668. 29750.   8.85e8 24974. 0.732
##  9 GBM_2_AutoML_20210630_211140          889643548. 29827.   8.90e8 25367. 0.728
## 10 GBM_5_AutoML_20210630_211140         1117993584. 33436.   1.12e9 27973. 0.787
## 11 XRT_1_AutoML_20210630_211140         1121907878. 33495.   1.12e9 23121. 0.791
## 12 GLM_1_AutoML_20210630_211140         1314471111. 36256.   1.31e9 31032. 0.835

Predict on test data

predict(model_fitted, test_tbl)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

## # A tibble: 84 x 1
##      .pred
##      <dbl>
##  1  22536.
##  2  22174.
##  3  37690.
##  4  39388.
##  5  80247.
##  6  82565.
##  7 133096.
##  8  23155.
##  9  22386.
## 10  37495.
## # ... with 74 more rows

After model is fitted can follow modeltime work flow: - Add fitted models to a Model Table. - Calibrate the models to a testing set. - Perform Testing Set Forecast Evaluation & Accuracy Evaluation. - Refit the models to Full Dataset & Forecast Forward ### Create model table

modeltime_tbl <- modeltime_table(
    model_fitted
) 
modeltime_tbl

## # Modeltime Table
## # A tibble: 1 x 3
##   .model_id .model   .model_desc                 
##       <int> <list>   <chr>                       
## 1         1 <fit[+]> H2O AUTOML - STACKEDENSEMBLE

Evaluate performance on test data and calibrate model

modeltime_tbl %>%
  modeltime_calibrate(test_tbl) %>%
    modeltime_forecast(
        new_data    = test_tbl,
        actual_data = data_tbl,
        keep_data   = TRUE
    ) %>%
    group_by(id) %>%
    plot_modeltime_forecast(
        .facet_ncol = 2, 
        .interactive = FALSE
    )

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

Refit to full data and forecast next 12 months

data_prepared_tbl <- bind_rows(train_tbl, test_tbl)

future_tbl <- data_prepared_tbl %>%
    group_by(id) %>%
    future_frame(.length_out = "1 year") %>%
    ungroup()

future_prepared_tbl <- bake(prep(recipe_spec), future_tbl)

#refitted model table
refit_tbl <- modeltime_tbl %>%
    modeltime_refit(data_prepared_tbl)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
##                                              model_id mean_residual_deviance
## 1    StackedEnsemble_AllModels_AutoML_20210630_211230               45534452
## 2 StackedEnsemble_BestOfFamily_AutoML_20210630_211230               45716831
## 3                        DRF_1_AutoML_20210630_211230              127384172
## 4                        GBM_1_AutoML_20210630_211230              712885559
## 5                        GBM_2_AutoML_20210630_211230             1072638362
## 6                        GLM_1_AutoML_20210630_211230             1314752621
##        rmse        mse       mae     rmsle
## 1  6747.922   45534452  4254.959 0.1731254
## 2  6761.422   45716831  4260.555 0.1732843
## 3 11286.460  127384172  6573.892 0.2623809
## 4 26699.917  712885559 22694.761 0.6738699
## 5 32751.158 1072638362 27939.187 0.7724833
## 6 36259.518 1314752621 31017.107 0.8289850
## 
## [6 rows x 6 columns]

Plot Forecast

refit_tbl %>%
  modeltime_forecast(
    new_data    = future_prepared_tbl,
    actual_data = data_prepared_tbl,
    keep_data   = TRUE
  ) %>%
  group_by(id) %>%
  plot_modeltime_forecast(
    .facet_ncol  = 2,
    .interactive = T,
    .legend_show = F,
    .title = "12 Month Forecast - Weekly Sales",
    .plotly_slider = F)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%