library(h2o)
library(dplyr)
library(tidymodels)
library(modeltime.h2o)
library(tidyverse)
library(timetk)
#read in data
data_tbl <- walmart_sales_weekly %>%
select(id, Date, Weekly_Sales)%>%
mutate(id = paste("store", str_remove(id, "1_")))
head(data_tbl)
## # A tibble: 6 x 3
## id Date Weekly_Sales
## <chr> <date> <dbl>
## 1 store 1 2010-02-05 24924.
## 2 store 1 2010-02-12 46039.
## 3 store 1 2010-02-19 41596.
## 4 store 1 2010-02-26 19404.
## 5 store 1 2010-03-05 21828.
## 6 store 1 2010-03-12 21043.
#plot data
data_tbl %>%
group_by(id) %>%
plot_time_series(
.date_var = Date,
.value = Weekly_Sales,
.facet_ncol = 2,
.smooth = F,
.interactive = F
)
#split test data using 3 months
splits <- time_series_split(data_tbl, assess = "3 month", cumulative = TRUE)
#create recipe
recipe_spec <- recipe(Weekly_Sales ~ ., data = training(splits)) %>%
step_timeseries_signature(Date)
train_tbl <- training(splits) %>% bake(prep(recipe_spec), .)
test_tbl <- testing(splits) %>% bake(prep(recipe_spec), .)
#model specs
model_spec <- automl_reg(mode = 'regression') %>%
set_engine(
engine = 'h2o',
max_runtime_secs = 10,
max_runtime_secs_per_model = 10,
max_models = 10,
nfolds = 5,
exclude_algos = c("DeepLearning"),
verbosity = NULL,
seed = 786
)
model_spec
## H2O AutoML Model Specification (regression)
##
## Engine-Specific Arguments:
## max_runtime_secs = 10
## max_runtime_secs_per_model = 10
## max_models = 10
## nfolds = 5
## exclude_algos = c("DeepLearning")
## verbosity = NULL
## seed = 786
##
## Computational engine: h2o
###Fit model
model_fitted <- model_spec %>%
fit(Weekly_Sales ~ ., data = train_tbl)
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
| | 1%
|
|======== | 11%
|
|=============== | 22%
|
|======================= | 33%
|
|============================== | 43%
|
|====================================== | 54%
|
|============================================= | 64%
|
|==================================================== | 75%
|
|============================================================ | 85%
|
|==================================================================== | 97%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
## model_id mean_residual_deviance
## 1 StackedEnsemble_AllModels_AutoML_20210630_211140 46162977
## 2 StackedEnsemble_BestOfFamily_AutoML_20210630_211140 54623136
## 3 DRF_1_AutoML_20210630_211140 139444042
## 4 GBM_grid__1_AutoML_20210630_211140_model_1 357134113
## 5 GBM_3_AutoML_20210630_211140 593750048
## 6 GBM_1_AutoML_20210630_211140 636007808
## rmse mse mae rmsle
## 1 6794.334 46162977 4414.035 0.1795215
## 2 7390.747 54623136 4968.496 0.2063314
## 3 11808.643 139444042 8286.015 0.3025745
## 4 18897.992 357134113 15727.312 0.5339985
## 5 24366.987 593750048 20577.132 0.6397330
## 6 25219.195 636007808 21368.810 0.6536901
##
## [12 rows x 6 columns]
model_fitted
## parsnip model object
##
## Fit time: 52.8s
##
## H2O AutoML - Stackedensemble
## --------
## Model: Model Details:
## ==============
##
## H2ORegressionModel: stackedensemble
## Model ID: StackedEnsemble_AllModels_AutoML_20210630_211140
## Number of Base Models: 10
##
## Base Models (count by algorithm type):
##
## drf gbm glm
## 2 7 1
##
## Metalearner:
##
## Metalearner algorithm: glm
## Metalearner cross-validation fold assignment:
## Fold assignment scheme: AUTO
## Number of folds: 5
## Fold column: NULL
## Metalearner hyperparameters:
##
##
## H2ORegressionMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 24624542
## RMSE: 4962.312
## MAE: 3179.488
## RMSLE: 0.1414845
## Mean Residual Deviance : 24624542
##
##
##
## H2ORegressionMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 46162977
## RMSE: 6794.334
## MAE: 4414.035
## RMSLE: 0.1795215
## Mean Residual Deviance : 46162977
Top model selected is a stacked ensemble
automl_leaderboard(model_fitted)
## # A tibble: 12 x 6
## model_id mean_residual_devi~ rmse mse mae rmsle
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 StackedEnsemble_AllModels_A~ 46162977. 6794. 4.62e7 4414. 0.180
## 2 StackedEnsemble_BestOfFamil~ 54623136. 7391. 5.46e7 4968. 0.206
## 3 DRF_1_AutoML_20210630_211140 139444042. 11809. 1.39e8 8286. 0.303
## 4 GBM_grid__1_AutoML_20210630~ 357134113. 18898. 3.57e8 15727. 0.534
## 5 GBM_3_AutoML_20210630_211140 593750048. 24367. 5.94e8 20577. 0.640
## 6 GBM_1_AutoML_20210630_211140 636007808. 25219. 6.36e8 21369. 0.654
## 7 GBM_4_AutoML_20210630_211140 850921054. 29171. 8.51e8 24792. 0.718
## 8 GBM_grid__1_AutoML_20210630~ 885064668. 29750. 8.85e8 24974. 0.732
## 9 GBM_2_AutoML_20210630_211140 889643548. 29827. 8.90e8 25367. 0.728
## 10 GBM_5_AutoML_20210630_211140 1117993584. 33436. 1.12e9 27973. 0.787
## 11 XRT_1_AutoML_20210630_211140 1121907878. 33495. 1.12e9 23121. 0.791
## 12 GLM_1_AutoML_20210630_211140 1314471111. 36256. 1.31e9 31032. 0.835
predict(model_fitted, test_tbl)
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
## # A tibble: 84 x 1
## .pred
## <dbl>
## 1 22536.
## 2 22174.
## 3 37690.
## 4 39388.
## 5 80247.
## 6 82565.
## 7 133096.
## 8 23155.
## 9 22386.
## 10 37495.
## # ... with 74 more rows
After model is fitted can follow modeltime work flow: - Add fitted models to a Model Table. - Calibrate the models to a testing set. - Perform Testing Set Forecast Evaluation & Accuracy Evaluation. - Refit the models to Full Dataset & Forecast Forward ### Create model table
modeltime_tbl <- modeltime_table(
model_fitted
)
modeltime_tbl
## # Modeltime Table
## # A tibble: 1 x 3
## .model_id .model .model_desc
## <int> <list> <chr>
## 1 1 <fit[+]> H2O AUTOML - STACKEDENSEMBLE
modeltime_tbl %>%
modeltime_calibrate(test_tbl) %>%
modeltime_forecast(
new_data = test_tbl,
actual_data = data_tbl,
keep_data = TRUE
) %>%
group_by(id) %>%
plot_modeltime_forecast(
.facet_ncol = 2,
.interactive = FALSE
)
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
data_prepared_tbl <- bind_rows(train_tbl, test_tbl)
future_tbl <- data_prepared_tbl %>%
group_by(id) %>%
future_frame(.length_out = "1 year") %>%
ungroup()
future_prepared_tbl <- bake(prep(recipe_spec), future_tbl)
#refitted model table
refit_tbl <- modeltime_tbl %>%
modeltime_refit(data_prepared_tbl)
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
| | 1%
|
|======== | 11%
|
|=================================================== | 72%
|
|========================================================== | 83%
|
|================================================================= | 93%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
## model_id mean_residual_deviance
## 1 StackedEnsemble_AllModels_AutoML_20210630_211230 45534452
## 2 StackedEnsemble_BestOfFamily_AutoML_20210630_211230 45716831
## 3 DRF_1_AutoML_20210630_211230 127384172
## 4 GBM_1_AutoML_20210630_211230 712885559
## 5 GBM_2_AutoML_20210630_211230 1072638362
## 6 GLM_1_AutoML_20210630_211230 1314752621
## rmse mse mae rmsle
## 1 6747.922 45534452 4254.959 0.1731254
## 2 6761.422 45716831 4260.555 0.1732843
## 3 11286.460 127384172 6573.892 0.2623809
## 4 26699.917 712885559 22694.761 0.6738699
## 5 32751.158 1072638362 27939.187 0.7724833
## 6 36259.518 1314752621 31017.107 0.8289850
##
## [6 rows x 6 columns]
refit_tbl %>%
modeltime_forecast(
new_data = future_prepared_tbl,
actual_data = data_prepared_tbl,
keep_data = TRUE
) %>%
group_by(id) %>%
plot_modeltime_forecast(
.facet_ncol = 2,
.interactive = T,
.legend_show = F,
.title = "12 Month Forecast - Weekly Sales",
.plotly_slider = F)
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%