# Bibliotecas
#------------
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(timetk)
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:h2o':
##
## day, hour, month, week, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ggplot2)
#Data
#-----------------
beer_sales_tbl <- tq_get("S4248SM144NCEN",
get = "economic.data",
from = "2010-01-01",
to = "2018-01-01")
head(beer_sales_tbl)
## # A tibble: 6 × 3
## symbol date price
## <chr> <date> <int>
## 1 S4248SM144NCEN 2010-01-01 6558
## 2 S4248SM144NCEN 2010-02-01 7481
## 3 S4248SM144NCEN 2010-03-01 9475
## 4 S4248SM144NCEN 2010-04-01 9424
## 5 S4248SM144NCEN 2010-05-01 9351
## 6 S4248SM144NCEN 2010-06-01 10552
str(beer_sales_tbl)
## tibble [97 × 3] (S3: tbl_df/tbl/data.frame)
## $ symbol: chr [1:97] "S4248SM144NCEN" "S4248SM144NCEN" "S4248SM144NCEN" "S4248SM144NCEN" ...
## $ date : Date[1:97], format: "2010-01-01" "2010-02-01" ...
## $ price : int [1:97] 6558 7481 9475 9424 9351 10552 9077 9273 9420 9413 ...
summary(beer_sales_tbl$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6558 9475 10884 10718 11700 14428
# Representación de los datos
#------------------------------------
beer_sales_tbl %>% ggplot(aes(x=date, y=price)) +
geom_line() +
geom_point() +
geom_ma(ma_fun = SMA, n = 12, size = 4) +
annotate('text',
x = ymd('2016-07-01'),
y = 7000,
color = 'black',
label = 'Validation \n Region') +
geom_rect(xmin = as.numeric(ymd('2016-01-01')),
xmax = as.numeric(ymd('2016-12-31')),
ymin = 0,
ymax = Inf,
fill = 'red',
alpha = 0.002) +
annotate('text',
x = ymd('2017-05-01'),
y = 7000,
color = 'black',
label = 'Test \n Region') +
geom_rect(xmin = as.numeric(ymd('2017-01-01')),
xmax = as.numeric(ymd('2018-01-01')),
ymin = 0,
ymax = Inf,
fill = 'blue',
alpha = 0.002) +
annotate('text',
x = ymd('2012-01-01'),
y = 7000,
color = 'black',
label = 'Train \n Region') +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
labs(x = "Date", y = "Sales",
title = "Beer Sales: 2007 through 2018",
subtitle = "Approach for Testing Model's Accuracy: Train, Validation, and Tests Sets Shown") +
theme_bw()
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Spread timeseries signature
(beer_sales_tbl <- beer_sales_tbl %>% tk_augment_timeseries_signature())
## tk_augment_timeseries_signature(): Using the following .date_var variable: date
## # A tibble: 97 × 31
## symbol date price index.num diff year year.iso half quarter month
## <chr> <date> <int> <dbl> <dbl> <int> <int> <int> <int> <int>
## 1 S4248S… 2010-01-01 6558 1.26e9 NA 2010 2009 1 1 1
## 2 S4248S… 2010-02-01 7481 1.26e9 2678400 2010 2010 1 1 2
## 3 S4248S… 2010-03-01 9475 1.27e9 2419200 2010 2010 1 1 3
## 4 S4248S… 2010-04-01 9424 1.27e9 2678400 2010 2010 1 2 4
## 5 S4248S… 2010-05-01 9351 1.27e9 2592000 2010 2010 1 2 5
## 6 S4248S… 2010-06-01 10552 1.28e9 2678400 2010 2010 1 2 6
## 7 S4248S… 2010-07-01 9077 1.28e9 2592000 2010 2010 2 3 7
## 8 S4248S… 2010-08-01 9273 1.28e9 2678400 2010 2010 2 3 8
## 9 S4248S… 2010-09-01 9420 1.28e9 2678400 2010 2010 2 3 9
## 10 S4248S… 2010-10-01 9413 1.29e9 2592000 2010 2010 2 4 10
## # ℹ 87 more rows
## # ℹ 21 more variables: month.xts <int>, month.lbl <ord>, day <int>, hour <int>,
## # minute <int>, second <int>, hour12 <int>, am.pm <int>, wday <int>,
## # wday.xts <int>, wday.lbl <ord>, mday <int>, qday <int>, yday <int>,
## # mweek <int>, week <int>, week.iso <int>, week2 <int>, week3 <int>,
## # week4 <int>, mday7 <int>
str(beer_sales_tbl)
## tibble [97 × 31] (S3: tbl_df/tbl/data.frame)
## $ symbol : chr [1:97] "S4248SM144NCEN" "S4248SM144NCEN" "S4248SM144NCEN" "S4248SM144NCEN" ...
## $ date : Date[1:97], format: "2010-01-01" "2010-02-01" ...
## $ price : int [1:97] 6558 7481 9475 9424 9351 10552 9077 9273 9420 9413 ...
## $ index.num: num [1:97] 1.26e+09 1.26e+09 1.27e+09 1.27e+09 1.27e+09 ...
## $ diff : num [1:97] NA 2678400 2419200 2678400 2592000 ...
## $ year : int [1:97] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ year.iso : int [1:97] 2009 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ half : int [1:97] 1 1 1 1 1 1 2 2 2 2 ...
## $ quarter : int [1:97] 1 1 1 2 2 2 3 3 3 4 ...
## $ month : int [1:97] 1 2 3 4 5 6 7 8 9 10 ...
## $ month.xts: int [1:97] 0 1 2 3 4 5 6 7 8 9 ...
## $ month.lbl: Ord.factor w/ 12 levels "enero"<"febrero"<..: 1 2 3 4 5 6 7 8 9 10 ...
## $ day : int [1:97] 1 1 1 1 1 1 1 1 1 1 ...
## $ hour : int [1:97] 0 0 0 0 0 0 0 0 0 0 ...
## $ minute : int [1:97] 0 0 0 0 0 0 0 0 0 0 ...
## $ second : int [1:97] 0 0 0 0 0 0 0 0 0 0 ...
## $ hour12 : int [1:97] 0 0 0 0 0 0 0 0 0 0 ...
## $ am.pm : int [1:97] 1 1 1 1 1 1 1 1 1 1 ...
## $ wday : int [1:97] 6 2 2 5 7 3 5 1 4 6 ...
## $ wday.xts : int [1:97] 5 1 1 4 6 2 4 0 3 5 ...
## $ wday.lbl : Ord.factor w/ 7 levels "domingo"<"lunes"<..: 6 2 2 5 7 3 5 1 4 6 ...
## $ mday : int [1:97] 1 1 1 1 1 1 1 1 1 1 ...
## $ qday : int [1:97] 1 32 60 1 31 62 1 32 63 1 ...
## $ yday : int [1:97] 1 32 60 91 121 152 182 213 244 274 ...
## $ mweek : int [1:97] 0 1 1 1 0 1 1 0 1 0 ...
## $ week : int [1:97] 1 5 9 13 18 22 26 31 35 40 ...
## $ week.iso : int [1:97] 53 5 9 13 17 22 26 30 35 39 ...
## $ week2 : int [1:97] 1 1 1 1 0 0 0 1 1 0 ...
## $ week3 : int [1:97] 1 2 0 1 0 1 2 1 2 1 ...
## $ week4 : int [1:97] 1 1 1 1 2 2 2 3 3 0 ...
## $ mday7 : int [1:97] 1 1 1 1 1 1 1 1 1 1 ...
# Ordered factor should be converted into character and then it needs to be converted to factor
library(dplyr)
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
beer_sales_clean <- beer_sales_tbl %>%
select(-date) %>%
select_if(~!any(is.na(.))) %>%
mutate_if(is.ordered, ~ as.character(.) %>% as.factor)
beer_sales_tbl
## # A tibble: 97 × 31
## symbol date price index.num diff year year.iso half quarter month
## <chr> <date> <int> <dbl> <dbl> <int> <int> <int> <int> <int>
## 1 S4248S… 2010-01-01 6558 1.26e9 NA 2010 2009 1 1 1
## 2 S4248S… 2010-02-01 7481 1.26e9 2678400 2010 2010 1 1 2
## 3 S4248S… 2010-03-01 9475 1.27e9 2419200 2010 2010 1 1 3
## 4 S4248S… 2010-04-01 9424 1.27e9 2678400 2010 2010 1 2 4
## 5 S4248S… 2010-05-01 9351 1.27e9 2592000 2010 2010 1 2 5
## 6 S4248S… 2010-06-01 10552 1.28e9 2678400 2010 2010 1 2 6
## 7 S4248S… 2010-07-01 9077 1.28e9 2592000 2010 2010 2 3 7
## 8 S4248S… 2010-08-01 9273 1.28e9 2678400 2010 2010 2 3 8
## 9 S4248S… 2010-09-01 9420 1.28e9 2678400 2010 2010 2 3 9
## 10 S4248S… 2010-10-01 9413 1.29e9 2592000 2010 2010 2 4 10
## # ℹ 87 more rows
## # ℹ 21 more variables: month.xts <int>, month.lbl <ord>, day <int>, hour <int>,
## # minute <int>, second <int>, hour12 <int>, am.pm <int>, wday <int>,
## # wday.xts <int>, wday.lbl <ord>, mday <int>, qday <int>, yday <int>,
## # mweek <int>, week <int>, week.iso <int>, week2 <int>, week3 <int>,
## # week4 <int>, mday7 <int>
train_tbl <- beer_sales_clean %>% filter (year< 2016)
valid_tbl <- beer_sales_clean %>% filter(year == 2016)
test_tbl <- beer_sales_clean %>% filter(year == 2017)
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 35 minutes 4 seconds
## H2O cluster timezone: Europe/Paris
## H2O data parsing timezone: UTC
## H2O cluster version: 3.42.0.2
## H2O cluster version age: 5 months and 8 days
## H2O cluster name: H2O_started_from_R_Usuario_sph802
## H2O cluster total nodes: 1
## H2O cluster total memory: 6.73 GB
## H2O cluster total cores: 20
## H2O cluster allowed cores: 20
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.2 (2023-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (5 months and 8 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
train_h2o <- as.h2o(train_tbl)
##
|
| | 0%
|
|======================================================================| 100%
valid_h2o <- as.h2o(valid_tbl)
##
|
| | 0%
|
|======================================================================| 100%
test_h2o <- as.h2o(test_tbl)
##
|
| | 0%
|
|======================================================================| 100%
y <- 'price'
x <- names(train_tbl) %>% setdiff(y)
Modelo
# linear regression model used, but can use any model
set.seed(123)
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 60,
stopping_metric = "deviance")
##
|
| | 0%
|
|================ | 22%
## 18:26:21.567: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 18:26:21.567: AutoML: XGBoost is not available; skipping it.
## 18:26:21.570: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:21.592: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:21.592: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 72.0.
## 18:26:21.593: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:21.738: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:21.889: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.38: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.175: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.295: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.424: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.581: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.735: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.780: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.909: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|=========================== | 38%
|
|============================== | 42%
|
|================================= | 47%
|
|==================================== | 52%
|
|======================================= | 56%
|
|=========================================== | 61%
|
|============================================== | 66%
|
|================================================== | 71%
|
|===================================================== | 75%
|
|======================================================== | 80%
|
|=========================================================== | 85%
## 18:27:13.266: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:27:13.395: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
|
|=============================================================== | 90%
|
|================================================================== | 94%
|
|======================================================================| 99%
## 18:27:20.105: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:27:20.602: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:27:20.932: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:27:21.169: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
|
|======================================================================| 100%
# Extract leader model
automl_leader <- automl_models_h2o@leader
# Generate prediction using h2o.predict()
pred_h2o <- h2o.predict(automl_leader, newdata = test_h2o)
##
|
| | 0%
|
|======================================================================| 100%
#Investigate error
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ✔ readr 2.1.4 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::day() masks h2o::day()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks xts::first()
## ✖ lubridate::hour() masks h2o::hour()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks xts::last()
## ✖ lubridate::month() masks h2o::month()
## ✖ lubridate::week() masks h2o::week()
## ✖ lubridate::year() masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
error_tbl <- beer_sales_tbl %>%
filter(lubridate::year(date) == 2017) %>%
add_column(pred = pred_h2o %>% as.vector()) %>%
rename(actual = price) %>%
mutate(
error = actual - pred,
error_pct = error / actual
) %>%
select(date, actual, pred, error, error_pct)
error_tbl
## # A tibble: 12 × 5
## date actual pred error error_pct
## <date> <int> <dbl> <dbl> <dbl>
## 1 2017-01-01 8870 9335. -465. -0.0524
## 2 2017-02-01 10251 10326. -75.0 -0.00732
## 3 2017-03-01 12241 11204. 1037. 0.0847
## 4 2017-04-01 11266 11300. -34.4 -0.00306
## 5 2017-05-01 13275 13439. -164. -0.0123
## 6 2017-06-01 14428 13743. 685. 0.0475
## 7 2017-07-01 11165 11541. -376. -0.0337
## 8 2017-08-01 13098 12865. 233. 0.0178
## 9 2017-09-01 11619 11872. -253. -0.0217
## 10 2017-10-01 12386 12864. -478. -0.0386
## 11 2017-11-01 12904 12796. 108. 0.00840
## 12 2017-12-01 13859 14368. -509. -0.0368
#Error function
f_error <- function(data){
data %>% summarise(
n=length(error),
mean = mean(error),
var = sum((error-mean)^2)/(n-1),
std = sqrt(var),
mae = mean(abs(error)),
rmse = mean(error^2)^0.5,
mape = mean(abs(error_pct)),
mpe = mean(error_pct),
skew = sum(((error - mean)/std)^3)/n,
kurtosis = sum(((error - mean)/std)^4)/n-3
)
}
error_tbl %>% f_error()
## # A tibble: 1 × 10
## n mean var std mae rmse mape mpe skew kurtosis
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 -24.3 231481. 481. 368. 461. 0.0304 -0.00396 0.912 -0.390
# Create spooky dark theme:
theme_spooky = function(base_size = 10, base_family = "Chiller") {
theme_grey(base_size = base_size, base_family = base_family) %+replace%
theme(
# Specify axis options
axis.line = element_blank(),
axis.text.x = element_text(size = base_size*0.8, color = "white", lineheight = 0.9),
axis.text.y = element_text(size = base_size*0.8, color = "white", lineheight = 0.9),
axis.ticks = element_line(color = "white", size = 0.2),
axis.title.x = element_text(size = base_size, color = "white", margin = margin(0, 10, 0, 0)),
axis.title.y = element_text(size = base_size, color = "white", angle = 90, margin = margin(0, 10, 0, 0)),
axis.ticks.length = unit(0.3, "lines"),
# Specify legend options
legend.background = element_rect(color = NA, fill = " gray10"),
legend.key = element_rect(color = "white", fill = " gray10"),
legend.key.size = unit(1.2, "lines"),
legend.key.height = NULL,
legend.key.width = NULL,
legend.text = element_text(size = base_size*0.8, color = "white"),
legend.title = element_text(size = base_size*0.8, face = "bold", hjust = 0, color = "white"),
legend.position = "none",
legend.text.align = NULL,
legend.title.align = NULL,
legend.direction = "vertical",
legend.box = NULL,
# Specify panel options
panel.background = element_rect(fill = " gray10", color = NA),
#panel.border = element_rect(fill = NA, color = "white"),
panel.border = element_blank(),
panel.grid.major = element_line(color = "grey35"),
panel.grid.minor = element_line(color = "grey20"),
panel.spacing = unit(0.5, "lines"),
# Specify facetting options
strip.background = element_rect(fill = "grey30", color = "grey10"),
strip.text.x = element_text(size = base_size*0.8, color = "white"),
strip.text.y = element_text(size = base_size*0.8, color = "white",angle = -90),
# Specify plot options
plot.background = element_rect(color = " gray10", fill = " gray10"),
plot.title = element_text(size = base_size*1.2, color = "white",hjust=0,lineheight=1.25,
margin=margin(2,2,2,2)),
plot.subtitle = element_text(size = base_size*1, color = "white",hjust=0, margin=margin(2,2,2,2)),
plot.caption = element_text(size = base_size*0.8, color = "white",hjust=0),
plot.margin = unit(rep(1, 4), "lines")
)
}
#Load chiller fontset to use in theme_spooky()
library(extrafont)
## Registering fonts with R
loadfonts(device="win")
beer_sales_tbl %>%
ggplot(aes(x = date, y = price)) +
# Data - Spooky Orange
geom_point(size = 2, color = "gray", alpha = 0.5, shape = 21, fill = "orange") +
geom_line(color = "orange", size = 0.5) +
geom_ma(n = 12, color = "white") +
# Predictions - Spooky Purple
geom_point(aes(y = pred), size = 2, color = "gray", alpha = 1, shape = 21, fill = "purple", data = error_tbl) +
geom_line(aes(y = pred), color = "purple", size = 0.5, data = error_tbl) +
# Aesthetics
theme_spooky(base_size = 20) +
labs(
title = "Beer Sales Forecast: h2o + timetk",
subtitle = "H2O had highest accuracy, MAPE = 4.5%",
caption="Practice with H2o timeseries forecast"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
ARIMA FORECAST
#Load package
library(sweep) #Broom-style tidiers for forecast package
library(forecast) #Forecasting model like ARIMA, GARCH, .... and prediction
library(tidyquant) #Load tidyverse and finacial data
library(timetk) #Function to work with time series data
#Get data again
beer_sales_tbl <- tq_get("S4248SM144NCEN",
get = "economic.data",
from = "2010-01-01",
to = "2018-01-01")
#Convert data from tbl into ts object
train_ts <- timetk::tk_ts(beer_sales_tbl, start = 2010, end = 2017, freq = 12)
## Warning: Non-numeric columns being dropped: symbol, date
test_ts <- timetk::tk_ts(beer_sales_tbl, start = 2017, end = 2018, freq = 12)
## Warning: Non-numeric columns being dropped: symbol, date
#Build model ARIMA
fit_arima <- auto.arima(train_ts)
summary(fit_arima)
## Series: train_ts
## ARIMA(3,0,0)(0,1,1)[12] with drift
##
## Coefficients:
## ar1 ar2 ar3 sma1 drift
## -0.2663 0.0933 0.6077 -0.6257 33.5604
## s.e. 0.0948 0.1008 0.0966 0.4080 3.3790
##
## sigma^2 = 153862: log likelihood = -540.64
## AIC=1093.28 AICc=1094.56 BIC=1107.03
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 8.227108 350.8415 248.643 -0.04664823 2.330124 0.4558132
## ACF1
## Training set -0.003628103
# Check for actual, fitted and resid value
sw_augment(fit_arima, timetk_idx = TRUE)
## Warning in .check_tzones(e1, e2): 'tzone' attributes are inconsistent
## # A tibble: 85 × 4
## index .actual .fitted .resid
## <date> <dbl> <dbl> <dbl>
## 1 2010-01-01 6558 6551. 6.52
## 2 2010-02-01 7481 7474. 7.41
## 3 2010-03-01 9475 9466. 9.37
## 4 2010-04-01 9424 9415. 9.29
## 5 2010-05-01 9351 9342. 9.18
## 6 2010-06-01 10552 10542. 10.4
## 7 2010-07-01 9077 9068. 8.84
## 8 2010-08-01 9273 9264. 9.00
## 9 2010-09-01 9420 9411. 9.12
## 10 2010-10-01 9413 9404. 9.08
## # ℹ 75 more rows
#Visualize residual
sw_augment(fit_arima, timetk_idx = TRUE) %>%
ggplot(aes(x = index, y = .resid)) +
geom_point(colour = 'blue', size = 2, stroke = 1, shape = 21) +
geom_line() +
geom_hline(yintercept = 0, color = "red") +
labs(title = "Residual diagnostic") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
theme_classic()
## Warning in .check_tzones(e1, e2): 'tzone' attributes are inconsistent
# Forecast in next 12 months
fcast_arima <- forecast(fit_arima, h=12)
#Get forecast value by using sw_sweep()
fcast_tbl <- sw_sweep(fcast_arima, timetk_idx = TRUE)
## Warning in .check_tzones(e1, e2): 'tzone' attributes are inconsistent
fcast_tbl
## # A tibble: 97 × 7
## index key price lo.80 lo.95 hi.80 hi.95
## <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2010-01-01 actual 6558 NA NA NA NA
## 2 2010-02-01 actual 7481 NA NA NA NA
## 3 2010-03-01 actual 9475 NA NA NA NA
## 4 2010-04-01 actual 9424 NA NA NA NA
## 5 2010-05-01 actual 9351 NA NA NA NA
## 6 2010-06-01 actual 10552 NA NA NA NA
## 7 2010-07-01 actual 9077 NA NA NA NA
## 8 2010-08-01 actual 9273 NA NA NA NA
## 9 2010-09-01 actual 9420 NA NA NA NA
## 10 2010-10-01 actual 9413 NA NA NA NA
## # ℹ 87 more rows
#Get actual data in test time
actual_tbl <- beer_sales_tbl %>% filter(date >= '2017-01-01')
#Visualize with actual value
fcast_tbl %>%
ggplot(aes(x = index, y = price, color = key)) +
geom_line() +
geom_point() +
#Ribbon 95%
geom_ribbon(aes(ymin = lo.95,ymax = hi.95),
fill = '#596DD5', alpha = 0.8,size = 0) +
#Actual data
geom_line(data = actual_tbl, aes(x = date, y = price), color = 'black') +
geom_point(data = actual_tbl, aes(x = date, y = price), color = 'red') +
labs(title = "Beer Sales Forecast: ARIMA", x = "", y = "Thousands of Tons",
subtitle = "sw_sweep tidies the auto.arima() forecast output") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
scale_color_tq() +
scale_fill_tq()
## Warning in max(ids, na.rm = TRUE): ningun argumento finito para max; retornando
## -Inf
#Calculate Error
error_arima<- fcast_tbl %>% filter(key == 'forecast') %>%
left_join(actual_tbl, by=c("index"="date")) %>%
mutate(date = index,actual = price.y, pred = price.x) %>%
select(date,actual,pred) %>%
mutate(error = actual - pred,
error_pct = error/actual)
error_arima
## # A tibble: 12 × 5
## date actual pred error error_pct
## <date> <int> <dbl> <dbl> <dbl>
## 1 2017-02-01 10251 10964. -713. -0.0696
## 2 2017-03-01 12241 11665. 576. 0.0471
## 3 2017-04-01 11266 11660. -394. -0.0350
## 4 2017-05-01 13275 13031. 244. 0.0184
## 5 2017-06-01 14428 13279. 1149. 0.0796
## 6 2017-07-01 11165 11873. -708. -0.0634
## 7 2017-08-01 13098 12751. 347. 0.0265
## 8 2017-09-01 11619 12130. -511. -0.0440
## 9 2017-10-01 12386 12570. -184. -0.0148
## 10 2017-11-01 12904 12702. 202. 0.0156
## 11 2017-12-01 13859 14363. -504. -0.0364
## 12 2018-01-01 9248 9618. -370. -0.0400
f_error(error_arima)
## # A tibble: 1 × 10
## n mean var std mae rmse mape mpe skew kurtosis
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 -72.2 332822. 577. 492. 557. 0.0409 -0.00966 0.639 -0.843
LINEAR REGRESSION
library(timetk)
library(tidyquant)
#Add new time factor into dataframe
#Get data again
beer_sales_tbl <- tq_get("S4248SM144NCEN",
get = "economic.data",
from = "2010-01-01",
to = "2018-01-01")
beer_sales_tbl <- beer_sales_tbl %>%
tk_augment_timeseries_signature()
## tk_augment_timeseries_signature(): Using the following .date_var variable: date
beer_sales_tbl
## # A tibble: 97 × 31
## symbol date price index.num diff year year.iso half quarter month
## <chr> <date> <int> <dbl> <dbl> <int> <int> <int> <int> <int>
## 1 S4248S… 2010-01-01 6558 1.26e9 NA 2010 2009 1 1 1
## 2 S4248S… 2010-02-01 7481 1.26e9 2678400 2010 2010 1 1 2
## 3 S4248S… 2010-03-01 9475 1.27e9 2419200 2010 2010 1 1 3
## 4 S4248S… 2010-04-01 9424 1.27e9 2678400 2010 2010 1 2 4
## 5 S4248S… 2010-05-01 9351 1.27e9 2592000 2010 2010 1 2 5
## 6 S4248S… 2010-06-01 10552 1.28e9 2678400 2010 2010 1 2 6
## 7 S4248S… 2010-07-01 9077 1.28e9 2592000 2010 2010 2 3 7
## 8 S4248S… 2010-08-01 9273 1.28e9 2678400 2010 2010 2 3 8
## 9 S4248S… 2010-09-01 9420 1.28e9 2678400 2010 2010 2 3 9
## 10 S4248S… 2010-10-01 9413 1.29e9 2592000 2010 2010 2 4 10
## # ℹ 87 more rows
## # ℹ 21 more variables: month.xts <int>, month.lbl <ord>, day <int>, hour <int>,
## # minute <int>, second <int>, hour12 <int>, am.pm <int>, wday <int>,
## # wday.xts <int>, wday.lbl <ord>, mday <int>, qday <int>, yday <int>,
## # mweek <int>, week <int>, week.iso <int>, week2 <int>, week3 <int>,
## # week4 <int>, mday7 <int>
train_lm <- beer_sales_tbl %>% filter(date <= '2017-01-01')
test_lm <- beer_sales_tbl %>% filter(date <= '2018-01-01' & date > '2017-01-01')
CONCLUSION It is not true when we say that the accuracy of time series machine learning is alway superior to ARIMA, GARCH or other forecast techniques. It depends on our data structure and characteristic. If our data have true regression form is similiar with ARIMA or GARCH performance, the result may be show in contrast. Particularly in this case lm and ARIMA have lower MAPE than h2o auto regression machine learning with respectively only 2.5% and 3.8%. So the best way to find the model that is the best or superior with our data, it is training in multiple models. Moreover, not only accuracy depends on the algorithms, but also the ways we choose strong predictors affect to our target. Finally, you should not absolutely give your confidence in any algorithms and have to back test your model in regularly. Timeseries alway are contrained by changing data structure and may be in this time period your current algorithms are strong but in other period it is week. You should consider to change algorithms in the case it seem to not accuracy.