Series Temporales y Machine Learning

# Bibliotecas
#------------
library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(timetk)
library(tidyquant)

## Loading required package: lubridate

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:h2o':
## 
##     day, hour, month, week, year

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

## Loading required package: PerformanceAnalytics

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

## Loading required package: quantmod

## Loading required package: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(ggplot2)

#Data
#-----------------
beer_sales_tbl <- tq_get("S4248SM144NCEN", 
                         get = "economic.data", 
                         from = "2010-01-01", 
                         to = "2018-01-01")

head(beer_sales_tbl)

## # A tibble: 6 × 3
##   symbol         date       price
##   <chr>          <date>     <int>
## 1 S4248SM144NCEN 2010-01-01  6558
## 2 S4248SM144NCEN 2010-02-01  7481
## 3 S4248SM144NCEN 2010-03-01  9475
## 4 S4248SM144NCEN 2010-04-01  9424
## 5 S4248SM144NCEN 2010-05-01  9351
## 6 S4248SM144NCEN 2010-06-01 10552

str(beer_sales_tbl)

## tibble [97 × 3] (S3: tbl_df/tbl/data.frame)
##  $ symbol: chr [1:97] "S4248SM144NCEN" "S4248SM144NCEN" "S4248SM144NCEN" "S4248SM144NCEN" ...
##  $ date  : Date[1:97], format: "2010-01-01" "2010-02-01" ...
##  $ price : int [1:97] 6558 7481 9475 9424 9351 10552 9077 9273 9420 9413 ...

summary(beer_sales_tbl$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6558    9475   10884   10718   11700   14428

# Representación de los datos 
#------------------------------------
beer_sales_tbl %>% ggplot(aes(x=date, y=price)) + 
  geom_line() + 
  geom_point() + 
  geom_ma(ma_fun = SMA, n = 12, size = 4)  + 
  annotate('text', 
            x = ymd('2016-07-01'), 
            y = 7000, 
            color = 'black', 
            label = 'Validation \n Region') +
  
  
  geom_rect(xmin = as.numeric(ymd('2016-01-01')),
            xmax = as.numeric(ymd('2016-12-31')),
            ymin = 0,
            ymax = Inf,
            fill = 'red',
            alpha = 0.002) +
  
  annotate('text',
           x = ymd('2017-05-01'),
           y = 7000,
           color = 'black',
           label = 'Test \n Region') +
  
  geom_rect(xmin = as.numeric(ymd('2017-01-01')),
            xmax = as.numeric(ymd('2018-01-01')),
            ymin = 0,
            ymax = Inf,
            fill = 'blue',
            alpha = 0.002) +
  
  annotate('text',
           x = ymd('2012-01-01'),
           y = 7000,
           color = 'black',
           label = 'Train \n Region') +
  
scale_x_date(date_breaks = "1 year", date_labels = "%Y") + 
  labs(x = "Date", y = "Sales",
       title = "Beer Sales: 2007 through 2018",
       subtitle = "Approach for Testing Model's Accuracy: Train, Validation, and Tests Sets Shown") +
  
  theme_bw()

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Spread timeseries signature
(beer_sales_tbl <- beer_sales_tbl %>% tk_augment_timeseries_signature())

## tk_augment_timeseries_signature(): Using the following .date_var variable: date

## # A tibble: 97 × 31
##    symbol  date       price index.num    diff  year year.iso  half quarter month
##    <chr>   <date>     <int>     <dbl>   <dbl> <int>    <int> <int>   <int> <int>
##  1 S4248S… 2010-01-01  6558    1.26e9      NA  2010     2009     1       1     1
##  2 S4248S… 2010-02-01  7481    1.26e9 2678400  2010     2010     1       1     2
##  3 S4248S… 2010-03-01  9475    1.27e9 2419200  2010     2010     1       1     3
##  4 S4248S… 2010-04-01  9424    1.27e9 2678400  2010     2010     1       2     4
##  5 S4248S… 2010-05-01  9351    1.27e9 2592000  2010     2010     1       2     5
##  6 S4248S… 2010-06-01 10552    1.28e9 2678400  2010     2010     1       2     6
##  7 S4248S… 2010-07-01  9077    1.28e9 2592000  2010     2010     2       3     7
##  8 S4248S… 2010-08-01  9273    1.28e9 2678400  2010     2010     2       3     8
##  9 S4248S… 2010-09-01  9420    1.28e9 2678400  2010     2010     2       3     9
## 10 S4248S… 2010-10-01  9413    1.29e9 2592000  2010     2010     2       4    10
## # ℹ 87 more rows
## # ℹ 21 more variables: month.xts <int>, month.lbl <ord>, day <int>, hour <int>,
## #   minute <int>, second <int>, hour12 <int>, am.pm <int>, wday <int>,
## #   wday.xts <int>, wday.lbl <ord>, mday <int>, qday <int>, yday <int>,
## #   mweek <int>, week <int>, week.iso <int>, week2 <int>, week3 <int>,
## #   week4 <int>, mday7 <int>

str(beer_sales_tbl)

## tibble [97 × 31] (S3: tbl_df/tbl/data.frame)
##  $ symbol   : chr [1:97] "S4248SM144NCEN" "S4248SM144NCEN" "S4248SM144NCEN" "S4248SM144NCEN" ...
##  $ date     : Date[1:97], format: "2010-01-01" "2010-02-01" ...
##  $ price    : int [1:97] 6558 7481 9475 9424 9351 10552 9077 9273 9420 9413 ...
##  $ index.num: num [1:97] 1.26e+09 1.26e+09 1.27e+09 1.27e+09 1.27e+09 ...
##  $ diff     : num [1:97] NA 2678400 2419200 2678400 2592000 ...
##  $ year     : int [1:97] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ year.iso : int [1:97] 2009 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ half     : int [1:97] 1 1 1 1 1 1 2 2 2 2 ...
##  $ quarter  : int [1:97] 1 1 1 2 2 2 3 3 3 4 ...
##  $ month    : int [1:97] 1 2 3 4 5 6 7 8 9 10 ...
##  $ month.xts: int [1:97] 0 1 2 3 4 5 6 7 8 9 ...
##  $ month.lbl: Ord.factor w/ 12 levels "enero"<"febrero"<..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ day      : int [1:97] 1 1 1 1 1 1 1 1 1 1 ...
##  $ hour     : int [1:97] 0 0 0 0 0 0 0 0 0 0 ...
##  $ minute   : int [1:97] 0 0 0 0 0 0 0 0 0 0 ...
##  $ second   : int [1:97] 0 0 0 0 0 0 0 0 0 0 ...
##  $ hour12   : int [1:97] 0 0 0 0 0 0 0 0 0 0 ...
##  $ am.pm    : int [1:97] 1 1 1 1 1 1 1 1 1 1 ...
##  $ wday     : int [1:97] 6 2 2 5 7 3 5 1 4 6 ...
##  $ wday.xts : int [1:97] 5 1 1 4 6 2 4 0 3 5 ...
##  $ wday.lbl : Ord.factor w/ 7 levels "domingo"<"lunes"<..: 6 2 2 5 7 3 5 1 4 6 ...
##  $ mday     : int [1:97] 1 1 1 1 1 1 1 1 1 1 ...
##  $ qday     : int [1:97] 1 32 60 1 31 62 1 32 63 1 ...
##  $ yday     : int [1:97] 1 32 60 91 121 152 182 213 244 274 ...
##  $ mweek    : int [1:97] 0 1 1 1 0 1 1 0 1 0 ...
##  $ week     : int [1:97] 1 5 9 13 18 22 26 31 35 40 ...
##  $ week.iso : int [1:97] 53 5 9 13 17 22 26 30 35 39 ...
##  $ week2    : int [1:97] 1 1 1 1 0 0 0 1 1 0 ...
##  $ week3    : int [1:97] 1 2 0 1 0 1 2 1 2 1 ...
##  $ week4    : int [1:97] 1 1 1 1 2 2 2 3 3 0 ...
##  $ mday7    : int [1:97] 1 1 1 1 1 1 1 1 1 1 ...

# Ordered factor should be converted into character and then it needs to be converted to factor
library(dplyr)

## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:xts':
## 
##     first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

beer_sales_clean <- beer_sales_tbl %>%
  select(-date) %>%
  select_if(~!any(is.na(.))) %>%
  mutate_if(is.ordered, ~ as.character(.) %>% as.factor)

beer_sales_tbl

## # A tibble: 97 × 31
##    symbol  date       price index.num    diff  year year.iso  half quarter month
##    <chr>   <date>     <int>     <dbl>   <dbl> <int>    <int> <int>   <int> <int>
##  1 S4248S… 2010-01-01  6558    1.26e9      NA  2010     2009     1       1     1
##  2 S4248S… 2010-02-01  7481    1.26e9 2678400  2010     2010     1       1     2
##  3 S4248S… 2010-03-01  9475    1.27e9 2419200  2010     2010     1       1     3
##  4 S4248S… 2010-04-01  9424    1.27e9 2678400  2010     2010     1       2     4
##  5 S4248S… 2010-05-01  9351    1.27e9 2592000  2010     2010     1       2     5
##  6 S4248S… 2010-06-01 10552    1.28e9 2678400  2010     2010     1       2     6
##  7 S4248S… 2010-07-01  9077    1.28e9 2592000  2010     2010     2       3     7
##  8 S4248S… 2010-08-01  9273    1.28e9 2678400  2010     2010     2       3     8
##  9 S4248S… 2010-09-01  9420    1.28e9 2678400  2010     2010     2       3     9
## 10 S4248S… 2010-10-01  9413    1.29e9 2592000  2010     2010     2       4    10
## # ℹ 87 more rows
## # ℹ 21 more variables: month.xts <int>, month.lbl <ord>, day <int>, hour <int>,
## #   minute <int>, second <int>, hour12 <int>, am.pm <int>, wday <int>,
## #   wday.xts <int>, wday.lbl <ord>, mday <int>, qday <int>, yday <int>,
## #   mweek <int>, week <int>, week.iso <int>, week2 <int>, week3 <int>,
## #   week4 <int>, mday7 <int>

train_tbl <- beer_sales_clean %>% filter (year< 2016)
valid_tbl <- beer_sales_clean %>% filter(year == 2016)
test_tbl <- beer_sales_clean %>% filter(year == 2017)

h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         35 minutes 4 seconds 
##     H2O cluster timezone:       Europe/Paris 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.42.0.2 
##     H2O cluster version age:    5 months and 8 days 
##     H2O cluster name:           H2O_started_from_R_Usuario_sph802 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   6.73 GB 
##     H2O cluster total cores:    20 
##     H2O cluster allowed cores:  20 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (5 months and 8 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

train_h2o <- as.h2o(train_tbl)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

valid_h2o <- as.h2o(valid_tbl)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

test_h2o <- as.h2o(test_tbl)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

y <- 'price'
x <- names(train_tbl) %>% setdiff(y)

Modelo

# linear regression model used, but can use any model
set.seed(123)
automl_models_h2o <- h2o.automl(
    x = x, 
    y = y, 
    training_frame = train_h2o, 
    validation_frame = valid_h2o, 
    leaderboard_frame = test_h2o, 
    max_runtime_secs = 60, 
    stopping_metric = "deviance")

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |================                                                      |  22%
## 18:26:21.567: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 18:26:21.567: AutoML: XGBoost is not available; skipping it.
## 18:26:21.570: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:21.592: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:21.592: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 72.0.
## 18:26:21.593: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:21.738: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:21.889: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.38: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.175: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.295: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.424: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.581: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.735: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.780: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:26:22.909: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |===========================================================           |  85%
## 18:27:13.266: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:27:13.395: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |======================================================================|  99%
## 18:27:20.105: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:27:20.602: _train param, Dropping bad and constant columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:27:20.932: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
## 18:27:21.169: _train param, Dropping unused columns: [symbol, hour, am.pm, mday7, mday, day, hour12, minute, second]
  |                                                                            
  |======================================================================| 100%

# Extract leader model
automl_leader <- automl_models_h2o@leader

# Generate prediction using h2o.predict()
pred_h2o <- h2o.predict(automl_leader, newdata = test_h2o)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

#Investigate error
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ purrr   1.0.2     ✔ tibble  3.2.1
## ✔ readr   2.1.4     ✔ tidyr   1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::day()   masks h2o::day()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::first()     masks xts::first()
## ✖ lubridate::hour()  masks h2o::hour()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::last()      masks xts::last()
## ✖ lubridate::month() masks h2o::month()
## ✖ lubridate::week()  masks h2o::week()
## ✖ lubridate::year()  masks h2o::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

error_tbl <- beer_sales_tbl %>% 
    filter(lubridate::year(date) == 2017) %>%
    add_column(pred = pred_h2o %>% as.vector()) %>%
    rename(actual = price) %>%
    mutate(
        error     = actual - pred,
        error_pct = error / actual
    ) %>% 
    select(date, actual, pred, error, error_pct)
error_tbl

## # A tibble: 12 × 5
##    date       actual   pred  error error_pct
##    <date>      <int>  <dbl>  <dbl>     <dbl>
##  1 2017-01-01   8870  9335. -465.   -0.0524 
##  2 2017-02-01  10251 10326.  -75.0  -0.00732
##  3 2017-03-01  12241 11204. 1037.    0.0847 
##  4 2017-04-01  11266 11300.  -34.4  -0.00306
##  5 2017-05-01  13275 13439. -164.   -0.0123 
##  6 2017-06-01  14428 13743.  685.    0.0475 
##  7 2017-07-01  11165 11541. -376.   -0.0337 
##  8 2017-08-01  13098 12865.  233.    0.0178 
##  9 2017-09-01  11619 11872. -253.   -0.0217 
## 10 2017-10-01  12386 12864. -478.   -0.0386 
## 11 2017-11-01  12904 12796.  108.    0.00840
## 12 2017-12-01  13859 14368. -509.   -0.0368

#Error function
f_error <- function(data){
  data %>% summarise(
    n=length(error),
    mean = mean(error),
    var = sum((error-mean)^2)/(n-1),
    std = sqrt(var),
    mae = mean(abs(error)),
    rmse = mean(error^2)^0.5,
    mape = mean(abs(error_pct)),
    mpe = mean(error_pct),
    skew = sum(((error - mean)/std)^3)/n,
    kurtosis = sum(((error - mean)/std)^4)/n-3
  ) 
}

error_tbl %>% f_error()

## # A tibble: 1 × 10
##       n  mean     var   std   mae  rmse   mape      mpe  skew kurtosis
##   <int> <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>    <dbl> <dbl>    <dbl>
## 1    12 -24.3 231481.  481.  368.  461. 0.0304 -0.00396 0.912   -0.390

# Create spooky dark theme:
theme_spooky = function(base_size = 10, base_family = "Chiller") {
   
    theme_grey(base_size = base_size, base_family = base_family) %+replace%
       
        theme(
            # Specify axis options
            axis.line = element_blank(),  
            axis.text.x = element_text(size = base_size*0.8, color = "white", lineheight = 0.9),  
            axis.text.y = element_text(size = base_size*0.8, color = "white", lineheight = 0.9),  
            axis.ticks = element_line(color = "white", size  =  0.2),  
            axis.title.x = element_text(size = base_size, color = "white", margin = margin(0, 10, 0, 0)),  
            axis.title.y = element_text(size = base_size, color = "white", angle = 90, margin = margin(0, 10, 0, 0)),  
            axis.ticks.length = unit(0.3, "lines"),  
            # Specify legend options
            legend.background = element_rect(color = NA, fill = " gray10"),  
            legend.key = element_rect(color = "white",  fill = " gray10"),  
            legend.key.size = unit(1.2, "lines"),  
            legend.key.height = NULL,  
            legend.key.width = NULL,      
            legend.text = element_text(size = base_size*0.8, color = "white"),  
            legend.title = element_text(size = base_size*0.8, face = "bold", hjust = 0, color = "white"),  
            legend.position = "none",  
            legend.text.align = NULL,  
            legend.title.align = NULL,  
            legend.direction = "vertical",  
            legend.box = NULL,
            # Specify panel options
            panel.background = element_rect(fill = " gray10", color  =  NA),  
            #panel.border = element_rect(fill = NA, color = "white"),  
            panel.border = element_blank(),
            panel.grid.major = element_line(color = "grey35"),  
            panel.grid.minor = element_line(color = "grey20"),  
            panel.spacing = unit(0.5, "lines"),  
            # Specify facetting options
            strip.background = element_rect(fill = "grey30", color = "grey10"),  
            strip.text.x = element_text(size = base_size*0.8, color = "white"),  
            strip.text.y = element_text(size = base_size*0.8, color = "white",angle = -90),  
            # Specify plot options
            plot.background = element_rect(color = " gray10", fill = " gray10"),  
            plot.title = element_text(size = base_size*1.2, color = "white",hjust=0,lineheight=1.25,
                                      margin=margin(2,2,2,2)),  
            plot.subtitle = element_text(size = base_size*1, color = "white",hjust=0,  margin=margin(2,2,2,2)),  
            plot.caption = element_text(size = base_size*0.8, color = "white",hjust=0),  
            plot.margin = unit(rep(1, 4), "lines")
           
        )
   
}

#Load chiller fontset to use in theme_spooky()
library(extrafont)

## Registering fonts with R

loadfonts(device="win")

beer_sales_tbl %>%
    ggplot(aes(x = date, y = price)) +
    # Data - Spooky Orange
    geom_point(size = 2, color = "gray", alpha = 0.5, shape = 21, fill = "orange") +
    geom_line(color = "orange", size = 0.5) +
    geom_ma(n = 12, color = "white") +
    # Predictions - Spooky Purple
    geom_point(aes(y = pred), size = 2, color = "gray", alpha = 1, shape = 21, fill = "purple", data = error_tbl) +
    geom_line(aes(y = pred), color = "purple", size = 0.5, data = error_tbl) +
    # Aesthetics
    theme_spooky(base_size = 20) +
    labs(
        title = "Beer Sales Forecast: h2o + timetk",
        subtitle = "H2O had highest accuracy, MAPE = 4.5%",
        caption="Practice with H2o timeseries forecast"
    )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

ARIMA FORECAST

#Load package
library(sweep) #Broom-style tidiers for forecast package
library(forecast) #Forecasting model like ARIMA, GARCH, .... and prediction
library(tidyquant) #Load tidyverse and finacial data
library(timetk) #Function to work with time series data

#Get data again
beer_sales_tbl <- tq_get("S4248SM144NCEN", 
                         get = "economic.data", 
                         from = "2010-01-01", 
                         to = "2018-01-01")
#Convert data from tbl into ts object 
train_ts <- timetk::tk_ts(beer_sales_tbl, start = 2010, end = 2017, freq = 12)

## Warning: Non-numeric columns being dropped: symbol, date

test_ts <- timetk::tk_ts(beer_sales_tbl, start = 2017, end = 2018, freq = 12)

## Warning: Non-numeric columns being dropped: symbol, date

#Build model ARIMA
fit_arima <- auto.arima(train_ts)

summary(fit_arima)

## Series: train_ts 
## ARIMA(3,0,0)(0,1,1)[12] with drift 
## 
## Coefficients:
##           ar1     ar2     ar3     sma1    drift
##       -0.2663  0.0933  0.6077  -0.6257  33.5604
## s.e.   0.0948  0.1008  0.0966   0.4080   3.3790
## 
## sigma^2 = 153862:  log likelihood = -540.64
## AIC=1093.28   AICc=1094.56   BIC=1107.03
## 
## Training set error measures:
##                    ME     RMSE     MAE         MPE     MAPE      MASE
## Training set 8.227108 350.8415 248.643 -0.04664823 2.330124 0.4558132
##                      ACF1
## Training set -0.003628103

# Check for actual, fitted and resid value
sw_augment(fit_arima, timetk_idx = TRUE)

## Warning in .check_tzones(e1, e2): 'tzone' attributes are inconsistent

## # A tibble: 85 × 4
##    index      .actual .fitted .resid
##    <date>       <dbl>   <dbl>  <dbl>
##  1 2010-01-01    6558   6551.   6.52
##  2 2010-02-01    7481   7474.   7.41
##  3 2010-03-01    9475   9466.   9.37
##  4 2010-04-01    9424   9415.   9.29
##  5 2010-05-01    9351   9342.   9.18
##  6 2010-06-01   10552  10542.  10.4 
##  7 2010-07-01    9077   9068.   8.84
##  8 2010-08-01    9273   9264.   9.00
##  9 2010-09-01    9420   9411.   9.12
## 10 2010-10-01    9413   9404.   9.08
## # ℹ 75 more rows

#Visualize residual
sw_augment(fit_arima, timetk_idx = TRUE) %>% 
  ggplot(aes(x = index, y = .resid)) +
    geom_point(colour = 'blue', size = 2, stroke = 1, shape = 21) + 
    geom_line() +
    geom_hline(yintercept = 0, color = "red") + 
    labs(title = "Residual diagnostic") +
    scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
    theme_classic()

## Warning in .check_tzones(e1, e2): 'tzone' attributes are inconsistent

# Forecast in next 12 months
fcast_arima <- forecast(fit_arima, h=12)

#Get forecast value by using sw_sweep()
fcast_tbl <- sw_sweep(fcast_arima, timetk_idx = TRUE)

## Warning in .check_tzones(e1, e2): 'tzone' attributes are inconsistent

fcast_tbl

## # A tibble: 97 × 7
##    index      key    price lo.80 lo.95 hi.80 hi.95
##    <date>     <chr>  <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 2010-01-01 actual  6558    NA    NA    NA    NA
##  2 2010-02-01 actual  7481    NA    NA    NA    NA
##  3 2010-03-01 actual  9475    NA    NA    NA    NA
##  4 2010-04-01 actual  9424    NA    NA    NA    NA
##  5 2010-05-01 actual  9351    NA    NA    NA    NA
##  6 2010-06-01 actual 10552    NA    NA    NA    NA
##  7 2010-07-01 actual  9077    NA    NA    NA    NA
##  8 2010-08-01 actual  9273    NA    NA    NA    NA
##  9 2010-09-01 actual  9420    NA    NA    NA    NA
## 10 2010-10-01 actual  9413    NA    NA    NA    NA
## # ℹ 87 more rows

#Get actual data in test time
actual_tbl <- beer_sales_tbl %>% filter(date >= '2017-01-01')
#Visualize with actual value
fcast_tbl %>% 
  ggplot(aes(x = index, y = price, color = key)) +
  geom_line() +
  geom_point() + 
    #Ribbon 95%
  geom_ribbon(aes(ymin = lo.95,ymax = hi.95),
              fill = '#596DD5', alpha = 0.8,size = 0) +
  #Actual data
  geom_line(data = actual_tbl, aes(x = date, y = price), color = 'black') +
  geom_point(data = actual_tbl, aes(x = date, y = price), color = 'red') +
  labs(title = "Beer Sales Forecast: ARIMA", x = "", y = "Thousands of Tons",
         subtitle = "sw_sweep tidies the auto.arima() forecast output") +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  scale_color_tq() +
  scale_fill_tq()

## Warning in max(ids, na.rm = TRUE): ningun argumento finito para max; retornando
## -Inf

#Calculate Error
error_arima<- fcast_tbl %>% filter(key == 'forecast') %>% 
                            left_join(actual_tbl, by=c("index"="date")) %>% 
                            mutate(date = index,actual = price.y, pred = price.x) %>% 
                            select(date,actual,pred) %>% 
                            mutate(error = actual - pred,
                                   error_pct = error/actual)
error_arima

## # A tibble: 12 × 5
##    date       actual   pred error error_pct
##    <date>      <int>  <dbl> <dbl>     <dbl>
##  1 2017-02-01  10251 10964. -713.   -0.0696
##  2 2017-03-01  12241 11665.  576.    0.0471
##  3 2017-04-01  11266 11660. -394.   -0.0350
##  4 2017-05-01  13275 13031.  244.    0.0184
##  5 2017-06-01  14428 13279. 1149.    0.0796
##  6 2017-07-01  11165 11873. -708.   -0.0634
##  7 2017-08-01  13098 12751.  347.    0.0265
##  8 2017-09-01  11619 12130. -511.   -0.0440
##  9 2017-10-01  12386 12570. -184.   -0.0148
## 10 2017-11-01  12904 12702.  202.    0.0156
## 11 2017-12-01  13859 14363. -504.   -0.0364
## 12 2018-01-01   9248  9618. -370.   -0.0400

f_error(error_arima)

## # A tibble: 1 × 10
##       n  mean     var   std   mae  rmse   mape      mpe  skew kurtosis
##   <int> <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>    <dbl> <dbl>    <dbl>
## 1    12 -72.2 332822.  577.  492.  557. 0.0409 -0.00966 0.639   -0.843

LINEAR REGRESSION

library(timetk)
library(tidyquant)
#Add new time factor into dataframe
#Get data again
beer_sales_tbl <- tq_get("S4248SM144NCEN", 
                         get = "economic.data", 
                         from = "2010-01-01", 
                         to = "2018-01-01")
beer_sales_tbl <- beer_sales_tbl %>% 
  tk_augment_timeseries_signature()

## tk_augment_timeseries_signature(): Using the following .date_var variable: date

beer_sales_tbl

## # A tibble: 97 × 31
##    symbol  date       price index.num    diff  year year.iso  half quarter month
##    <chr>   <date>     <int>     <dbl>   <dbl> <int>    <int> <int>   <int> <int>
##  1 S4248S… 2010-01-01  6558    1.26e9      NA  2010     2009     1       1     1
##  2 S4248S… 2010-02-01  7481    1.26e9 2678400  2010     2010     1       1     2
##  3 S4248S… 2010-03-01  9475    1.27e9 2419200  2010     2010     1       1     3
##  4 S4248S… 2010-04-01  9424    1.27e9 2678400  2010     2010     1       2     4
##  5 S4248S… 2010-05-01  9351    1.27e9 2592000  2010     2010     1       2     5
##  6 S4248S… 2010-06-01 10552    1.28e9 2678400  2010     2010     1       2     6
##  7 S4248S… 2010-07-01  9077    1.28e9 2592000  2010     2010     2       3     7
##  8 S4248S… 2010-08-01  9273    1.28e9 2678400  2010     2010     2       3     8
##  9 S4248S… 2010-09-01  9420    1.28e9 2678400  2010     2010     2       3     9
## 10 S4248S… 2010-10-01  9413    1.29e9 2592000  2010     2010     2       4    10
## # ℹ 87 more rows
## # ℹ 21 more variables: month.xts <int>, month.lbl <ord>, day <int>, hour <int>,
## #   minute <int>, second <int>, hour12 <int>, am.pm <int>, wday <int>,
## #   wday.xts <int>, wday.lbl <ord>, mday <int>, qday <int>, yday <int>,
## #   mweek <int>, week <int>, week.iso <int>, week2 <int>, week3 <int>,
## #   week4 <int>, mday7 <int>

train_lm <- beer_sales_tbl %>% filter(date <= '2017-01-01')
test_lm <- beer_sales_tbl %>% filter(date <= '2018-01-01' & date > '2017-01-01')

CONCLUSION It is not true when we say that the accuracy of time series machine learning is alway superior to ARIMA, GARCH or other forecast techniques. It depends on our data structure and characteristic. If our data have true regression form is similiar with ARIMA or GARCH performance, the result may be show in contrast. Particularly in this case lm and ARIMA have lower MAPE than h2o auto regression machine learning with respectively only 2.5% and 3.8%. So the best way to find the model that is the best or superior with our data, it is training in multiple models. Moreover, not only accuracy depends on the algorithms, but also the ways we choose strong predictors affect to our target. Finally, you should not absolutely give your confidence in any algorithms and have to back test your model in regularly. Timeseries alway are contrained by changing data structure and may be in this time period your current algorithms are strong but in other period it is week. You should consider to change algorithms in the case it seem to not accuracy.

Series Temporales y Machine Learning

2024-01-02