M4-Competition:Extract data and Implement Naive-ARIMA-ANN-RNN-LSTM

1. Import packages

library(M4comp2018)
library(astsa)
library(tidyverse)
library(forecast)
library(ggfortify)
library(fpp2)
library(rnn)
library(glue)
library(tidyverse)
library(glue)
library(forcats)
library(timetk)
library(tidyquant)
library(tibbletime)
library(cowplot)
library(recipes)
library(rsample)
library(yardstick) 
library(keras)

2. Extracting data: yearly, quarterly, monthly, dayly, hourly

List data struture

data(M4)
yearly_M4 = Filter(function(l) l$period == "Yearly", M4)
quarterly_M4 = Filter(function(l) l$period == "Quarterly", M4)
monthly_M4 = Filter(function(l) l$period == "Monthly", M4)
weekly_M4 = Filter(function(l) l$period == "Weekly", M4)
hourly_M4 = Filter(function(l) l$period == "Hourly", M4)
daily_M4 = Filter(function(l) l$period == "Daily", M4)

3. Extract one sample to implement statistical models and deep learning

Extract the first month sample: full, training, and test set

monthly_1_full = ts(c(monthly_M4[[1]]$x, monthly_M4[[1]]$xx),
               start=start(monthly_M4[[1]]$x), 
               frequency = frequency(monthly_M4[[1]]$x))
monthly_1_train = ts(monthly_M4[[1]]$x, 
                  start=start(monthly_M4[[1]]$x), 
                  frequency = frequency(monthly_M4[[1]]$x))
monthly_1_test = ts(monthly_M4[[1]]$xx, 
                  start=start(monthly_M4[[1]]$xx), 
                  frequency = frequency(monthly_M4[[1]]$xx))

plot the monthly sample: including full, training and test set

plot(ts(monthly_1_full,
        start = start(monthly_1_full),
        frequency = frequency(monthly_1_full)),
     type = 'l', col = 'black', ylab ='', xlab ='')
lines(monthly_1_test, col = 'red', type = 'o', xlab = '')

Produce a polar coordinate season plot

ggseasonplot(monthly_1_train, polar = T)

Create subseries plot that comprises mini time plots for each season

ggsubseriesplot(monthly_1_train)

### Plot the monthly sample: Removing trend

autoplot(diff(monthly_1_train))

## 4. Explore time series patterns: Trend, seasonal, or cyclic

acf2(diff(monthly_1_train))

##         ACF  PACF
##  [1,]  0.07  0.07
##  [2,] -0.12 -0.12
##  [3,] -0.35 -0.34
##  [4,] -0.25 -0.26
##  [5,]  0.19  0.14
##  [6,]  0.04 -0.16
##  [7,]  0.13  0.00
##  [8,] -0.25 -0.28
##  [9,] -0.32 -0.35
## [10,] -0.09 -0.29
## [11,]  0.20 -0.06
## [12,]  0.62  0.37
## [13,]  0.19  0.26
## [14,] -0.13  0.11
## [15,] -0.31  0.08
## [16,] -0.20  0.03
## [17,]  0.14  0.01
## [18,]  0.07 -0.07
## [19,]  0.11  0.08
## [20,] -0.25 -0.02
## [21,] -0.31 -0.07
## [22,] -0.07 -0.06
## [23,]  0.19 -0.09
## [24,]  0.59  0.16
## [25,]  0.16  0.06
## [26,] -0.12  0.01
## [27,] -0.28  0.06
## [28,] -0.20  0.00
## [29,]  0.13 -0.02
## [30,]  0.04 -0.13
## [31,]  0.11  0.01
## [32,] -0.24 -0.04
## [33,] -0.26  0.04
## [34,] -0.11 -0.10
## [35,]  0.19 -0.07
## [36,]  0.57  0.13
## [37,]  0.15  0.04
## [38,] -0.12 -0.05
## [39,] -0.30 -0.02
## [40,] -0.20 -0.10
## [41,]  0.14 -0.02
## [42,]  0.04 -0.09
## [43,]  0.12  0.03
## [44,] -0.25 -0.07
## [45,] -0.25  0.03
## [46,] -0.09 -0.06
## [47,]  0.16 -0.08
## [48,]  0.54  0.01

The first differencing data have seasonal pattern ## 5. Ljung-Box test: Finding h autocorrelation

Box.test(monthly_1_train, fitdf = 0, lag = 24, type = 'Lj')

## 
##  Box-Ljung test
## 
## data:  monthly_1_train
## X-squared = 4388.7, df = 24, p-value < 2.2e-16

Box.test(diff(monthly_1_train), fitdf = 0, lag = 24, type = 'Lj')

## 
##  Box-Ljung test
## 
## data:  diff(monthly_1_train)
## X-squared = 786.08, df = 24, p-value < 2.2e-16

The Ljung-Box test result (p-value <<< 0.05) shows autocorrelation in both original data and differencing data. ## 6. Implement statistical model to predict ### 6.1. Seasonal Naive model Model summary and plot

snaive_f = snaive(diff(monthly_1_train), h = 18)
autoplot(snaive_f)

summary(snaive_f)

## 
## Forecast method: Seasonal naive method
## 
## Model Information:
## Call: snaive(y = diff(monthly_1_train), h = 18) 
## 
## Residual sd: 727.3591 
## 
## Error measures:
##                      ME     RMSE      MAE MPE MAPE MASE       ACF1
## Training set -0.9429825 726.5617 565.8114 NaN  Inf    1 -0.3595435
## 
## Forecasts:
##          Point Forecast      Lo 80      Hi 80       Lo 95      Hi 95
## Jul 2015           1920   988.8737 2851.12627   495.96526 3344.03474
## Aug 2015          -1390 -2321.1263 -458.87373 -2814.03474   34.03474
## Sep 2015          -1930 -2861.1263 -998.87373 -3354.03474 -505.96526
## Oct 2015           -410 -1341.1263  521.12627 -1834.03474 1014.03474
## Nov 2015            580  -351.1263 1511.12627  -844.03474 2004.03474
## Dec 2015           -480 -1411.1263  451.12627 -1904.03474  944.03474
## Jan 2016            300  -631.1263 1231.12627 -1124.03474 1724.03474
## Feb 2016           -910 -1841.1263   21.12627 -2334.03474  514.03474
## Mar 2016            -40  -971.1263  891.12627 -1464.03474 1384.03474
## Apr 2016            120  -811.1263 1051.12627 -1304.03474 1544.03474
## May 2016           -300 -1231.1263  631.12627 -1724.03474 1124.03474
## Jun 2016           1980  1048.8737 2911.12627   555.96526 3404.03474
## Jul 2016           1920   603.1886 3236.81140   -93.88924 3933.88924
## Aug 2016          -1390 -2706.8114  -73.18860 -3403.88924  623.88924
## Sep 2016          -1930 -3246.8114 -613.18860 -3943.88924   83.88924
## Oct 2016           -410 -1726.8114  906.81140 -2423.88924 1603.88924
## Nov 2016            580  -736.8114 1896.81140 -1433.88924 2593.88924
## Dec 2016           -480 -1796.8114  836.81140 -2493.88924 1533.88924

Fitted values and residuals

autoplot(snaive_f, series = "Train")

Check residuals whether white noise or not

checkresiduals(snaive_f)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 225.16, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24

Accuracy of Seasonal Naive model

forecast::accuracy(snaive_f, diff(monthly_1_test))

##                       ME     RMSE      MAE      MPE     MAPE     MASE
## Training set  -0.9429825 726.5617 565.8114      NaN      Inf 1.000000
## Test set     -30.5882353 815.6845 632.9412 80.09394 97.85567 1.118643
##                    ACF1 Theil's U
## Training set -0.3595435        NA
## Test set     -0.1157613 0.5601452

6.2. Seasonal Arima model

Model summary and plot

sarima_f = auto.arima(monthly_1_train)
summary(sarima_f)

## Series: monthly_1_train 
## ARIMA(1,0,5)(0,1,1)[12] 
## 
## Coefficients:
##          ar1      ma1      ma2      ma3     ma4     ma5     sma1
##       0.9791  -0.3524  -0.0601  -0.1436  0.0074  0.1266  -0.7895
## s.e.  0.0110   0.0489   0.0506   0.0520  0.0520  0.0436   0.0338
## 
## sigma^2 estimated as 268035:  log likelihood=-3507.04
## AIC=7030.07   AICc=7030.39   BIC=7063.07
## 
## Training set error measures:
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -3.835782 507.1257 389.5809 -0.5104896 6.365607 0.5128426
##                       ACF1
## Training set -0.0007186712

sarima_f %>% 
  forecast(h = 18) %>%
  autoplot()

Check residuals whether white noise or not

checkresiduals(sarima_f)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,5)(0,1,1)[12]
## Q* = 30.259, df = 17, p-value = 0.02454
## 
## Model df: 7.   Total lags used: 24

Accuracy of Seasonal Arima model

forecast::accuracy(forecast(sarima_f, h = 18), monthly_1_test, d = 0, D = 1)

##                       ME     RMSE      MAE        MPE      MAPE      MASE
## Training set   -3.835782 507.1257 389.5809 -0.5104896  6.365607 0.5128426
## Test set     -111.744545 774.8113 687.0752 -4.4136328 12.765260 0.9044630
##                       ACF1 Theil's U
## Training set -0.0007186712        NA
## Test set      0.4626465094 0.6106427

7. Implement RNN

7.1. Convert time serial to matrix

Y_train = matrix(monthly_1_train)
Y_test = matrix(monthly_1_test)

7.2. Standardize in the interval 0 - 1 then transpose

Y_train_scaled = (Y_train - min(Y_train)) / (max(Y_train) - min(Y_train))
Y_train = t(Y_train_scaled)
Y_test_scaled = (Y_test - min(Y_test)) / (max(Y_test) - min(Y_test))
Y_test = t(Y_test_scaled)

7.3. Training RNN

model = trainr(Y = Y_train,
               X = Y_train,
               learningrate = 0.05,
               hidden_dim = 16,
               numepochs = 1000)

Summarize the rnn model

summary(model)

##                          Length Class  Mode     
## input_dim                   1   -none- numeric  
## hidden_dim                  1   -none- numeric  
## output_dim                  1   -none- numeric  
## synapse_dim                 3   -none- numeric  
## time_dim                    1   -none- numeric  
## sigmoid                     1   -none- character
## network_type                1   -none- character
## numepochs                   1   -none- numeric  
## batch_size                  1   -none- numeric  
## learningrate                1   -none- numeric  
## learningrate_decay          1   -none- numeric  
## momentum                    1   -none- numeric  
## update_rule                 1   -none- character
## use_bias                    1   -none- logical  
## seq_to_seq_unsync           1   -none- logical  
## epoch_function              2   -none- list     
## loss_function               1   -none- function 
## last_layer_error          469   -none- numeric  
## last_layer_delta          469   -none- numeric  
## store                       2   -none- list     
## time_synapse                2   -none- list     
## recurrent_synapse           1   -none- list     
## bias_synapse                2   -none- list     
## time_synapse_update         2   -none- list     
## bias_synapse_update         2   -none- list     
## recurrent_synapse_update    1   -none- list     
## error                    1000   -none- numeric  
## current_epoch               1   -none- numeric  
## store_best                  2   -none- list

plot(colMeans(model$error),type='l',xlab='epoch',ylab='errors')

### Plot errors

plot(colMeans(model$error),type='l',xlab='epoch',ylab='errors')

### Predict test set

y_pred = predictr(model, Y_test)

Plot prediction vs actual

plot(as.vector(t(Y_test)), col = 'red', type='l',
     main = "Actual vs Predicted data: testing set",
     ylab = "Y,Yp")
lines(as.vector(t(y_pred)), type = 'l', col = 'black')
legend("bottomright", c("Predicted", "Actual"),
       col = c("red","black"),
       lty = c(1,1), lwd = c(1,1))

### Accuracy analysis

forecast::accuracy(ts(Y_test), ts(y_pred))

##                  ME      RMSE        MAE      MPE     MAPE      ACF1
## Test set -0.0027717 0.0419865 0.02592881 7.815092 13.16181 0.0761262
##          Theil's U
## Test set 0.1582634

8. Implement LSTM

8.1. Convert ts data to tibble data

monthly = tk_tbl(monthly_1_full) %>%
  mutate(index = as_date(index)) %>%
  as_tbl_time(index = index)

head(monthly)

## # A time tibble: 6 x 2
## # Index: index
##   index      value
##   <date>     <dbl>
## 1 1976-06-01  8000
## 2 1976-07-01  8350
## 3 1976-08-01  8570
## 4 1976-09-01  7700
## 5 1976-10-01  7080
## 6 1976-11-01  6520

8.2. Visualizing monthly Data With Cowplot

monthly %>%
    ggplot(aes(index, value)) +
    geom_point(color = palette_light()[[1]], alpha = 0.5) +
    geom_smooth(method = "loess", span = 0.2, se = FALSE) +
    theme_tq() +
    labs(title = "Monthly Data Set)")

### 8.3. Evaluating AFC and Autocorrelation: h = 12

tidy_acf <- function(data, value, lags = 0:20) {
    
    value_expr <- enquo(value)
    
    acf_values <- data %>%
        pull(value) %>%
        acf(lag.max = tail(lags, 1), plot = FALSE) %>%
        .$acf %>%
        .[,,1]
    
    ret <- tibble(acf = acf_values) %>%
        rowid_to_column(var = "lag") %>%
        mutate(lag = lag - 1) %>%
        filter(lag %in% lags)
    
    return(ret)
}

monthly %>%
    tidy_acf(value, lags = 0:24)

## # A tibble: 25 x 2
##      lag   acf
##    <dbl> <dbl>
##  1     0 1.000
##  2     1 0.874
##  3     2 0.725
##  4     3 0.616
##  5     4 0.594
##  6     5 0.630
##  7     6 0.626
##  8     7 0.613
##  9     8 0.562
## 10     9 0.566
## # ... with 15 more rows

Plotting

monthly %>%
    tidy_acf(value, lags = 0:24) %>%
    ggplot(aes(lag, acf)) +
    geom_segment(aes(xend = lag, yend = 0), color = palette_light()[[1]]) +
    geom_vline(xintercept = 12, size = 3, color = palette_light()[[2]]) +
    annotate("text", label = "12 Month Mark", x = 12, y = 0.827, 
             color = palette_light()[[2]], size = 6, hjust = 0) +
    theme_tq() +
    labs(title = "ACF: Monthly")

#### The optimal lag occurs at lag 12 #### We can theoretically implement the LSTM model

8.4. Backtesting Strategy

periods_train <- 12 * 10
periods_test  <- 12 * 5
skip_span     <- 12 * 6

rolling_origin_resamples <- rolling_origin(
    monthly,
    initial    = periods_train,
    assess     = periods_test,
    cumulative = FALSE,
    skip       = skip_span
)

rolling_origin_resamples

## # Rolling origin forecast resampling 
## # A tibble: 5 x 2
##   splits           id    
##   <list>           <chr> 
## 1 <split [120/60]> Slice1
## 2 <split [120/60]> Slice2
## 3 <split [120/60]> Slice3
## 4 <split [120/60]> Slice4
## 5 <split [120/60]> Slice5

8.5. LSTM Model: Keras

Data Setup

split    <- rolling_origin_resamples$splits[[1]]
split_id <- rolling_origin_resamples$id[[1]]
df_trn <- training(split)
df_tst <- testing(split)

df <- bind_rows(
    df_trn %>% add_column(key = "training"),
    df_tst %>% add_column(key = "testing")
) %>% 
    as_tbl_time(index = index)

head(df)

## # A time tibble: 6 x 3
## # Index: index
##   index      value key     
##   <date>     <dbl> <chr>   
## 1 1976-06-01  8000 training
## 2 1976-07-01  8350 training
## 3 1976-08-01  8570 training
## 4 1976-09-01  7700 training
## 5 1976-10-01  7080 training
## 6 1976-11-01  6520 training

Preprocessing With Recipes: centere and scale data

rec_obj <- recipe(value ~ ., df) %>%
    step_sqrt(value) %>%
    step_center(value) %>%
    step_scale(value) %>%
    prep()

df_processed_tbl <- bake(rec_obj, df)

df_processed_tbl

## # A tibble: 180 x 3
##    index        value key     
##    <date>       <dbl> <fct>   
##  1 1976-06-01  1.10   training
##  2 1976-07-01  1.33   training
##  3 1976-08-01  1.48   training
##  4 1976-09-01  0.888  training
##  5 1976-10-01  0.446  training
##  6 1976-11-01  0.0292 training
##  7 1976-12-01 -0.319  training
##  8 1977-01-01  0.127  training
##  9 1977-02-01  0.262  training
## 10 1977-03-01 -0.606  training
## # ... with 170 more rows

LSTM Model input

lag_setting  <- 12
batch_size   <- 10
train_length <- 120
tsteps       <- 1
epochs       <- 300

2D And 3D Train/Test Arrays

# Training Set
lag_train_tbl <- df_processed_tbl %>%
    mutate(value_lag = lag(value, n = lag_setting)) %>%
    filter(!is.na(value_lag)) %>%
    filter(key == "training") %>%
    tail(train_length)

x_train_vec <- lag_train_tbl$value_lag
x_train_arr <- array(data = x_train_vec, dim = c(length(x_train_vec), 1, 1))

y_train_vec <- lag_train_tbl$value
y_train_arr <- array(data = y_train_vec, dim = c(length(y_train_vec), 1))

# Testing Set
lag_test_tbl <- df_processed_tbl %>%
    mutate(
        value_lag = lag(value, n = lag_setting)
    ) %>%
    filter(!is.na(value_lag)) %>%
    filter(key == "testing")

x_test_vec <- lag_test_tbl$value_lag
x_test_arr <- array(data = x_test_vec, dim = c(length(x_test_vec), 1, 1))

y_test_vec <- lag_test_tbl$value
y_test_arr <- array(data = y_test_vec, dim = c(length(y_test_vec), 1))

Implement LSTM model

# Error: model_lstm <- keras_model_sequential()

# Error: model_lstm %>%
# Error:    layer_lstm(units            = 50, 
# Error:               input_shape      = c(tsteps, 1), 
# Error:               batch_size       = batch_size,
# Error:               return_sequences = TRUE, 
# Error:               stateful         = TRUE) %>% 
# Error:    layer_lstm(units            = 50, 
# Error:               return_sequences = FALSE, 
# Error:               stateful         = TRUE) %>% 
# Error:    layer_dense(units = 1)

# Error: model_lstm %>% 
# Error:    compile(loss = 'mae', optimizer = 'adam')

# Error: model_lstm