library(M4comp2018)
library(astsa)
library(tidyverse)
library(forecast)
library(ggfortify)
library(fpp2)
library(rnn)
library(glue)
library(tidyverse)
library(glue)
library(forcats)
library(timetk)
library(tidyquant)
library(tibbletime)
library(cowplot)
library(recipes)
library(rsample)
library(yardstick)
library(keras)
List data struture
data(M4)
yearly_M4 = Filter(function(l) l$period == "Yearly", M4)
quarterly_M4 = Filter(function(l) l$period == "Quarterly", M4)
monthly_M4 = Filter(function(l) l$period == "Monthly", M4)
weekly_M4 = Filter(function(l) l$period == "Weekly", M4)
hourly_M4 = Filter(function(l) l$period == "Hourly", M4)
daily_M4 = Filter(function(l) l$period == "Daily", M4)
monthly_1_full = ts(c(monthly_M4[[1]]$x, monthly_M4[[1]]$xx),
start=start(monthly_M4[[1]]$x),
frequency = frequency(monthly_M4[[1]]$x))
monthly_1_train = ts(monthly_M4[[1]]$x,
start=start(monthly_M4[[1]]$x),
frequency = frequency(monthly_M4[[1]]$x))
monthly_1_test = ts(monthly_M4[[1]]$xx,
start=start(monthly_M4[[1]]$xx),
frequency = frequency(monthly_M4[[1]]$xx))
plot(ts(monthly_1_full,
start = start(monthly_1_full),
frequency = frequency(monthly_1_full)),
type = 'l', col = 'black', ylab ='', xlab ='')
lines(monthly_1_test, col = 'red', type = 'o', xlab = '')
Produce a polar coordinate season plot
ggseasonplot(monthly_1_train, polar = T)
Create subseries plot that comprises mini time plots for each season
ggsubseriesplot(monthly_1_train)
### Plot the monthly sample: Removing trend
autoplot(diff(monthly_1_train))
## 4. Explore time series patterns: Trend, seasonal, or cyclic
acf2(diff(monthly_1_train))
## ACF PACF
## [1,] 0.07 0.07
## [2,] -0.12 -0.12
## [3,] -0.35 -0.34
## [4,] -0.25 -0.26
## [5,] 0.19 0.14
## [6,] 0.04 -0.16
## [7,] 0.13 0.00
## [8,] -0.25 -0.28
## [9,] -0.32 -0.35
## [10,] -0.09 -0.29
## [11,] 0.20 -0.06
## [12,] 0.62 0.37
## [13,] 0.19 0.26
## [14,] -0.13 0.11
## [15,] -0.31 0.08
## [16,] -0.20 0.03
## [17,] 0.14 0.01
## [18,] 0.07 -0.07
## [19,] 0.11 0.08
## [20,] -0.25 -0.02
## [21,] -0.31 -0.07
## [22,] -0.07 -0.06
## [23,] 0.19 -0.09
## [24,] 0.59 0.16
## [25,] 0.16 0.06
## [26,] -0.12 0.01
## [27,] -0.28 0.06
## [28,] -0.20 0.00
## [29,] 0.13 -0.02
## [30,] 0.04 -0.13
## [31,] 0.11 0.01
## [32,] -0.24 -0.04
## [33,] -0.26 0.04
## [34,] -0.11 -0.10
## [35,] 0.19 -0.07
## [36,] 0.57 0.13
## [37,] 0.15 0.04
## [38,] -0.12 -0.05
## [39,] -0.30 -0.02
## [40,] -0.20 -0.10
## [41,] 0.14 -0.02
## [42,] 0.04 -0.09
## [43,] 0.12 0.03
## [44,] -0.25 -0.07
## [45,] -0.25 0.03
## [46,] -0.09 -0.06
## [47,] 0.16 -0.08
## [48,] 0.54 0.01
The first differencing data have seasonal pattern ## 5. Ljung-Box test: Finding h autocorrelation
Box.test(monthly_1_train, fitdf = 0, lag = 24, type = 'Lj')
##
## Box-Ljung test
##
## data: monthly_1_train
## X-squared = 4388.7, df = 24, p-value < 2.2e-16
Box.test(diff(monthly_1_train), fitdf = 0, lag = 24, type = 'Lj')
##
## Box-Ljung test
##
## data: diff(monthly_1_train)
## X-squared = 786.08, df = 24, p-value < 2.2e-16
The Ljung-Box test result (p-value <<< 0.05) shows autocorrelation in both original data and differencing data. ## 6. Implement statistical model to predict ### 6.1. Seasonal Naive model Model summary and plot
snaive_f = snaive(diff(monthly_1_train), h = 18)
autoplot(snaive_f)
summary(snaive_f)
##
## Forecast method: Seasonal naive method
##
## Model Information:
## Call: snaive(y = diff(monthly_1_train), h = 18)
##
## Residual sd: 727.3591
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.9429825 726.5617 565.8114 NaN Inf 1 -0.3595435
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jul 2015 1920 988.8737 2851.12627 495.96526 3344.03474
## Aug 2015 -1390 -2321.1263 -458.87373 -2814.03474 34.03474
## Sep 2015 -1930 -2861.1263 -998.87373 -3354.03474 -505.96526
## Oct 2015 -410 -1341.1263 521.12627 -1834.03474 1014.03474
## Nov 2015 580 -351.1263 1511.12627 -844.03474 2004.03474
## Dec 2015 -480 -1411.1263 451.12627 -1904.03474 944.03474
## Jan 2016 300 -631.1263 1231.12627 -1124.03474 1724.03474
## Feb 2016 -910 -1841.1263 21.12627 -2334.03474 514.03474
## Mar 2016 -40 -971.1263 891.12627 -1464.03474 1384.03474
## Apr 2016 120 -811.1263 1051.12627 -1304.03474 1544.03474
## May 2016 -300 -1231.1263 631.12627 -1724.03474 1124.03474
## Jun 2016 1980 1048.8737 2911.12627 555.96526 3404.03474
## Jul 2016 1920 603.1886 3236.81140 -93.88924 3933.88924
## Aug 2016 -1390 -2706.8114 -73.18860 -3403.88924 623.88924
## Sep 2016 -1930 -3246.8114 -613.18860 -3943.88924 83.88924
## Oct 2016 -410 -1726.8114 906.81140 -2423.88924 1603.88924
## Nov 2016 580 -736.8114 1896.81140 -1433.88924 2593.88924
## Dec 2016 -480 -1796.8114 836.81140 -2493.88924 1533.88924
Fitted values and residuals
autoplot(snaive_f, series = "Train")
Check residuals whether white noise or not
checkresiduals(snaive_f)
##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 225.16, df = 24, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 24
Accuracy of Seasonal Naive model
forecast::accuracy(snaive_f, diff(monthly_1_test))
## ME RMSE MAE MPE MAPE MASE
## Training set -0.9429825 726.5617 565.8114 NaN Inf 1.000000
## Test set -30.5882353 815.6845 632.9412 80.09394 97.85567 1.118643
## ACF1 Theil's U
## Training set -0.3595435 NA
## Test set -0.1157613 0.5601452
Model summary and plot
sarima_f = auto.arima(monthly_1_train)
summary(sarima_f)
## Series: monthly_1_train
## ARIMA(1,0,5)(0,1,1)[12]
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4 ma5 sma1
## 0.9791 -0.3524 -0.0601 -0.1436 0.0074 0.1266 -0.7895
## s.e. 0.0110 0.0489 0.0506 0.0520 0.0520 0.0436 0.0338
##
## sigma^2 estimated as 268035: log likelihood=-3507.04
## AIC=7030.07 AICc=7030.39 BIC=7063.07
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -3.835782 507.1257 389.5809 -0.5104896 6.365607 0.5128426
## ACF1
## Training set -0.0007186712
sarima_f %>%
forecast(h = 18) %>%
autoplot()
Check residuals whether white noise or not
checkresiduals(sarima_f)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,5)(0,1,1)[12]
## Q* = 30.259, df = 17, p-value = 0.02454
##
## Model df: 7. Total lags used: 24
Accuracy of Seasonal Arima model
forecast::accuracy(forecast(sarima_f, h = 18), monthly_1_test, d = 0, D = 1)
## ME RMSE MAE MPE MAPE MASE
## Training set -3.835782 507.1257 389.5809 -0.5104896 6.365607 0.5128426
## Test set -111.744545 774.8113 687.0752 -4.4136328 12.765260 0.9044630
## ACF1 Theil's U
## Training set -0.0007186712 NA
## Test set 0.4626465094 0.6106427
Y_train = matrix(monthly_1_train)
Y_test = matrix(monthly_1_test)
Y_train_scaled = (Y_train - min(Y_train)) / (max(Y_train) - min(Y_train))
Y_train = t(Y_train_scaled)
Y_test_scaled = (Y_test - min(Y_test)) / (max(Y_test) - min(Y_test))
Y_test = t(Y_test_scaled)
model = trainr(Y = Y_train,
X = Y_train,
learningrate = 0.05,
hidden_dim = 16,
numepochs = 1000)
summary(model)
## Length Class Mode
## input_dim 1 -none- numeric
## hidden_dim 1 -none- numeric
## output_dim 1 -none- numeric
## synapse_dim 3 -none- numeric
## time_dim 1 -none- numeric
## sigmoid 1 -none- character
## network_type 1 -none- character
## numepochs 1 -none- numeric
## batch_size 1 -none- numeric
## learningrate 1 -none- numeric
## learningrate_decay 1 -none- numeric
## momentum 1 -none- numeric
## update_rule 1 -none- character
## use_bias 1 -none- logical
## seq_to_seq_unsync 1 -none- logical
## epoch_function 2 -none- list
## loss_function 1 -none- function
## last_layer_error 469 -none- numeric
## last_layer_delta 469 -none- numeric
## store 2 -none- list
## time_synapse 2 -none- list
## recurrent_synapse 1 -none- list
## bias_synapse 2 -none- list
## time_synapse_update 2 -none- list
## bias_synapse_update 2 -none- list
## recurrent_synapse_update 1 -none- list
## error 1000 -none- numeric
## current_epoch 1 -none- numeric
## store_best 2 -none- list
plot(colMeans(model$error),type='l',xlab='epoch',ylab='errors')
### Plot errors
plot(colMeans(model$error),type='l',xlab='epoch',ylab='errors')
### Predict test set
y_pred = predictr(model, Y_test)
plot(as.vector(t(Y_test)), col = 'red', type='l',
main = "Actual vs Predicted data: testing set",
ylab = "Y,Yp")
lines(as.vector(t(y_pred)), type = 'l', col = 'black')
legend("bottomright", c("Predicted", "Actual"),
col = c("red","black"),
lty = c(1,1), lwd = c(1,1))
### Accuracy analysis
forecast::accuracy(ts(Y_test), ts(y_pred))
## ME RMSE MAE MPE MAPE ACF1
## Test set -0.0027717 0.0419865 0.02592881 7.815092 13.16181 0.0761262
## Theil's U
## Test set 0.1582634
monthly = tk_tbl(monthly_1_full) %>%
mutate(index = as_date(index)) %>%
as_tbl_time(index = index)
head(monthly)
## # A time tibble: 6 x 2
## # Index: index
## index value
## <date> <dbl>
## 1 1976-06-01 8000
## 2 1976-07-01 8350
## 3 1976-08-01 8570
## 4 1976-09-01 7700
## 5 1976-10-01 7080
## 6 1976-11-01 6520
monthly %>%
ggplot(aes(index, value)) +
geom_point(color = palette_light()[[1]], alpha = 0.5) +
geom_smooth(method = "loess", span = 0.2, se = FALSE) +
theme_tq() +
labs(title = "Monthly Data Set)")
### 8.3. Evaluating AFC and Autocorrelation: h = 12
tidy_acf <- function(data, value, lags = 0:20) {
value_expr <- enquo(value)
acf_values <- data %>%
pull(value) %>%
acf(lag.max = tail(lags, 1), plot = FALSE) %>%
.$acf %>%
.[,,1]
ret <- tibble(acf = acf_values) %>%
rowid_to_column(var = "lag") %>%
mutate(lag = lag - 1) %>%
filter(lag %in% lags)
return(ret)
}
monthly %>%
tidy_acf(value, lags = 0:24)
## # A tibble: 25 x 2
## lag acf
## <dbl> <dbl>
## 1 0 1.000
## 2 1 0.874
## 3 2 0.725
## 4 3 0.616
## 5 4 0.594
## 6 5 0.630
## 7 6 0.626
## 8 7 0.613
## 9 8 0.562
## 10 9 0.566
## # ... with 15 more rows
Plotting
monthly %>%
tidy_acf(value, lags = 0:24) %>%
ggplot(aes(lag, acf)) +
geom_segment(aes(xend = lag, yend = 0), color = palette_light()[[1]]) +
geom_vline(xintercept = 12, size = 3, color = palette_light()[[2]]) +
annotate("text", label = "12 Month Mark", x = 12, y = 0.827,
color = palette_light()[[2]], size = 6, hjust = 0) +
theme_tq() +
labs(title = "ACF: Monthly")
#### The optimal lag occurs at lag 12 #### We can theoretically implement the LSTM model
periods_train <- 12 * 10
periods_test <- 12 * 5
skip_span <- 12 * 6
rolling_origin_resamples <- rolling_origin(
monthly,
initial = periods_train,
assess = periods_test,
cumulative = FALSE,
skip = skip_span
)
rolling_origin_resamples
## # Rolling origin forecast resampling
## # A tibble: 5 x 2
## splits id
## <list> <chr>
## 1 <split [120/60]> Slice1
## 2 <split [120/60]> Slice2
## 3 <split [120/60]> Slice3
## 4 <split [120/60]> Slice4
## 5 <split [120/60]> Slice5
Data Setup
split <- rolling_origin_resamples$splits[[1]]
split_id <- rolling_origin_resamples$id[[1]]
df_trn <- training(split)
df_tst <- testing(split)
df <- bind_rows(
df_trn %>% add_column(key = "training"),
df_tst %>% add_column(key = "testing")
) %>%
as_tbl_time(index = index)
head(df)
## # A time tibble: 6 x 3
## # Index: index
## index value key
## <date> <dbl> <chr>
## 1 1976-06-01 8000 training
## 2 1976-07-01 8350 training
## 3 1976-08-01 8570 training
## 4 1976-09-01 7700 training
## 5 1976-10-01 7080 training
## 6 1976-11-01 6520 training
Preprocessing With Recipes: centere and scale data
rec_obj <- recipe(value ~ ., df) %>%
step_sqrt(value) %>%
step_center(value) %>%
step_scale(value) %>%
prep()
df_processed_tbl <- bake(rec_obj, df)
df_processed_tbl
## # A tibble: 180 x 3
## index value key
## <date> <dbl> <fct>
## 1 1976-06-01 1.10 training
## 2 1976-07-01 1.33 training
## 3 1976-08-01 1.48 training
## 4 1976-09-01 0.888 training
## 5 1976-10-01 0.446 training
## 6 1976-11-01 0.0292 training
## 7 1976-12-01 -0.319 training
## 8 1977-01-01 0.127 training
## 9 1977-02-01 0.262 training
## 10 1977-03-01 -0.606 training
## # ... with 170 more rows
LSTM Model input
lag_setting <- 12
batch_size <- 10
train_length <- 120
tsteps <- 1
epochs <- 300
2D And 3D Train/Test Arrays
# Training Set
lag_train_tbl <- df_processed_tbl %>%
mutate(value_lag = lag(value, n = lag_setting)) %>%
filter(!is.na(value_lag)) %>%
filter(key == "training") %>%
tail(train_length)
x_train_vec <- lag_train_tbl$value_lag
x_train_arr <- array(data = x_train_vec, dim = c(length(x_train_vec), 1, 1))
y_train_vec <- lag_train_tbl$value
y_train_arr <- array(data = y_train_vec, dim = c(length(y_train_vec), 1))
# Testing Set
lag_test_tbl <- df_processed_tbl %>%
mutate(
value_lag = lag(value, n = lag_setting)
) %>%
filter(!is.na(value_lag)) %>%
filter(key == "testing")
x_test_vec <- lag_test_tbl$value_lag
x_test_arr <- array(data = x_test_vec, dim = c(length(x_test_vec), 1, 1))
y_test_vec <- lag_test_tbl$value
y_test_arr <- array(data = y_test_vec, dim = c(length(y_test_vec), 1))
Implement LSTM model
# Error: model_lstm <- keras_model_sequential()
# Error: model_lstm %>%
# Error: layer_lstm(units = 50,
# Error: input_shape = c(tsteps, 1),
# Error: batch_size = batch_size,
# Error: return_sequences = TRUE,
# Error: stateful = TRUE) %>%
# Error: layer_lstm(units = 50,
# Error: return_sequences = FALSE,
# Error: stateful = TRUE) %>%
# Error: layer_dense(units = 1)
# Error: model_lstm %>%
# Error: compile(loss = 'mae', optimizer = 'adam')
# Error: model_lstm