Building a Step by Step Time Series Forecasting Workflow

Business Case

0.1 Importing Libraries

#Modeling
library(tidymodels)
library(modeltime)

#Core
library(tidyverse)
library(timetk)
library(lubridate)
library(janitor)
library(tidyquant)

#Viz
library(ggthemes)
library(ggsci)

#Tables
library(gt)

1 Preparing Dataframe

Here it is a glimpse of our main table. We will use two stocks for demonstration purposes: Google and Amazon. The time frame is only for one year which is a bit of a disadvantage because we cannot use other years to have more evidence of the patterns we are seeing now. Nevertheless, as mentioned before the goal of this notebook is to implement a simple forecasting workflow.

# Importing Dataframe

raw_tbl <- read_csv("/Users/nguyenbuiminh/Desktop/coding_python/ml/time_series/prices.csv")

start_date <- "2016-01-04"
end_date <- "2016-01-08"



stock_filtered_tbl <- raw_tbl %>%
  set_names(names(.) %>% str_to_lower()) %>%
  filter_by_time(
    .start_date = "2016"
  ) %>%
  filter(symbol == "GOOGL")


stock_filtered_tbl %>%
  mutate(date = as.Date(date)) %>%
  head(10) %>%gt() %>%
  tab_header(
    title = "NYSE (GOOGL Stock)",
    subtitle = glue::glue("{start_date} to {end_date}")
  ) %>%
  fmt_date(
    columns = vars(date),
    date_style = 3
  ) %>%
  fmt_currency(
    columns = vars(open, high, low, close),
    currency = "USD"
  ) %>%
  fmt_number(
    columns = vars(volume),
    suffixing = TRUE
  )
NYSE (GOOGL Stock)
2016-01-04 to 2016-01-08
date symbol open close low high volume
Mon, Jan 4, 2016 GOOGL $762.20 $759.44 $747.54 $762.20 3.37M
Tue, Jan 5, 2016 GOOGL $764.10 $761.53 $755.65 $769.20 2.26M
Wed, Jan 6, 2016 GOOGL $750.37 $759.33 $748.00 $765.73 2.41M
Thu, Jan 7, 2016 GOOGL $746.49 $741.00 $735.28 $755.31 3.16M
Fri, Jan 8, 2016 GOOGL $747.80 $730.91 $728.92 $750.12 2.38M
Mon, Jan 11, 2016 GOOGL $731.95 $733.07 $719.56 $735.08 2.54M
Tue, Jan 12, 2016 GOOGL $740.75 $745.34 $736.43 $748.34 2.34M
Wed, Jan 13, 2016 GOOGL $749.34 $719.57 $716.78 $753.00 2.59M
Thu, Jan 14, 2016 GOOGL $724.44 $731.39 $705.00 $739.89 2.78M
Fri, Jan 15, 2016 GOOGL $709.99 $710.49 $701.51 $724.16 3.78M

2 Time Series Exploratory Analysis

2.1 Visualizations

In this section we will focus on the visualizations of both the time series for Google stock

Summary - Google Stock: We see much more volatility around the Google stock *just need to observe the upward and downward swings around the time series)

# Create a new theme for the future

stock_filtered_tbl %>%
  plot_time_series(date, close, .interactive = FALSE, .smooth = TRUE, .color_var = symbol, .line_size = 1) + 
  theme_minimal() +
  scale_y_continuous(labels = scales::dollar_format()) +
  scale_color_tron()

## Candlestick Chart

We use the candlestick plot to display much better how does our dataframe looks like.

googl_plot <- stock_filtered_tbl %>%
  ggplot(aes(x=date, y=close)) +
  geom_candlestick(aes(open=open, high=high, low=low, close=close)) + 
  geom_ma(ma_fun = SMA, n = 15, color = "#7496D2", size=1) +
  labs(
    title = "GOOGL Candlestick Chart", y = "Closing Price", x=""
  ) +
  theme_minimal() +
  scale_y_continuous(labels=scales::dollar_format())

googl_plot

2.2 Transforming and Scaling

We will transform the closing price (target variable) by using logs and standardizing the target variable. We usually apply transformations to remove noise from our time series.

trans_tbl <- stock_filtered_tbl %>%
  mutate(close=log_interval_vec(close, limit_lower=0, offset=1)) %>%
  mutate(close = standardize_vec(close))


inversion_tbl <- trans_tbl %>%
  summarise(
    log_mean = mean(close),
    log_mean = as.numeric(format(log_mean, digits = 15)),
    sd_rev = sd(close),
    sd_rev = as.numeric(format(sd_rev, digits = 15))
  )

inv_mean <- inversion_tbl$log_mean
inv_sd <- inversion_tbl$sd_rev

Building the complete table

2.3 Preparation

2.3.1 Creating the complete table

We have added some lags solely with the main purpose to see if we are able to capture the trend. Trend is an important concept that our model should capture in order to make more precise predictions in the upcoming phases of this notebook.

# Our forecast horizon will be 60 days
forecast_h <- 60
lag_period <- 60
rolling_avg <- c(30,60,90,180)

complete_tbl <- trans_tbl %>% 
  bind_rows(
    future_frame(.data = ., .date_var = date, .length_out = forecast_h)
  ) %>%
  tk_augment_lags(close, .lags = lag_period) %>% 
  tk_augment_slidify(
    .value = close_lag60,
    .f = mean,
    .period = rolling_avg,
    .align = "center",
    .partial = TRUE
  ) %>% 
  rename_with(.cols = contains("lag"), .fn = ~str_c("lag_", .))


# Without the future frame (We will use this table for most of our tasks)
complete_prepared_tbl <- complete_tbl %>% 
  filter(!is.na(close))

forecast_tbl <- complete_tbl %>% 
  filter(is.na(close))
# Lag roll 60 has a better upward trend 

complete_prepared_tbl %>% 
  select(date, symbol, close, contains("lag")) %>% 
  pivot_longer(cols = close:lag_close_lag60_roll_180) %>%
  plot_time_series(date, value, name, .smooth=F, .interactive=F, .line_size=1) + 
  scale_color_locuszoom() + theme_dark() + theme(
    panel.background = element_rect(fill = "#2D2D2D"),
    legend.key = element_rect(fill = "#2D2D2D"),
    legend.position = "bottom") + 
  labs(
    title = "Lags Plot (GOOGL Stock)",
    subtitle = "A glimpse of the full table",
    x = "Date",
    y = "Close Price", 
    caption = "Note: Lag 60 days seems to better catch the trend"
  )

2.4 SPlitting the Dataset

In this analysis we will split the dataset into 80% training and 20% testing set.

split_frame <- complete_prepared_tbl  %>% 
  time_series_split(
    .date_var = date,
    assess = "8 weeks",
    cumulative = TRUE
  )


  
split_frame %>% 
  tk_time_series_cv_plan() %>% 
  plot_time_series_cv_plan(date, close, .interactive=F, .line_size=1) + 
    scale_color_locuszoom() + theme_dark() + theme(
    panel.background = element_rect(fill = "#2D2D2D"),
    legend.key = element_rect(fill = "#2D2D2D"),
    legend.position = "bottom") + 
  labs(
    title = "Dataframe Split",
    caption = "We use this before refitting"

  )

2.4.1 Feature Engineering

Using the recipes package, we will implement a simple preprocessing pipeline. As explained in previous notebooks, we will extract feature from our date column in order to come with valuable features that our model could detect to come up with more accurate predictions. We will also implement fourier series, normalization and one hot encoding to categorical variables.

#Prepare the base of the recipe

recipe_basic_specs <- recipe(close ~ date, data=training(split_frame)) %>%
  step_timeseries_signature(date) %>%
  step_fourier(date, period = c(30,60,90,180), K=1) %>%
  step_rm(matches("(iso)|(xts)|(hour)|(minute)|(second)|(am.pm)|(date_quarter)")) %>% 
  step_normalize(matches("(index.num)|(yday)")) %>% 
  step_dummy(all_nominal_predictors(), one_hot=TRUE)

2.5 Setting up the Models

We need to set up our models with the parsnip library before adding them to our workflow. The steps goes as follows, we prepare the preprocessing steps, we set our model and then we train our models in our workflow. Repeat after me, preprocess, set models and integrate to a workflow for training!

# Models to pick: Random Forest, Arima (Univariate), GLMNET

# Random Forest Model Multivariate


model_rf_fitted <- rand_forest() %>% 
  set_engine("randomForest")%>%
  set_mode("regression")

#Workflow we apply to multivariates

workflow_rf_fitted <- workflow() %>%
  add_recipe(recipe_basic_specs) %>%
  add_model(model_rf_fitted) %>%
  fit(training(split_frame))

#ARIMA Model (univariate)

#ARIMA Multivariate
model_arima_multi_fitted <- arima_reg() %>%
  set_engine("auto_arima")

workflow_arima_multi_fitted <- workflow() %>%
  add_recipe(recipe_basic_specs) %>%
  add_model(model_rf_fitted) %>%
  fit(training(split_frame))

# Bundle the models into a table
models_tbl <- modeltime_table(
  workflow_rf_fitted,
  workflow_arima_multi_fitted
) %>%
  update_model_description(1, "RandomForest Model") %>%
  update_model_description(3, "ARIMA Multivariate")
calibration_tbl <- models_tbl %>% 
  modeltime_calibrate(
    new_data = testing(split_frame)
  ) 


calibration_tbl %>% 
  modeltime_accuracy(
    metric_set = default_forecast_accuracy_metric_set()
  ) %>% 
  table_modeltime_accuracy()

2.5.1 Test set trial

calibration_tbl %>% 
  modeltime_forecast(
    actual_data = complete_prepared_tbl,
    conf_interval = 0.95
  ) %>% 
  plot_modeltime_forecast(.interactive = F, .line_size=1, .conf_interval_fill = "white") +
    theme(
    panel.background = element_rect(fill = "#2D2D2D"),
    legend.key = element_rect(fill = "#2D2D2D"),
    legend.position = "bottom") + scale_colour_ucscgb() +
  labs(
    title = "GOOGL Stock",
    subtitle = "Forecasting on the test set"
  ) + theme(legend.position = "bottom")

2.6 FORECASTING

2.6.1 REFIT

calibration_tbl %>% 
  # Trains on the whole training set
  # It says Updated on the training set because parameters have changed when refitting
  modeltime_refit(data = complete_prepared_tbl) %>% 
  # H depends on time column not good for models depending on external regressors
  modeltime_forecast(
    new_data = forecast_tbl,
    actual_data = complete_prepared_tbl
  ) %>% 
  plot_modeltime_forecast(.interactive = F, .line_size=1, 
                          .conf_interval_fill = "white",
                          .legend_max_width = 25) + 
  theme(
    panel.background = element_rect(fill = "#2D2D2D"),
    legend.key = element_rect(fill = "#2D2D2D"),
    legend.position = "bottom") + scale_color_tron() +
  labs(
    title = "Forecasting Forward",
    subtitle = "GOOGL Stock"
  )

refit_tbl <- calibration_tbl %>% 
  modeltime_refit(data = complete_prepared_tbl) 

refit_tbl %>% 
    modeltime_forecast(
    new_data = forecast_tbl,
    actual_data = complete_prepared_tbl
  ) %>% 
  mutate(across(.value:.conf_hi, .fns = ~ standardize_inv_vec(
    x = ., 
    mean = 2.26601292196241, 
    sd = 0.531019665873533
  ))) %>% 
  mutate(across(.value:.conf_hi, .fns = ~log_interval_inv_vec(
    x = ., 
    limit_lower = 0,
    limit_upper = 852.1999875,
    offset = 1
  ))) %>% 
  plot_modeltime_forecast(.interactive = F, .line_size=1, .conf_interval_fill = "white") + 
    theme(
    panel.background = element_rect(fill = "#2D2D2D"),
    legend.key = element_rect(fill = "#2D2D2D"),
    legend.position = "bottom") + scale_color_tron() +
  labs(
    title = "Inverse Transformation Forecast",
    subtitle = "GOOGL Stock"
  ) + scale_y_continuous(labels = scales::dollar_format())