NYSE (GOOGL Stock)
Mon, Jan 4, 2016	GOOGL	$762.20	$759.44	$747.54	$762.20	3.37M
Tue, Jan 5, 2016	GOOGL	$764.10	$761.53	$755.65	$769.20	2.26M
Wed, Jan 6, 2016	GOOGL	$750.37	$759.33	$748.00	$765.73	2.41M
Thu, Jan 7, 2016	GOOGL	$746.49	$741.00	$735.28	$755.31	3.16M
Fri, Jan 8, 2016	GOOGL	$747.80	$730.91	$728.92	$750.12	2.38M
Mon, Jan 11, 2016	GOOGL	$731.95	$733.07	$719.56	$735.08	2.54M
Tue, Jan 12, 2016	GOOGL	$740.75	$745.34	$736.43	$748.34	2.34M
Wed, Jan 13, 2016	GOOGL	$749.34	$719.57	$716.78	$753.00	2.59M
Thu, Jan 14, 2016	GOOGL	$724.44	$731.39	$705.00	$739.89	2.78M
Fri, Jan 15, 2016	GOOGL	$709.99	$710.49	$701.51	$724.16	3.78M

2 Time Series Exploratory Analysis

2.1 Visualizations

In this section we will focus on the visualizations of both the time series for Google stock

Summary - Google Stock: We see much more volatility around the Google stock *just need to observe the upward and downward swings around the time series)

# Create a new theme for the future

stock_filtered_tbl %>%
  plot_time_series(date, close, .interactive = FALSE, .smooth = TRUE, .color_var = symbol, .line_size = 1) + 
  theme_minimal() +
  scale_y_continuous(labels = scales::dollar_format()) +
  scale_color_tron()

## Candlestick Chart

We use the candlestick plot to display much better how does our dataframe looks like.

googl_plot <- stock_filtered_tbl %>%
  ggplot(aes(x=date, y=close)) +
  geom_candlestick(aes(open=open, high=high, low=low, close=close)) + 
  geom_ma(ma_fun = SMA, n = 15, color = "#7496D2", size=1) +
  labs(
    title = "GOOGL Candlestick Chart", y = "Closing Price", x=""
  ) +
  theme_minimal() +
  scale_y_continuous(labels=scales::dollar_format())

googl_plot

2.2 Transforming and Scaling

We will transform the closing price (target variable) by using logs and standardizing the target variable. We usually apply transformations to remove noise from our time series.

trans_tbl <- stock_filtered_tbl %>%
  mutate(close=log_interval_vec(close, limit_lower=0, offset=1)) %>%
  mutate(close = standardize_vec(close))


inversion_tbl <- trans_tbl %>%
  summarise(
    log_mean = mean(close),
    log_mean = as.numeric(format(log_mean, digits = 15)),
    sd_rev = sd(close),
    sd_rev = as.numeric(format(sd_rev, digits = 15))
  )

inv_mean <- inversion_tbl$log_mean
inv_sd <- inversion_tbl$sd_rev

Building the complete table

2.3 Preparation

2.3.1 Creating the complete table

We have added some lags solely with the main purpose to see if we are able to capture the trend. Trend is an important concept that our model should capture in order to make more precise predictions in the upcoming phases of this notebook.

# Our forecast horizon will be 60 days
forecast_h <- 60
lag_period <- 60
rolling_avg <- c(30,60,90,180)

complete_tbl <- trans_tbl %>% 
  bind_rows(
    future_frame(.data = ., .date_var = date, .length_out = forecast_h)
  ) %>%
  tk_augment_lags(close, .lags = lag_period) %>% 
  tk_augment_slidify(
    .value = close_lag60,
    .f = mean,
    .period = rolling_avg,
    .align = "center",
    .partial = TRUE
  ) %>% 
  rename_with(.cols = contains("lag"), .fn = ~str_c("lag_", .))


# Without the future frame (We will use this table for most of our tasks)
complete_prepared_tbl <- complete_tbl %>% 
  filter(!is.na(close))

forecast_tbl <- complete_tbl %>% 
  filter(is.na(close))

# Lag roll 60 has a better upward trend 

complete_prepared_tbl %>% 
  select(date, symbol, close, contains("lag")) %>% 
  pivot_longer(cols = close:lag_close_lag60_roll_180) %>%
  plot_time_series(date, value, name, .smooth=F, .interactive=F, .line_size=1) + 
  scale_color_locuszoom() + theme_dark() + theme(
    panel.background = element_rect(fill = "#2D2D2D"),
    legend.key = element_rect(fill = "#2D2D2D"),
    legend.position = "bottom") + 
  labs(
    title = "Lags Plot (GOOGL Stock)",
    subtitle = "A glimpse of the full table",
    x = "Date",
    y = "Close Price", 
    caption = "Note: Lag 60 days seems to better catch the trend"
  )

2.4 SPlitting the Dataset

In this analysis we will split the dataset into 80% training and 20% testing set.

split_frame <- complete_prepared_tbl  %>% 
  time_series_split(
    .date_var = date,
    assess = "8 weeks",
    cumulative = TRUE
  )


  
split_frame %>% 
  tk_time_series_cv_plan() %>% 
  plot_time_series_cv_plan(date, close, .interactive=F, .line_size=1) + 
    scale_color_locuszoom() + theme_dark() + theme(
    panel.background = element_rect(fill = "#2D2D2D"),
    legend.key = element_rect(fill = "#2D2D2D"),
    legend.position = "bottom") + 
  labs(
    title = "Dataframe Split",
    caption = "We use this before refitting"

  )

2.4.1 Feature Engineering

Using the recipes package, we will implement a simple preprocessing pipeline. As explained in previous notebooks, we will extract feature from our date column in order to come with valuable features that our model could detect to come up with more accurate predictions. We will also implement fourier series, normalization and one hot encoding to categorical variables.

#Prepare the base of the recipe

recipe_basic_specs <- recipe(close ~ date, data=training(split_frame)) %>%
  step_timeseries_signature(date) %>%
  step_fourier(date, period = c(30,60,90,180), K=1) %>%
  step_rm(matches("(iso)|(xts)|(hour)|(minute)|(second)|(am.pm)|(date_quarter)")) %>% 
  step_normalize(matches("(index.num)|(yday)")) %>% 
  step_dummy(all_nominal_predictors(), one_hot=TRUE)

2.5 Setting up the Models

We need to set up our models with the parsnip library before adding them to our workflow. The steps goes as follows, we prepare the preprocessing steps, we set our model and then we train our models in our workflow. Repeat after me, preprocess, set models and integrate to a workflow for training!

# Models to pick: Random Forest, Arima (Univariate), GLMNET

# Random Forest Model Multivariate


model_rf_fitted <- rand_forest() %>% 
  set_engine("randomForest")%>%
  set_mode("regression")

#Workflow we apply to multivariates

workflow_rf_fitted <- workflow() %>%
  add_recipe(recipe_basic_specs) %>%
  add_model(model_rf_fitted) %>%
  fit(training(split_frame))

#ARIMA Model (univariate)

#ARIMA Multivariate
model_arima_multi_fitted <- arima_reg() %>%
  set_engine("auto_arima")

workflow_arima_multi_fitted <- workflow() %>%
  add_recipe(recipe_basic_specs) %>%
  add_model(model_rf_fitted) %>%
  fit(training(split_frame))

# Bundle the models into a table
models_tbl <- modeltime_table(
  workflow_rf_fitted,
  workflow_arima_multi_fitted
) %>%
  update_model_description(1, "RandomForest Model") %>%
  update_model_description(3, "ARIMA Multivariate")

calibration_tbl <- models_tbl %>% 
  modeltime_calibrate(
    new_data = testing(split_frame)
  ) 


calibration_tbl %>% 
  modeltime_accuracy(
    metric_set = default_forecast_accuracy_metric_set()
  ) %>% 
  table_modeltime_accuracy()

2.5.1 Test set trial

calibration_tbl %>% 
  modeltime_forecast(
    actual_data = complete_prepared_tbl,
    conf_interval = 0.95
  ) %>% 
  plot_modeltime_forecast(.interactive = F, .line_size=1, .conf_interval_fill = "white") +
    theme(
    panel.background = element_rect(fill = "#2D2D2D"),
    legend.key = element_rect(fill = "#2D2D2D"),
    legend.position = "bottom") + scale_colour_ucscgb() +
  labs(
    title = "GOOGL Stock",
    subtitle = "Forecasting on the test set"
  ) + theme(legend.position = "bottom")

2.6 FORECASTING

2.6.1 REFIT

calibration_tbl %>% 
  # Trains on the whole training set
  # It says Updated on the training set because parameters have changed when refitting
  modeltime_refit(data = complete_prepared_tbl) %>% 
  # H depends on time column not good for models depending on external regressors
  modeltime_forecast(
    new_data = forecast_tbl,
    actual_data = complete_prepared_tbl
  ) %>% 
  plot_modeltime_forecast(.interactive = F, .line_size=1, 
                          .conf_interval_fill = "white",
                          .legend_max_width = 25) + 
  theme(
    panel.background = element_rect(fill = "#2D2D2D"),
    legend.key = element_rect(fill = "#2D2D2D"),
    legend.position = "bottom") + scale_color_tron() +
  labs(
    title = "Forecasting Forward",
    subtitle = "GOOGL Stock"
  )

refit_tbl <- calibration_tbl %>% 
  modeltime_refit(data = complete_prepared_tbl) 

refit_tbl %>% 
    modeltime_forecast(
    new_data = forecast_tbl,
    actual_data = complete_prepared_tbl
  ) %>% 
  mutate(across(.value:.conf_hi, .fns = ~ standardize_inv_vec(
    x = ., 
    mean = 2.26601292196241, 
    sd = 0.531019665873533
  ))) %>% 
  mutate(across(.value:.conf_hi, .fns = ~log_interval_inv_vec(
    x = ., 
    limit_lower = 0,
    limit_upper = 852.1999875,
    offset = 1
  ))) %>% 
  plot_modeltime_forecast(.interactive = F, .line_size=1, .conf_interval_fill = "white") + 
    theme(
    panel.background = element_rect(fill = "#2D2D2D"),
    legend.key = element_rect(fill = "#2D2D2D"),
    legend.position = "bottom") + scale_color_tron() +
  labs(
    title = "Inverse Transformation Forecast",
    subtitle = "GOOGL Stock"
  ) + scale_y_continuous(labels = scales::dollar_format())

time_series_p3

nguyen_lscm

2023-12-06

Building a Step by Step Time Series Forecasting Workflow

0.1 Importing Libraries

1 Preparing Dataframe