Business Case
Here it is a glimpse of our main table. We will use two stocks for demonstration purposes: Google and Amazon. The time frame is only for one year which is a bit of a disadvantage because we cannot use other years to have more evidence of the patterns we are seeing now. Nevertheless, as mentioned before the goal of this notebook is to implement a simple forecasting workflow.
# Importing Dataframe
raw_tbl <- read_csv("/Users/nguyenbuiminh/Desktop/coding_python/ml/time_series/prices.csv")
start_date <- "2016-01-04"
end_date <- "2016-01-08"
stock_filtered_tbl <- raw_tbl %>%
set_names(names(.) %>% str_to_lower()) %>%
filter_by_time(
.start_date = "2016"
) %>%
filter(symbol == "GOOGL")
stock_filtered_tbl %>%
mutate(date = as.Date(date)) %>%
head(10) %>%gt() %>%
tab_header(
title = "NYSE (GOOGL Stock)",
subtitle = glue::glue("{start_date} to {end_date}")
) %>%
fmt_date(
columns = vars(date),
date_style = 3
) %>%
fmt_currency(
columns = vars(open, high, low, close),
currency = "USD"
) %>%
fmt_number(
columns = vars(volume),
suffixing = TRUE
)| NYSE (GOOGL Stock) | ||||||
| 2016-01-04 to 2016-01-08 | ||||||
| date | symbol | open | close | low | high | volume |
|---|---|---|---|---|---|---|
| Mon, Jan 4, 2016 | GOOGL | $762.20 | $759.44 | $747.54 | $762.20 | 3.37M |
| Tue, Jan 5, 2016 | GOOGL | $764.10 | $761.53 | $755.65 | $769.20 | 2.26M |
| Wed, Jan 6, 2016 | GOOGL | $750.37 | $759.33 | $748.00 | $765.73 | 2.41M |
| Thu, Jan 7, 2016 | GOOGL | $746.49 | $741.00 | $735.28 | $755.31 | 3.16M |
| Fri, Jan 8, 2016 | GOOGL | $747.80 | $730.91 | $728.92 | $750.12 | 2.38M |
| Mon, Jan 11, 2016 | GOOGL | $731.95 | $733.07 | $719.56 | $735.08 | 2.54M |
| Tue, Jan 12, 2016 | GOOGL | $740.75 | $745.34 | $736.43 | $748.34 | 2.34M |
| Wed, Jan 13, 2016 | GOOGL | $749.34 | $719.57 | $716.78 | $753.00 | 2.59M |
| Thu, Jan 14, 2016 | GOOGL | $724.44 | $731.39 | $705.00 | $739.89 | 2.78M |
| Fri, Jan 15, 2016 | GOOGL | $709.99 | $710.49 | $701.51 | $724.16 | 3.78M |
In this section we will focus on the visualizations of both the time series for Google stock
Summary - Google Stock: We see much more volatility around the Google stock *just need to observe the upward and downward swings around the time series)
# Create a new theme for the future
stock_filtered_tbl %>%
plot_time_series(date, close, .interactive = FALSE, .smooth = TRUE, .color_var = symbol, .line_size = 1) +
theme_minimal() +
scale_y_continuous(labels = scales::dollar_format()) +
scale_color_tron()
## Candlestick Chart
We use the candlestick plot to display much better how does our dataframe looks like.
googl_plot <- stock_filtered_tbl %>%
ggplot(aes(x=date, y=close)) +
geom_candlestick(aes(open=open, high=high, low=low, close=close)) +
geom_ma(ma_fun = SMA, n = 15, color = "#7496D2", size=1) +
labs(
title = "GOOGL Candlestick Chart", y = "Closing Price", x=""
) +
theme_minimal() +
scale_y_continuous(labels=scales::dollar_format())
googl_plotWe will transform the closing price (target variable) by using logs and standardizing the target variable. We usually apply transformations to remove noise from our time series.
trans_tbl <- stock_filtered_tbl %>%
mutate(close=log_interval_vec(close, limit_lower=0, offset=1)) %>%
mutate(close = standardize_vec(close))
inversion_tbl <- trans_tbl %>%
summarise(
log_mean = mean(close),
log_mean = as.numeric(format(log_mean, digits = 15)),
sd_rev = sd(close),
sd_rev = as.numeric(format(sd_rev, digits = 15))
)
inv_mean <- inversion_tbl$log_mean
inv_sd <- inversion_tbl$sd_revBuilding the complete table
We have added some lags solely with the main purpose to see if we are able to capture the trend. Trend is an important concept that our model should capture in order to make more precise predictions in the upcoming phases of this notebook.
# Our forecast horizon will be 60 days
forecast_h <- 60
lag_period <- 60
rolling_avg <- c(30,60,90,180)
complete_tbl <- trans_tbl %>%
bind_rows(
future_frame(.data = ., .date_var = date, .length_out = forecast_h)
) %>%
tk_augment_lags(close, .lags = lag_period) %>%
tk_augment_slidify(
.value = close_lag60,
.f = mean,
.period = rolling_avg,
.align = "center",
.partial = TRUE
) %>%
rename_with(.cols = contains("lag"), .fn = ~str_c("lag_", .))
# Without the future frame (We will use this table for most of our tasks)
complete_prepared_tbl <- complete_tbl %>%
filter(!is.na(close))
forecast_tbl <- complete_tbl %>%
filter(is.na(close))# Lag roll 60 has a better upward trend
complete_prepared_tbl %>%
select(date, symbol, close, contains("lag")) %>%
pivot_longer(cols = close:lag_close_lag60_roll_180) %>%
plot_time_series(date, value, name, .smooth=F, .interactive=F, .line_size=1) +
scale_color_locuszoom() + theme_dark() + theme(
panel.background = element_rect(fill = "#2D2D2D"),
legend.key = element_rect(fill = "#2D2D2D"),
legend.position = "bottom") +
labs(
title = "Lags Plot (GOOGL Stock)",
subtitle = "A glimpse of the full table",
x = "Date",
y = "Close Price",
caption = "Note: Lag 60 days seems to better catch the trend"
)In this analysis we will split the dataset into 80% training and 20% testing set.
split_frame <- complete_prepared_tbl %>%
time_series_split(
.date_var = date,
assess = "8 weeks",
cumulative = TRUE
)
split_frame %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(date, close, .interactive=F, .line_size=1) +
scale_color_locuszoom() + theme_dark() + theme(
panel.background = element_rect(fill = "#2D2D2D"),
legend.key = element_rect(fill = "#2D2D2D"),
legend.position = "bottom") +
labs(
title = "Dataframe Split",
caption = "We use this before refitting"
)Using the recipes package, we will implement a simple preprocessing pipeline. As explained in previous notebooks, we will extract feature from our date column in order to come with valuable features that our model could detect to come up with more accurate predictions. We will also implement fourier series, normalization and one hot encoding to categorical variables.
#Prepare the base of the recipe
recipe_basic_specs <- recipe(close ~ date, data=training(split_frame)) %>%
step_timeseries_signature(date) %>%
step_fourier(date, period = c(30,60,90,180), K=1) %>%
step_rm(matches("(iso)|(xts)|(hour)|(minute)|(second)|(am.pm)|(date_quarter)")) %>%
step_normalize(matches("(index.num)|(yday)")) %>%
step_dummy(all_nominal_predictors(), one_hot=TRUE)We need to set up our models with the parsnip library before adding them to our workflow. The steps goes as follows, we prepare the preprocessing steps, we set our model and then we train our models in our workflow. Repeat after me, preprocess, set models and integrate to a workflow for training!
# Models to pick: Random Forest, Arima (Univariate), GLMNET
# Random Forest Model Multivariate
model_rf_fitted <- rand_forest() %>%
set_engine("randomForest")%>%
set_mode("regression")
#Workflow we apply to multivariates
workflow_rf_fitted <- workflow() %>%
add_recipe(recipe_basic_specs) %>%
add_model(model_rf_fitted) %>%
fit(training(split_frame))
#ARIMA Model (univariate)
#ARIMA Multivariate
model_arima_multi_fitted <- arima_reg() %>%
set_engine("auto_arima")
workflow_arima_multi_fitted <- workflow() %>%
add_recipe(recipe_basic_specs) %>%
add_model(model_rf_fitted) %>%
fit(training(split_frame))
# Bundle the models into a table
models_tbl <- modeltime_table(
workflow_rf_fitted,
workflow_arima_multi_fitted
) %>%
update_model_description(1, "RandomForest Model") %>%
update_model_description(3, "ARIMA Multivariate")calibration_tbl <- models_tbl %>%
modeltime_calibrate(
new_data = testing(split_frame)
)
calibration_tbl %>%
modeltime_accuracy(
metric_set = default_forecast_accuracy_metric_set()
) %>%
table_modeltime_accuracy()calibration_tbl %>%
modeltime_forecast(
actual_data = complete_prepared_tbl,
conf_interval = 0.95
) %>%
plot_modeltime_forecast(.interactive = F, .line_size=1, .conf_interval_fill = "white") +
theme(
panel.background = element_rect(fill = "#2D2D2D"),
legend.key = element_rect(fill = "#2D2D2D"),
legend.position = "bottom") + scale_colour_ucscgb() +
labs(
title = "GOOGL Stock",
subtitle = "Forecasting on the test set"
) + theme(legend.position = "bottom")calibration_tbl %>%
# Trains on the whole training set
# It says Updated on the training set because parameters have changed when refitting
modeltime_refit(data = complete_prepared_tbl) %>%
# H depends on time column not good for models depending on external regressors
modeltime_forecast(
new_data = forecast_tbl,
actual_data = complete_prepared_tbl
) %>%
plot_modeltime_forecast(.interactive = F, .line_size=1,
.conf_interval_fill = "white",
.legend_max_width = 25) +
theme(
panel.background = element_rect(fill = "#2D2D2D"),
legend.key = element_rect(fill = "#2D2D2D"),
legend.position = "bottom") + scale_color_tron() +
labs(
title = "Forecasting Forward",
subtitle = "GOOGL Stock"
)refit_tbl <- calibration_tbl %>%
modeltime_refit(data = complete_prepared_tbl)
refit_tbl %>%
modeltime_forecast(
new_data = forecast_tbl,
actual_data = complete_prepared_tbl
) %>%
mutate(across(.value:.conf_hi, .fns = ~ standardize_inv_vec(
x = .,
mean = 2.26601292196241,
sd = 0.531019665873533
))) %>%
mutate(across(.value:.conf_hi, .fns = ~log_interval_inv_vec(
x = .,
limit_lower = 0,
limit_upper = 852.1999875,
offset = 1
))) %>%
plot_modeltime_forecast(.interactive = F, .line_size=1, .conf_interval_fill = "white") +
theme(
panel.background = element_rect(fill = "#2D2D2D"),
legend.key = element_rect(fill = "#2D2D2D"),
legend.position = "bottom") + scale_color_tron() +
labs(
title = "Inverse Transformation Forecast",
subtitle = "GOOGL Stock"
) + scale_y_continuous(labels = scales::dollar_format())