R Markdown

goog_price = getSymbols('GOOG', env = NULL)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
## 
## This message is shown once per session and may be disabled by setting 
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
head(goog_price)
##            GOOG.Open GOOG.High GOOG.Low GOOG.Close GOOG.Volume GOOG.Adjusted
## 2007-01-03  232.1299  237.4400 229.6940   232.9220    15470772      232.9220
## 2007-01-04  233.6243  241.0714 233.3005   240.7277    15834329      240.7277
## 2007-01-05  240.3491  242.8398 238.1623   242.6853    13795717      242.6853
## 2007-01-08  242.9344  244.0204 240.1997   240.8871     9544441      240.8871
## 2007-01-09  241.8186  243.2134 239.7015   241.8435    10803142      241.8435
## 2007-01-10  241.3105  245.8535 240.1200   243.8161    11981743      243.8161
#Finding min and max of date
date = index(goog_price)

min(date)
## [1] "2007-01-03"
max(date)
## [1] "2022-02-16"
# Changing column names
names(goog_price) = c('open','high','low','close','volume','adjusted')
# Filtering data till Jan 19,2022
goog_price = data.frame(goog_price) %>% 
  mutate(date = date) %>%
  select(adjusted,date) %>% 
  filter(date >= "2007-01-03",
         date <= "2022-01-19")
# split data into train and test

train = goog_price %>%
  filter(date < "2022-01-01")

test = goog_price %>% 
filter(date >= "2022-01-01")

Data Visualization

The ADF is performed to check if the process is stationary or non-stationary. The NULL hypothesis for the following ADF test is that “it is non-stationary”. Since we get a p-value > 0.05 there is no sufficient evidence to reject the null hypothesis.Hence , we conclude from the ADF test that the process is stationary.

# ADF test (since p-value > 0.05, it is non-stationary)
adf.test(train$adjusted,alternative = 'stationary')
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train$adjusted
## Dickey-Fuller = 1.4876, Lag order = 15, p-value = 0.99
## alternative hypothesis: stationary

The PP-test also concludes that the process is stationary since we get a p-value > 0.05 suggesting that we do not have enough evidence to reject the null hypothesis.

kpss.test(train$adjusted)
## 
##  KPSS Test for Level Stationarity
## 
## data:  train$adjusted
## KPSS Level = 28.342, Truncation lag parameter = 9, p-value = 0.01

Log transfomration

goog_log = train %>%
  mutate(close_log = log1p(adjusted))
goog_log %>%
  ggplot()+
      geom_line(aes(date, close_log))+
      theme_bw()+
      ggtitle("Google over Time (Log)")+
      ylab("Adjusted Closing Price (Log)")+
      xlab("Date")

goog_boxcox = train %>%
  mutate(close_boxcox = forecast::BoxCox(adjusted,lambda='auto'))
goog_boxcox %>%
  ggplot()+
      geom_line(aes(date,close_boxcox))+
      theme_bw()+
      ggtitle("Google over Time (Box-Cox Transformation)")+
      ylab("Closing Price (Box-Cox Tranformation)")+
      xlab("Date")

Handling Mean Stationarity

goog_diff = goog_log %>%
  mutate(close_diff = adjusted - lag(adjusted),
         close_diff2 = adjusted - lag(adjusted,n = 2),
         close_log_diff = close_log - lag(close_log),
         close_log_diff2 = close_log - lag(close_log,n=2)) %>% 
  drop_na()

goog_diff %>% 
  select(date,adjusted,close_diff,close_diff2,close_log_diff,
         close_log_diff2) %>% 
  drop_na() %>% 
  head()
##                  date adjusted close_diff close_diff2 close_log_diff
## 2007-01-05 2007-01-05 242.6853   1.957657    9.763398    0.008065987
## 2007-01-08 2007-01-08 240.8871  -1.798249    0.159408   -0.007406751
## 2007-01-09 2007-01-09 241.8435   0.956406   -0.841843    0.003946139
## 2007-01-10 2007-01-10 243.8161   1.972610    2.929016    0.008090154
## 2007-01-11 2007-01-11 248.9270   5.110840    7.083450    0.020661317
## 2007-01-12 2007-01-12 251.5571   2.630142    7.740982    0.010468655
##            close_log_diff2
## 2007-01-05    0.0408903228
## 2007-01-08    0.0006592354
## 2007-01-09   -0.0034606122
## 2007-01-10    0.0120362934
## 2007-01-11    0.0287514715
## 2007-01-12    0.0311299720
goog_diff %>%
  ggplot()+
      geom_line(aes(date,close_diff))+
      theme_bw()+
      ggtitle("goog (First Difference)")+
      ylab("Adjusted Closing Price (Difference))")+
      xlab("Date")

goog_diff %>%
  ggplot()+
      geom_line(aes(date,close_log_diff2))+
      theme_bw()+
      ggtitle("goog (First Difference)")+
      ylab("Adjusted Closing Price (Difference))")+
      xlab("Date")

acf(goog_diff$close_log_diff,lag.max = 10)

pacf(goog_diff$close_log_diff,lag.max = 10)

From the PACF plot above, we can conclude that the order of AR process is of the order 4. Hence we can represent it as AR(1,1,0)

adf.test(goog_diff$close_log_diff)
## Warning in adf.test(goog_diff$close_log_diff): p-value smaller than printed p-
## value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  goog_diff$close_log_diff
## Dickey-Fuller = -15.197, Lag order = 15, p-value = 0.01
## alternative hypothesis: stationary

Conducting ADF test again to re-verify

kpss.test(goog_diff$close_log_diff)
## Warning in kpss.test(goog_diff$close_log_diff): p-value greater than printed p-
## value
## 
##  KPSS Test for Level Stationarity
## 
## data:  goog_diff$close_log_diff
## KPSS Level = 0.18832, Truncation lag parameter = 9, p-value = 0.1

From the above ADF and KPSS tests we confirm that the process is stationary given the p-value is less than 0.05.

Fitting ARIMA Models

par(mfrow = c(1,2))
acf(goog_diff$close_log_diff,lag.max=20)
pacf(goog_diff$close_log_diff,lag.max=20)

AIC(
  arima(goog_diff$close_log,order=c(0,1,1)),
  arima(goog_diff$close_log,order=c(0,1,2)),
  arima(goog_diff$close_log,order=c(0,1,3)),
  arima(goog_diff$close_log,order=c(1,1,0)),
  arima(goog_diff$close_log,order=c(1,1,1)),
  arima(goog_diff$close_log,order=c(2,1,0)),
  arima(goog_diff$close_log,order=c(3,1,0)),
  arima(goog_diff$close_log,order=c(4,1,0)),
  arima(goog_diff$close_log,order=c(4,2,0)),
  arima(goog_diff$close_log,order=c(4,2,1)),
  arima(goog_diff$close_log,order=c(5,1,0)),
  arima(goog_diff$close_log,order=c(5,2,0))
)
##                                                df       AIC
## arima(goog_diff$close_log, order = c(0, 1, 1))  2 -19569.36
## arima(goog_diff$close_log, order = c(0, 1, 2))  3 -19567.37
## arima(goog_diff$close_log, order = c(0, 1, 3))  4 -19565.68
## arima(goog_diff$close_log, order = c(1, 1, 0))  2 -19569.37
## arima(goog_diff$close_log, order = c(1, 1, 1))  3 -19567.37
## arima(goog_diff$close_log, order = c(2, 1, 0))  3 -19567.37
## arima(goog_diff$close_log, order = c(3, 1, 0))  4 -19565.60
## arima(goog_diff$close_log, order = c(4, 1, 0))  5 -19563.94
## arima(goog_diff$close_log, order = c(4, 2, 0))  5 -18849.26
## arima(goog_diff$close_log, order = c(4, 2, 1))  6 -19553.06
## arima(goog_diff$close_log, order = c(5, 1, 0))  6 -19562.05
## arima(goog_diff$close_log, order = c(5, 2, 0))  6 -18910.72

From the above AIC values we conclude that the process with least AIC score is AR(1,1,0) with a score of -19569.37

BIC(
  arima(goog_diff$close_log,order=c(0,1,1)),
  arima(goog_diff$close_log,order=c(0,1,2)),
  arima(goog_diff$close_log,order=c(0,1,3)),
  arima(goog_diff$close_log,order=c(1,1,0)),
  arima(goog_diff$close_log,order=c(1,1,1)),
  arima(goog_diff$close_log,order=c(2,1,0)),
  arima(goog_diff$close_log,order=c(3,1,0)),
  arima(goog_diff$close_log,order=c(4,1,0)),
  arima(goog_diff$close_log,order=c(4,2,0)),
  arima(goog_diff$close_log,order=c(4,2,1)),
  arima(goog_diff$close_log,order=c(5,1,0)),
  arima(goog_diff$close_log,order=c(5,2,0))
)
## Warning in BIC.default(arima(goog_diff$close_log, order = c(0, 1, 1)),
## arima(goog_diff$close_log, : models are not all fitted to the same number of
## observations
##                                                df       BIC
## arima(goog_diff$close_log, order = c(0, 1, 1))  2 -19556.89
## arima(goog_diff$close_log, order = c(0, 1, 2))  3 -19548.66
## arima(goog_diff$close_log, order = c(0, 1, 3))  4 -19540.74
## arima(goog_diff$close_log, order = c(1, 1, 0))  2 -19556.90
## arima(goog_diff$close_log, order = c(1, 1, 1))  3 -19548.66
## arima(goog_diff$close_log, order = c(2, 1, 0))  3 -19548.67
## arima(goog_diff$close_log, order = c(3, 1, 0))  4 -19540.66
## arima(goog_diff$close_log, order = c(4, 1, 0))  5 -19532.76
## arima(goog_diff$close_log, order = c(4, 2, 0))  5 -18818.08
## arima(goog_diff$close_log, order = c(4, 2, 1))  6 -19515.65
## arima(goog_diff$close_log, order = c(5, 1, 0))  6 -19524.63
## arima(goog_diff$close_log, order = c(5, 2, 0))  6 -18873.31

From the above BIC values we conclude that the process with least AIC score is AR(1,1,0) with a score of -19556.90

Auto Arima

forecast::auto.arima(goog_diff$close_log,stationary=FALSE,allowdrift=FALSE,
seasonal=FALSE,stepwise=FALSE,approximation=FALSE)
## Series: goog_diff$close_log 
## ARIMA(1,1,0) 
## 
## Coefficients:
##           ar1
##       -0.0474
## s.e.   0.0163
## 
## sigma^2 = 0.0003275:  log likelihood = 9786.69
## AIC=-19569.37   AICc=-19569.37   BIC=-19556.9

Ljung-BOX Test

best_mod = arima(goog_diff$close_log,order=c(1,1,0))
forecast::checkresiduals(best_mod)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 21.155, df = 9, p-value = 0.01198
## 
## Model df: 1.   Total lags used: 10

From the ACF plots of residuals, we can infer there is no particular pattern.

par(mfrow=c(1,2))
resid = best_mod$resid
acf(resid,lag.max=20)
pacf(resid,lag.max=20)

Box.test(resid,type='Ljung-Box',lag=1)
## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 0.0084401, df = 1, p-value = 0.9268
Box.test(resid,type='Ljung-Box',lag=5)
## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 0.77871, df = 5, p-value = 0.9784
Box.test(resid,type='Ljung-Box',lag=10)
## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 21.155, df = 10, p-value = 0.02004
Box.test(resid,type='Ljung-Box',lag=15)
## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 31.984, df = 15, p-value = 0.006471
Box.test(resid,type='Ljung-Box',lag=20)
## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 37.554, df = 20, p-value = 0.01003
best_mod = arima(goog_diff$close_log,order=c(1,1,0))
resid = best_mod$residuals
pred = resid+goog_diff$close_log
ggplot()+
  geom_line(aes(goog_diff$date,goog_diff$close_log))+
  geom_line(aes(goog_diff$date,pred),color='blue',alpha=0.4)+
  theme_bw()+
  xlab("Date")+
  ylab("Log GooG")

RMSE = sqrt(mean((expm1(pred) - expm1(goog_diff$close_log))^2,na.rm=T))
RMSE
## [1] 15.51314
best_mod %>%
  forecast::forecast(h=5) %>% 
  autoplot()

prediction = predict(best_mod,n.ahead=12)

pred_data = data.frame(
  pred = prediction,
  date = test$date
)
pred_data = data.frame(pred = prediction,date = test$date)

pred_merge = test %>% 
  full_join(pred_data) %>%
  mutate(
    pred_high = expm1(pred.pred+2*pred.se),
    pred_low = expm1(pred.pred - 2*pred.se),
    pred.pred = expm1(pred.pred),
    error = pred.pred - adjusted
    )
## Joining, by = "date"

Forecast plot for the nextr 5 time periods

plot(forecast::forecast(best_mod,h=5),main="ARIMA(1,1,0) using Multi-Steps Forecast",ylab="Adj Price",xlab="Date")

Forecast plot for the next 6 months

plot(forecast::forecast(best_mod,h=180),main="ARIMA(1,1,0) using Multi-Steps Forecast",ylab="Adj Price",xlab="Date")

Fitting a Prophet Model

library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice
prophet_data = goog_price %>% 
    rename(ds = date, # Have to name our date variable "ds"
    y = adjusted)  # Have to name our time series "y"

# Splitting into Train and Test Data
train = prophet_data %>% 
  filter(ds<ymd("2022-01-01"))
test = prophet_data %>%
  filter(ds>=ymd("2022-01-01"))


model = prophet(train, yearly.seasonality = TRUE, daily.seasonality = TRUE)
future = make_future_dataframe(model,periods = 365)
forecast = predict(model,future)

Plotting Forecast

plot(model,forecast)+
ylab("Adjusted Closing Price of Google Stocks")+xlab("Date")+theme_bw()

Interactive Charts

dyplot.prophet(model,forecast)

Time Series Decomposition

Visualization of Time Series components

prophet_plot_components(model,forecast)

Identifying Saturation Points

Let us make a two-year forecast to identify the need for saturation points.

future_2yr = make_future_dataframe(model,periods = 730)
pred_2yr = predict(model,future_2yr)

dyplot.prophet(model,pred_2yr)

We observe that there are no unnatural values in the graph. So, there is no need to set Saturation Points for this model.

Estimating the trend and changepoints

Plotting Changepoints

  • The algorithm examines 25 equally-spaced potential change points by default as potential candidates for a change in the trend.
  • Examines the actual rate of change at each potential change point, and removes those with low rates of change.
plot(model,forecast)+add_changepoints_to_plot(model)+theme_bw()+xlab("Date")+ylab("Adjusted Closing Price")

Changepoint Hyperparameters

We can manually specify more number of change points to improve performance.

# Number of Changepoints
model = prophet(train,n.changepoints=50)
forecast = predict(model,future)
plot(model,forecast)+add_changepoints_to_plot(model)+theme_bw()+xlab("Date")+ylab("Adjusted Stock Price")

prophet_plot_components(model,forecast)

forecast_plot_data = forecast %>% 
  as_tibble() %>% 
  mutate(ds = as.Date(ds)) %>% 
  filter(ds>=ymd("2015-01-01"))

forecast_not_scaled = ggplot()+
geom_line(aes(test$ds,test$y))+ xlab("Date") + ylab("Adjusted Closing Price from test data") +
geom_line(aes(forecast_plot_data$ds,forecast_plot_data$yhat), color='steelblue')
forecast_scaled = forecast_not_scaled + 
ylim(0,2500) 
forecast_scaled

Assessment of Seasonality

prophet_plot_components(model,forecast)

Additive Seasonality

additive = prophet(train)
add_fcst = predict(additive,future)
plot(additive,add_fcst)+
ylim(0,2500)

prophet_plot_components(additive,add_fcst)

Multiplicative Seasonality

multi = prophet(train,seasonality.mode = 'multiplicative')
multi_forecast = predict(multi,future)
plot(multi,multi_forecast)+ylim(0,2500)

prophet_plot_components(multi, multi_forecast)

Assessment of holidays

model = prophet(train,fit=FALSE)
model = add_country_holidays(model,country_name = 'US')
model = fit.prophet(model,train)
forecast = predict(model,future)
prophet_plot_components(model,forecast)

Examining Impact of Holidays

forecast %>% 
  filter(
    holidays!=0,
    ds == ymd("2020-11-11") | 
    ds == ymd("2020-10-12")) %>%
    select(ds,`Veterans Day`,`Columbus Day`)
##           ds Veterans Day Columbus Day
## 1 2020-10-12      0.00000     5.522341
## 2 2020-11-11    -11.63453     0.000000
forecast %>%
  filter(holidays != 0) %>%
  dplyr::select(-ds:-additive_terms_upper, -holidays:-holidays_upper, -weekly:-yhat, -contains("upper"), -contains("lower")) %>%
  mutate_all(~ if_else(. == 0, as.numeric(NA), .)) %>%
  summarize_if(is.numeric, ~ max(., na.rm = T)) %>%
  pivot_longer(
    cols = `Christmas Day`:`Washington's Birthday`,
    names_to = 'holiday', 
    values_to = 'effect'
  ) %>%
ggplot() +
  geom_col(aes(effect,holiday))+
  theme_bw()

  • From the above plot, New Year’s Day seem to be the most important holidays affecting stock prices

Prophet Model Assessment

forecast_metric_data = forecast %>% 
  as_tibble() %>% 
  mutate(ds = as.Date(ds)) %>% 
  filter(ds>=ymd("2022-01-01"))
RMSE = sqrt(mean((test$y - forecast_metric_data$yhat)^2))
MAE = mean(abs(test$y - forecast_metric_data$yhat))
MAPE = mean(abs((test$y - forecast_metric_data$yhat)/test$y))
print(paste("RMSE:",round(RMSE,2)))
## [1] "RMSE: 264.46"
print(paste("MAE:",round(MAE,2)))
## [1] "MAE: 212.24"
print(paste("MAPE:",round(MAPE,2)))
## [1] "MAPE: 0.08"

The observed values of in-sample performance metrics are RMSE = 264.46, MAE = 212.64, MAPE = 0.08.

Rolling Window Cross-Validation with Prophet Model

library(tidyverse)
library(caret)
library(prophet)
df.cv <- cross_validation(model, initial = 2000, period = 60, horizon = 120, units = 'days')
DT::datatable(df.cv)
unique(df.cv$cutoff)
##  [1] "2012-08-20 GMT" "2012-10-19 GMT" "2012-12-18 GMT" "2013-02-16 GMT"
##  [5] "2013-04-17 GMT" "2013-06-16 GMT" "2013-08-15 GMT" "2013-10-14 GMT"
##  [9] "2013-12-13 GMT" "2014-02-11 GMT" "2014-04-12 GMT" "2014-06-11 GMT"
## [13] "2014-08-10 GMT" "2014-10-09 GMT" "2014-12-08 GMT" "2015-02-06 GMT"
## [17] "2015-04-07 GMT" "2015-06-06 GMT" "2015-08-05 GMT" "2015-10-04 GMT"
## [21] "2015-12-03 GMT" "2016-02-01 GMT" "2016-04-01 GMT" "2016-05-31 GMT"
## [25] "2016-07-30 GMT" "2016-09-28 GMT" "2016-11-27 GMT" "2017-01-26 GMT"
## [29] "2017-03-27 GMT" "2017-05-26 GMT" "2017-07-25 GMT" "2017-09-23 GMT"
## [33] "2017-11-22 GMT" "2018-01-21 GMT" "2018-03-22 GMT" "2018-05-21 GMT"
## [37] "2018-07-20 GMT" "2018-09-18 GMT" "2018-11-17 GMT" "2019-01-16 GMT"
## [41] "2019-03-17 GMT" "2019-05-16 GMT" "2019-07-15 GMT" "2019-09-13 GMT"
## [45] "2019-11-12 GMT" "2020-01-11 GMT" "2020-03-11 GMT" "2020-05-10 GMT"
## [49] "2020-07-09 GMT" "2020-09-07 GMT" "2020-11-06 GMT" "2021-01-05 GMT"
## [53] "2021-03-06 GMT" "2021-05-05 GMT" "2021-07-04 GMT" "2021-09-02 GMT"

Cross-Validation Actual vs Predicted

df.cv %>% 
  ggplot()+
  geom_point(aes(ds,y)) +
  geom_point(aes(ds,yhat,color = factor(cutoff)))+
  theme_bw()+
  xlab("Date")+
  ylab("Adjusted Closing Price")+
  scale_color_discrete(name = 'Cutoff')

Table for Performance metrics

Creating a table for various performance metrics for different time horizons.

DT::datatable(performance_metrics(df.cv))

Plots for various metrics

Now, we can plot visualizations for metrics like RMSE, MAE, MAPE, MDAPE, SMAPE.

plot_cross_validation_metric(df.cv, metric = 'rmse')

plot_cross_validation_metric(df.cv, metric = 'mae')

plot_cross_validation_metric(df.cv, metric = 'mape')

plot_cross_validation_metric(df.cv, metric = 'mdape')

plot_cross_validation_metric(df.cv, metric = 'smape')