R Markdown

goog_price = getSymbols('GOOG', env = NULL)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
## 
## This message is shown once per session and may be disabled by setting 
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
head(goog_price)
##            GOOG.Open GOOG.High GOOG.Low GOOG.Close GOOG.Volume GOOG.Adjusted
## 2007-01-03  232.1299  237.4400 229.6940   232.9220    15470772      232.9220
## 2007-01-04  233.6243  241.0714 233.3005   240.7277    15834329      240.7277
## 2007-01-05  240.3491  242.8398 238.1623   242.6853    13795717      242.6853
## 2007-01-08  242.9344  244.0204 240.1997   240.8871     9544441      240.8871
## 2007-01-09  241.8186  243.2134 239.7015   241.8435    10803142      241.8435
## 2007-01-10  241.3105  245.8535 240.1200   243.8161    11981743      243.8161
#Finding min and max of date
date = index(goog_price)

min(date)
## [1] "2007-01-03"
max(date)
## [1] "2022-02-25"
# Changing column names
names(goog_price) = c('open','high','low','close','volume','adjusted')
# Filtering data till Jan 19,2022
goog_price = data.frame(goog_price) %>% 
  mutate(date = date) %>%
  select(adjusted,date) %>% 
  filter(date >= "2007-01-03",
         date <= "2022-01-19")
# split data into train and test

train = goog_price %>%
  filter(date < "2022-01-01")

test = goog_price %>% 
filter(date >= "2022-01-01")

Data Visualization

The ADF is performed to check if the process is stationary or non-stationary. The NULL hypothesis for the following ADF test is that “it is non-stationary”. Since we get a p-value > 0.05 there is no sufficient evidence to reject the null hypothesis.Hence , we conclude from the ADF test that the process is stationary.

# ADF test (since p-value > 0.05, it is non-stationary)
adf.test(train$adjusted,alternative = 'stationary')
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train$adjusted
## Dickey-Fuller = 1.4876, Lag order = 15, p-value = 0.99
## alternative hypothesis: stationary

The PP-test also concludes that the process is stationary since we get a p-value > 0.05 suggesting that we do not have enough evidence to reject the null hypothesis.

kpss.test(train$adjusted)
## 
##  KPSS Test for Level Stationarity
## 
## data:  train$adjusted
## KPSS Level = 28.342, Truncation lag parameter = 9, p-value = 0.01

Log transfomration

goog_log = train %>%
  mutate(close_log = log1p(adjusted))
goog_log %>%
  ggplot()+
      geom_line(aes(date, close_log))+
      theme_bw()+
      ggtitle("Google over Time (Log)")+
      ylab("Adjusted Closing Price (Log)")+
      xlab("Date")

goog_boxcox = train %>%
  mutate(close_boxcox = forecast::BoxCox(adjusted,lambda='auto'))
goog_boxcox %>%
  ggplot()+
      geom_line(aes(date,close_boxcox))+
      theme_bw()+
      ggtitle("Google over Time (Box-Cox Transformation)")+
      ylab("Closing Price (Box-Cox Tranformation)")+
      xlab("Date")

Handling Mean Stationarity

goog_diff = goog_log %>%
  mutate(close_diff = adjusted - lag(adjusted),
         close_diff2 = adjusted - lag(adjusted,n = 2),
         close_log_diff = close_log - lag(close_log),
         close_log_diff2 = close_log - lag(close_log,n=2)) %>% 
  drop_na()

goog_diff %>% 
  select(date,adjusted,close_diff,close_diff2,close_log_diff,
         close_log_diff2) %>% 
  drop_na() %>% 
  head()
##                  date adjusted close_diff close_diff2 close_log_diff
## 2007-01-05 2007-01-05 242.6853   1.957657    9.763398    0.008065987
## 2007-01-08 2007-01-08 240.8871  -1.798249    0.159408   -0.007406751
## 2007-01-09 2007-01-09 241.8435   0.956406   -0.841843    0.003946139
## 2007-01-10 2007-01-10 243.8161   1.972610    2.929016    0.008090154
## 2007-01-11 2007-01-11 248.9270   5.110840    7.083450    0.020661317
## 2007-01-12 2007-01-12 251.5571   2.630142    7.740982    0.010468655
##            close_log_diff2
## 2007-01-05    0.0408903228
## 2007-01-08    0.0006592354
## 2007-01-09   -0.0034606122
## 2007-01-10    0.0120362934
## 2007-01-11    0.0287514715
## 2007-01-12    0.0311299720
goog_diff %>%
  ggplot()+
      geom_line(aes(date,close_diff))+
      theme_bw()+
      ggtitle("goog (First Difference)")+
      ylab("Adjusted Closing Price (Difference))")+
      xlab("Date")

goog_diff %>%
  ggplot()+
      geom_line(aes(date,close_log_diff2))+
      theme_bw()+
      ggtitle("goog (First Difference)")+
      ylab("Adjusted Closing Price (Difference))")+
      xlab("Date")

acf(goog_diff$close_log_diff,lag.max = 10)

pacf(goog_diff$close_log_diff,lag.max = 10)

From the PACF plot above, we can conclude that the order of AR process is of the order 4. Hence we can represent it as AR(1,1,0)

adf.test(goog_diff$close_log_diff)
## Warning in adf.test(goog_diff$close_log_diff): p-value smaller than printed p-
## value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  goog_diff$close_log_diff
## Dickey-Fuller = -15.197, Lag order = 15, p-value = 0.01
## alternative hypothesis: stationary

Conducting ADF test again to re-verify

kpss.test(goog_diff$close_log_diff)
## Warning in kpss.test(goog_diff$close_log_diff): p-value greater than printed p-
## value
## 
##  KPSS Test for Level Stationarity
## 
## data:  goog_diff$close_log_diff
## KPSS Level = 0.18832, Truncation lag parameter = 9, p-value = 0.1

From the above ADF and KPSS tests we confirm that the process is stationary given the p-value is less than 0.05.

Fitting ARIMA Models

par(mfrow = c(1,2))
acf(goog_diff$close_log_diff,lag.max=20)
pacf(goog_diff$close_log_diff,lag.max=20)

AIC(
  arima(goog_diff$close_log,order=c(0,1,1)),
  arima(goog_diff$close_log,order=c(0,1,2)),
  arima(goog_diff$close_log,order=c(0,1,3)),
  arima(goog_diff$close_log,order=c(1,1,0)),
  arima(goog_diff$close_log,order=c(1,1,1)),
  arima(goog_diff$close_log,order=c(2,1,0)),
  arima(goog_diff$close_log,order=c(3,1,0)),
  arima(goog_diff$close_log,order=c(4,1,0)),
  arima(goog_diff$close_log,order=c(4,2,0)),
  arima(goog_diff$close_log,order=c(4,2,1)),
  arima(goog_diff$close_log,order=c(5,1,0)),
  arima(goog_diff$close_log,order=c(5,2,0))
)
##                                                df       AIC
## arima(goog_diff$close_log, order = c(0, 1, 1))  2 -19569.36
## arima(goog_diff$close_log, order = c(0, 1, 2))  3 -19567.37
## arima(goog_diff$close_log, order = c(0, 1, 3))  4 -19565.68
## arima(goog_diff$close_log, order = c(1, 1, 0))  2 -19569.37
## arima(goog_diff$close_log, order = c(1, 1, 1))  3 -19567.37
## arima(goog_diff$close_log, order = c(2, 1, 0))  3 -19567.37
## arima(goog_diff$close_log, order = c(3, 1, 0))  4 -19565.60
## arima(goog_diff$close_log, order = c(4, 1, 0))  5 -19563.94
## arima(goog_diff$close_log, order = c(4, 2, 0))  5 -18849.26
## arima(goog_diff$close_log, order = c(4, 2, 1))  6 -19553.06
## arima(goog_diff$close_log, order = c(5, 1, 0))  6 -19562.05
## arima(goog_diff$close_log, order = c(5, 2, 0))  6 -18910.72

From the above AIC values we conclude that the process with least AIC score is AR(1,1,0) with a score of -19569.37

BIC(
  arima(goog_diff$close_log,order=c(0,1,1)),
  arima(goog_diff$close_log,order=c(0,1,2)),
  arima(goog_diff$close_log,order=c(0,1,3)),
  arima(goog_diff$close_log,order=c(1,1,0)),
  arima(goog_diff$close_log,order=c(1,1,1)),
  arima(goog_diff$close_log,order=c(2,1,0)),
  arima(goog_diff$close_log,order=c(3,1,0)),
  arima(goog_diff$close_log,order=c(4,1,0)),
  arima(goog_diff$close_log,order=c(4,2,0)),
  arima(goog_diff$close_log,order=c(4,2,1)),
  arima(goog_diff$close_log,order=c(5,1,0)),
  arima(goog_diff$close_log,order=c(5,2,0))
)
## Warning in BIC.default(arima(goog_diff$close_log, order = c(0, 1, 1)),
## arima(goog_diff$close_log, : models are not all fitted to the same number of
## observations
##                                                df       BIC
## arima(goog_diff$close_log, order = c(0, 1, 1))  2 -19556.89
## arima(goog_diff$close_log, order = c(0, 1, 2))  3 -19548.66
## arima(goog_diff$close_log, order = c(0, 1, 3))  4 -19540.74
## arima(goog_diff$close_log, order = c(1, 1, 0))  2 -19556.90
## arima(goog_diff$close_log, order = c(1, 1, 1))  3 -19548.66
## arima(goog_diff$close_log, order = c(2, 1, 0))  3 -19548.67
## arima(goog_diff$close_log, order = c(3, 1, 0))  4 -19540.66
## arima(goog_diff$close_log, order = c(4, 1, 0))  5 -19532.76
## arima(goog_diff$close_log, order = c(4, 2, 0))  5 -18818.08
## arima(goog_diff$close_log, order = c(4, 2, 1))  6 -19515.65
## arima(goog_diff$close_log, order = c(5, 1, 0))  6 -19524.63
## arima(goog_diff$close_log, order = c(5, 2, 0))  6 -18873.31

From the above BIC values we conclude that the process with least AIC score is AR(1,1,0) with a score of -19556.90

Auto Arima

forecast::auto.arima(goog_diff$close_log,stationary=FALSE,allowdrift=FALSE,
seasonal=FALSE,stepwise=FALSE,approximation=FALSE)
## Series: goog_diff$close_log 
## ARIMA(1,1,0) 
## 
## Coefficients:
##           ar1
##       -0.0474
## s.e.   0.0163
## 
## sigma^2 = 0.0003275:  log likelihood = 9786.69
## AIC=-19569.37   AICc=-19569.37   BIC=-19556.9

Ljung-BOX Test

best_mod = arima(goog_diff$close_log,order=c(1,1,0))
forecast::checkresiduals(best_mod)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 21.155, df = 9, p-value = 0.01198
## 
## Model df: 1.   Total lags used: 10

From the ACF plots of residuals, we can infer there is no particular pattern.

par(mfrow=c(1,2))
resid = best_mod$resid
acf(resid,lag.max=20)
pacf(resid,lag.max=20)

Box.test(resid,type='Ljung-Box',lag=1)
## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 0.0084401, df = 1, p-value = 0.9268
Box.test(resid,type='Ljung-Box',lag=5)
## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 0.77871, df = 5, p-value = 0.9784
Box.test(resid,type='Ljung-Box',lag=10)
## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 21.155, df = 10, p-value = 0.02004
Box.test(resid,type='Ljung-Box',lag=15)
## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 31.984, df = 15, p-value = 0.006471
Box.test(resid,type='Ljung-Box',lag=20)
## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 37.554, df = 20, p-value = 0.01003
best_mod = arima(goog_diff$close_log,order=c(1,1,0))
resid = best_mod$residuals
pred = resid+goog_diff$close_log
ggplot()+
  geom_line(aes(goog_diff$date,goog_diff$close_log))+
  geom_line(aes(goog_diff$date,pred),color='blue',alpha=0.4)+
  theme_bw()+
  xlab("Date")+
  ylab("Log GooG")

RMSE = sqrt(mean((expm1(pred) - expm1(goog_diff$close_log))^2,na.rm=T))
RMSE
## [1] 15.51314
best_mod %>%
  forecast::forecast(h=5) %>% 
  autoplot()

prediction = predict(best_mod,n.ahead=12)

pred_data = data.frame(
  pred = prediction,
  date = test$date
)
pred_data = data.frame(pred = prediction,date = test$date)

pred_merge = test %>% 
  full_join(pred_data) %>%
  mutate(
    pred_high = expm1(pred.pred+2*pred.se),
    pred_low = expm1(pred.pred - 2*pred.se),
    pred.pred = expm1(pred.pred),
    error = pred.pred - adjusted
    )
## Joining, by = "date"

Forecast plot for the nextr 5 time periods

plot(forecast::forecast(best_mod,h=5),main="ARIMA(1,1,0) using Multi-Steps Forecast",ylab="Adj Price",xlab="Date")

Forecast plot for the next 6 months

plot(forecast::forecast(best_mod,h=180),main="ARIMA(1,1,0) using Multi-Steps Forecast",ylab="Adj Price",xlab="Date")

Fitting a Prophet Model

library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice
prophet_data = goog_price %>% 
    rename(ds = date, # Have to name our date variable "ds"
    y = adjusted)  # Have to name our time series "y"

# Splitting into Train and Test Data
train = prophet_data %>% 
  filter(ds<ymd("2022-01-01"))
test = prophet_data %>%
  filter(ds>=ymd("2022-01-01"))


model = prophet(train, yearly.seasonality = TRUE, daily.seasonality = TRUE)
future = make_future_dataframe(model,periods = 365)
forecast = predict(model,future)

Plotting Forecast

plot(model,forecast)+
ylab("Adjusted Closing Price of Google Stocks")+xlab("Date")+theme_bw()

Interactive Charts

dyplot.prophet(model,forecast)

Time Series Decomposition

Visualization of Time Series components

prophet_plot_components(model,forecast)

Identifying Saturation Points

Let us make a two-year forecast to identify the need for saturation points.

future_2yr = make_future_dataframe(model,periods = 730)
pred_2yr = predict(model,future_2yr)

dyplot.prophet(model,pred_2yr)

We observe that there are no unnatural values in the graph. So, there is no need to set Saturation Points for this model.

Estimating the trend and changepoints

Plotting Changepoints

  • The algorithm examines 25 equally-spaced potential change points by default as potential candidates for a change in the trend.
  • Examines the actual rate of change at each potential change point, and removes those with low rates of change.
plot(model,forecast)+add_changepoints_to_plot(model)+theme_bw()+xlab("Date")+ylab("Adjusted Closing Price")

Changepoint Hyperparameters

We can manually specify more number of change points to improve performance.

# Number of Changepoints
model = prophet(train,n.changepoints=50)
forecast = predict(model,future)
plot(model,forecast)+add_changepoints_to_plot(model)+theme_bw()+xlab("Date")+ylab("Adjusted Stock Price")

prophet_plot_components(model,forecast)

forecast_plot_data = forecast %>% 
  as_tibble() %>% 
  mutate(ds = as.Date(ds)) %>% 
  filter(ds>=ymd("2015-01-01"))

forecast_not_scaled = ggplot()+
geom_line(aes(test$ds,test$y))+ xlab("Date") + ylab("Adjusted Closing Price from test data") +
geom_line(aes(forecast_plot_data$ds,forecast_plot_data$yhat), color='steelblue')
forecast_scaled = forecast_not_scaled + 
ylim(0,2500) 
forecast_scaled

Assessment of Seasonality

prophet_plot_components(model,forecast)

Additive Seasonality

additive = prophet(train)
add_fcst = predict(additive,future)
plot(additive,add_fcst)+
ylim(0,2500)

prophet_plot_components(additive,add_fcst)

Multiplicative Seasonality

multi = prophet(train,seasonality.mode = 'multiplicative')
multi_forecast = predict(multi,future)
plot(multi,multi_forecast)+ylim(0,2500)

prophet_plot_components(multi, multi_forecast)

Assessment of holidays

model = prophet(train,fit=FALSE)
model = add_country_holidays(model,country_name = 'US')
model = fit.prophet(model,train)
forecast = predict(model,future)
prophet_plot_components(model,forecast)

Examining Impact of Holidays

forecast %>% 
  filter(
    holidays!=0,
    ds == ymd("2020-11-11") | 
    ds == ymd("2020-10-12")) %>%
    select(ds,`Veterans Day`,`Columbus Day`)
##           ds Veterans Day Columbus Day
## 1 2020-10-12      0.00000     5.522341
## 2 2020-11-11    -11.63453     0.000000
forecast %>%
  filter(holidays != 0) %>%
  dplyr::select(-ds:-additive_terms_upper, -holidays:-holidays_upper, -weekly:-yhat, -contains("upper"), -contains("lower")) %>%
  mutate_all(~ if_else(. == 0, as.numeric(NA), .)) %>%
  summarize_if(is.numeric, ~ max(., na.rm = T)) %>%
  pivot_longer(
    cols = `Christmas Day`:`Washington's Birthday`,
    names_to = 'holiday', 
    values_to = 'effect'
  ) %>%
ggplot() +
  geom_col(aes(effect,holiday))+
  theme_bw()

  • From the above plot, New Year’s Day seem to be the most important holidays affecting stock prices

Prophet Model Assessment

forecast_metric_data = forecast %>% 
  as_tibble() %>% 
  mutate(ds = as.Date(ds)) %>% 
  filter(ds>=ymd("2022-01-01"))
RMSE = sqrt(mean((test$y - forecast_metric_data$yhat)^2))
MAE = mean(abs(test$y - forecast_metric_data$yhat))
MAPE = mean(abs((test$y - forecast_metric_data$yhat)/test$y))
print(paste("RMSE:",round(RMSE,2)))
## [1] "RMSE: 264.46"
print(paste("MAE:",round(MAE,2)))
## [1] "MAE: 212.24"
print(paste("MAPE:",round(MAPE,2)))
## [1] "MAPE: 0.08"

The observed values of in-sample performance metrics are RMSE = 264.46, MAE = 212.64, MAPE = 0.08.

Rolling Window Cross-Validation with Prophet Model

library(tidyverse)
library(caret)
library(prophet)
df.cv <- cross_validation(model, initial = 2000, period = 60, horizon = 120, units = 'days')
DT::datatable(df.cv)
unique(df.cv$cutoff)
##  [1] "2012-08-20 GMT" "2012-10-19 GMT" "2012-12-18 GMT" "2013-02-16 GMT"
##  [5] "2013-04-17 GMT" "2013-06-16 GMT" "2013-08-15 GMT" "2013-10-14 GMT"
##  [9] "2013-12-13 GMT" "2014-02-11 GMT" "2014-04-12 GMT" "2014-06-11 GMT"
## [13] "2014-08-10 GMT" "2014-10-09 GMT" "2014-12-08 GMT" "2015-02-06 GMT"
## [17] "2015-04-07 GMT" "2015-06-06 GMT" "2015-08-05 GMT" "2015-10-04 GMT"
## [21] "2015-12-03 GMT" "2016-02-01 GMT" "2016-04-01 GMT" "2016-05-31 GMT"
## [25] "2016-07-30 GMT" "2016-09-28 GMT" "2016-11-27 GMT" "2017-01-26 GMT"
## [29] "2017-03-27 GMT" "2017-05-26 GMT" "2017-07-25 GMT" "2017-09-23 GMT"
## [33] "2017-11-22 GMT" "2018-01-21 GMT" "2018-03-22 GMT" "2018-05-21 GMT"
## [37] "2018-07-20 GMT" "2018-09-18 GMT" "2018-11-17 GMT" "2019-01-16 GMT"
## [41] "2019-03-17 GMT" "2019-05-16 GMT" "2019-07-15 GMT" "2019-09-13 GMT"
## [45] "2019-11-12 GMT" "2020-01-11 GMT" "2020-03-11 GMT" "2020-05-10 GMT"
## [49] "2020-07-09 GMT" "2020-09-07 GMT" "2020-11-06 GMT" "2021-01-05 GMT"
## [53] "2021-03-06 GMT" "2021-05-05 GMT" "2021-07-04 GMT" "2021-09-02 GMT"

Cross-Validation Actual vs Predicted

df.cv %>% 
  ggplot()+
  geom_point(aes(ds,y)) +
  geom_point(aes(ds,yhat,color = factor(cutoff)))+
  theme_bw()+
  xlab("Date")+
  ylab("Adjusted Closing Price")+
  scale_color_discrete(name = 'Cutoff')

Table for Performance metrics

Creating a table for various performance metrics for different time horizons.

DT::datatable(performance_metrics(df.cv))

Plots for various metrics

Now, we can plot visualizations for metrics like RMSE, MAE, MAPE, MDAPE, SMAPE.

plot_cross_validation_metric(df.cv, metric = 'rmse')

plot_cross_validation_metric(df.cv, metric = 'mae')

plot_cross_validation_metric(df.cv, metric = 'mape')

plot_cross_validation_metric(df.cv, metric = 'mdape')

plot_cross_validation_metric(df.cv, metric = 'smape')

From the above obtained RMSE, we conclude that ARIMA model ie AR (1,1,0) is the best model with an RMSE of 15.513 Hence, we use Arima model to forecast the selected timeseries data.