goog_price = getSymbols('GOOG', env = NULL)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
head(goog_price)
## GOOG.Open GOOG.High GOOG.Low GOOG.Close GOOG.Volume GOOG.Adjusted
## 2007-01-03 232.1299 237.4400 229.6940 232.9220 15470772 232.9220
## 2007-01-04 233.6243 241.0714 233.3005 240.7277 15834329 240.7277
## 2007-01-05 240.3491 242.8398 238.1623 242.6853 13795717 242.6853
## 2007-01-08 242.9344 244.0204 240.1997 240.8871 9544441 240.8871
## 2007-01-09 241.8186 243.2134 239.7015 241.8435 10803142 241.8435
## 2007-01-10 241.3105 245.8535 240.1200 243.8161 11981743 243.8161
#Finding min and max of date
date = index(goog_price)
min(date)
## [1] "2007-01-03"
max(date)
## [1] "2022-02-16"
# Changing column names
names(goog_price) = c('open','high','low','close','volume','adjusted')
# Filtering data till Jan 19,2022
goog_price = data.frame(goog_price) %>%
mutate(date = date) %>%
select(adjusted,date) %>%
filter(date >= "2007-01-03",
date <= "2022-01-19")
# split data into train and test
train = goog_price %>%
filter(date < "2022-01-01")
test = goog_price %>%
filter(date >= "2022-01-01")
The ADF is performed to check if the process is stationary or non-stationary. The NULL hypothesis for the following ADF test is that “it is non-stationary”. Since we get a p-value > 0.05 there is no sufficient evidence to reject the null hypothesis.Hence , we conclude from the ADF test that the process is stationary.
# ADF test (since p-value > 0.05, it is non-stationary)
adf.test(train$adjusted,alternative = 'stationary')
##
## Augmented Dickey-Fuller Test
##
## data: train$adjusted
## Dickey-Fuller = 1.4876, Lag order = 15, p-value = 0.99
## alternative hypothesis: stationary
The PP-test also concludes that the process is stationary since we get a p-value > 0.05 suggesting that we do not have enough evidence to reject the null hypothesis.
kpss.test(train$adjusted)
##
## KPSS Test for Level Stationarity
##
## data: train$adjusted
## KPSS Level = 28.342, Truncation lag parameter = 9, p-value = 0.01
goog_log = train %>%
mutate(close_log = log1p(adjusted))
goog_log %>%
ggplot()+
geom_line(aes(date, close_log))+
theme_bw()+
ggtitle("Google over Time (Log)")+
ylab("Adjusted Closing Price (Log)")+
xlab("Date")
goog_boxcox = train %>%
mutate(close_boxcox = forecast::BoxCox(adjusted,lambda='auto'))
goog_boxcox %>%
ggplot()+
geom_line(aes(date,close_boxcox))+
theme_bw()+
ggtitle("Google over Time (Box-Cox Transformation)")+
ylab("Closing Price (Box-Cox Tranformation)")+
xlab("Date")
goog_diff = goog_log %>%
mutate(close_diff = adjusted - lag(adjusted),
close_diff2 = adjusted - lag(adjusted,n = 2),
close_log_diff = close_log - lag(close_log),
close_log_diff2 = close_log - lag(close_log,n=2)) %>%
drop_na()
goog_diff %>%
select(date,adjusted,close_diff,close_diff2,close_log_diff,
close_log_diff2) %>%
drop_na() %>%
head()
## date adjusted close_diff close_diff2 close_log_diff
## 2007-01-05 2007-01-05 242.6853 1.957657 9.763398 0.008065987
## 2007-01-08 2007-01-08 240.8871 -1.798249 0.159408 -0.007406751
## 2007-01-09 2007-01-09 241.8435 0.956406 -0.841843 0.003946139
## 2007-01-10 2007-01-10 243.8161 1.972610 2.929016 0.008090154
## 2007-01-11 2007-01-11 248.9270 5.110840 7.083450 0.020661317
## 2007-01-12 2007-01-12 251.5571 2.630142 7.740982 0.010468655
## close_log_diff2
## 2007-01-05 0.0408903228
## 2007-01-08 0.0006592354
## 2007-01-09 -0.0034606122
## 2007-01-10 0.0120362934
## 2007-01-11 0.0287514715
## 2007-01-12 0.0311299720
goog_diff %>%
ggplot()+
geom_line(aes(date,close_diff))+
theme_bw()+
ggtitle("goog (First Difference)")+
ylab("Adjusted Closing Price (Difference))")+
xlab("Date")
goog_diff %>%
ggplot()+
geom_line(aes(date,close_log_diff2))+
theme_bw()+
ggtitle("goog (First Difference)")+
ylab("Adjusted Closing Price (Difference))")+
xlab("Date")
acf(goog_diff$close_log_diff,lag.max = 10)
pacf(goog_diff$close_log_diff,lag.max = 10)
adf.test(goog_diff$close_log_diff)
## Warning in adf.test(goog_diff$close_log_diff): p-value smaller than printed p-
## value
##
## Augmented Dickey-Fuller Test
##
## data: goog_diff$close_log_diff
## Dickey-Fuller = -15.197, Lag order = 15, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(goog_diff$close_log_diff)
## Warning in kpss.test(goog_diff$close_log_diff): p-value greater than printed p-
## value
##
## KPSS Test for Level Stationarity
##
## data: goog_diff$close_log_diff
## KPSS Level = 0.18832, Truncation lag parameter = 9, p-value = 0.1
From the above ADF and KPSS tests we confirm that the process is stationary given the p-value is less than 0.05.
par(mfrow = c(1,2))
acf(goog_diff$close_log_diff,lag.max=20)
pacf(goog_diff$close_log_diff,lag.max=20)
AIC(
arima(goog_diff$close_log,order=c(0,1,1)),
arima(goog_diff$close_log,order=c(0,1,2)),
arima(goog_diff$close_log,order=c(0,1,3)),
arima(goog_diff$close_log,order=c(1,1,0)),
arima(goog_diff$close_log,order=c(1,1,1)),
arima(goog_diff$close_log,order=c(2,1,0)),
arima(goog_diff$close_log,order=c(3,1,0)),
arima(goog_diff$close_log,order=c(4,1,0)),
arima(goog_diff$close_log,order=c(4,2,0)),
arima(goog_diff$close_log,order=c(4,2,1)),
arima(goog_diff$close_log,order=c(5,1,0)),
arima(goog_diff$close_log,order=c(5,2,0))
)
## df AIC
## arima(goog_diff$close_log, order = c(0, 1, 1)) 2 -19569.36
## arima(goog_diff$close_log, order = c(0, 1, 2)) 3 -19567.37
## arima(goog_diff$close_log, order = c(0, 1, 3)) 4 -19565.68
## arima(goog_diff$close_log, order = c(1, 1, 0)) 2 -19569.37
## arima(goog_diff$close_log, order = c(1, 1, 1)) 3 -19567.37
## arima(goog_diff$close_log, order = c(2, 1, 0)) 3 -19567.37
## arima(goog_diff$close_log, order = c(3, 1, 0)) 4 -19565.60
## arima(goog_diff$close_log, order = c(4, 1, 0)) 5 -19563.94
## arima(goog_diff$close_log, order = c(4, 2, 0)) 5 -18849.26
## arima(goog_diff$close_log, order = c(4, 2, 1)) 6 -19553.06
## arima(goog_diff$close_log, order = c(5, 1, 0)) 6 -19562.05
## arima(goog_diff$close_log, order = c(5, 2, 0)) 6 -18910.72
BIC(
arima(goog_diff$close_log,order=c(0,1,1)),
arima(goog_diff$close_log,order=c(0,1,2)),
arima(goog_diff$close_log,order=c(0,1,3)),
arima(goog_diff$close_log,order=c(1,1,0)),
arima(goog_diff$close_log,order=c(1,1,1)),
arima(goog_diff$close_log,order=c(2,1,0)),
arima(goog_diff$close_log,order=c(3,1,0)),
arima(goog_diff$close_log,order=c(4,1,0)),
arima(goog_diff$close_log,order=c(4,2,0)),
arima(goog_diff$close_log,order=c(4,2,1)),
arima(goog_diff$close_log,order=c(5,1,0)),
arima(goog_diff$close_log,order=c(5,2,0))
)
## Warning in BIC.default(arima(goog_diff$close_log, order = c(0, 1, 1)),
## arima(goog_diff$close_log, : models are not all fitted to the same number of
## observations
## df BIC
## arima(goog_diff$close_log, order = c(0, 1, 1)) 2 -19556.89
## arima(goog_diff$close_log, order = c(0, 1, 2)) 3 -19548.66
## arima(goog_diff$close_log, order = c(0, 1, 3)) 4 -19540.74
## arima(goog_diff$close_log, order = c(1, 1, 0)) 2 -19556.90
## arima(goog_diff$close_log, order = c(1, 1, 1)) 3 -19548.66
## arima(goog_diff$close_log, order = c(2, 1, 0)) 3 -19548.67
## arima(goog_diff$close_log, order = c(3, 1, 0)) 4 -19540.66
## arima(goog_diff$close_log, order = c(4, 1, 0)) 5 -19532.76
## arima(goog_diff$close_log, order = c(4, 2, 0)) 5 -18818.08
## arima(goog_diff$close_log, order = c(4, 2, 1)) 6 -19515.65
## arima(goog_diff$close_log, order = c(5, 1, 0)) 6 -19524.63
## arima(goog_diff$close_log, order = c(5, 2, 0)) 6 -18873.31
forecast::auto.arima(goog_diff$close_log,stationary=FALSE,allowdrift=FALSE,
seasonal=FALSE,stepwise=FALSE,approximation=FALSE)
## Series: goog_diff$close_log
## ARIMA(1,1,0)
##
## Coefficients:
## ar1
## -0.0474
## s.e. 0.0163
##
## sigma^2 = 0.0003275: log likelihood = 9786.69
## AIC=-19569.37 AICc=-19569.37 BIC=-19556.9
best_mod = arima(goog_diff$close_log,order=c(1,1,0))
forecast::checkresiduals(best_mod)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0)
## Q* = 21.155, df = 9, p-value = 0.01198
##
## Model df: 1. Total lags used: 10
par(mfrow=c(1,2))
resid = best_mod$resid
acf(resid,lag.max=20)
pacf(resid,lag.max=20)
Box.test(resid,type='Ljung-Box',lag=1)
##
## Box-Ljung test
##
## data: resid
## X-squared = 0.0084401, df = 1, p-value = 0.9268
Box.test(resid,type='Ljung-Box',lag=5)
##
## Box-Ljung test
##
## data: resid
## X-squared = 0.77871, df = 5, p-value = 0.9784
Box.test(resid,type='Ljung-Box',lag=10)
##
## Box-Ljung test
##
## data: resid
## X-squared = 21.155, df = 10, p-value = 0.02004
Box.test(resid,type='Ljung-Box',lag=15)
##
## Box-Ljung test
##
## data: resid
## X-squared = 31.984, df = 15, p-value = 0.006471
Box.test(resid,type='Ljung-Box',lag=20)
##
## Box-Ljung test
##
## data: resid
## X-squared = 37.554, df = 20, p-value = 0.01003
best_mod = arima(goog_diff$close_log,order=c(1,1,0))
resid = best_mod$residuals
pred = resid+goog_diff$close_log
ggplot()+
geom_line(aes(goog_diff$date,goog_diff$close_log))+
geom_line(aes(goog_diff$date,pred),color='blue',alpha=0.4)+
theme_bw()+
xlab("Date")+
ylab("Log GooG")
RMSE = sqrt(mean((expm1(pred) - expm1(goog_diff$close_log))^2,na.rm=T))
RMSE
## [1] 15.51314
best_mod %>%
forecast::forecast(h=5) %>%
autoplot()
prediction = predict(best_mod,n.ahead=12)
pred_data = data.frame(
pred = prediction,
date = test$date
)
pred_data = data.frame(pred = prediction,date = test$date)
pred_merge = test %>%
full_join(pred_data) %>%
mutate(
pred_high = expm1(pred.pred+2*pred.se),
pred_low = expm1(pred.pred - 2*pred.se),
pred.pred = expm1(pred.pred),
error = pred.pred - adjusted
)
## Joining, by = "date"
plot(forecast::forecast(best_mod,h=5),main="ARIMA(1,1,0) using Multi-Steps Forecast",ylab="Adj Price",xlab="Date")
plot(forecast::forecast(best_mod,h=180),main="ARIMA(1,1,0) using Multi-Steps Forecast",ylab="Adj Price",xlab="Date")
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
## splice
prophet_data = goog_price %>%
rename(ds = date, # Have to name our date variable "ds"
y = adjusted) # Have to name our time series "y"
# Splitting into Train and Test Data
train = prophet_data %>%
filter(ds<ymd("2022-01-01"))
test = prophet_data %>%
filter(ds>=ymd("2022-01-01"))
model = prophet(train, yearly.seasonality = TRUE, daily.seasonality = TRUE)
future = make_future_dataframe(model,periods = 365)
forecast = predict(model,future)
plot(model,forecast)+
ylab("Adjusted Closing Price of Google Stocks")+xlab("Date")+theme_bw()
dyplot.prophet(model,forecast)
prophet_plot_components(model,forecast)
Let us make a two-year forecast to identify the need for saturation points.
future_2yr = make_future_dataframe(model,periods = 730)
pred_2yr = predict(model,future_2yr)
dyplot.prophet(model,pred_2yr)
We observe that there are no unnatural values in the graph. So, there is no need to set Saturation Points for this model.
plot(model,forecast)+add_changepoints_to_plot(model)+theme_bw()+xlab("Date")+ylab("Adjusted Closing Price")
We can manually specify more number of change points to improve performance.
# Number of Changepoints
model = prophet(train,n.changepoints=50)
forecast = predict(model,future)
plot(model,forecast)+add_changepoints_to_plot(model)+theme_bw()+xlab("Date")+ylab("Adjusted Stock Price")
prophet_plot_components(model,forecast)
forecast_plot_data = forecast %>%
as_tibble() %>%
mutate(ds = as.Date(ds)) %>%
filter(ds>=ymd("2015-01-01"))
forecast_not_scaled = ggplot()+
geom_line(aes(test$ds,test$y))+ xlab("Date") + ylab("Adjusted Closing Price from test data") +
geom_line(aes(forecast_plot_data$ds,forecast_plot_data$yhat), color='steelblue')
forecast_scaled = forecast_not_scaled +
ylim(0,2500)
forecast_scaled
prophet_plot_components(model,forecast)
additive = prophet(train)
add_fcst = predict(additive,future)
plot(additive,add_fcst)+
ylim(0,2500)
prophet_plot_components(additive,add_fcst)
multi = prophet(train,seasonality.mode = 'multiplicative')
multi_forecast = predict(multi,future)
plot(multi,multi_forecast)+ylim(0,2500)
prophet_plot_components(multi, multi_forecast)
model = prophet(train,fit=FALSE)
model = add_country_holidays(model,country_name = 'US')
model = fit.prophet(model,train)
forecast = predict(model,future)
prophet_plot_components(model,forecast)
forecast %>%
filter(
holidays!=0,
ds == ymd("2020-11-11") |
ds == ymd("2020-10-12")) %>%
select(ds,`Veterans Day`,`Columbus Day`)
## ds Veterans Day Columbus Day
## 1 2020-10-12 0.00000 5.522341
## 2 2020-11-11 -11.63453 0.000000
forecast %>%
filter(holidays != 0) %>%
dplyr::select(-ds:-additive_terms_upper, -holidays:-holidays_upper, -weekly:-yhat, -contains("upper"), -contains("lower")) %>%
mutate_all(~ if_else(. == 0, as.numeric(NA), .)) %>%
summarize_if(is.numeric, ~ max(., na.rm = T)) %>%
pivot_longer(
cols = `Christmas Day`:`Washington's Birthday`,
names_to = 'holiday',
values_to = 'effect'
) %>%
ggplot() +
geom_col(aes(effect,holiday))+
theme_bw()
forecast_metric_data = forecast %>%
as_tibble() %>%
mutate(ds = as.Date(ds)) %>%
filter(ds>=ymd("2022-01-01"))
RMSE = sqrt(mean((test$y - forecast_metric_data$yhat)^2))
MAE = mean(abs(test$y - forecast_metric_data$yhat))
MAPE = mean(abs((test$y - forecast_metric_data$yhat)/test$y))
print(paste("RMSE:",round(RMSE,2)))
## [1] "RMSE: 264.46"
print(paste("MAE:",round(MAE,2)))
## [1] "MAE: 212.24"
print(paste("MAPE:",round(MAPE,2)))
## [1] "MAPE: 0.08"
The observed values of in-sample performance metrics are RMSE = 264.46, MAE = 212.64, MAPE = 0.08.
library(tidyverse)
library(caret)
library(prophet)
df.cv <- cross_validation(model, initial = 2000, period = 60, horizon = 120, units = 'days')
DT::datatable(df.cv)
unique(df.cv$cutoff)
## [1] "2012-08-20 GMT" "2012-10-19 GMT" "2012-12-18 GMT" "2013-02-16 GMT"
## [5] "2013-04-17 GMT" "2013-06-16 GMT" "2013-08-15 GMT" "2013-10-14 GMT"
## [9] "2013-12-13 GMT" "2014-02-11 GMT" "2014-04-12 GMT" "2014-06-11 GMT"
## [13] "2014-08-10 GMT" "2014-10-09 GMT" "2014-12-08 GMT" "2015-02-06 GMT"
## [17] "2015-04-07 GMT" "2015-06-06 GMT" "2015-08-05 GMT" "2015-10-04 GMT"
## [21] "2015-12-03 GMT" "2016-02-01 GMT" "2016-04-01 GMT" "2016-05-31 GMT"
## [25] "2016-07-30 GMT" "2016-09-28 GMT" "2016-11-27 GMT" "2017-01-26 GMT"
## [29] "2017-03-27 GMT" "2017-05-26 GMT" "2017-07-25 GMT" "2017-09-23 GMT"
## [33] "2017-11-22 GMT" "2018-01-21 GMT" "2018-03-22 GMT" "2018-05-21 GMT"
## [37] "2018-07-20 GMT" "2018-09-18 GMT" "2018-11-17 GMT" "2019-01-16 GMT"
## [41] "2019-03-17 GMT" "2019-05-16 GMT" "2019-07-15 GMT" "2019-09-13 GMT"
## [45] "2019-11-12 GMT" "2020-01-11 GMT" "2020-03-11 GMT" "2020-05-10 GMT"
## [49] "2020-07-09 GMT" "2020-09-07 GMT" "2020-11-06 GMT" "2021-01-05 GMT"
## [53] "2021-03-06 GMT" "2021-05-05 GMT" "2021-07-04 GMT" "2021-09-02 GMT"
df.cv %>%
ggplot()+
geom_point(aes(ds,y)) +
geom_point(aes(ds,yhat,color = factor(cutoff)))+
theme_bw()+
xlab("Date")+
ylab("Adjusted Closing Price")+
scale_color_discrete(name = 'Cutoff')
Creating a table for various performance metrics for different time horizons.
DT::datatable(performance_metrics(df.cv))
Now, we can plot visualizations for metrics like RMSE, MAE, MAPE, MDAPE, SMAPE.
plot_cross_validation_metric(df.cv, metric = 'rmse')
plot_cross_validation_metric(df.cv, metric = 'mae')
plot_cross_validation_metric(df.cv, metric = 'mape')
plot_cross_validation_metric(df.cv, metric = 'mdape')
plot_cross_validation_metric(df.cv, metric = 'smape')