goog_price = getSymbols('GOOG', env = NULL)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
head(goog_price)
## GOOG.Open GOOG.High GOOG.Low GOOG.Close GOOG.Volume GOOG.Adjusted
## 2007-01-03 232.1299 237.4400 229.6940 232.9220 15470772 232.9220
## 2007-01-04 233.6243 241.0714 233.3005 240.7277 15834329 240.7277
## 2007-01-05 240.3491 242.8398 238.1623 242.6853 13795717 242.6853
## 2007-01-08 242.9344 244.0204 240.1997 240.8871 9544441 240.8871
## 2007-01-09 241.8186 243.2134 239.7015 241.8435 10803142 241.8435
## 2007-01-10 241.3105 245.8535 240.1200 243.8161 11981743 243.8161
#Finding min and max of date
date = index(goog_price)
min(date)
## [1] "2007-01-03"
max(date)
## [1] "2022-02-25"
# Changing column names
names(goog_price) = c('open','high','low','close','volume','adjusted')
# Filtering data till Jan 19,2022
goog_price = data.frame(goog_price) %>%
mutate(date = date) %>%
select(adjusted,date) %>%
filter(date >= "2007-01-03",
date <= "2022-01-19")
# split data into train and test
train = goog_price %>%
filter(date < "2022-01-01")
test = goog_price %>%
filter(date >= "2022-01-01")
The ADF is performed to check if the process is stationary or non-stationary. The NULL hypothesis for the following ADF test is that “it is non-stationary”. Since we get a p-value > 0.05 there is no sufficient evidence to reject the null hypothesis.Hence , we conclude from the ADF test that the process is stationary.
# ADF test (since p-value > 0.05, it is non-stationary)
adf.test(train$adjusted,alternative = 'stationary')
##
## Augmented Dickey-Fuller Test
##
## data: train$adjusted
## Dickey-Fuller = 1.4876, Lag order = 15, p-value = 0.99
## alternative hypothesis: stationary
The PP-test also concludes that the process is stationary since we get a p-value > 0.05 suggesting that we do not have enough evidence to reject the null hypothesis.
kpss.test(train$adjusted)
##
## KPSS Test for Level Stationarity
##
## data: train$adjusted
## KPSS Level = 28.342, Truncation lag parameter = 9, p-value = 0.01
goog_log = train %>%
mutate(close_log = log1p(adjusted))
goog_log %>%
ggplot()+
geom_line(aes(date, close_log))+
theme_bw()+
ggtitle("Google over Time (Log)")+
ylab("Adjusted Closing Price (Log)")+
xlab("Date")
goog_boxcox = train %>%
mutate(close_boxcox = forecast::BoxCox(adjusted,lambda='auto'))
goog_boxcox %>%
ggplot()+
geom_line(aes(date,close_boxcox))+
theme_bw()+
ggtitle("Google over Time (Box-Cox Transformation)")+
ylab("Closing Price (Box-Cox Tranformation)")+
xlab("Date")
goog_diff = goog_log %>%
mutate(close_diff = adjusted - lag(adjusted),
close_diff2 = adjusted - lag(adjusted,n = 2),
close_log_diff = close_log - lag(close_log),
close_log_diff2 = close_log - lag(close_log,n=2)) %>%
drop_na()
goog_diff %>%
select(date,adjusted,close_diff,close_diff2,close_log_diff,
close_log_diff2) %>%
drop_na() %>%
head()
## date adjusted close_diff close_diff2 close_log_diff
## 2007-01-05 2007-01-05 242.6853 1.957657 9.763398 0.008065987
## 2007-01-08 2007-01-08 240.8871 -1.798249 0.159408 -0.007406751
## 2007-01-09 2007-01-09 241.8435 0.956406 -0.841843 0.003946139
## 2007-01-10 2007-01-10 243.8161 1.972610 2.929016 0.008090154
## 2007-01-11 2007-01-11 248.9270 5.110840 7.083450 0.020661317
## 2007-01-12 2007-01-12 251.5571 2.630142 7.740982 0.010468655
## close_log_diff2
## 2007-01-05 0.0408903228
## 2007-01-08 0.0006592354
## 2007-01-09 -0.0034606122
## 2007-01-10 0.0120362934
## 2007-01-11 0.0287514715
## 2007-01-12 0.0311299720
goog_diff %>%
ggplot()+
geom_line(aes(date,close_diff))+
theme_bw()+
ggtitle("goog (First Difference)")+
ylab("Adjusted Closing Price (Difference))")+
xlab("Date")
goog_diff %>%
ggplot()+
geom_line(aes(date,close_log_diff2))+
theme_bw()+
ggtitle("goog (First Difference)")+
ylab("Adjusted Closing Price (Difference))")+
xlab("Date")
acf(goog_diff$close_log_diff,lag.max = 10)
pacf(goog_diff$close_log_diff,lag.max = 10)
adf.test(goog_diff$close_log_diff)
## Warning in adf.test(goog_diff$close_log_diff): p-value smaller than printed p-
## value
##
## Augmented Dickey-Fuller Test
##
## data: goog_diff$close_log_diff
## Dickey-Fuller = -15.197, Lag order = 15, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(goog_diff$close_log_diff)
## Warning in kpss.test(goog_diff$close_log_diff): p-value greater than printed p-
## value
##
## KPSS Test for Level Stationarity
##
## data: goog_diff$close_log_diff
## KPSS Level = 0.18832, Truncation lag parameter = 9, p-value = 0.1
From the above ADF and KPSS tests we confirm that the process is stationary given the p-value is less than 0.05.
par(mfrow = c(1,2))
acf(goog_diff$close_log_diff,lag.max=20)
pacf(goog_diff$close_log_diff,lag.max=20)
AIC(
arima(goog_diff$close_log,order=c(0,1,1)),
arima(goog_diff$close_log,order=c(0,1,2)),
arima(goog_diff$close_log,order=c(0,1,3)),
arima(goog_diff$close_log,order=c(1,1,0)),
arima(goog_diff$close_log,order=c(1,1,1)),
arima(goog_diff$close_log,order=c(2,1,0)),
arima(goog_diff$close_log,order=c(3,1,0)),
arima(goog_diff$close_log,order=c(4,1,0)),
arima(goog_diff$close_log,order=c(4,2,0)),
arima(goog_diff$close_log,order=c(4,2,1)),
arima(goog_diff$close_log,order=c(5,1,0)),
arima(goog_diff$close_log,order=c(5,2,0))
)
## df AIC
## arima(goog_diff$close_log, order = c(0, 1, 1)) 2 -19569.36
## arima(goog_diff$close_log, order = c(0, 1, 2)) 3 -19567.37
## arima(goog_diff$close_log, order = c(0, 1, 3)) 4 -19565.68
## arima(goog_diff$close_log, order = c(1, 1, 0)) 2 -19569.37
## arima(goog_diff$close_log, order = c(1, 1, 1)) 3 -19567.37
## arima(goog_diff$close_log, order = c(2, 1, 0)) 3 -19567.37
## arima(goog_diff$close_log, order = c(3, 1, 0)) 4 -19565.60
## arima(goog_diff$close_log, order = c(4, 1, 0)) 5 -19563.94
## arima(goog_diff$close_log, order = c(4, 2, 0)) 5 -18849.26
## arima(goog_diff$close_log, order = c(4, 2, 1)) 6 -19553.06
## arima(goog_diff$close_log, order = c(5, 1, 0)) 6 -19562.05
## arima(goog_diff$close_log, order = c(5, 2, 0)) 6 -18910.72
BIC(
arima(goog_diff$close_log,order=c(0,1,1)),
arima(goog_diff$close_log,order=c(0,1,2)),
arima(goog_diff$close_log,order=c(0,1,3)),
arima(goog_diff$close_log,order=c(1,1,0)),
arima(goog_diff$close_log,order=c(1,1,1)),
arima(goog_diff$close_log,order=c(2,1,0)),
arima(goog_diff$close_log,order=c(3,1,0)),
arima(goog_diff$close_log,order=c(4,1,0)),
arima(goog_diff$close_log,order=c(4,2,0)),
arima(goog_diff$close_log,order=c(4,2,1)),
arima(goog_diff$close_log,order=c(5,1,0)),
arima(goog_diff$close_log,order=c(5,2,0))
)
## Warning in BIC.default(arima(goog_diff$close_log, order = c(0, 1, 1)),
## arima(goog_diff$close_log, : models are not all fitted to the same number of
## observations
## df BIC
## arima(goog_diff$close_log, order = c(0, 1, 1)) 2 -19556.89
## arima(goog_diff$close_log, order = c(0, 1, 2)) 3 -19548.66
## arima(goog_diff$close_log, order = c(0, 1, 3)) 4 -19540.74
## arima(goog_diff$close_log, order = c(1, 1, 0)) 2 -19556.90
## arima(goog_diff$close_log, order = c(1, 1, 1)) 3 -19548.66
## arima(goog_diff$close_log, order = c(2, 1, 0)) 3 -19548.67
## arima(goog_diff$close_log, order = c(3, 1, 0)) 4 -19540.66
## arima(goog_diff$close_log, order = c(4, 1, 0)) 5 -19532.76
## arima(goog_diff$close_log, order = c(4, 2, 0)) 5 -18818.08
## arima(goog_diff$close_log, order = c(4, 2, 1)) 6 -19515.65
## arima(goog_diff$close_log, order = c(5, 1, 0)) 6 -19524.63
## arima(goog_diff$close_log, order = c(5, 2, 0)) 6 -18873.31
forecast::auto.arima(goog_diff$close_log,stationary=FALSE,allowdrift=FALSE,
seasonal=FALSE,stepwise=FALSE,approximation=FALSE)
## Series: goog_diff$close_log
## ARIMA(1,1,0)
##
## Coefficients:
## ar1
## -0.0474
## s.e. 0.0163
##
## sigma^2 = 0.0003275: log likelihood = 9786.69
## AIC=-19569.37 AICc=-19569.37 BIC=-19556.9
best_mod = arima(goog_diff$close_log,order=c(1,1,0))
forecast::checkresiduals(best_mod)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0)
## Q* = 21.155, df = 9, p-value = 0.01198
##
## Model df: 1. Total lags used: 10
par(mfrow=c(1,2))
resid = best_mod$resid
acf(resid,lag.max=20)
pacf(resid,lag.max=20)
Box.test(resid,type='Ljung-Box',lag=1)
##
## Box-Ljung test
##
## data: resid
## X-squared = 0.0084401, df = 1, p-value = 0.9268
Box.test(resid,type='Ljung-Box',lag=5)
##
## Box-Ljung test
##
## data: resid
## X-squared = 0.77871, df = 5, p-value = 0.9784
Box.test(resid,type='Ljung-Box',lag=10)
##
## Box-Ljung test
##
## data: resid
## X-squared = 21.155, df = 10, p-value = 0.02004
Box.test(resid,type='Ljung-Box',lag=15)
##
## Box-Ljung test
##
## data: resid
## X-squared = 31.984, df = 15, p-value = 0.006471
Box.test(resid,type='Ljung-Box',lag=20)
##
## Box-Ljung test
##
## data: resid
## X-squared = 37.554, df = 20, p-value = 0.01003
best_mod = arima(goog_diff$close_log,order=c(1,1,0))
resid = best_mod$residuals
pred = resid+goog_diff$close_log
ggplot()+
geom_line(aes(goog_diff$date,goog_diff$close_log))+
geom_line(aes(goog_diff$date,pred),color='blue',alpha=0.4)+
theme_bw()+
xlab("Date")+
ylab("Log GooG")
RMSE = sqrt(mean((expm1(pred) - expm1(goog_diff$close_log))^2,na.rm=T))
RMSE
## [1] 15.51314
best_mod %>%
forecast::forecast(h=5) %>%
autoplot()
prediction = predict(best_mod,n.ahead=12)
pred_data = data.frame(
pred = prediction,
date = test$date
)
pred_data = data.frame(pred = prediction,date = test$date)
pred_merge = test %>%
full_join(pred_data) %>%
mutate(
pred_high = expm1(pred.pred+2*pred.se),
pred_low = expm1(pred.pred - 2*pred.se),
pred.pred = expm1(pred.pred),
error = pred.pred - adjusted
)
## Joining, by = "date"
plot(forecast::forecast(best_mod,h=5),main="ARIMA(1,1,0) using Multi-Steps Forecast",ylab="Adj Price",xlab="Date")
plot(forecast::forecast(best_mod,h=180),main="ARIMA(1,1,0) using Multi-Steps Forecast",ylab="Adj Price",xlab="Date")
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
## splice
prophet_data = goog_price %>%
rename(ds = date, # Have to name our date variable "ds"
y = adjusted) # Have to name our time series "y"
# Splitting into Train and Test Data
train = prophet_data %>%
filter(ds<ymd("2022-01-01"))
test = prophet_data %>%
filter(ds>=ymd("2022-01-01"))
model = prophet(train, yearly.seasonality = TRUE, daily.seasonality = TRUE)
future = make_future_dataframe(model,periods = 365)
forecast = predict(model,future)
plot(model,forecast)+
ylab("Adjusted Closing Price of Google Stocks")+xlab("Date")+theme_bw()
dyplot.prophet(model,forecast)
prophet_plot_components(model,forecast)
Let us make a two-year forecast to identify the need for saturation points.
future_2yr = make_future_dataframe(model,periods = 730)
pred_2yr = predict(model,future_2yr)
dyplot.prophet(model,pred_2yr)
We observe that there are no unnatural values in the graph. So, there is no need to set Saturation Points for this model.
plot(model,forecast)+add_changepoints_to_plot(model)+theme_bw()+xlab("Date")+ylab("Adjusted Closing Price")
We can manually specify more number of change points to improve performance.
# Number of Changepoints
model = prophet(train,n.changepoints=50)
forecast = predict(model,future)
plot(model,forecast)+add_changepoints_to_plot(model)+theme_bw()+xlab("Date")+ylab("Adjusted Stock Price")
prophet_plot_components(model,forecast)
forecast_plot_data = forecast %>%
as_tibble() %>%
mutate(ds = as.Date(ds)) %>%
filter(ds>=ymd("2015-01-01"))
forecast_not_scaled = ggplot()+
geom_line(aes(test$ds,test$y))+ xlab("Date") + ylab("Adjusted Closing Price from test data") +
geom_line(aes(forecast_plot_data$ds,forecast_plot_data$yhat), color='steelblue')
forecast_scaled = forecast_not_scaled +
ylim(0,2500)
forecast_scaled
prophet_plot_components(model,forecast)
additive = prophet(train)
add_fcst = predict(additive,future)
plot(additive,add_fcst)+
ylim(0,2500)
prophet_plot_components(additive,add_fcst)
multi = prophet(train,seasonality.mode = 'multiplicative')
multi_forecast = predict(multi,future)
plot(multi,multi_forecast)+ylim(0,2500)
prophet_plot_components(multi, multi_forecast)
model = prophet(train,fit=FALSE)
model = add_country_holidays(model,country_name = 'US')
model = fit.prophet(model,train)
forecast = predict(model,future)
prophet_plot_components(model,forecast)
forecast %>%
filter(
holidays!=0,
ds == ymd("2020-11-11") |
ds == ymd("2020-10-12")) %>%
select(ds,`Veterans Day`,`Columbus Day`)
## ds Veterans Day Columbus Day
## 1 2020-10-12 0.00000 5.522341
## 2 2020-11-11 -11.63453 0.000000
forecast %>%
filter(holidays != 0) %>%
dplyr::select(-ds:-additive_terms_upper, -holidays:-holidays_upper, -weekly:-yhat, -contains("upper"), -contains("lower")) %>%
mutate_all(~ if_else(. == 0, as.numeric(NA), .)) %>%
summarize_if(is.numeric, ~ max(., na.rm = T)) %>%
pivot_longer(
cols = `Christmas Day`:`Washington's Birthday`,
names_to = 'holiday',
values_to = 'effect'
) %>%
ggplot() +
geom_col(aes(effect,holiday))+
theme_bw()
forecast_metric_data = forecast %>%
as_tibble() %>%
mutate(ds = as.Date(ds)) %>%
filter(ds>=ymd("2022-01-01"))
RMSE = sqrt(mean((test$y - forecast_metric_data$yhat)^2))
MAE = mean(abs(test$y - forecast_metric_data$yhat))
MAPE = mean(abs((test$y - forecast_metric_data$yhat)/test$y))
print(paste("RMSE:",round(RMSE,2)))
## [1] "RMSE: 264.46"
print(paste("MAE:",round(MAE,2)))
## [1] "MAE: 212.24"
print(paste("MAPE:",round(MAPE,2)))
## [1] "MAPE: 0.08"
The observed values of in-sample performance metrics are RMSE = 264.46, MAE = 212.64, MAPE = 0.08.
library(tidyverse)
library(caret)
library(prophet)
df.cv <- cross_validation(model, initial = 2000, period = 60, horizon = 120, units = 'days')
DT::datatable(df.cv)
unique(df.cv$cutoff)
## [1] "2012-08-20 GMT" "2012-10-19 GMT" "2012-12-18 GMT" "2013-02-16 GMT"
## [5] "2013-04-17 GMT" "2013-06-16 GMT" "2013-08-15 GMT" "2013-10-14 GMT"
## [9] "2013-12-13 GMT" "2014-02-11 GMT" "2014-04-12 GMT" "2014-06-11 GMT"
## [13] "2014-08-10 GMT" "2014-10-09 GMT" "2014-12-08 GMT" "2015-02-06 GMT"
## [17] "2015-04-07 GMT" "2015-06-06 GMT" "2015-08-05 GMT" "2015-10-04 GMT"
## [21] "2015-12-03 GMT" "2016-02-01 GMT" "2016-04-01 GMT" "2016-05-31 GMT"
## [25] "2016-07-30 GMT" "2016-09-28 GMT" "2016-11-27 GMT" "2017-01-26 GMT"
## [29] "2017-03-27 GMT" "2017-05-26 GMT" "2017-07-25 GMT" "2017-09-23 GMT"
## [33] "2017-11-22 GMT" "2018-01-21 GMT" "2018-03-22 GMT" "2018-05-21 GMT"
## [37] "2018-07-20 GMT" "2018-09-18 GMT" "2018-11-17 GMT" "2019-01-16 GMT"
## [41] "2019-03-17 GMT" "2019-05-16 GMT" "2019-07-15 GMT" "2019-09-13 GMT"
## [45] "2019-11-12 GMT" "2020-01-11 GMT" "2020-03-11 GMT" "2020-05-10 GMT"
## [49] "2020-07-09 GMT" "2020-09-07 GMT" "2020-11-06 GMT" "2021-01-05 GMT"
## [53] "2021-03-06 GMT" "2021-05-05 GMT" "2021-07-04 GMT" "2021-09-02 GMT"
df.cv %>%
ggplot()+
geom_point(aes(ds,y)) +
geom_point(aes(ds,yhat,color = factor(cutoff)))+
theme_bw()+
xlab("Date")+
ylab("Adjusted Closing Price")+
scale_color_discrete(name = 'Cutoff')
Creating a table for various performance metrics for different time horizons.
DT::datatable(performance_metrics(df.cv))
Now, we can plot visualizations for metrics like RMSE, MAE, MAPE, MDAPE, SMAPE.
plot_cross_validation_metric(df.cv, metric = 'rmse')
plot_cross_validation_metric(df.cv, metric = 'mae')
plot_cross_validation_metric(df.cv, metric = 'mape')
plot_cross_validation_metric(df.cv, metric = 'mdape')
plot_cross_validation_metric(df.cv, metric = 'smape')
From the above obtained RMSE, we conclude that ARIMA model ie AR (1,1,0) is the best model with an RMSE of 15.513 Hence, we use Arima model to forecast the selected timeseries data.