# Packages Required
if (!require(quantmod)) install.packages("quantmod")
if (!require(tidyverse)) install.packages("tidyverse")
if (!require(ggplot2)) install.packages("ggplot2")
if (!require(mltools)) install.packages("mltools")
if (!require(tseries)) install.packages("tseries")
if (!require(forecast)) install.packages("forecast")
if (!require(lubridate)) install.packages("lubridate")
if (!require(sjPlot)) install.packages("sjPlot")
library(ggplot2)
library(quantmod)
library(tidyverse)
library(mltools)
library(tseries)
library(forecast)
library(lubridate)
library(zoo)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(dplyr)
library(tidyr)
# Importing the dataset
appl_data <- read.csv('C:/Sai Anne/MSBA/Second Semester/BANA7050_002_Forecasting and Time Series Analysis/Final Project/APPLStockPrice.csv')
# Data Preprocessing & Exploratory Data Analysis
head(appl_data)
## date open high low close volume
## 1 12-Dec-80 0.1007 0.1011 0.1007 0.1007 469033600
## 2 15-Dec-80 0.0959 0.0959 0.0954 0.0954 175884800
## 3 16-Dec-80 0.0889 0.0889 0.0884 0.0884 105728000
## 4 17-Dec-80 0.0906 0.0911 0.0906 0.0906 86441600
## 5 18-Dec-80 0.0933 0.0937 0.0933 0.0933 73449600
## 6 19-Dec-80 0.0990 0.0994 0.0990 0.0990 48630400
tail(appl_data)
## date open high low close volume
## 10356 07-Jan-22 172.89 174.14 171.03 172.17 85034450
## 10357 10-Jan-22 169.08 172.50 168.17 172.19 103822290
## 10358 11-Jan-22 172.32 175.18 170.82 175.08 75937685
## 10359 12-Jan-22 176.12 177.18 174.82 175.53 72793796
## 10360 13-Jan-22 175.78 176.62 171.79 172.19 82273231
## 10361 14-Jan-22 171.34 173.78 171.09 173.07 80324582
summary(appl_data)
## date open high low
## Length:10361 Min. : 0.0390 Min. : 0.0390 Min. : 0.0385
## Class :character 1st Qu.: 0.2343 1st Qu.: 0.2395 1st Qu.: 0.2286
## Mode :character Median : 0.3832 Median : 0.3896 Median : 0.3765
## Mean : 12.6693 Mean : 12.8055 Mean : 12.5350
## 3rd Qu.: 11.8718 3rd Qu.: 11.9808 3rd Qu.: 11.7050
## Max. :182.6300 Max. :182.9400 Max. :179.1200
## close volume
## Min. : 0.0385 Min. :1.002e+06
## 1st Qu.: 0.2340 1st Qu.:1.257e+08
## Median : 0.3829 Median :2.216e+08
## Mean : 12.6756 Mean :3.309e+08
## 3rd Qu.: 11.8771 3rd Qu.:4.144e+08
## Max. :182.0100 Max. :2.147e+09
dim(appl_data)
## [1] 10361 6
str(appl_data)
## 'data.frame': 10361 obs. of 6 variables:
## $ date : chr "12-Dec-80" "15-Dec-80" "16-Dec-80" "17-Dec-80" ...
## $ open : num 0.1007 0.0959 0.0889 0.0906 0.0933 ...
## $ high : num 0.1011 0.0959 0.0889 0.0911 0.0937 ...
## $ low : num 0.1007 0.0954 0.0884 0.0906 0.0933 ...
## $ close : num 0.1007 0.0954 0.0884 0.0906 0.0933 ...
## $ volume: int 469033600 175884800 105728000 86441600 73449600 48630400 37363200 46950400 48003200 55574400 ...
# Converting the date data types for further analysis
appl_data['date'] <- as.Date(appl_data$date, "%d-%b-%y")
str(appl_data)
## 'data.frame': 10361 obs. of 6 variables:
## $ date : Date, format: "1980-12-12" "1980-12-15" ...
## $ open : num 0.1007 0.0959 0.0889 0.0906 0.0933 ...
## $ high : num 0.1011 0.0959 0.0889 0.0911 0.0937 ...
## $ low : num 0.1007 0.0954 0.0884 0.0906 0.0933 ...
## $ close : num 0.1007 0.0954 0.0884 0.0906 0.0933 ...
## $ volume: int 469033600 175884800 105728000 86441600 73449600 48630400 37363200 46950400 48003200 55574400 ...
View(appl_data)
colSums(is.na(appl_data))
## date open high low close volume
## 0 0 0 0 0 0
# Now let us consider the tech era of the dates and select the required rows
appl_tech_era <- appl_data[appl_data$date >= '2012-01-17',]
View(appl_tech_era)
# Finding the standard deviation of the closing price of the apple stock price
sd(appl_tech_era$close)
## [1] 40.11958
range(appl_tech_era$close)
## [1] 12.133 182.010
# Checking for the presence of outliers
par(mfrow = c(2,3), oma = c(1,1,0,0) + 0.1, mar = c(1,1,1,1) + 0.1)
boxplot(appl_tech_era$open,main = "Open")
boxplot(appl_tech_era$high,main = "High")
boxplot(appl_tech_era$low,main = "Low")
boxplot(appl_tech_era$close,main = "Close")
boxplot(appl_tech_era$volume,main = "Volume")
# Checking for the normal distribution of data
hist(appl_tech_era$close, main = "Apple Stock Closing Price in Tech Era(2012-2022)", xlab = "Closing Price", xlim = c(min(appl_tech_era$close) - 1 ,max(appl_tech_era$close) + 1), col = "#90A4AE", border = "#263238", breaks = 50)
ggplot(data = appl_tech_era, aes(sample = close)) + stat_qq() + stat_qq_line(col = '#90A4AE') + ggtitle('QQ plot of Apple Stock Closing Price')
# Apple Stock Closing Price Visualization Across Years
ggplot(data = appl_tech_era, aes(x = date, y = close)) + geom_line(col = "#455A64") + labs(title = "Apple Stock Price Visualization Across Years", subtitle = "2012-2022", x = "Time Frame", y = "Closing Price ($)")
# Using lm and glm Models
ggplot(data = appl_tech_era, aes(x = date, y = close)) + geom_line(col = "#455A64") + geom_smooth(method = "lm", color = "#90A4AE", lwd = 1, linetype = 'dashed') + geom_smooth(method = "glm", method.args = list(family = gaussian(link = "log")), color = "7986CB")
# Testing and Training the data
pm <- 1:(nrow(appl_tech_era)*0.75)
train_data <- appl_tech_era[pm,]
test_data <- appl_tech_era[-pm,]
asp.lm <- lm(close ~ date, data = train_data)
summary(asp.lm)
##
## Call:
## lm(formula = close ~ date, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.848 -2.899 -0.546 3.253 14.183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.874e+02 2.227e+00 -84.12 <2e-16 ***
## date 1.288e-02 1.330e-04 96.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.572 on 1886 degrees of freedom
## Multiple R-squared: 0.8324, Adjusted R-squared: 0.8324
## F-statistic: 9370 on 1 and 1886 DF, p-value: < 2.2e-16
# Finding the RMSE Value
predicted_closing_price <- predict(asp.lm, newdata = test_data[c('date')])
pred_test_data <- test_data
pred_test_data['predicted_closing_price'] <- predicted_closing_price
rmse_asp <- rmse(actuals = pred_test_data$close, pred = pred_test_data$predicted_closing_price)
# Plotting the Final Predicted Linear Regression Model
ggplot(data = train_data, aes(x = date, y = close)) + geom_line(colour = "#90A4AE", show.legend = TRUE) + geom_line(data = test_data, aes(x = date, y = close),colour = "#66BB6A") + geom_line(data = pred_test_data, aes(x = date, y = predicted_closing_price),colour = "#BF360C") + labs(title = "Predicted Apple Stock Price Visualization Across Years", subtitle = "2012-2022", x = "Time Frame", y = "Closing Price ($)")
appl_rollstat <- appl_tech_era %>% mutate(mean_close = zoo::rollmean(close, k = 180, fill = '#78909C'), sd_close = zoo::rollapply(close, FUN = sd, width = 180, fill = '#37474F'))
appl_rollmean <- appl_rollstat %>% ggplot() + geom_line(aes(date, mean_close)) + theme_bw() + ggtitle("Apple Stock Mean Over the Past Ten-Year Period (Six Month Rolling Window)") + ylab("Closing Price") + xlab("Date")
# Finding the mean of the Apple Stock
appl_rollmean
appl_rollsd <- appl_rollstat %>% ggplot() + geom_line(aes(date, sd_close)) + theme_bw() + ggtitle("Apple Stock Standard Deviation Over the Past Ten-Year Period (Six Month Rolling Window)") + ylab("Closing Price") + xlab("Date")
# Finding the Standard Deviation of the Apple Stock
appl_rollsd
print(adf.test(appl_tech_era$close))
##
## Augmented Dickey-Fuller Test
##
## data: appl_tech_era$close
## Dickey-Fuller = 0.35617, Lag order = 13, p-value = 0.99
## alternative hypothesis: stationary
kpss.test(appl_tech_era$close, null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: appl_tech_era$close
## KPSS Trend = 5.2235, Truncation lag parameter = 8, p-value = 0.01
appl_techlog = appl_tech_era %>% mutate(closing_log = log1p(close))
appl_techlog %>% ggplot() + geom_line(aes(date, closing_log)) + theme_bw() + ggtitle("Apple Stock over Time (Log)") + ylab("Closing Price (Log)") + xlab("Date")
appl_boxcox = appl_tech_era %>% mutate(close_boxcox = forecast::BoxCox(close,lambda = 'auto'))
appl_boxcox %>% ggplot() + geom_line(aes(date, close_boxcox)) + theme_bw() + ggtitle("Box-Cox Transformation for Apple Stock") + ylab("Box-Cox Transformation of Closing Price") + xlab("Date")
appl_fst_diff = appl_techlog %>% mutate(diff_close = close - lag(close))
appl_fst_diff %>% select(date, close, diff_close) %>% head()
## date close diff_close
## 7844 2012-01-17 13.0028 NA
## 7845 2012-01-18 13.1378 0.1350
## 7846 2012-01-19 13.0962 -0.0416
## 7847 2012-01-20 12.8681 -0.2281
## 7848 2012-01-23 13.0858 0.2177
## 7849 2012-01-24 12.8715 -0.2143
appl_fst_diff %>% ggplot() + geom_line(aes(date, diff_close)) + theme_bw() + ggtitle("First Difference of the Apple Stock") + ylab("Difference in Closing Price)") + xlab("Date")
appl_fst_diff = appl_tech_era %>% mutate(close_log = log1p(close)) %>% mutate(close_diff = close_log - lag(close_log))
appl_fst_diff %>% ggplot() + geom_line(aes(date,close_diff)) + theme_bw() + ggtitle("Apple Stock(Log; First Difference)") + ylab("Log Closing Price (Difference))") + xlab("Date")
acf(appl_fst_diff$close)
pacf(appl_fst_diff$close)
# For making the variance stationary, let us use the log transformation
appl_techlog_diff <- appl_tech_era %>% arrange(date) %>% mutate(log_close = log1p(close), close_diff = close - lag(close), close_log_diff = log_close - lag(log_close)) %>% drop_na()
# For making the mean stationary, let us use the difference
appl_techlog_diff %>% ggplot() + geom_line(aes(date, close_log_diff)) + theme_bw() + ggtitle("Difference in Log Values for Apple Stock Over the Past 10 Years") + ylab("Closing Price (Difference))") + xlab("Date")
adf.test(appl_techlog_diff$close)
##
## Augmented Dickey-Fuller Test
##
## data: appl_techlog_diff$close
## Dickey-Fuller = 0.35153, Lag order = 13, p-value = 0.99
## alternative hypothesis: stationary
adf.test(appl_techlog_diff$close_diff)
##
## Augmented Dickey-Fuller Test
##
## data: appl_techlog_diff$close_diff
## Dickey-Fuller = -12.801, Lag order = 13, p-value = 0.01
## alternative hypothesis: stationary
adf.test(appl_techlog_diff$log_close)
##
## Augmented Dickey-Fuller Test
##
## data: appl_techlog_diff$log_close
## Dickey-Fuller = -1.6027, Lag order = 13, p-value = 0.7465
## alternative hypothesis: stationary
adf.test(appl_techlog_diff$close_log_diff)
##
## Augmented Dickey-Fuller Test
##
## data: appl_techlog_diff$close_log_diff
## Dickey-Fuller = -12.555, Lag order = 13, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(appl_techlog_diff$close)
##
## KPSS Test for Level Stationarity
##
## data: appl_techlog_diff$close
## KPSS Level = 20.401, Truncation lag parameter = 8, p-value = 0.01
kpss.test(appl_techlog_diff$close_diff)
##
## KPSS Test for Level Stationarity
##
## data: appl_techlog_diff$close_diff
## KPSS Level = 0.9068, Truncation lag parameter = 8, p-value = 0.01
kpss.test(appl_techlog_diff$log_close)
##
## KPSS Test for Level Stationarity
##
## data: appl_techlog_diff$log_close
## KPSS Level = 25.289, Truncation lag parameter = 8, p-value = 0.01
kpss.test(appl_techlog_diff$close_log_diff)
##
## KPSS Test for Level Stationarity
##
## data: appl_techlog_diff$close_log_diff
## KPSS Level = 0.19031, Truncation lag parameter = 8, p-value = 0.1
par(mfrow = c(1,2))
acf(appl_techlog_diff$close_log_diff,lag.max = 20)
pacf(appl_techlog_diff$close_log_diff,lag.max = 20)
AIC(
arima(appl_techlog_diff$close_log,order = c(0,1,1)),
arima(appl_techlog_diff$close_log,order = c(0,1,2)),
arima(appl_techlog_diff$close_log,order = c(0,1,3)),
arima(appl_techlog_diff$close_log,order = c(1,1,0)),
arima(appl_techlog_diff$close_log,order = c(1,1,1)),
arima(appl_techlog_diff$close_log,order = c(2,1,0)),
arima(appl_techlog_diff$close_log,order = c(3,1,0))
)
## df AIC
## arima(appl_techlog_diff$close_log, order = c(0, 1, 1)) 2 -13244.24
## arima(appl_techlog_diff$close_log, order = c(0, 1, 2)) 3 -13252.96
## arima(appl_techlog_diff$close_log, order = c(0, 1, 3)) 4 -13250.99
## arima(appl_techlog_diff$close_log, order = c(1, 1, 0)) 2 -12191.81
## arima(appl_techlog_diff$close_log, order = c(1, 1, 1)) 3 -13253.03
## arima(appl_techlog_diff$close_log, order = c(2, 1, 0)) 3 -12487.45
## arima(appl_techlog_diff$close_log, order = c(3, 1, 0)) 4 -12673.58
BIC(
arima(appl_techlog_diff$close_log,order = c(0,1,1)),
arima(appl_techlog_diff$close_log,order = c(0,1,2)),
arima(appl_techlog_diff$close_log,order = c(0,1,3)),
arima(appl_techlog_diff$close_log,order = c(1,1,0)),
arima(appl_techlog_diff$close_log,order = c(1,1,1)),
arima(appl_techlog_diff$close_log,order = c(2,1,0)),
arima(appl_techlog_diff$close_log,order = c(3,1,0))
)
## df BIC
## arima(appl_techlog_diff$close_log, order = c(0, 1, 1)) 2 -13232.58
## arima(appl_techlog_diff$close_log, order = c(0, 1, 2)) 3 -13235.47
## arima(appl_techlog_diff$close_log, order = c(0, 1, 3)) 4 -13227.67
## arima(appl_techlog_diff$close_log, order = c(1, 1, 0)) 2 -12180.15
## arima(appl_techlog_diff$close_log, order = c(1, 1, 1)) 3 -13235.54
## arima(appl_techlog_diff$close_log, order = c(2, 1, 0)) 3 -12469.96
## arima(appl_techlog_diff$close_log, order = c(3, 1, 0)) 4 -12650.26
auto.arima(appl_techlog_diff$close_log,stationary = FALSE, allowdrift = TRUE, seasonal = FALSE, stepwise = FALSE, approximation = FALSE)
## Series: appl_techlog_diff$close_log
## ARIMA(1,0,0) with non-zero mean
##
## Coefficients:
## ar1 mean
## -0.0657 1e-03
## s.e. 0.0199 3e-04
##
## sigma^2 = 0.0003004: log likelihood = 6636.62
## AIC=-13267.24 AICc=-13267.23 BIC=-13249.75
best_apple_techmodel = arima(appl_techlog_diff$close_log, order = c(1,0,0))
forecast::checkresiduals(best_apple_techmodel)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 49.238, df = 8, p-value = 5.723e-08
##
## Model df: 2. Total lags used: 10
residual = best_apple_techmodel$residual
Box.test(residual, type = 'Ljung-Box', lag = 1)
##
## Box-Ljung test
##
## data: residual
## X-squared = 3.4678e-05, df = 1, p-value = 0.9953
Box.test(residual, type = 'Ljung-Box', lag = 2)
##
## Box-Ljung test
##
## data: residual
## X-squared = 0.001695, df = 2, p-value = 0.9992
Box.test(residual, type = 'Ljung-Box', lag = 3)
##
## Box-Ljung test
##
## data: residual
## X-squared = 0.90498, df = 3, p-value = 0.8242
Box.test(residual, type = 'Ljung-Box', lag = 4)
##
## Box-Ljung test
##
## data: residual
## X-squared = 1.0389, df = 4, p-value = 0.9038
Box.test(residual, type = 'Ljung-Box', lag = 5)
##
## Box-Ljung test
##
## data: residual
## X-squared = 1.3882, df = 5, p-value = 0.9256
best_apple_techmodel = arima(appl_techlog_diff$log_close, order = c(1,1,0))
residual = best_apple_techmodel$residuals
pred_apple_techmodel = residual + appl_techlog_diff$log_close
ggplot() + geom_line(aes(appl_techlog_diff$date, appl_techlog_diff$log_close)) + geom_line(aes(appl_techlog_diff$date,pred_apple_techmodel),color = '#2196F3',alpha = 0.4) + theme_bw() + xlab("Date") + ylab("Log Apple Stock")
RMSE = sqrt(mean((expm1(pred_apple_techmodel) - expm1(appl_techlog_diff$log_close))^2, na.rm = T))
RMSE
## [1] 1.180576
best_apple_techmodel %>% forecast(h = 150) %>% autoplot()
prediction = predict(best_apple_techmodel, n.ahead = 2518)
pred_data = data.frame(pred = prediction, date = appl_tech_era$date)
pred_data = data.frame(pred = prediction, date = appl_tech_era$date)
pred_merge = appl_tech_era %>% full_join(pred_data) %>% mutate(
pred_high = expm1(pred.pred + 2*pred.se),
pred_low = expm1(pred.pred - 2*pred.se),
pred.pred = expm1(pred.pred),
error = pred.pred - close
)
plot(forecast::forecast(best_apple_techmodel, h = 150), main = "ARIMA(1,1,0) using Multi-Steps Forecast", ylab = "Closing Price", xlab = "Date")
forecast = best_apple_techmodel %>% forecast(h = 150)
forecast$mean = exp(forecast$mean) - 1
forecast$fitted = exp(forecast$fitted) - 1
forecast$x = exp(forecast$x) - 1
autoplot(forecast, main = 'Forecast of Apple Stock Price Over Time', ylab = 'Actual Apple Stock Price')
#install.packages("prophet")
library(prophet)
prophet_data = appl_fst_diff %>% rename(ds = date, # Have to name our date variable "ds"
y = close) # Have to name our time series "y"
train = prophet_data %>% filter(ds < ymd("2021-01-01"))
test = prophet_data %>% filter(ds >= ymd("2021-01-01"))
model = prophet(train)
future = make_future_dataframe(model, periods = 365)
forecast = predict(model, future)
plot(model, forecast) + ylab("Closing Price of Apple Stock Price") + xlab("Date") + theme_bw()
prophet_plot_components(model, forecast)
dyplot.prophet(model, forecast)
prophet_plot_components(model, forecast)
two_yr_future = make_future_dataframe(model, periods = 730)
two_yr_forecast = predict(model, two_yr_future)
plot(model, two_yr_forecast) + theme_bw() + xlab("Date") + ylab("Closing Price")
plot(model, forecast) + add_changepoints_to_plot(model) + theme_bw() + xlab("Date") + ylab("Closing price")
model = prophet(train, n.changepoints = 50)
forecast = predict(model, future)
plot(model, forecast) + add_changepoints_to_plot(model) + theme_bw() + xlab("Date") + ylab("Closing Price")
prophet_plot_components(model, forecast)
forecast_plot_data = forecast %>% as_tibble() %>% mutate(ds = as.Date(ds)) %>% filter(ds >= ymd("2021-01-01"))
forecast_not_scaled = ggplot() + geom_line(aes(test$ds,test$y)) + xlab("Date") + ylab("Closing Price from Test Data") + geom_line(aes(forecast_plot_data$ds, forecast_plot_data$yhat), color = 'green')
forecast_scaled = forecast_not_scaled + ylim(0, 200)
forecast_scaled
prophet_plot_components(model, forecast)
additive = prophet(train)
add_fcst = predict(additive, future)
plot(additive, add_fcst) + ylim(0, 200)
prophet_plot_components(additive, add_fcst)
multi = prophet(train, seasonality.mode = 'multiplicative', yearly.seasonality = TRUE)
multi_fcst = predict(multi, future)
plot(multi, multi_fcst) + ylim(0,200)
prophet_plot_components(multi, multi_fcst)
model = prophet(train, fit = FALSE, yearly.seasonality = TRUE)
model = add_country_holidays(model, country_name = 'US')
model = fit.prophet(model, train)
forecast = predict(model, future)
prophet_plot_components(model, forecast)
forecast %>% filter(holidays != 0) %>% dplyr::select(-ds:-additive_terms_upper, -holidays:-holidays_upper, -weekly:-yhat, -contains("upper"), -contains("lower")) %>% mutate_all(~ if_else(. == 0, as.numeric(NA), .)) %>% summarize_if(is.numeric, ~ max(., na.rm = T)) %>% pivot_longer(cols = `Christmas Day`:`Washington's Birthday`, names_to = 'holiday', values_to = 'effect') %>% ggplot() + geom_col(aes(effect, holiday)) + theme_bw()
forecast_metric_data = forecast %>% as_tibble() %>% mutate(ds = as.Date(ds)) %>% filter(ds <= ymd("2021-01-01"))
RMSE = sqrt(mean((train$y - forecast_metric_data$yhat)^2))
MAE = mean(abs(train$y - forecast_metric_data$yhat))
MAPE = mean(abs((train$y - forecast_metric_data$yhat)/train$y))
print(paste("RMSE:", round(RMSE,2)))
## [1] "RMSE: 4.88"
print(paste("MAE:", round(MAE,2)))
## [1] "MAE: 2.69"
print(paste("MAPE:", round(MAPE,2)))
## [1] "MAPE: 0.07"
df.cv <- cross_validation(model, initial = 365, period = 30, horizon = 30, units = 'days')
head(df.cv)
## y ds yhat yhat_lower yhat_upper cutoff
## 1 14.5367 2013-02-12 15.14146 14.70668 15.58909 2013-02-11
## 2 14.5091 2013-02-13 15.24569 14.81442 15.71051 2013-02-11
## 3 14.4960 2013-02-14 15.23505 14.77577 15.68280 2013-02-11
## 4 14.2963 2013-02-15 15.19228 14.73189 15.61070 2013-02-11
## 5 14.2910 2013-02-19 15.38728 14.95277 15.84114 2013-02-11
## 6 13.9449 2013-02-20 15.41446 14.98357 15.87737 2013-02-11
unique(df.cv$cutoff)
## [1] "2013-02-11 GMT" "2013-03-13 GMT" "2013-04-12 GMT" "2013-05-12 GMT"
## [5] "2013-06-11 GMT" "2013-07-11 GMT" "2013-08-10 GMT" "2013-09-09 GMT"
## [9] "2013-10-09 GMT" "2013-11-08 GMT" "2013-12-08 GMT" "2014-01-07 GMT"
## [13] "2014-02-06 GMT" "2014-03-08 GMT" "2014-04-07 GMT" "2014-05-07 GMT"
## [17] "2014-06-06 GMT" "2014-07-06 GMT" "2014-08-05 GMT" "2014-09-04 GMT"
## [21] "2014-10-04 GMT" "2014-11-03 GMT" "2014-12-03 GMT" "2015-01-02 GMT"
## [25] "2015-02-01 GMT" "2015-03-03 GMT" "2015-04-02 GMT" "2015-05-02 GMT"
## [29] "2015-06-01 GMT" "2015-07-01 GMT" "2015-07-31 GMT" "2015-08-30 GMT"
## [33] "2015-09-29 GMT" "2015-10-29 GMT" "2015-11-28 GMT" "2015-12-28 GMT"
## [37] "2016-01-27 GMT" "2016-02-26 GMT" "2016-03-27 GMT" "2016-04-26 GMT"
## [41] "2016-05-26 GMT" "2016-06-25 GMT" "2016-07-25 GMT" "2016-08-24 GMT"
## [45] "2016-09-23 GMT" "2016-10-23 GMT" "2016-11-22 GMT" "2016-12-22 GMT"
## [49] "2017-01-21 GMT" "2017-02-20 GMT" "2017-03-22 GMT" "2017-04-21 GMT"
## [53] "2017-05-21 GMT" "2017-06-20 GMT" "2017-07-20 GMT" "2017-08-19 GMT"
## [57] "2017-09-18 GMT" "2017-10-18 GMT" "2017-11-17 GMT" "2017-12-17 GMT"
## [61] "2018-01-16 GMT" "2018-02-15 GMT" "2018-03-17 GMT" "2018-04-16 GMT"
## [65] "2018-05-16 GMT" "2018-06-15 GMT" "2018-07-15 GMT" "2018-08-14 GMT"
## [69] "2018-09-13 GMT" "2018-10-13 GMT" "2018-11-12 GMT" "2018-12-12 GMT"
## [73] "2019-01-11 GMT" "2019-02-10 GMT" "2019-03-12 GMT" "2019-04-11 GMT"
## [77] "2019-05-11 GMT" "2019-06-10 GMT" "2019-07-10 GMT" "2019-08-09 GMT"
## [81] "2019-09-08 GMT" "2019-10-08 GMT" "2019-11-07 GMT" "2019-12-07 GMT"
## [85] "2020-01-06 GMT" "2020-02-05 GMT" "2020-03-06 GMT" "2020-04-05 GMT"
## [89] "2020-05-05 GMT" "2020-06-04 GMT" "2020-07-04 GMT" "2020-08-03 GMT"
## [93] "2020-09-02 GMT" "2020-10-02 GMT" "2020-11-01 GMT" "2020-12-01 GMT"
df.cv %>% ggplot() + geom_point(aes(ds,y)) + geom_point(aes(ds, yhat, color = factor(cutoff))) + theme_bw() + xlab("Date") + ylab("Closing Price") + scale_color_discrete(name = 'Cutoff')
performance_metrics(df.cv)
## horizon mse rmse mae mape mdape smape
## 1 3 days 39.75505 6.305160 3.325812 0.06472524 0.05072974 0.06647809
## 2 4 days 40.21200 6.341293 3.445867 0.06827496 0.05312153 0.06996177
## 3 5 days 35.52350 5.960159 3.307464 0.06821516 0.05014260 0.06958385
## 4 6 days 35.38933 5.948893 3.384482 0.06942759 0.05009282 0.07082936
## 5 7 days 39.44923 6.280862 3.540424 0.07131226 0.05030320 0.07272675
## 6 8 days 44.41664 6.664581 3.768618 0.07450224 0.05688070 0.07619917
## 7 9 days 48.98105 6.998646 3.969377 0.07963435 0.06568888 0.08141239
## 8 10 days 48.07204 6.933400 3.910116 0.07922354 0.06742724 0.08123131
## 9 11 days 51.32232 7.163960 4.043606 0.08185248 0.07007794 0.08395039
## 10 12 days 47.83803 6.916504 3.907534 0.08007399 0.06189538 0.08184172
## 11 13 days 45.56370 6.750089 3.883158 0.08002143 0.06087173 0.08169688
## 12 14 days 43.68273 6.609291 3.850812 0.08081468 0.06087173 0.08211306
## 13 15 days 48.44418 6.960185 4.040590 0.08310397 0.06651563 0.08493101
## 14 16 days 53.31952 7.302021 4.224695 0.08703659 0.06726181 0.08900963
## 15 17 days 55.12138 7.424377 4.302720 0.08824665 0.06726181 0.09064999
## 16 18 days 56.17753 7.495167 4.226918 0.08767615 0.06921929 0.09031794
## 17 19 days 49.65979 7.046971 4.043715 0.08548973 0.06864963 0.08810995
## 18 20 days 43.97730 6.631538 3.857848 0.08320368 0.06659101 0.08595789
## 19 21 days 44.82548 6.695184 4.001558 0.08805706 0.06659101 0.09075817
## 20 22 days 52.17340 7.223116 4.224118 0.09299574 0.06970217 0.09560804
## 21 23 days 62.58906 7.911325 4.539521 0.09742956 0.07546466 0.10028650
## 22 24 days 62.26636 7.890904 4.514033 0.09729879 0.07657231 0.10018883
## 23 25 days 60.51640 7.779229 4.362385 0.09445292 0.07657231 0.09757277
## 24 26 days 50.03183 7.073318 4.064287 0.09187314 0.07150802 0.09482500
## 25 27 days 48.58968 6.970630 4.100951 0.09199260 0.06936275 0.09516313
## 26 28 days 54.82826 7.404611 4.392943 0.09780476 0.07591803 0.10123549
## 27 29 days 67.23275 8.199558 4.674757 0.10101966 0.08102599 0.10449830
## 28 30 days 77.40861 8.798216 4.897428 0.10336557 0.08664496 0.10682623
## coverage
## 1 0.3819073
## 2 0.3647671
## 3 0.3719075
## 4 0.3907188
## 5 0.4028926
## 6 0.3766772
## 7 0.3244207
## 8 0.3064236
## 9 0.3054070
## 10 0.3296197
## 11 0.3433590
## 12 0.3400155
## 13 0.3383838
## 14 0.3317503
## 15 0.3265993
## 16 0.3449648
## 17 0.3621288
## 18 0.3717021
## 19 0.3221016
## 20 0.2660196
## 21 0.2311171
## 22 0.2347354
## 23 0.2786690
## 24 0.3091856
## 25 0.3213675
## 26 0.2999109
## 27 0.2929293
## 28 0.2848179
plot_cross_validation_metric(df.cv, metric = 'rmse')
plot_cross_validation_metric(df.cv, metric = 'mae')
plot_cross_validation_metric(df.cv, metric = 'mape')
plot_cross_validation_metric(df.cv, metric = 'mdape')
plot_cross_validation_metric(df.cv, metric = 'smape')
train_log <- train %>% mutate(y_log = log1p(train$y)) %>% drop_na()
best_arima_mod <- auto.arima(train_log$y_log, stationary = FALSE, allowdrift = TRUE, seasonal = FALSE, stepwise = FALSE, approximation = FALSE)
checkresiduals(best_arima_mod)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2)
## Q* = 20.179, df = 6, p-value = 0.002574
##
## Model df: 4. Total lags used: 10
test_pred = predict(best_arima_mod, 262)
test_arima_pred = test_pred$pred
arima_error = test$y - expm1(test_arima_pred)
out_arima_rmse <- sqrt(mean(arima_error^2, na.rm = T))
out_arima_rmse
## [1] 18.89076
best_prophet_mod <- prophet(train)
future <- make_future_dataframe(model, periods = 262)
forecast <- predict(best_prophet_mod, future)
forecast_metric_data <- forecast %>% as_tibble() %>% mutate(ds = as.Date(ds)) %>% filter(ds >= ymd("2021-01-01"))
out_prophet_rmse <- sqrt(mean((test$y - forecast_metric_data$yhat)^2))
out_prophet_rmse
## [1] 13.20644
Out-of-sample RMSE for best prophet model is 13.20644
tibble(`best_ARIMA` = round(out_arima_rmse,2), `best_Prophet` = round(out_prophet_rmse,2))
## # A tibble: 1 x 2
## best_ARIMA best_Prophet
## <dbl> <dbl>
## 1 18.9 13.2