Data set used for this time series analysis is Amazon Stock price data. I got this data from quantmod package that collects data from Yahoo Finance. Time period I chose for the data is of last 2 years i.e. from February 2020 to February 2022 with daily frequency. Goal of this project is to predict stock price for Amazon.
I have chosen this data set because my MBA major is in Finance and Operations. And while studying I grew interest in stock prices and how they perform over time. So this project will help me in further advancing my knowledge about stock prices behavior over time.
##Loading required packages
library(quantmod)
library(data.table)
library(psych)
library(ggplot2)
library(tidyverse)
library(sjPlot)
library(dplyr)
library(tseries)
library(zoo)
library(forecast)
library(ymd)
library(bizdays)
getSymbols.warning4.0=FALSE
amazon_data <- getSymbols(Symbols = "AMZN", src = "yahoo",
from = "2020-04-02", to = "2022-04-02", auto.assign = FALSE)
amazon_data1 <- amazon_data
amazon_data1 <- as.data.table(amazon_data1, keep.rownames = "Date")
names(amazon_data1) <- c("Date", "Open", "High", "Low", "Close", "Volume", "Adjusted")
head(amazon_data1)
## Date Open High Low Close Volume Adjusted
## 1: 2020-04-02 1901.64 1927.53 1890.00 1918.83 4336000 1918.83
## 2: 2020-04-03 1911.15 1926.33 1889.15 1906.59 3609900 1906.59
## 3: 2020-04-06 1936.00 1998.52 1930.02 1997.59 5773200 1997.59
## 4: 2020-04-07 2017.11 2035.72 1997.62 2011.60 5114000 2011.60
## 5: 2020-04-08 2021.00 2044.00 2011.15 2043.00 3977300 2043.00
## 6: 2020-04-09 2044.30 2053.00 2017.66 2042.76 4646500 2042.76
attach(amazon_data1)
str(amazon_data1)
## Classes 'data.table' and 'data.frame': 466 obs. of 7 variables:
## $ Date : Date, format: "2020-04-02" "2020-04-03" ...
## $ Open : num 1902 1911 1936 2017 2021 ...
## $ High : num 1928 1926 1999 2036 2044 ...
## $ Low : num 1890 1889 1930 1998 2011 ...
## $ Close : num 1919 1907 1998 2012 2043 ...
## $ Volume : num 4336000 3609900 5773200 5114000 3977300 ...
## $ Adjusted: num 1919 1907 1998 2012 2043 ...
## - attr(*, ".internal.selfref")=<externalptr>
describe(amazon_data1)
## vars n mean sd median trimmed mad
## Date 1 466 NaN NA NA NaN NA
## Open 2 466 3163.41 343.92 3230.00 3213.89 197.61
## High 3 466 3199.02 343.02 3264.18 3249.47 203.10
## Low 4 466 3123.89 341.74 3192.38 3172.86 208.37
## Close 5 466 3161.25 340.44 3228.22 3210.71 213.63
## Volume 6 466 3999165.67 1628510.40 3580750.00 3777240.11 1272737.97
## Adjusted 7 466 3161.25 340.44 3228.22 3210.71 213.63
## min max range skew kurtosis se
## Date Inf -Inf -Inf NA NA NA
## Open 1901.64 3744.00 1842.36 -1.42 1.85 15.93
## High 1926.33 3773.08 1846.75 -1.42 1.88 15.89
## Low 1889.15 3696.79 1807.64 -1.37 1.67 15.83
## Close 1906.59 3731.41 1824.82 -1.38 1.72 15.77
## Volume 1451900.00 12640500.00 11188600.00 1.63 3.92 75439.29
## Adjusted 1906.59 3731.41 1824.82 -1.38 1.72 15.77
#t(describe(amazon_data1))
sum(is.na(amazon_data1))
## [1] 0
par(mfrow=c(3,3))
boxplot(amazon_data1$Open,main = "Open")
boxplot(amazon_data1$High,main = "High")
boxplot(amazon_data1$Low,main = "Low")
boxplot(amazon_data1$Close,main = "Close")
boxplot(amazon_data1$Volume,main = "Volume")
boxplot(amazon_data1$Adjusted,main = "Adjusted")
par(mfrow=c(3,3))
hist(amazon_data1$Open, freq = FALSE, main = "Open")
lines(density(amazon_data1$Open), lwd=5, col='blue')
hist(amazon_data1$High, freq = FALSE, main = "High")
lines(density(amazon_data1$High), lwd=5, col='blue')
hist(amazon_data1$Low, freq = FALSE, main = "Low")
lines(density(amazon_data1$Low), lwd=5, col='blue')
hist(amazon_data1$Close, freq = FALSE, main = "Close")
lines(density(amazon_data1$Close), lwd=5, col='blue')
hist(amazon_data1$Volume, freq = FALSE, main = "Volume")
lines(density(amazon_data1$Volume), lwd=5, col='blue')
hist(amazon_data1$Adjusted, freq = FALSE, main = "Adjusted")
lines(density(amazon_data1$Adjusted), lwd=5, col='blue')
#round(cor(amazon_data1),2)
GGally::ggpairs(
data = amazon_data1[, c(2,3,4,5,6,7)]
)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggplot(amazon_data1)+
geom_line(aes(Date,Adjusted))+
theme_bw()+
xlab("Date")+
ylab("Adjusted Closing Price")+
labs(
title = 'Adj.Closing Price over time',
subtitle = 'Feb 2020 - Feb 2022'
)+
geom_smooth(aes(Date,Adjusted),method='lm',color='red', size=0.1)
## `geom_smooth()` using formula 'y ~ x'
ggplot(amazon_data1)+
geom_line(aes(Date,Volume))+
theme_bw()+
xlab("Date")+
ylab("Volume Traded")+
labs(
title = 'Volume traded over time',
subtitle = 'Feb 2020 - Feb 2022'
)
amazon_data1_lm <- lm(Adjusted ~ Date, data = amazon_data1)
summary(amazon_data1_lm)
##
## Call:
## lm(formula = Adjusted ~ Date, data = amazon_data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -856.55 -112.06 34.22 166.16 588.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.897e+04 1.114e+03 -17.04 <2e-16 ***
## Date 1.184e+00 5.958e-02 19.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 250.5 on 464 degrees of freedom
## Multiple R-squared: 0.4599, Adjusted R-squared: 0.4587
## F-statistic: 395.1 on 1 and 464 DF, p-value: < 2.2e-16
tab_model(amazon_data1_lm)
| Adjusted | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -18973.98 | -21162.48 – -16785.48 | <0.001 |
| Date | 1.18 | 1.07 – 1.30 | <0.001 |
| Observations | 466 | ||
| R2 / R2 adjusted | 0.460 / 0.459 | ||
## The following objects are masked from amazon_data1 (pos = 3):
##
## Adjusted, Close, Date, High, Low, Open, Volume
ggplot(amazon_data1)+
geom_line(aes(Date,Adjusted))+
theme_bw()+
xlab("Date")+
ylab("Adjusted Closing Price")+
labs(
title = 'Adj.Closing Price over time',
subtitle = 'Feb 2020 - Feb 2022'
)
Adjusted closing price graph above is increasing over the time from Feb 2020 to Feb 2022. To check if the time series is stationary or not, I will take the rolling average for mean and standard deviation.
amazon_data1_roll <- amazon_data1 %>%
mutate(
adj_close_mean = zoo::rollmean(
Adjusted,
k=180,
fill = NA),
adj_close_sd = zoo::rollapply(
Adjusted,
FUN = sd,
width = 180,
fill = NA)
)
amazon_data1_rollmean <- amazon_data1_roll %>%
ggplot() +
geom_line(aes(Date, adj_close_mean)) +
theme_bw() +
ggtitle("Amazon Mean over Time (6 month rolling window)") +
ylab("Adj Closing Price") +
xlab("Date")
amazon_data1_rollsd <- amazon_data1_roll %>%
ggplot() +
geom_line(aes(Date, adj_close_sd)) +
theme_bw() +
ggtitle("Differenced Amazon SD over Time (6 month rolling window)") +
ylab("Adj Closing Price") +
xlab("Date")
amazon_data1_rollmean
amazon_data1_rollsd
Rolling average graphs above shows that the Adjusted closing price time series data is both Mean and Variance non-stationary.
Performing Augmented Dickey Fuller test and KPSS test to test for stationary. Augmented Dickey fuller test should have p-value < 0.05 for stationary data and KPSS test should have p-value > 0.05 for stationary data.
adf.test(amazon_data1$Adjusted)
##
## Augmented Dickey-Fuller Test
##
## data: amazon_data1$Adjusted
## Dickey-Fuller = -2.6809, Lag order = 7, p-value = 0.2899
## alternative hypothesis: stationary
kpss.test(amazon_data1$Adjusted, null = c("Level"))
##
## KPSS Test for Level Stationarity
##
## data: amazon_data1$Adjusted
## KPSS Level = 4.0324, Truncation lag parameter = 5, p-value = 0.01
kpss.test(amazon_data1$Adjusted, null = c("Trend"))
##
## KPSS Test for Trend Stationarity
##
## data: amazon_data1$Adjusted
## KPSS Trend = 0.82615, Truncation lag parameter = 5, p-value = 0.01
ADF test p-value is 0.29 and KPSS p-value is 0.01. Both tests suggests that the time series data is non-stationary.
As the data is variance non-stationary, I will perform log transformation on time series data and I will calculate first difference as the data is also mean non-stationary.
amazon_data1_log_diff = amazon_data1 %>%
mutate(
amazon_log = log1p(amazon_data1$Adjusted),
amazon_diff = amazon_data1$Adjusted - lag(amazon_data1$Adjusted),
amazon_log_diff = amazon_log - lag(amazon_log)
)%>%
drop_na()
amazon_data1_log_diff %>%
ggplot() +
geom_line(aes(Date, amazon_log_diff)) +
theme_bw() +
ggtitle("Difference in Log Values, Amazon over Time") +
ylab("Adj Closing Price (Difference))") +
xlab("Date")
adf.test(amazon_data1_log_diff$amazon_log_diff)
##
## Augmented Dickey-Fuller Test
##
## data: amazon_data1_log_diff$amazon_log_diff
## Dickey-Fuller = -8.1374, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(amazon_data1_log_diff$amazon_log_diff, null = c("Level"))
##
## KPSS Test for Level Stationarity
##
## data: amazon_data1_log_diff$amazon_log_diff
## KPSS Level = 0.43492, Truncation lag parameter = 5, p-value = 0.0621
kpss.test(amazon_data1_log_diff$amazon_log_diff, null = c("Trend"))
##
## KPSS Test for Trend Stationarity
##
## data: amazon_data1_log_diff$amazon_log_diff
## KPSS Trend = 0.06855, Truncation lag parameter = 5, p-value = 0.1
After performing log transformation and first difference, time series seems to be stationary. That can be confirmed from ADF and KPSS tests. ADF test p-value is 0.01 (<0.05) and KPSS test value is 0.06 and 0.1 (>0.05), which confirms the data is stationary.
Now we can check if the data is auto-regressive or moving average and the order by plotting ACF and PACF.
par(mfrow = c(1,2))
acf(amazon_data1_log_diff$amazon_log_diff, lag.max = 20)
pacf(amazon_data1_log_diff$amazon_log_diff, lag.max = 20)
ACF doesn’t have any dampening effect so time series appears to be moving average process with order 0 as there are no statistically significant lags or order can also be 2 as the 2nd lag appears to just touch the line. So from ACF and PACF, order of ARIMA is (0,1,0).
Fitting different ARIMA models to check the best model based on AIC and BIC
AIC(
arima(amazon_data1_log_diff$amazon_log,order=c(0,1,0)),
arima(amazon_data1_log_diff$amazon_log,order=c(0,1,1)),
arima(amazon_data1_log_diff$amazon_log,order=c(0,1,2)),
arima(amazon_data1_log_diff$amazon_log,order=c(0,1,3)),
arima(amazon_data1_log_diff$amazon_log,order=c(1,1,0)),
arima(amazon_data1_log_diff$amazon_log,order=c(1,1,1)),
arima(amazon_data1_log_diff$amazon_log,order=c(2,1,0)),
arima(amazon_data1_log_diff$amazon_log,order=c(3,1,0))
)
## df AIC
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 0)) 1 -2300.224
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 1)) 2 -2298.931
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 2)) 3 -2296.935
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 3)) 4 -2298.841
## arima(amazon_data1_log_diff$amazon_log, order = c(1, 1, 0)) 2 -2298.931
## arima(amazon_data1_log_diff$amazon_log, order = c(1, 1, 1)) 3 -2297.299
## arima(amazon_data1_log_diff$amazon_log, order = c(2, 1, 0)) 3 -2296.935
## arima(amazon_data1_log_diff$amazon_log, order = c(3, 1, 0)) 4 -2299.113
As per AIC, best model is (0,1,0) as it has the lowest value compared to other models.
BIC(
arima(amazon_data1_log_diff$amazon_log,order=c(0,1,0)),
arima(amazon_data1_log_diff$amazon_log,order=c(0,1,1)),
arima(amazon_data1_log_diff$amazon_log,order=c(0,1,2)),
arima(amazon_data1_log_diff$amazon_log,order=c(0,1,3)),
arima(amazon_data1_log_diff$amazon_log,order=c(1,1,0)),
arima(amazon_data1_log_diff$amazon_log,order=c(1,1,1)),
arima(amazon_data1_log_diff$amazon_log,order=c(2,1,0)),
arima(amazon_data1_log_diff$amazon_log,order=c(3,1,0))
)
## df BIC
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 0)) 1 -2296.084
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 1)) 2 -2290.651
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 2)) 3 -2284.515
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 3)) 4 -2282.282
## arima(amazon_data1_log_diff$amazon_log, order = c(1, 1, 0)) 2 -2290.651
## arima(amazon_data1_log_diff$amazon_log, order = c(1, 1, 1)) 3 -2284.879
## arima(amazon_data1_log_diff$amazon_log, order = c(2, 1, 0)) 3 -2284.515
## arima(amazon_data1_log_diff$amazon_log, order = c(3, 1, 0)) 4 -2282.554
As per BIC also best model is (0,1,0) as it has lowest value compared to other models.
auto.arima(amazon_data1_log_diff$amazon_log,stationary=FALSE,allowdrift=FALSE,
seasonal=FALSE,stepwise=FALSE,approximation=FALSE)
## Series: amazon_data1_log_diff$amazon_log
## ARIMA(0,1,0)
##
## sigma^2 = 0.00041: log likelihood = 1151.11
## AIC=-2300.22 AICc=-2300.22 BIC=-2296.08
Auto arima process also chooses (0,1,0) as the best model which is consistent with AIC, BIC and from ACF and PACF. So I am choosing ARIMA(0,1,0) as the best model for forecasting.
best_mod = arima(amazon_data1_log_diff$amazon_log,order=c(0,1,0))
forecast::checkresiduals(best_mod)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0)
## Q* = 8.6668, df = 10, p-value = 0.564
##
## Model df: 0. Total lags used: 10
Residuals are normally distributed. From the ACF plot above, there is no auto-correlation within the residuals. Residuals seems to be white noise and model is performing good.
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 1)
##
## Box-Ljung test
##
## data: best_mod$residuals
## X-squared = 0.66066, df = 1, p-value = 0.4163
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 2)
##
## Box-Ljung test
##
## data: best_mod$residuals
## X-squared = 0.66066, df = 2, p-value = 0.7187
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 3)
##
## Box-Ljung test
##
## data: best_mod$residuals
## X-squared = 4.5044, df = 3, p-value = 0.2119
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 5)
##
## Box-Ljung test
##
## data: best_mod$residuals
## X-squared = 5.7099, df = 5, p-value = 0.3355
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 10)
##
## Box-Ljung test
##
## data: best_mod$residuals
## X-squared = 8.6668, df = 10, p-value = 0.564
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 15)
##
## Box-Ljung test
##
## data: best_mod$residuals
## X-squared = 10.668, df = 15, p-value = 0.7757
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 20)
##
## Box-Ljung test
##
## data: best_mod$residuals
## X-squared = 11.84, df = 20, p-value = 0.9215
Ljung-Box test for Residual auto correlation suggests that if p-value is less than 0.05, then there exists resiual auto correlation at that lag. From the results above, we can conclude that there is no auto-correlation at any of the lags as for all lags p-value is greater than 0.05.
best_mod = arima(amazon_data1_log_diff$amazon_log,order=c(0,1,0))
resid = best_mod$residuals
pred = resid + amazon_data1_log_diff$amazon_log
ggplot() +
geom_line(aes(amazon_data1_log_diff$Date,amazon_data1_log_diff$amazon_log))+
geom_line(aes(amazon_data1_log_diff$Date,pred),color='blue',alpha=0.4)+
theme_bw()+
xlab("Date")+
ylab("Log Amazon")
Model seems to be doing good but there are lot of peaks that we are not capturing
RMSE = sqrt(mean((expm1(pred) - expm1(amazon_data1_log_diff$amazon_log))^2,na.rm=T))
RMSE
## [1] 62.84307
RMSE for the model is 62.8. Results suggest that our model predicts Amazon adjusted closing stock price within ~63 points on average in-sample.
As I am using daily data, I am predicting forecast for 5 Months i.e. 150 days from the actual data.
best_mod %>%
forecast(h = 150) %>%
autoplot()
As we performed ARIMA model on log values, we need to inverse those log values to the original values.
prediction = predict(best_mod,n.ahead=150)
pred_data = data.frame(
pred = prediction,
Date = as.Date(ymd(max(amazon_data1_log_diff$Date)):ymd(max(amazon_data1_log_diff$Date)+149))
)
pred_merge = amazon_data1 %>%
full_join(pred_data) %>%
mutate(
pred_high = expm1(pred.pred+2*pred.se),
pred_low = expm1(pred.pred-2*pred.se),
pred.pred = expm1(pred.pred),
error = pred.pred - Adjusted
)
## Joining, by = "Date"
pred_merge %>%
ggplot()+
geom_line(aes(Date,Adjusted))+
geom_line(aes(Date,pred.pred),color='blue')+
geom_ribbon(aes(x=Date,ymin=pred_low,ymax = pred_high),fill='blue',alpha=0.2)
Forecast generated from arima seems to be reasonable, but historical trend shows that the stock price is increasing over the time. So our forecast doesn’t seem to capture that trend as we are getting constant value for next 5 months.