Amazon Stock Price Analysis

About Data

Data set used for this time series analysis is Amazon Stock price data. I got this data from quantmod package that collects data from Yahoo Finance. Time period I chose for the data is of last 2 years i.e. from February 2020 to February 2022 with daily frequency. Goal of this project is to predict stock price for Amazon.

Motivation

I have chosen this data set because my MBA major is in Finance and Operations. And while studying I grew interest in stock prices and how they perform over time. So this project will help me in further advancing my knowledge about stock prices behavior over time.

Data Description

  • Date - This includes the date and time of the trade
  • Open - This is the price at which the stock began
  • High - The highest price at the given time period
  • Low - The lowest price at the given time period
  • Close - The price at which the stock ended
  • Volume - This is the total amount of trading activity
  • Adj Close: The adjusted closing price factors in corporate actions, such as stock splits, dividends, and rights offerings
##Loading required packages
library(quantmod)
library(data.table)
library(psych)
library(ggplot2)
library(tidyverse)
library(sjPlot)
library(dplyr)
library(tseries)
library(zoo)
library(forecast)
library(ymd)
library(bizdays)
getSymbols.warning4.0=FALSE
amazon_data <- getSymbols(Symbols = "AMZN", src = "yahoo", 
                   from = "2020-04-02", to = "2022-04-02", auto.assign = FALSE)
amazon_data1 <- amazon_data
amazon_data1 <- as.data.table(amazon_data1, keep.rownames = "Date")
names(amazon_data1) <- c("Date", "Open", "High", "Low", "Close", "Volume", "Adjusted")
head(amazon_data1)
##          Date    Open    High     Low   Close  Volume Adjusted
## 1: 2020-04-02 1901.64 1927.53 1890.00 1918.83 4336000  1918.83
## 2: 2020-04-03 1911.15 1926.33 1889.15 1906.59 3609900  1906.59
## 3: 2020-04-06 1936.00 1998.52 1930.02 1997.59 5773200  1997.59
## 4: 2020-04-07 2017.11 2035.72 1997.62 2011.60 5114000  2011.60
## 5: 2020-04-08 2021.00 2044.00 2011.15 2043.00 3977300  2043.00
## 6: 2020-04-09 2044.30 2053.00 2017.66 2042.76 4646500  2042.76
attach(amazon_data1)
str(amazon_data1)
## Classes 'data.table' and 'data.frame':   466 obs. of  7 variables:
##  $ Date    : Date, format: "2020-04-02" "2020-04-03" ...
##  $ Open    : num  1902 1911 1936 2017 2021 ...
##  $ High    : num  1928 1926 1999 2036 2044 ...
##  $ Low     : num  1890 1889 1930 1998 2011 ...
##  $ Close   : num  1919 1907 1998 2012 2043 ...
##  $ Volume  : num  4336000 3609900 5773200 5114000 3977300 ...
##  $ Adjusted: num  1919 1907 1998 2012 2043 ...
##  - attr(*, ".internal.selfref")=<externalptr>
describe(amazon_data1)
##          vars   n       mean         sd     median    trimmed        mad
## Date        1 466        NaN         NA         NA        NaN         NA
## Open        2 466    3163.41     343.92    3230.00    3213.89     197.61
## High        3 466    3199.02     343.02    3264.18    3249.47     203.10
## Low         4 466    3123.89     341.74    3192.38    3172.86     208.37
## Close       5 466    3161.25     340.44    3228.22    3210.71     213.63
## Volume      6 466 3999165.67 1628510.40 3580750.00 3777240.11 1272737.97
## Adjusted    7 466    3161.25     340.44    3228.22    3210.71     213.63
##                 min         max       range  skew kurtosis       se
## Date            Inf        -Inf        -Inf    NA       NA       NA
## Open        1901.64     3744.00     1842.36 -1.42     1.85    15.93
## High        1926.33     3773.08     1846.75 -1.42     1.88    15.89
## Low         1889.15     3696.79     1807.64 -1.37     1.67    15.83
## Close       1906.59     3731.41     1824.82 -1.38     1.72    15.77
## Volume   1451900.00 12640500.00 11188600.00  1.63     3.92 75439.29
## Adjusted    1906.59     3731.41     1824.82 -1.38     1.72    15.77
#t(describe(amazon_data1))
sum(is.na(amazon_data1))
## [1] 0

Boxplots - Outliers in Amazon data

par(mfrow=c(3,3))
boxplot(amazon_data1$Open,main = "Open")
boxplot(amazon_data1$High,main = "High")
boxplot(amazon_data1$Low,main = "Low")
boxplot(amazon_data1$Close,main = "Close")
boxplot(amazon_data1$Volume,main = "Volume")
boxplot(amazon_data1$Adjusted,main = "Adjusted")

Amazon Data Distribution

par(mfrow=c(3,3))
hist(amazon_data1$Open, freq = FALSE, main = "Open")
lines(density(amazon_data1$Open), lwd=5, col='blue')
hist(amazon_data1$High, freq = FALSE, main = "High")
lines(density(amazon_data1$High), lwd=5, col='blue')
hist(amazon_data1$Low, freq = FALSE, main = "Low")
lines(density(amazon_data1$Low), lwd=5, col='blue')
hist(amazon_data1$Close, freq = FALSE, main = "Close")
lines(density(amazon_data1$Close), lwd=5, col='blue')
hist(amazon_data1$Volume, freq = FALSE, main = "Volume")
lines(density(amazon_data1$Volume), lwd=5, col='blue')
hist(amazon_data1$Adjusted, freq = FALSE, main = "Adjusted")
lines(density(amazon_data1$Adjusted), lwd=5, col='blue')

Correlation

#round(cor(amazon_data1),2)
GGally::ggpairs(
  data = amazon_data1[, c(2,3,4,5,6,7)]
)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

ggplot(amazon_data1)+
  geom_line(aes(Date,Adjusted))+
  theme_bw()+
  xlab("Date")+
  ylab("Adjusted Closing Price")+
  labs(
    title = 'Adj.Closing Price over time',
    subtitle = 'Feb 2020 - Feb 2022'
  )+ 
  geom_smooth(aes(Date,Adjusted),method='lm',color='red', size=0.1)
## `geom_smooth()` using formula 'y ~ x'

ggplot(amazon_data1)+
  geom_line(aes(Date,Volume))+
  theme_bw()+
  xlab("Date")+
  ylab("Volume Traded")+
  labs(
    title = 'Volume traded over time',
    subtitle = 'Feb 2020 - Feb 2022'
  )

Linear Regression

amazon_data1_lm <- lm(Adjusted ~ Date, data = amazon_data1)
summary(amazon_data1_lm)
## 
## Call:
## lm(formula = Adjusted ~ Date, data = amazon_data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -856.55 -112.06   34.22  166.16  588.31 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.897e+04  1.114e+03  -17.04   <2e-16 ***
## Date         1.184e+00  5.958e-02   19.88   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 250.5 on 464 degrees of freedom
## Multiple R-squared:  0.4599, Adjusted R-squared:  0.4587 
## F-statistic: 395.1 on 1 and 464 DF,  p-value: < 2.2e-16
tab_model(amazon_data1_lm)
  Adjusted
Predictors Estimates CI p
(Intercept) -18973.98 -21162.48 – -16785.48 <0.001
Date 1.18 1.07 – 1.30 <0.001
Observations 466
R2 / R2 adjusted 0.460 / 0.459
## The following objects are masked from amazon_data1 (pos = 3):
## 
##     Adjusted, Close, Date, High, Low, Open, Volume

Adjusted Closing Price Over the time

ggplot(amazon_data1)+
  geom_line(aes(Date,Adjusted))+
  theme_bw()+
  xlab("Date")+
  ylab("Adjusted Closing Price")+
  labs(
    title = 'Adj.Closing Price over time',
    subtitle = 'Feb 2020 - Feb 2022'
  )

Adjusted closing price graph above is increasing over the time from Feb 2020 to Feb 2022. To check if the time series is stationary or not, I will take the rolling average for mean and standard deviation.

Mean Stationary and Variance Stationary check

amazon_data1_roll <- amazon_data1 %>%
  mutate(
    adj_close_mean = zoo::rollmean(
      Adjusted,
      k=180,
      fill = NA),
    adj_close_sd = zoo::rollapply(
      Adjusted,
      FUN = sd,
      width = 180,
      fill = NA)
    )
amazon_data1_rollmean <- amazon_data1_roll %>%
  ggplot() +
  geom_line(aes(Date, adj_close_mean)) +
  theme_bw() +
  ggtitle("Amazon Mean over Time (6 month rolling window)") +
  ylab("Adj Closing Price") +
  xlab("Date")
amazon_data1_rollsd <- amazon_data1_roll %>%
  ggplot() +
  geom_line(aes(Date, adj_close_sd)) +
  theme_bw() +
  ggtitle("Differenced Amazon SD over Time (6 month rolling window)") +
  ylab("Adj Closing Price") +
  xlab("Date")
amazon_data1_rollmean

amazon_data1_rollsd

Rolling average graphs above shows that the Adjusted closing price time series data is both Mean and Variance non-stationary.

Performing Augmented Dickey Fuller test and KPSS test to test for stationary. Augmented Dickey fuller test should have p-value < 0.05 for stationary data and KPSS test should have p-value > 0.05 for stationary data.

ADF and KPSS test for stationary

adf.test(amazon_data1$Adjusted)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  amazon_data1$Adjusted
## Dickey-Fuller = -2.6809, Lag order = 7, p-value = 0.2899
## alternative hypothesis: stationary
kpss.test(amazon_data1$Adjusted, null = c("Level"))
## 
##  KPSS Test for Level Stationarity
## 
## data:  amazon_data1$Adjusted
## KPSS Level = 4.0324, Truncation lag parameter = 5, p-value = 0.01
kpss.test(amazon_data1$Adjusted, null = c("Trend"))
## 
##  KPSS Test for Trend Stationarity
## 
## data:  amazon_data1$Adjusted
## KPSS Trend = 0.82615, Truncation lag parameter = 5, p-value = 0.01

ADF test p-value is 0.29 and KPSS p-value is 0.01. Both tests suggests that the time series data is non-stationary.

As the data is variance non-stationary, I will perform log transformation on time series data and I will calculate first difference as the data is also mean non-stationary.

Log Transformation and Differencing

amazon_data1_log_diff = amazon_data1 %>%
  mutate(
    amazon_log = log1p(amazon_data1$Adjusted),
    amazon_diff = amazon_data1$Adjusted - lag(amazon_data1$Adjusted),
    amazon_log_diff = amazon_log - lag(amazon_log)
  )%>%
  drop_na()

amazon_data1_log_diff %>%
  ggplot() +
  geom_line(aes(Date, amazon_log_diff)) +
  theme_bw() +
  ggtitle("Difference in Log Values, Amazon over Time") +
  ylab("Adj Closing Price (Difference))") +
  xlab("Date")

adf.test(amazon_data1_log_diff$amazon_log_diff)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  amazon_data1_log_diff$amazon_log_diff
## Dickey-Fuller = -8.1374, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(amazon_data1_log_diff$amazon_log_diff, null = c("Level"))
## 
##  KPSS Test for Level Stationarity
## 
## data:  amazon_data1_log_diff$amazon_log_diff
## KPSS Level = 0.43492, Truncation lag parameter = 5, p-value = 0.0621
kpss.test(amazon_data1_log_diff$amazon_log_diff, null = c("Trend"))
## 
##  KPSS Test for Trend Stationarity
## 
## data:  amazon_data1_log_diff$amazon_log_diff
## KPSS Trend = 0.06855, Truncation lag parameter = 5, p-value = 0.1

After performing log transformation and first difference, time series seems to be stationary. That can be confirmed from ADF and KPSS tests. ADF test p-value is 0.01 (<0.05) and KPSS test value is 0.06 and 0.1 (>0.05), which confirms the data is stationary.

Now we can check if the data is auto-regressive or moving average and the order by plotting ACF and PACF.

AR and MA Order

par(mfrow = c(1,2))
acf(amazon_data1_log_diff$amazon_log_diff, lag.max = 20)
pacf(amazon_data1_log_diff$amazon_log_diff, lag.max = 20)

ACF doesn’t have any dampening effect so time series appears to be moving average process with order 0 as there are no statistically significant lags or order can also be 2 as the 2nd lag appears to just touch the line. So from ACF and PACF, order of ARIMA is (0,1,0).

Best ARIMA Model

Fitting different ARIMA models to check the best model based on AIC and BIC

AIC(
  arima(amazon_data1_log_diff$amazon_log,order=c(0,1,0)),
  arima(amazon_data1_log_diff$amazon_log,order=c(0,1,1)),
  arima(amazon_data1_log_diff$amazon_log,order=c(0,1,2)),
  arima(amazon_data1_log_diff$amazon_log,order=c(0,1,3)),
  arima(amazon_data1_log_diff$amazon_log,order=c(1,1,0)),
  arima(amazon_data1_log_diff$amazon_log,order=c(1,1,1)),
  arima(amazon_data1_log_diff$amazon_log,order=c(2,1,0)),
  arima(amazon_data1_log_diff$amazon_log,order=c(3,1,0))
)
##                                                             df       AIC
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 0))  1 -2300.224
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 1))  2 -2298.931
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 2))  3 -2296.935
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 3))  4 -2298.841
## arima(amazon_data1_log_diff$amazon_log, order = c(1, 1, 0))  2 -2298.931
## arima(amazon_data1_log_diff$amazon_log, order = c(1, 1, 1))  3 -2297.299
## arima(amazon_data1_log_diff$amazon_log, order = c(2, 1, 0))  3 -2296.935
## arima(amazon_data1_log_diff$amazon_log, order = c(3, 1, 0))  4 -2299.113

As per AIC, best model is (0,1,0) as it has the lowest value compared to other models.

BIC(
  arima(amazon_data1_log_diff$amazon_log,order=c(0,1,0)),
  arima(amazon_data1_log_diff$amazon_log,order=c(0,1,1)),
  arima(amazon_data1_log_diff$amazon_log,order=c(0,1,2)),
  arima(amazon_data1_log_diff$amazon_log,order=c(0,1,3)),
  arima(amazon_data1_log_diff$amazon_log,order=c(1,1,0)),
  arima(amazon_data1_log_diff$amazon_log,order=c(1,1,1)),
  arima(amazon_data1_log_diff$amazon_log,order=c(2,1,0)),
  arima(amazon_data1_log_diff$amazon_log,order=c(3,1,0))
)
##                                                             df       BIC
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 0))  1 -2296.084
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 1))  2 -2290.651
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 2))  3 -2284.515
## arima(amazon_data1_log_diff$amazon_log, order = c(0, 1, 3))  4 -2282.282
## arima(amazon_data1_log_diff$amazon_log, order = c(1, 1, 0))  2 -2290.651
## arima(amazon_data1_log_diff$amazon_log, order = c(1, 1, 1))  3 -2284.879
## arima(amazon_data1_log_diff$amazon_log, order = c(2, 1, 0))  3 -2284.515
## arima(amazon_data1_log_diff$amazon_log, order = c(3, 1, 0))  4 -2282.554

As per BIC also best model is (0,1,0) as it has lowest value compared to other models.

auto.arima(amazon_data1_log_diff$amazon_log,stationary=FALSE,allowdrift=FALSE,
seasonal=FALSE,stepwise=FALSE,approximation=FALSE)
## Series: amazon_data1_log_diff$amazon_log 
## ARIMA(0,1,0) 
## 
## sigma^2 = 0.00041:  log likelihood = 1151.11
## AIC=-2300.22   AICc=-2300.22   BIC=-2296.08

Auto arima process also chooses (0,1,0) as the best model which is consistent with AIC, BIC and from ACF and PACF. So I am choosing ARIMA(0,1,0) as the best model for forecasting.

best_mod = arima(amazon_data1_log_diff$amazon_log,order=c(0,1,0))
forecast::checkresiduals(best_mod)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 8.6668, df = 10, p-value = 0.564
## 
## Model df: 0.   Total lags used: 10

Residuals are normally distributed. From the ACF plot above, there is no auto-correlation within the residuals. Residuals seems to be white noise and model is performing good.

Ljung-Box Test for residual auto correlation

Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 1)
## 
##  Box-Ljung test
## 
## data:  best_mod$residuals
## X-squared = 0.66066, df = 1, p-value = 0.4163
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 2)
## 
##  Box-Ljung test
## 
## data:  best_mod$residuals
## X-squared = 0.66066, df = 2, p-value = 0.7187
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 3)
## 
##  Box-Ljung test
## 
## data:  best_mod$residuals
## X-squared = 4.5044, df = 3, p-value = 0.2119
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 5)
## 
##  Box-Ljung test
## 
## data:  best_mod$residuals
## X-squared = 5.7099, df = 5, p-value = 0.3355
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 10)
## 
##  Box-Ljung test
## 
## data:  best_mod$residuals
## X-squared = 8.6668, df = 10, p-value = 0.564
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 15)
## 
##  Box-Ljung test
## 
## data:  best_mod$residuals
## X-squared = 10.668, df = 15, p-value = 0.7757
Box.test(best_mod$residuals, type = 'Ljung-Box', lag = 20)
## 
##  Box-Ljung test
## 
## data:  best_mod$residuals
## X-squared = 11.84, df = 20, p-value = 0.9215

Ljung-Box test for Residual auto correlation suggests that if p-value is less than 0.05, then there exists resiual auto correlation at that lag. From the results above, we can conclude that there is no auto-correlation at any of the lags as for all lags p-value is greater than 0.05.

Actual vs Predicted from Model

best_mod = arima(amazon_data1_log_diff$amazon_log,order=c(0,1,0))
resid = best_mod$residuals
pred = resid + amazon_data1_log_diff$amazon_log
ggplot() +
  geom_line(aes(amazon_data1_log_diff$Date,amazon_data1_log_diff$amazon_log))+
  geom_line(aes(amazon_data1_log_diff$Date,pred),color='blue',alpha=0.4)+
  theme_bw()+
  xlab("Date")+
  ylab("Log Amazon")

Model seems to be doing good but there are lot of peaks that we are not capturing

RMSE = sqrt(mean((expm1(pred) - expm1(amazon_data1_log_diff$amazon_log))^2,na.rm=T))
RMSE
## [1] 62.84307

RMSE for the model is 62.8. Results suggest that our model predicts Amazon adjusted closing stock price within ~63 points on average in-sample.

Building Forecast - 5 time period

As I am using daily data, I am predicting forecast for 5 Months i.e. 150 days from the actual data.

best_mod %>% 
forecast(h = 150) %>%
  autoplot()

As we performed ARIMA model on log values, we need to inverse those log values to the original values.

prediction = predict(best_mod,n.ahead=150)
pred_data = data.frame(
  pred = prediction,
  Date = as.Date(ymd(max(amazon_data1_log_diff$Date)):ymd(max(amazon_data1_log_diff$Date)+149))
)

pred_merge = amazon_data1 %>% 
  full_join(pred_data) %>%
  mutate(
    pred_high = expm1(pred.pred+2*pred.se),
    pred_low = expm1(pred.pred-2*pred.se),
    pred.pred = expm1(pred.pred),
    error = pred.pred - Adjusted
  )
## Joining, by = "Date"
pred_merge %>% 
  ggplot()+
  geom_line(aes(Date,Adjusted))+
  geom_line(aes(Date,pred.pred),color='blue')+
  geom_ribbon(aes(x=Date,ymin=pred_low,ymax = pred_high),fill='blue',alpha=0.2)

Forecast generated from arima seems to be reasonable, but historical trend shows that the stock price is increasing over the time. So our forecast doesn’t seem to capture that trend as we are getting constant value for next 5 months.