ARIMA Trading Revised

library(quantmod)

## 필요한 패키지를 로딩중입니다: xts

## 필요한 패키지를 로딩중입니다: zoo

## 
## 다음의 패키지를 부착합니다: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 필요한 패키지를 로딩중입니다: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(magrittr)

getSymbols(Symbols = 'NVDA', from = '2016-01-01')

## [1] "NVDA"

head(NVDA)

##            NVDA.Open NVDA.High NVDA.Low NVDA.Close NVDA.Volume NVDA.Adjusted
## 2016-01-04    8.0725    8.1450   8.0100     8.0925    35807600      7.902439
## 2016-01-05    8.2450    8.3600   8.1250     8.2225    49027200      8.029387
## 2016-01-06    8.0875    8.1250   7.7900     7.8825    44934400      7.697371
## 2016-01-07    7.6850    7.7375   7.4700     7.5700    64530400      7.392212
## 2016-01-08    7.6675    7.6750   7.3925     7.4075    39847200      7.233527
## 2016-01-11    7.4150    7.4725   7.2875     7.4200    40937200      7.245734

nvda <- NVDA$NVDA.Adjusted
tail(nvda)

##            NVDA.Adjusted
## 2023-05-05        286.80
## 2023-05-08        291.51
## 2023-05-09        285.71
## 2023-05-10        288.85
## 2023-05-11        285.78
## 2023-05-12        283.40

#Visualize Stock Price
library(ggplot2)
nvda %>%
  ggplot(aes(x=index(nvda),y=NVDA.Adjusted))+
  geom_line(color='#4373B6')+
  labs(x='Date',y='Price',title='NVDA Daily Price (2016 - 2023)')+
  theme_bw()

#Compute return using diff log
nvda_return <- diff(log(nvda)) %>% na.omit()
head(nvda_return)

##            NVDA.Adjusted
## 2016-01-05   0.015936740
## 2016-01-06  -0.042229344
## 2016-01-07  -0.040451828
## 2016-01-08  -0.021700268
## 2016-01-11   0.001686136
## 2016-01-12   0.016706137

#Visualize Stock Return
nvda_return %>%
  ggplot(aes(x=index(nvda_return),y=NVDA.Adjusted))+
  geom_line(color='#4373B6')+
  labs(x='Date',y='Return',title='NVDA Daily Return (2016 - 2023)')+
  theme_bw()+
  scale_y_continuous(labels = scales::percent)

length(nvda_return)

## [1] 1852

#Use data from 2016 to 2022 define the order of (p,d,q) for arima
nvda_return[1761,]

##            NVDA.Adjusted
## 2022-12-30  0.0007530183

train <- nvda_return[1:1761,]

head(train)

##            NVDA.Adjusted
## 2016-01-05   0.015936740
## 2016-01-06  -0.042229344
## 2016-01-07  -0.040451828
## 2016-01-08  -0.021700268
## 2016-01-11   0.001686136
## 2016-01-12   0.016706137

tail(train)

##            NVDA.Adjusted
## 2022-12-22 -0.0730223754
## 2022-12-23 -0.0087085104
## 2022-12-27 -0.0740270523
## 2022-12-28 -0.0060375816
## 2022-12-29  0.0396015280
## 2022-12-30  0.0007530183

library(forecast)

#Custom code for ACF and PACF Plot
#Refernce: https://rh8liuqy.github.io/ACF_PACF_by_ggplot2.html#correlogram-formula
ggplot.corr <- function(data, lag.max = 24, ci = 0.95, large.sample.size = TRUE, horizontal = TRUE,...) {
  options(dplyr.banner_expression = FALSE)
  
  require(ggplot2)
  require(dplyr)
  require(cowplot)
  
  if(horizontal == TRUE) {numofrow <- 1} else {numofrow <- 2}
  
  list.acf <- acf(data, lag.max = lag.max, type = "correlation", plot = FALSE)
  N <- as.numeric(list.acf$n.used)
  df1 <- data.frame(lag = list.acf$lag, acf = list.acf$acf)
  df1$lag.acf <- dplyr::lag(df1$acf, default = 0)
  df1$lag.acf[2] <- 0
  df1$lag.acf.cumsum <- cumsum((df1$lag.acf)^2)
  df1$acfstd <- sqrt(1/N * (1 + 2 * df1$lag.acf.cumsum))
  df1$acfstd[1] <- 0
  df1 <- dplyr::select(df1, lag, acf, acfstd)
  
  list.pacf <- acf(data, lag.max = lag.max, type = "partial", plot = FALSE)
  df2 <- data.frame(lag = list.pacf$lag,pacf = list.pacf$acf)
  df2$pacfstd <- sqrt(1/N)
  
  if(large.sample.size == TRUE) {
    plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
    geom_area(aes(x = lag, y = qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
    geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
    geom_col(fill = "#4373B6", width = 0.7) +
    scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
    scale_y_continuous(name = element_blank(), 
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("ACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
    
    plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
    geom_area(aes(x = lag, y = qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
    geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
    geom_col(fill = "#4373B6", width = 0.7) +
    scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
    scale_y_continuous(name = element_blank(),
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("PACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
  }
  else {
    plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
    geom_col(fill = "#4373B6", width = 0.7) +
    geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
    scale_y_continuous(name = element_blank(), 
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("ACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
    
    plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
    geom_col(fill = "#4373B6", width = 0.7) +
    geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
    scale_y_continuous(name = element_blank(),
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("PACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
  }
  cowplot::plot_grid(plot.acf, plot.pacf, nrow = numofrow)
}

#Custom code for ARIMA Model ACF PACF reisduals
#Refernce: https://rh8liuqy.github.io/ACF_PACF_by_ggplot2.html#correlogram-formula
library(tidyr)

## 
## 다음의 패키지를 부착합니다: 'tidyr'

## The following object is masked from 'package:magrittr':
## 
##     extract

arima.order <- c(2,0,2)

p <- 0:arima.order[1]
d <- 0:arima.order[2] #Revise this part depending on how many differencing was conducted
q <- 0:arima.order[3]

df.arima.order <- data.frame(t(expand.grid(p=p,d=d,q=q)))
df.colnames <- data.frame(expand.grid(p=p,d=d,q=q))
df.colnames$type <- with(df.colnames,paste0
                         ("ARIMA(",p,",",d,",",q,")"))

list.arima.model <- lapply(df.arima.order, 
                           function(data, x, ...) { 
                             arima(x = data, order = x, ...)
                             }, 
                           data = train) # Data has to be in ts or xts format

df.arima.residual <- sapply(list.arima.model, 
                            function(x) {
                              x$residuals
                              })
df.arima.residual <- data.frame(df.arima.residual)
colnames(df.arima.residual) <- df.colnames$type
df.arima.residual$time <- time(train) # Data has to be in ts or xts format


df.arima.residual.plot <- gather(data = df.arima.residual, value = "residual", key = "type",-time)


df.acf <- sapply(subset(df.arima.residual, select = -time), 
       function(x, lag.max,...) {
         acf(x = x, lag.max = lag.max, plot = FALSE)$acf
       },
       lag.max = 24)
df.acf <- data.frame(df.acf)
colnames(df.acf) <- df.colnames$type

df.acf$lag <- 0:(dim(df.acf)[1]-1)
df.acf.plot <- gather(df.acf, 
                      value = "value", 
                      key = "Model",
                      -lag)


df.pacf <- sapply(subset(df.arima.residual, select = -time),
                  function(x, lag.max,...) {
                    acfvalue <- as.numeric(acf(x = x, 
                                               lag.max = lag.max,
                                               type = "partial",
                                               plot = FALSE)$acf)
                    return(c(1,acfvalue))
                    },
                  lag.max = 24)
df.pacf <- data.frame(df.pacf)
colnames(df.pacf) <- df.colnames$type

df.pacf$lag <- 0:(dim(df.pacf)[1]-1)
df.pacf.plot <- gather(df.pacf, 
                       value = "value", 
                       key = "Model",
                       -lag)

fig.acf <- ggplot(data = df.acf.plot, aes(x = lag, y = value)) +
  geom_col(width = 0.4, fill = "#667fa2") +
  geom_hline(yintercept = qnorm((1+0.95)/2)/sqrt(dim(df.acf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  geom_hline(yintercept = -qnorm((1+0.95)/2)/sqrt(dim(df.acf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  scale_y_continuous(name = element_blank()) +
  scale_x_continuous(breaks = seq(0,24,4)) +
  ggtitle("ACF of residuals") +
  facet_wrap(~Model) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        strip.background = element_rect(fill = "#f8ddc4"))

fig.pacf <- ggplot(data = df.pacf.plot, aes(x = lag, y = value)) +
  geom_col(width = 0.4, fill = "#667fa2") +
  geom_hline(yintercept = qnorm((1+0.95)/2)/sqrt(dim(df.pacf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  geom_hline(yintercept = -qnorm((1+0.95)/2)/sqrt(dim(df.pacf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  scale_y_continuous(name = element_blank()) +
  scale_x_continuous(breaks = seq(0,24,4)) +
  ggtitle("PACF of residuals") +
  facet_wrap(~Model) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        strip.background = element_rect(fill = "#f8ddc4"))

fig.residual <- ggplot(data = df.arima.residual.plot, aes(x = time, y = residual)) +
  geom_line(colour = "#667fa2") +
  ggtitle("Residuals") +
  facet_wrap(~type) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        strip.background = element_rect(fill = "#f8ddc4"))

ggplot.corr(train,large.sample.size = FALSE)

## 필요한 패키지를 로딩중입니다: dplyr

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:xts':
## 
##     first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## 필요한 패키지를 로딩중입니다: cowplot

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

From the correlogram, it seems that AR(1), MA(1), or ARIMA(1,0,1) is adequate for forecast. We will see AIC for final order for ARIMA.

fig.acf

fig.pacf

fig.residual

#Custom Code for AIC 
aic <- sapply(list.arima.model, function(X) {X$aic})
aic <- round(as.numeric(aic),2)
df.aic.plot <- data.frame(aic = aic, p = df.colnames$p, q = df.colnames$q)

ggplot(data = df.aic.plot, aes(x = p, y = q)) + 
  geom_raster(aes(fill = aic)) +
  geom_text(aes( label = aic), colour = "black") + 
  scale_x_continuous(expand = c(0.05,0.05)) +
  scale_color_continuous( high = "#FFA800", low = "#4AC200") +
  theme_bw() +
  theme(panel.grid.minor = element_blank()) + 
  scale_fill_distiller(palette = "Spectral", direction = -1) +
  ggtitle('Akaike Information Criterion (AIC)')

Although ARIMA(0,0,2) gives the lowest AIC, we will adopt AR(1) due to parismony principle.

First, let’s predict 2023 January 3rd NVDA return using AR(1) and compare to actual value

#Predict 2023-01-05 Stock Return
a <- arima(train,order = c(1,0,0)) %>% forecast
a$mean[1]

## [1] 0.001728894

#Actual Value
nvda_return[1762,]

##            NVDA.Adjusted
## 2023-01-03    -0.0206721

Now, starting from 2023-01-03, we will predict the stock return as actual observation gets updated. The following code below shows

#Predict 2023-01-05 Return
train1 <- nvda_return[1:1761,] #Data until 2022-12-30
a1 <- arima(train1,order=c(1,0,0)) %>% forecast
a1$mean[1]

## [1] 0.001728894

#Predict 2023-01-06 Return
train2 <- nvda_return[1:1762,] #Data until 2022-01-05
a2 <- arima(train2,order=c(1,0,0)) %>% forecast
a2$mean[1]

## [1] 0.003464433

#Predict 2023-01-07 Return
train3 <- nvda_return[1:1763,]#Data until 2022-01-06
a3 <- arima(train3,order=c(1,0,0)) %>% forecast
a3$mean[1]

## [1] -0.0006542375

Using this logic, we will automate the process and predict all the Y2023 returns from ARIMA model.

# Set the ARIMA order
# Change the value of order for diferent outcome for cumulative return
order <- c(1, 0, 0)

# Set the length of each training set
train_length <- 1761

# Create an empty vector to store the results
a_means <- numeric()

# Iterate over the different subsets of nvda_return
for (i in seq(train_length, length(nvda_return)-1, by = 1)) {
  # Extract the current training set
  train <- nvda_return[1:i, ]
  
  # Fit an ARIMA model to the training set and generate a forecast
  a <- arima(train, order = order) %>% forecast
  
  # Store the first mean forecast in the results vector
  a_means[i - train_length + 1] <- a$mean[1]
}

# Print the results
a_means

##  [1]  1.728894e-03  3.464433e-03 -6.542375e-04  4.527535e-03 -1.597737e-03
##  [6] -2.314524e-03  3.834736e-04  1.368736e-03 -6.971491e-04 -1.198816e-05
## [11] -1.837152e-03  3.381323e-03  4.741396e-03 -3.168510e-03 -3.838217e-03
## [16]  1.643691e-03  1.702481e-03  2.732159e-05 -2.261552e-04  6.752674e-03
## [21]  3.996825e-04 -3.502117e-03 -7.800693e-04  4.201025e-03  2.022231e-03
## [26] -1.895142e-03  1.898210e-03  1.551586e-03  5.816565e-03  8.979662e-05
## [31] -2.087561e-03  2.726395e-03  4.646644e-03  4.161445e-03  4.621498e-03
## [36]  1.591768e-03 -7.969210e-03  3.261831e-03  1.322649e-03  2.959700e-03
## [41]  3.737523e-03 -4.186473e-05  1.620211e-04  3.114134e-03  2.875768e-03
## [46] -8.593897e-04  4.449467e-03  3.572742e-03  2.001527e-03 -1.569207e-03
## [51]  1.508273e-03 -1.989517e-03  1.517240e-03  1.550714e-03  1.197009e-03
## [56]  1.298960e-03  3.714267e-05  3.253827e-03  2.789219e-03  2.423277e-03
## [61]  4.417127e-04  9.729408e-04  1.011332e-03  1.589039e-03  3.499460e-03
## [66]  3.675943e-03  1.639513e-03  5.824284e-04  3.219647e-03  3.968260e-03
## [71]  2.153093e-03  1.227094e-03  1.383743e-03  2.440439e-04  1.371134e-03
## [76]  4.343375e-03  2.027029e-03  2.280930e-03  4.316984e-03  2.239228e-05
## [81]  1.311998e-03  6.369718e-04 -9.870499e-04  3.942354e-03  3.174607e-03
## [86]  2.721769e-03 -9.171190e-04  8.718254e-04  3.601667e-03  1.265322e-03
## [91]  2.891395e-03

#Create a new dataframe that shows 2023 predicted value and actual value
df <- data.frame(Date=index(nvda_return[1762:nrow(nvda_return),]),
           Predict = a_means,
           Actual = nvda_return[1762:nrow(nvda_return),]%>%as.numeric())

library(DT)
datatable(df)

# create a new column 'signal' based on the predict column
# If the Predicted value is negative, we are going to short
# If the Predicted Valye is positive, we are going to long
df$signal <- ifelse(df$Predict > 0, "Long", "Short")

datatable(df)

#Create signal direction to calculate return
df$direction <- ifelse(df$signal == "Short", -1, 1)

#We are going to multiply -1 * actual return for short position
#Multiplu 1 for long position
df$return <- df$direction*df$Actual

datatable(df)

#Calculate cumulative return for buy&hold strategy Vs. ARIMA strategy
#Buy&Hold return
df$buy_hold <- cumprod(1+df$Actual)

#ARIMA return
df$arima <- cumprod(1+df$return)

# plot the total returns on a graph
ggplot(df, aes(x = Date)) +
  geom_line(aes(y = buy_hold, color = "Buy&Hold"),size=1) +
  geom_line(aes(y = arima, color = "ARIMA"),size=1) +
  scale_color_manual(values = c("Buy&Hold" = "grey30", "ARIMA" = '#4373B6')) +
  labs(x = "Date", y = "Cumulative Return", color = "",title='Buy&Hold vs ARIMA') +
  scale_x_date(breaks='2 month')+
  theme_bw()+
  theme(legend.position = "top")

From the result, it seems that Buy&Hold strategy gave more returns for NVDA. However, it still gave a positive return, which means that it is still a valiud strategy.

Now, let’s predict how the stock price would look like in the future period. To decide the order, we will use all the data points from the past and use auto.arima

unloadNamespace('ggfortify')
unloadNamespace('forecast')
library('forecast')

#Convert to ts format 
ts_nvda <- nvda%>%ts()
#Forecast the stock price using auto.arima
fit <- auto.arima(ts_nvda) %>% forecast(h=300)


#Plot ARIMA model prediction
autoplot(ts_nvda,color='grey30') +
  autolayer(fit,series = 'ARIMA') +

  labs(x='Time', y='Return', title='NVDA Stock Price Prediction from ARIMA') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('red'))+
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))

ARIMA Trading Revised

Jaehyun Ahn

2023-05-12