library(tidyquant)
## 필요한 패키지를 로딩중입니다: lubridate
## 
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 필요한 패키지를 로딩중입니다: PerformanceAnalytics
## 필요한 패키지를 로딩중입니다: xts
## 필요한 패키지를 로딩중입니다: zoo
## 
## 다음의 패키지를 부착합니다: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 다음의 패키지를 부착합니다: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## 필요한 패키지를 로딩중입니다: quantmod
## 필요한 패키지를 로딩중입니다: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(DT)
library(magrittr)
library(tseries)
library(timetk)
library(stats)
library(forecast)
library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rsample)
library(ggplot2)
library(stats)
# Set the start and end dates
start_date <- "2013-01-01"
end_date <- "2023-04-21"

# Get the S&P 500 monthly price data
spy_data <- tq_get("SPY", 
                     from = start_date,
                     to = end_date,
                     get = "stock.prices",
                     periodicity = "monthly")

tail(spy_data)
## # A tibble: 6 x 8
##   symbol date        open  high   low close     volume adjusted
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>      <dbl>    <dbl>
## 1 SPY    2022-11-01  390.  408.  369.  408. 1745985300     404.
## 2 SPY    2022-12-01  409.  410.  375.  382. 1735973600     379.
## 3 SPY    2023-01-01  384.  408.  378.  406. 1575450100     405.
## 4 SPY    2023-02-01  405.  418.  394.  396. 1603094700     395.
## 5 SPY    2023-03-01  395.  410.  381.  409. 2515907800     408.
## 6 SPY    2023-04-01  409.  416.  406.  406. 1132698300     406.
spy <- tibble::tibble(date=spy_data$date,spy=spy_data$close)
head(spy)
## # A tibble: 6 x 2
##   date         spy
##   <date>     <dbl>
## 1 2013-01-01  150.
## 2 2013-02-01  152.
## 3 2013-03-01  157.
## 4 2013-04-01  160.
## 5 2013-05-01  163.
## 6 2013-06-01  160.
spy %>%
  ggplot(aes(x=date,y=spy))+
  geom_line(color='#4373B6') +
  theme_bw() +
  ggtitle('SPY from 2013(M1) - 2023(M4)')+
  xlab('Date')+
  ylab('Close Price')

#Compute log return of S&P 500
spy_return <- spy_data %>%
  tq_transmute(select = close,
               mutate_fun = periodReturn,
               type = 'log',
               col_rename = 'returns')

head(spy_return)
## # A tibble: 6 x 2
##   date       returns
##   <date>       <dbl>
## 1 2013-01-01  0     
## 2 2013-02-01  0.0127
## 3 2013-03-01  0.0328
## 4 2013-04-01  0.0190
## 5 2013-05-01  0.0233
## 6 2013-06-01 -0.0187
#Graph the return
spy_return %>%
  ggplot(aes(x=date,y=returns))+
  geom_line(color='#4373B6') +
  theme_bw() +
  ggtitle('SPY Returns from 2013(M1) - 2023(M4) ')+
  xlab('Date')+
  ylab('Returns')+
  scale_y_continuous(labels = scales::percent)

ts_spy_return <- ts(spy_return$returns,start=c(2013,1),frequency = 12)
head(ts_spy_return)
##              Jan         Feb         Mar         Apr         May         Jun
## 2013  0.00000000  0.01267817  0.03283023  0.01903010  0.02333535 -0.01871175
train <- window(ts_spy_return,start=2013,end=c(2021,12))
train1 <- window(ts_spy_return,start=c(2013,1),end=c(2022,1))
train2 <- window(ts_spy_return,start=c(2013,1),end=c(2022,2))
train3 <- window(ts_spy_return,start=c(2013,1),end=c(2022,3))
train4 <- window(ts_spy_return,start=c(2013,1),end=c(2022,4))
train5 <- window(ts_spy_return,start=c(2013,1),end=c(2022,5))
train6 <- window(ts_spy_return,start=c(2013,1),end=c(2022,6))
train7 <- window(ts_spy_return,start=c(2013,1),end=c(2022,7))
train8 <- window(ts_spy_return,start=c(2013,1),end=c(2022,8))

test <- window(ts_spy_return,start=c(2022,1))
tail(train)
##               Jul          Aug          Sep          Oct          Nov
## 2021  0.024119271  0.029325611 -0.050925009  0.067811483 -0.008067297
##               Dec
## 2021  0.041703144
test
##               Jan          Feb          Mar          Apr          May
## 2022 -0.054182999 -0.029961406  0.033799270 -0.091862086  0.002254721
## 2023  0.060989143 -0.025464168  0.032597697 -0.008118131             
##               Jun          Jul          Aug          Sep          Oct
## 2022 -0.090369972  0.088090955 -0.041657713 -0.101101503  0.078141403
## 2023                                                                 
##               Nov          Dec
## 2022  0.054101296 -0.063936924
## 2023
#Custome code for ACF and PACF Plot
ggplot.corr <- function(data, lag.max = 24, ci = 0.95, large.sample.size = TRUE, horizontal = TRUE,...) {
  options(dplyr.banner_expression = FALSE)
  
  require(ggplot2)
  require(dplyr)
  require(cowplot)
  
  if(horizontal == TRUE) {numofrow <- 1} else {numofrow <- 2}
  
  list.acf <- acf(data, lag.max = lag.max, type = "correlation", plot = FALSE)
  N <- as.numeric(list.acf$n.used)
  df1 <- data.frame(lag = list.acf$lag, acf = list.acf$acf)
  df1$lag.acf <- dplyr::lag(df1$acf, default = 0)
  df1$lag.acf[2] <- 0
  df1$lag.acf.cumsum <- cumsum((df1$lag.acf)^2)
  df1$acfstd <- sqrt(1/N * (1 + 2 * df1$lag.acf.cumsum))
  df1$acfstd[1] <- 0
  df1 <- dplyr::select(df1, lag, acf, acfstd)
  
  list.pacf <- acf(data, lag.max = lag.max, type = "partial", plot = FALSE)
  df2 <- data.frame(lag = list.pacf$lag,pacf = list.pacf$acf)
  df2$pacfstd <- sqrt(1/N)
  
  if(large.sample.size == TRUE) {
    plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
    geom_area(aes(x = lag, y = qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
    geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
    geom_col(fill = "#4373B6", width = 0.7) +
    scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
    scale_y_continuous(name = element_blank(), 
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("ACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
    
    plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
    geom_area(aes(x = lag, y = qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
    geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
    geom_col(fill = "#4373B6", width = 0.7) +
    scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
    scale_y_continuous(name = element_blank(),
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("PACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
  }
  else {
    plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
    geom_col(fill = "#4373B6", width = 0.7) +
    geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
    scale_y_continuous(name = element_blank(), 
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("ACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
    
    plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
    geom_col(fill = "#4373B6", width = 0.7) +
    geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
    scale_y_continuous(name = element_blank(),
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("PACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
  }
  cowplot::plot_grid(plot.acf, plot.pacf, nrow = numofrow)
}
#Custom code for ARIMA Model ACF PACF reisduals

arima.order <- c(2,0,2)

p <- 0:arima.order[1]
d <- 0:arima.order[2] #Revise this part depending on how many differencing was conducted
q <- 0:arima.order[3]

df.arima.order <- data.frame(t(expand.grid(p=p,d=d,q=q)))
df.colnames <- data.frame(expand.grid(p=p,d=d,q=q))
df.colnames$type <- with(df.colnames,paste0
                         ("ARIMA(",p,",",d,",",q,")"))

list.arima.model <- lapply(df.arima.order, 
                           function(data, x, ...) { 
                             arima(x = data, order = x, ...)
                             }, 
                           data = train) # Data has to be in ts format

df.arima.residual <- sapply(list.arima.model, 
                            function(x) {
                              x$residuals
                              })
df.arima.residual <- data.frame(df.arima.residual)
colnames(df.arima.residual) <- df.colnames$type
df.arima.residual$time <- time(train) # Data has to be in ts format


df.arima.residual.plot <- gather(data = df.arima.residual, value = "residual", key = "type",-time)


df.acf <- sapply(subset(df.arima.residual, select = -time), 
       function(x, lag.max,...) {
         acf(x = x, lag.max = lag.max, plot = FALSE)$acf
       },
       lag.max = 24)
df.acf <- data.frame(df.acf)
colnames(df.acf) <- df.colnames$type

df.acf$lag <- 0:(dim(df.acf)[1]-1)
df.acf.plot <- gather(df.acf, 
                      value = "value", 
                      key = "Model",
                      -lag)


df.pacf <- sapply(subset(df.arima.residual, select = -time),
                  function(x, lag.max,...) {
                    acfvalue <- as.numeric(acf(x = x, 
                                               lag.max = lag.max,
                                               type = "partial",
                                               plot = FALSE)$acf)
                    return(c(1,acfvalue))
                    },
                  lag.max = 24)
df.pacf <- data.frame(df.pacf)
colnames(df.pacf) <- df.colnames$type

df.pacf$lag <- 0:(dim(df.pacf)[1]-1)
df.pacf.plot <- gather(df.pacf, 
                       value = "value", 
                       key = "Model",
                       -lag)

fig.acf <- ggplot(data = df.acf.plot, aes(x = lag, y = value)) +
  geom_col(width = 0.4, fill = "#667fa2") +
  geom_hline(yintercept = qnorm((1+0.95)/2)/sqrt(dim(df.acf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  geom_hline(yintercept = -qnorm((1+0.95)/2)/sqrt(dim(df.acf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  scale_y_continuous(name = element_blank()) +
  scale_x_continuous(breaks = seq(0,24,4)) +
  ggtitle("ACF of residuals") +
  facet_wrap(~Model) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        strip.background = element_rect(fill = "#f8ddc4"))

fig.pacf <- ggplot(data = df.pacf.plot, aes(x = lag, y = value)) +
  geom_col(width = 0.4, fill = "#667fa2") +
  geom_hline(yintercept = qnorm((1+0.95)/2)/sqrt(dim(df.pacf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  geom_hline(yintercept = -qnorm((1+0.95)/2)/sqrt(dim(df.pacf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  scale_y_continuous(name = element_blank()) +
  scale_x_continuous(breaks = seq(0,24,4)) +
  ggtitle("PACF of residuals") +
  facet_wrap(~Model) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        strip.background = element_rect(fill = "#f8ddc4"))

fig.residual <- ggplot(data = df.arima.residual.plot, aes(x = time, y = residual)) +
  geom_line(colour = "#667fa2") +
  ggtitle("Residuals") +
  facet_wrap(~type) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        strip.background = element_rect(fill = "#f8ddc4"))
ggplot.corr(spy_return[1:108,2],large.sample.size = FALSE)
## 필요한 패키지를 로딩중입니다: cowplot
## 
## 다음의 패키지를 부착합니다: 'cowplot'
## The following object is masked from 'package:lubridate':
## 
##     stamp
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1 rows containing missing values (`geom_hline()`).
## Removed 1 rows containing missing values (`geom_hline()`).

fig.acf

fig.pacf

fig.residual
## 
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

#Custom Code for AIC 
aic <- sapply(list.arima.model, function(X) {X$aic})
aic <- round(as.numeric(aic),2)
df.aic.plot <- data.frame(aic = aic, p = df.colnames$p, q = df.colnames$q)

ggplot(data = df.aic.plot, aes(x = p, y = q)) + 
  geom_raster(aes(fill = aic)) +
  geom_text(aes( label = aic), colour = "black") + 
  scale_x_continuous(expand = c(0.05,0.05)) +
  scale_color_continuous( high = "#FFA800", low = "#4AC200") +
  theme_bw() +
  theme(panel.grid.minor = element_blank()) + 
  scale_fill_distiller(palette = "Spectral", direction = -1) +
  ggtitle('Akaike Information Criterion (AIC)')

#From the Plot, it seems that MA(1) model is suitable.
#However auto.arima gives MA(2)
#We won't use auto arima 
auto.arima(train)
## Series: train 
## ARIMA(0,0,2) with non-zero mean 
## 
## Coefficients:
##           ma1      ma2    mean
##       -0.1995  -0.1745  0.0106
## s.e.   0.0947   0.0925  0.0023
## 
## sigma^2 = 0.001437:  log likelihood = 201.66
## AIC=-395.32   AICc=-394.93   BIC=-384.59
fit_train <- arima(train,c(0,0,1))%>% forecast()
fit_train
##          Point Forecast       Lo 80      Hi 80       Lo 95      Hi 95
## Jan 2022    0.004094099 -0.04456373 0.05275193 -0.07032163 0.07850982
## Feb 2022    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Mar 2022    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Apr 2022    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## May 2022    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Jun 2022    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Jul 2022    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Aug 2022    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Sep 2022    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Oct 2022    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Nov 2022    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Dec 2022    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Jan 2023    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Feb 2023    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Mar 2023    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Apr 2023    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## May 2023    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Jun 2023    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Jul 2023    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Aug 2023    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Sep 2023    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Oct 2023    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Nov 2023    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
## Dec 2023    0.010650051 -0.03922811 0.06052821 -0.06563201 0.08693212
train
##                Jan           Feb           Mar           Apr           May
## 2013  0.0000000000  0.0126781693  0.0328302290  0.0190300991  0.0233353461
## 2014 -0.0358845187  0.0445103257  0.0038575029  0.0069274658  0.0229412151
## 2015 -0.0300770648  0.0546819151 -0.0202841404  0.0097858719  0.0127742059
## 2016 -0.0510686886 -0.0008262911  0.0599558407  0.0039334662  0.0168684780
## 2017  0.0177364647  0.0385392608 -0.0030918302  0.0098772360  0.0140142433
## 2018  0.0548282567 -0.0370379259 -0.0317902619  0.0051549066  0.0240183060
## 2019  0.0770217815  0.0319015049  0.0135436455  0.0400399643 -0.0658953606
## 2020 -0.0004039031 -0.0824752378 -0.1392473580  0.1195446421  0.0465450374
## 2021 -0.0102427078  0.0274259374  0.0411290315  0.0515581941  0.0065446074
##                Jun           Jul           Aug           Sep           Oct
## 2013 -0.0187117513  0.0503859426 -0.0304513418  0.0262935074  0.0452665893
## 2014  0.0156543261 -0.0135286908  0.0387047343 -0.0185558483  0.0232778641
## 2015 -0.0253736385  0.0223378758 -0.0628866896 -0.0310325079  0.0816349954
## 2016 -0.0017170662  0.0358219227  0.0011968241 -0.0049806503 -0.0174890779
## 2017  0.0014899473  0.0203457943  0.0029134524  0.0149986409  0.0232907106
## 2018  0.0012540926  0.0363768172  0.0314209994  0.0014112974 -0.0716080153
## 2019  0.0624202148  0.0150062706 -0.0168851267  0.0146636380  0.0218638880
## 2020  0.0131880806  0.0572232859  0.0674686055 -0.0421576099 -0.0252496842
## 2021  0.0189134017  0.0241192714  0.0293256106 -0.0509250090  0.0678114831
##                Nov           Dec
## 2013  0.0292069703  0.0201817235
## 2014  0.0271013651 -0.0080438678
## 2015  0.0036484559 -0.0233673956
## 2016  0.0361760828  0.0141922770
## 2017  0.0301080500  0.0069565212
## 2018  0.0183793305 -0.0979910646
## 2019  0.0355584748  0.0237368681
## 2020  0.1032574582  0.0321249653
## 2021 -0.0080672971  0.0417031440
#Plot ARIMA model prediction
autoplot(train,color='grey30') +
  autolayer(fit_train$fitted,series = 'ARIMA')+
  autolayer(fit_train,series = 'ARIMA') +
   labs(x='Time', y='Return', title='ARIMA') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('red'))+
  scale_y_continuous(labels = scales::percent) +
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))

fit_train$mean
##              Jan         Feb         Mar         Apr         May         Jun
## 2022 0.004094099 0.010650051 0.010650051 0.010650051 0.010650051 0.010650051
## 2023 0.010650051 0.010650051 0.010650051 0.010650051 0.010650051 0.010650051
##              Jul         Aug         Sep         Oct         Nov         Dec
## 2022 0.010650051 0.010650051 0.010650051 0.010650051 0.010650051 0.010650051
## 2023 0.010650051 0.010650051 0.010650051 0.010650051 0.010650051 0.010650051
# define start and end dates
start_date <- c(2013,1)
end_dates1 <- c(2022,1:12)
end_dates2 <- c(2023,1:3)

# create a list to store the forecasts
forecasts <- list()

# loop over end dates and fit ARIMA models
for (i in 1:length(end_dates1)) {
  train <- window(ts_spy_return, start = start_date, end = c(2022, end_dates1[i]))
  fit <- arima(train, c(0,0,1)) %>% forecast()
  forecasts[[i]] <- fit$mean[1]
}
## Warning in window.default(x, ...): 'end' value not changed
for (i in 1:length(end_dates2)) {
  train <- window(ts_spy_return, start = start_date, end = c(2023,end_dates2[i]))
  fit <- arima(train, c(0,0,1)) %>% forecast()
  forecasts[[i + length(end_dates1)]] <- fit$mean[1]
}
## Warning in window.default(x, ...): 'end' value not changed
# combine results into a data frame
results <- data.frame(end_date = c(end_dates1, end_dates2), forecast = unlist(forecasts))


results
##    end_date     forecast
## 1      2022  0.011033389
## 2         1  0.024064490
## 3         2  0.021081002
## 4         3  0.007241615
## 5         4  0.030936546
## 6         5  0.014279113
## 7         6  0.025400795
## 8         7 -0.003371496
## 9         8  0.016683643
## 10        9  0.030475868
## 11       10 -0.001945132
## 12       11 -0.004081110
## 13       12  0.022091153
## 14     2023  0.011033389
## 15        1 -0.001435208
## 16        2  0.014098649
## 17        3  0.003444871
predicted_return <- c(fit_train$mean[1], results[c(-1,-14),]$forecast)
predicted_return
##  [1]  0.004094099  0.024064490  0.021081002  0.007241615  0.030936546
##  [6]  0.014279113  0.025400795 -0.003371496  0.016683643  0.030475868
## [11] -0.001945132 -0.004081110  0.022091153 -0.001435208  0.014098649
## [16]  0.003444871
library(tibble)

test_df <- tibble(date=spy[109:124,]$date,spy=spy[109:124,]$spy,return=spy_return[109:124,]$returns,predict=predicted_return)

test_df
## # A tibble: 16 x 4
##    date         spy   return  predict
##    <date>     <dbl>    <dbl>    <dbl>
##  1 2022-01-01  450. -0.0542   0.00409
##  2 2022-02-01  437. -0.0300   0.0241 
##  3 2022-03-01  452.  0.0338   0.0211 
##  4 2022-04-01  412  -0.0919   0.00724
##  5 2022-05-01  413.  0.00225  0.0309 
##  6 2022-06-01  377. -0.0904   0.0143 
##  7 2022-07-01  412.  0.0881   0.0254 
##  8 2022-08-01  395. -0.0417  -0.00337
##  9 2022-09-01  357. -0.101    0.0167 
## 10 2022-10-01  386.  0.0781   0.0305 
## 11 2022-11-01  408.  0.0541  -0.00195
## 12 2022-12-01  382. -0.0639  -0.00408
## 13 2023-01-01  406.  0.0610   0.0221 
## 14 2023-02-01  396. -0.0255  -0.00144
## 15 2023-03-01  409.  0.0326   0.0141 
## 16 2023-04-01  406. -0.00812  0.00344
# Initialize signal vector
signal1 <- rep("Hold", nrow(test_df))

# Implement signal rules
for(i in 1:nrow(test_df)) {
  if(i == 1) {
    if(test_df$predict[i] > 0) {
      signal1[i] <- "Buy"
    }
  } else {
    if(test_df$predict[i] > 0) {
      if(test_df$predict[i-1] < 0) {
        signal1[i] <- "Buy"
      } else {
        signal1[i] <- "Hold"
      }
    } else if(test_df$predict[i] < 0) {
      if(test_df$predict[i-1] > 0) {
        signal1[i] <- "Sell"
      } else {
        signal1[i] <- "Hold"
      }
    }
  }
}

signal1
##  [1] "Buy"  "Hold" "Hold" "Hold" "Hold" "Hold" "Hold" "Sell" "Buy"  "Hold"
## [11] "Sell" "Hold" "Buy"  "Sell" "Buy"  "Hold"
test_df$signal1 <- signal1

# create a new column 'signal2' based on the predict column
test_df$signal2 <- ifelse(test_df$predict > 0, "Long", "Short")

datatable(test_df)
test_df$signal2_direction <- ifelse(test_df$signal2 == "Short", -1, 1)
test_df$trading_return <- test_df$return*test_df$signal2_direction
datatable(test_df)
return_df <- test_df[,c(1,3,8)] 
return_df
## # A tibble: 16 x 3
##    date         return trading_return
##    <date>        <dbl>          <dbl>
##  1 2022-01-01 -0.0542        -0.0542 
##  2 2022-02-01 -0.0300        -0.0300 
##  3 2022-03-01  0.0338         0.0338 
##  4 2022-04-01 -0.0919        -0.0919 
##  5 2022-05-01  0.00225        0.00225
##  6 2022-06-01 -0.0904        -0.0904 
##  7 2022-07-01  0.0881         0.0881 
##  8 2022-08-01 -0.0417         0.0417 
##  9 2022-09-01 -0.101         -0.101  
## 10 2022-10-01  0.0781         0.0781 
## 11 2022-11-01  0.0541        -0.0541 
## 12 2022-12-01 -0.0639         0.0639 
## 13 2023-01-01  0.0610         0.0610 
## 14 2023-02-01 -0.0255         0.0255 
## 15 2023-03-01  0.0326         0.0326 
## 16 2023-04-01 -0.00812       -0.00812
return_df_pivot <- return_df %>%
  tidyr::pivot_longer(cols=-date,
                      names_to = 'type',
                      values_to = 'returns')
head(return_df_pivot)
## # A tibble: 6 x 3
##   date       type           returns
##   <date>     <chr>            <dbl>
## 1 2022-01-01 return         -0.0542
## 2 2022-01-01 trading_return -0.0542
## 3 2022-02-01 return         -0.0300
## 4 2022-02-01 trading_return -0.0300
## 5 2022-03-01 return          0.0338
## 6 2022-03-01 trading_return  0.0338
# Plot bar graph
a1 <- ggplot(return_df, aes(x = date, y = trading_return)) + 
  geom_bar(position = "dodge",stat='identity',fill='#4373B6') + 
  labs(x = "Date", y = "Returns", 
       title = "ARIMA Trading Returns") + 
  scale_y_continuous(labels = scales::percent_format()) +
  scale_x_date(breaks='2 month')+
  theme_bw()+
  theme(legend.position = "top") 

a2 <- ggplot(return_df, aes(x = date, y = return)) + 
  geom_bar(position = "dodge",stat='identity',fill='grey30') + 
  labs(x = "Date", y = "Returns", 
       title = "Buy&Hold") + 
  scale_y_continuous(labels = scales::percent_format()) +
  scale_x_date(breaks='2 month')+
  theme_bw()+
  theme(legend.position = "top") 

gridExtra::grid.arrange(a1,a2)

return_df
## # A tibble: 16 x 3
##    date         return trading_return
##    <date>        <dbl>          <dbl>
##  1 2022-01-01 -0.0542        -0.0542 
##  2 2022-02-01 -0.0300        -0.0300 
##  3 2022-03-01  0.0338         0.0338 
##  4 2022-04-01 -0.0919        -0.0919 
##  5 2022-05-01  0.00225        0.00225
##  6 2022-06-01 -0.0904        -0.0904 
##  7 2022-07-01  0.0881         0.0881 
##  8 2022-08-01 -0.0417         0.0417 
##  9 2022-09-01 -0.101         -0.101  
## 10 2022-10-01  0.0781         0.0781 
## 11 2022-11-01  0.0541        -0.0541 
## 12 2022-12-01 -0.0639         0.0639 
## 13 2023-01-01  0.0610         0.0610 
## 14 2023-02-01 -0.0255         0.0255 
## 15 2023-03-01  0.0326         0.0326 
## 16 2023-04-01 -0.00812       -0.00812
return_df$total_return <- cumprod(1 + return_df$return)
return_df$total_trading_return <- cumprod(1 + return_df$trading_return)

# plot the total returns on a graph
ggplot(return_df, aes(x = date)) +
  geom_line(aes(y = total_return, color = "Buy&Hold"),size=1) +
  geom_line(aes(y = total_trading_return, color = "Trading Return"),size=1) +
  scale_color_manual(values = c("Buy&Hold" = "grey30", "Trading Return" = '#4373B6')) +
  labs(x = "Date", y = "Cumulative Return", color = "",title='Buy&Hold vs ARIMA') +
  scale_x_date(breaks='2 month')+
  theme_bw()+
  theme(legend.position = "top") 

return_df
## # A tibble: 16 x 5
##    date         return trading_return total_return total_trading_return
##    <date>        <dbl>          <dbl>        <dbl>                <dbl>
##  1 2022-01-01 -0.0542        -0.0542         0.946                0.946
##  2 2022-02-01 -0.0300        -0.0300         0.917                0.917
##  3 2022-03-01  0.0338         0.0338         0.948                0.948
##  4 2022-04-01 -0.0919        -0.0919         0.861                0.861
##  5 2022-05-01  0.00225        0.00225        0.863                0.863
##  6 2022-06-01 -0.0904        -0.0904         0.785                0.785
##  7 2022-07-01  0.0881         0.0881         0.854                0.854
##  8 2022-08-01 -0.0417         0.0417         0.819                0.890
##  9 2022-09-01 -0.101         -0.101          0.736                0.800
## 10 2022-10-01  0.0781         0.0781         0.794                0.863
## 11 2022-11-01  0.0541        -0.0541         0.837                0.816
## 12 2022-12-01 -0.0639         0.0639         0.783                0.868
## 13 2023-01-01  0.0610         0.0610         0.831                0.921
## 14 2023-02-01 -0.0255         0.0255         0.810                0.944
## 15 2023-03-01  0.0326         0.0326         0.836                0.975
## 16 2023-04-01 -0.00812       -0.00812        0.829                0.967
# calculate total return for each trading method
buy_hold_total_return <- tail(return_df$total_return,1)-1
trading_total_return <- tail(return_df$total_trading_return,1)-1
# create a data frame with the total returns for each method
total_returns <- data.frame(method = c("Buy&Hold", "ARIMA Trading"),
                             total_return = c(buy_hold_total_return, trading_total_return))


# create bar graph
library(ggplot2)
ggplot(total_returns, aes(x = method,y=total_return,fill=method)) +
  geom_bar(stat = "identity",width = 0.3)+
  ggtitle("Total Return by Trading Method") +
  scale_fill_manual(values = c('#4373B6',"grey30")) +
  xlab("Trading Method") +
  ylab("Total Return")+
  theme_bw()+
  theme(legend.position = "top") +
  geom_text(aes(label=scales::percent(total_return)), vjust=1.2, size=5)