#Import Libraries
library(DT)
library(tidyquant)
## 필요한 패키지를 로딩중입니다: lubridate
## 
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 필요한 패키지를 로딩중입니다: PerformanceAnalytics
## 필요한 패키지를 로딩중입니다: xts
## 필요한 패키지를 로딩중입니다: zoo
## 
## 다음의 패키지를 부착합니다: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 다음의 패키지를 부착합니다: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## 필요한 패키지를 로딩중입니다: quantmod
## 필요한 패키지를 로딩중입니다: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(magrittr)
library(tseries)
library(timetk)
library(stats)
library(forecast)
library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rsample)
library(ggplot2)
library(ggfortify)
## Registered S3 methods overwritten by 'ggfortify':
##   method                 from    
##   autoplot.Arima         forecast
##   autoplot.acf           forecast
##   autoplot.ar            forecast
##   autoplot.bats          forecast
##   autoplot.decomposed.ts forecast
##   autoplot.ets           forecast
##   autoplot.forecast      forecast
##   autoplot.glmnet        parsnip 
##   autoplot.stl           forecast
##   autoplot.ts            forecast
##   fitted.ar              forecast
##   fortify.ts             forecast
##   residuals.ar           forecast
# Set the start and end dates
start_date <- "2013-01-01"
end_date <- "2023-03-28"

# Get the S&P 500 monthly price data
sp500_data <- tq_get("^GSPC", 
                     from = start_date,
                     to = end_date,
                     get = "stock.prices",
                     periodicity = "monthly")

tail(sp500_data)
## # A tibble: 6 x 8
##   symbol date        open  high   low close       volume adjusted
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>        <dbl>    <dbl>
## 1 ^GSPC  2022-10-01 3610. 3905. 3492. 3872.  95823760000    3872.
## 2 ^GSPC  2022-11-01 3902. 4080. 3698. 4080.  92671910000    4080.
## 3 ^GSPC  2022-12-01 4087. 4101. 3764. 3840.  85249330000    3840.
## 4 ^GSPC  2023-01-01 3853. 4094. 3794. 4077.  80763810000    4077.
## 5 ^GSPC  2023-02-01 4070. 4195. 3943. 3970.  80392280000    3970.
## 6 ^GSPC  2023-03-01 3963. 4111. 3809. 4109. 113094800000    4109.
#Create a new dataframe for analysis
sp500 <- tibble::tibble(date=sp500_data$date,sp500=sp500_data$adjusted)
head(sp500)
## # A tibble: 6 x 2
##   date       sp500
##   <date>     <dbl>
## 1 2013-01-01 1498.
## 2 2013-02-01 1515.
## 3 2013-03-01 1569.
## 4 2013-04-01 1598.
## 5 2013-05-01 1631.
## 6 2013-06-01 1606.
sp500 %>%
  ggplot(aes(x=date,y=sp500))+
  geom_line(color='#4373B6') +
  theme_bw() +
  ggtitle('S&P500 from 2013(M1) - 2023(M3) ')+
  xlab('Date')+
  ylab('Index')

#Create a ts format data 
ts_sp500 <- ts(sp500$sp500, start=c(2013,1),frequency = 12)

#Decomposition
fit_sp500 <- stl(ts_sp500,s.window = 'periodic')
#Plot Decomposition
autoplot(fit_sp500, ts.colour = '#4373B6',size=0.7) +
  theme_bw()

ACF & PACF

Reference: https://rh8liuqy.github.io/ACF_PACF_by_ggplot2.html

#Custome code for ACF and PACF Plot
ggplot.corr <- function(data, lag.max = 24, ci = 0.95, large.sample.size = TRUE, horizontal = TRUE,...) {
  options(dplyr.banner_expression = FALSE)
  
  require(ggplot2)
  require(dplyr)
  require(cowplot)
  
  if(horizontal == TRUE) {numofrow <- 1} else {numofrow <- 2}
  
  list.acf <- acf(data, lag.max = lag.max, type = "correlation", plot = FALSE)
  N <- as.numeric(list.acf$n.used)
  df1 <- data.frame(lag = list.acf$lag, acf = list.acf$acf)
  df1$lag.acf <- dplyr::lag(df1$acf, default = 0)
  df1$lag.acf[2] <- 0
  df1$lag.acf.cumsum <- cumsum((df1$lag.acf)^2)
  df1$acfstd <- sqrt(1/N * (1 + 2 * df1$lag.acf.cumsum))
  df1$acfstd[1] <- 0
  df1 <- dplyr::select(df1, lag, acf, acfstd)
  
  list.pacf <- acf(data, lag.max = lag.max, type = "partial", plot = FALSE)
  df2 <- data.frame(lag = list.pacf$lag,pacf = list.pacf$acf)
  df2$pacfstd <- sqrt(1/N)
  
  if(large.sample.size == TRUE) {
    plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
    geom_area(aes(x = lag, y = qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
    geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
    geom_col(fill = "#4373B6", width = 0.7) +
    scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
    scale_y_continuous(name = element_blank(), 
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("ACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
    
    plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
    geom_area(aes(x = lag, y = qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
    geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
    geom_col(fill = "#4373B6", width = 0.7) +
    scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
    scale_y_continuous(name = element_blank(),
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("PACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
  }
  else {
    plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
    geom_col(fill = "#4373B6", width = 0.7) +
    geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
    scale_y_continuous(name = element_blank(), 
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("ACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
    
    plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
    geom_col(fill = "#4373B6", width = 0.7) +
    geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
    scale_y_continuous(name = element_blank(),
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("PACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
  }
  cowplot::plot_grid(plot.acf, plot.pacf, nrow = numofrow)
}
#Plot ACF and PACF
#ACF and PACF tell that the data is non-stationary
ggplot.corr(data = sp500$sp500,  
            lag.max = 24, 
            ci= 0.95, 
            large.sample.size = FALSE, 
            horizontal = TRUE)+ 
  ggtitle("S&P 500") + 
  theme(plot.title = element_text(hjust = 0.5,size=12,face = 'bold'))
## 필요한 패키지를 로딩중입니다: cowplot
## 
## 다음의 패키지를 부착합니다: 'cowplot'
## The following object is masked from 'package:lubridate':
## 
##     stamp
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1 rows containing missing values (`geom_hline()`).
## Removed 1 rows containing missing values (`geom_hline()`).

#Conduct adf test
#Accept null = non-stationary
adf.test(sp500$sp500)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  sp500$sp500
## Dickey-Fuller = -2.285, Lag order = 4, p-value = 0.4582
## alternative hypothesis: stationary
#Difference the data
diff <- diff(sp500$sp500)
head(diff)
## [1]  16.57007  54.50989  28.38000  33.17004 -24.45996  79.44995
#Add diff_sp500 into sp500 dataframe
diff_sp500 <- c(0,diff)
sp500$dsp500<- diff_sp500
head(sp500)
## # A tibble: 6 x 3
##   date       sp500 dsp500
##   <date>     <dbl>  <dbl>
## 1 2013-01-01 1498.    0  
## 2 2013-02-01 1515.   16.6
## 3 2013-03-01 1569.   54.5
## 4 2013-04-01 1598.   28.4
## 5 2013-05-01 1631.   33.2
## 6 2013-06-01 1606.  -24.5
#The graph tells that tells that the variance is not stationary over time
#May not use ARIMA model but applied for coding practice purpose
sp500 %>%
  ggplot(aes(x=date,y=dsp500))+
  geom_line(color='#4373B6') +
  theme_bw() +
  ggtitle('S&P500 (1st Order Differencing) ')+
  xlab('Date')+
  ylab('Value')

#Plot ACF and PACF
ggplot.corr(data = sp500$dsp500,  
            lag.max = 24, 
            ci= 0.95, 
            large.sample.size = FALSE, 
            horizontal = TRUE)+ 
  ggtitle("S&P 500 (1st Order Differencing)") + 
  theme(plot.title = element_text(hjust = 0.5,size=12,face = 'bold'))

#Conduct adf test for differenced sp500
#Verified stationarity
adf.test(sp500$dsp500)
## Warning in adf.test(sp500$dsp500): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  sp500$dsp500
## Dickey-Fuller = -4.3071, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
#Convert dsp500 to ts format
ts_dsp500 <- ts(sp500$dsp500, start=c(2013,1),frequency = 12)
head(ts_dsp500)
##            Jan       Feb       Mar       Apr       May       Jun
## 2013   0.00000  16.57007  54.50989  28.38000  33.17004 -24.45996

Reference: https://rh8liuqy.github.io/Data_Visulization_of_ARIMA_Model_Selection.html

#Custom code for ARIMA Model ACF PACF reisduals

arima.order <- c(2,0,2)

p <- 0:arima.order[1]
d <- 1 #Revise this part depending on how many differencing was conducted
q <- 0:arima.order[3]

df.arima.order <- data.frame(t(expand.grid(p=p,d=d,q=q)))
df.colnames <- data.frame(expand.grid(p=p,d=d,q=q))
df.colnames$type <- with(df.colnames,paste0
                         ("ARIMA(",p,",",d,",",q,")"))

list.arima.model <- lapply(df.arima.order, 
                           function(data, x, ...) { 
                             arima(x = data, order = x, ...)
                             }, 
                           data = ts_dsp500) # Data has to be in ts format

df.arima.residual <- sapply(list.arima.model, 
                            function(x) {
                              x$residuals
                              })
df.arima.residual <- data.frame(df.arima.residual)
colnames(df.arima.residual) <- df.colnames$type
df.arima.residual$time <- time(ts_dsp500) # Data has to be in ts format


df.arima.residual.plot <- gather(data = df.arima.residual, value = "residual", key = "type",-time)


df.acf <- sapply(subset(df.arima.residual, select = -time), 
       function(x, lag.max,...) {
         acf(x = x, lag.max = lag.max, plot = FALSE)$acf
       },
       lag.max = 24)
df.acf <- data.frame(df.acf)
colnames(df.acf) <- df.colnames$type

df.acf$lag <- 0:(dim(df.acf)[1]-1)
df.acf.plot <- gather(df.acf, 
                      value = "value", 
                      key = "Model",
                      -lag)


df.pacf <- sapply(subset(df.arima.residual, select = -time),
                  function(x, lag.max,...) {
                    acfvalue <- as.numeric(acf(x = x, 
                                               lag.max = lag.max,
                                               type = "partial",
                                               plot = FALSE)$acf)
                    return(c(1,acfvalue))
                    },
                  lag.max = 24)
df.pacf <- data.frame(df.pacf)
colnames(df.pacf) <- df.colnames$type

df.pacf$lag <- 0:(dim(df.pacf)[1]-1)
df.pacf.plot <- gather(df.pacf, 
                       value = "value", 
                       key = "Model",
                       -lag)

fig.acf <- ggplot(data = df.acf.plot, aes(x = lag, y = value)) +
  geom_col(width = 0.4, fill = "#667fa2") +
  geom_hline(yintercept = qnorm((1+0.95)/2)/sqrt(dim(df.acf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  geom_hline(yintercept = -qnorm((1+0.95)/2)/sqrt(dim(df.acf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  scale_y_continuous(name = element_blank()) +
  scale_x_continuous(breaks = seq(0,24,4)) +
  ggtitle("ACF of residuals") +
  facet_wrap(~Model) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        strip.background = element_rect(fill = "#f8ddc4"))

fig.pacf <- ggplot(data = df.pacf.plot, aes(x = lag, y = value)) +
  geom_col(width = 0.4, fill = "#667fa2") +
  geom_hline(yintercept = qnorm((1+0.95)/2)/sqrt(dim(df.pacf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  geom_hline(yintercept = -qnorm((1+0.95)/2)/sqrt(dim(df.pacf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  scale_y_continuous(name = element_blank()) +
  scale_x_continuous(breaks = seq(0,24,4)) +
  ggtitle("PACF of residuals") +
  facet_wrap(~Model) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        strip.background = element_rect(fill = "#f8ddc4"))

fig.residual <- ggplot(data = df.arima.residual.plot, aes(x = time, y = residual)) +
  geom_line(colour = "#667fa2") +
  ggtitle("Residuals") +
  facet_wrap(~type) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        strip.background = element_rect(fill = "#f8ddc4"))
#Plot ACF of reiduals
fig.acf

#Plot PACF of reiduals
fig.pacf

#Plot reiduals
fig.residual
## 
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

#Custom Code for AIC 
aic <- sapply(list.arima.model, function(X) {X$aic})
aic <- round(as.numeric(aic),2)
df.aic.plot <- data.frame(aic = aic, p = df.colnames$p, q = df.colnames$q)

ggplot(data = df.aic.plot, aes(x = p, y = q)) + 
  geom_raster(aes(fill = aic)) +
  geom_text(aes( label = aic), colour = "black") + 
  scale_x_continuous(expand = c(0.05,0.05)) +
  scale_color_continuous( high = "#FFA800", low = "#4AC200") +
  theme_bw() +
  theme(panel.grid.minor = element_blank()) + 
  scale_fill_distiller(palette = "Spectral", direction = -1) +
  ggtitle('Akaike Information Criterion (AIC)')

# Compare with auto.arima
auto.arima(ts_sp500)
## Series: ts_sp500 
## ARIMA(0,1,1) with drift 
## 
## Coefficients:
##           ma1    drift
##       -0.2454  21.1996
## s.e.   0.0855   9.1050
## 
## sigma^2 = 17960:  log likelihood = -769.68
## AIC=1545.37   AICc=1545.57   BIC=1553.78

Forecasting Application

#Unload ggfority and forecast package
#Reload forecast 
#Have to do this since ggfortify & forecast don't work well with visualization when loaded simulatneously

unloadNamespace('ggfortify')
unloadNamespace('forecast')
library('forecast')
## Registered S3 methods overwritten by 'forecast':
##   method                 from     
##   autoplot.Arima         ggfortify
##   autoplot.acf           ggfortify
##   autoplot.ar            ggfortify
##   autoplot.bats          ggfortify
##   autoplot.decomposed.ts ggfortify
##   autoplot.ets           ggfortify
##   autoplot.forecast      ggfortify
##   autoplot.stl           ggfortify
##   autoplot.ts            ggfortify
##   fitted.ar              ggfortify
##   fortify.ts             ggfortify
##   residuals.ar           ggfortify
#Split the data
train <- window(ts_sp500,start=2013,end=c(2022,6))
test <- window(ts_sp500,start=c(2022,7))
#Fit the model and implement 8month forecast
f_fit_naive <- naive(train)
fore_fit_naive <- forecast(f_fit_naive,h=length(test))

f_fit_ses <- ses(train)
fore_fit_ses <- forecast(f_fit_ses,h=length(test))

f_fit_hwa <- hw(train,seasonal='additive')
fore_fit_hwa <- forecast(f_fit_hwa,h=length(test))

f_fit_hwm <- hw(train,seasonal='multiplicative')
fore_fit_hwm <- forecast(f_fit_hwm,h=length(test))

f_fit_ets <- ets(train)
fore_fit_ets <- forecast(f_fit_ets,h=length(test))

f_fit_arima <- auto.arima(train)
fore_fit_arima <- forecast(f_fit_arima,h=length(test))
ff1 <- autoplot(ts_sp500,color='grey40',size=1)+
  autolayer(fore_fit_naive,PI=TRUE,series = 'Naive Forecast',size=1.2)+
  autolayer(test,series=NA,color='black',linetype='longdash')+
  labs(x='Date', y='Index',title='Actual vs Naive Forecast')+
  theme_bw()+
  scale_colour_manual(name='Method',values=c('#4373B6'))+
  theme(legend.position=c(0.001, 0.98),
        legend.justification=c('left', 'top'))

ff2 <- autoplot(ts_sp500,color='grey40',size=1)+
  autolayer(fore_fit_ses,PI=TRUE,series = 'Simple Exponential Smoothing',size=1.2)+
  autolayer(test,series=NA,color='black',linetype='longdash')+
  labs(x='Date', y='Index',title='Actual vs Simple Exponenetial Smoothing')+
  theme_bw()+
  scale_colour_manual(name='Method',values=c('orangered'))+
  theme(legend.position=c(0.001, 0.98),
        legend.justification=c('left', 'top'))
ff3 <- autoplot(ts_sp500,color='grey40',size=1)+
  autolayer(fore_fit_hwa,PI=TRUE,series = 'Holt-Winter Additive',size=1.2)+
  autolayer(test,series=NA,color='black',linetype='longdash')+
  labs(x='Date', y='Index',title='Actual vs Holt-Winter Additive')+
  theme_bw()+
  scale_colour_manual(name='Method',values=c('deepskyblue4'))+
  theme(legend.position=c(0.001, 0.98),
        legend.justification=c('left', 'top'))
ff4 <- autoplot(ts_sp500,color='grey40',size=1)+
  autolayer(fore_fit_hwm,PI=TRUE,series = 'Holt-Winter Multiplicative',size=1.2)+
  autolayer(test,series=NA,color='black',linetype='longdash')+
  labs(x='Date', y='Index',title='Actual vs Holt-Winter Multiplicative')+
  theme_bw()+
  scale_colour_manual(name='Method',values=c('purple'))+
  theme(legend.position=c(0.001, 0.98),
        legend.justification=c('left', 'top'))


ff5 <- autoplot(ts_sp500,color='grey40',size=1)+
  autolayer(fore_fit_ets,PI=TRUE,series = 'ETS',size=1.2)+
  autolayer(test,series=NA,color='black',linetype='longdash')+
  labs(x='Date', y='Index',title='Actual vs ETS')+
  theme_bw()+
  scale_colour_manual(name='Method',values=c('brown4'))+
  theme(legend.position=c(0.001, 0.98),
        legend.justification=c('left', 'top'))
ff6 <- autoplot(ts_sp500,color='grey40',size=1)+
  autolayer(fore_fit_arima,PI=TRUE,series = 'ARIMA',size=1.2)+
  autolayer(test,series=NA,color='black',linetype='longdash')+
  labs(x='Date', y='Index',title='Actual vs ARIMA')+
  theme_bw()+
  scale_colour_manual(name='Method',values=c('red'))+
  theme(legend.position=c(0.001, 0.98),
        legend.justification=c('left', 'top'))
library(gridExtra)
## 
## 다음의 패키지를 부착합니다: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(ff1, ff2, ff3, ff4, ff5,ff6, ncol=2)

#Fit several forecasting methods into S&P 500
library(prophet)
## 필요한 패키지를 로딩중입니다: Rcpp
## 
## 다음의 패키지를 부착합니다: 'Rcpp'
## The following object is masked from 'package:rsample':
## 
##     populate
## 필요한 패키지를 로딩중입니다: rlang
## 
## 다음의 패키지를 부착합니다: 'rlang'
## The following object is masked from 'package:magrittr':
## 
##     set_names
#Naive Forecast
fit_naive <- naive(ts_sp500)

#Simple Exponential Smoothing
fit_ses <- ses(ts_sp500)

#Holt-Winter method
fit_hwa <- hw(ts_sp500,seasonal='additive')
fit_hwm <- hw(ts_sp500,seasonal='multiplicative')


#ETS
fit_ets <- ets(ts_sp500)
fit_ets1 <- fit_ets%>%forecast()


#ARIMA(0,1,1)
fit_arima <- auto.arima(ts_sp500)
fit_arima1 <- fit_arima%>%forecast()
#Plot the result
f1 <- autoplot(ts_sp500,color='grey40',size=1) +
  autolayer(fit_naive$fitted,series = 'Naive Forecast',size=1)+
  autolayer(fit_naive,series = 'Naive Forecast',size=1) +
   labs(x='Time', y='Index', title='Naive Forecast') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('#4373B6'))+
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))
f2 <- autoplot(ts_sp500,color='grey40',size=1) +
  autolayer(fit_ses$fitted,series = 'Simple Exponential Smoothing',size=1)+
  autolayer(fit_ses,series = 'Simple Exponential Smoothing',size=1) +
   labs(x='Time', y='Index', title='Simple Exponential Smoothing') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('orangered1'))+
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))
f3 <- autoplot(ts_sp500,color='grey40',size=1) +
  autolayer(fit_hwa$fitted,series = 'Holt-Winter Additive',size=1)+
  autolayer(fit_hwa,series = 'Holt-Winter Additive',size=1) +
   labs(x='Time', y='Index', title='Holt-Winter Additive') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('deepskyblue4'))+
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))
f4 <- autoplot(ts_sp500,color='grey40',size=1) +
  autolayer(fit_hwm$fitted,series = 'Holt-Winter Multiplicative',size=1)+
  autolayer(fit_hwm,series = 'Holt-Winter Multiplicative',size=1) +
   labs(x='Time', y='Index', title='Holt-Winter Multiplicative') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('purple'))+
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))
f5 <- autoplot(ts_sp500,color='grey40',size=1) +
  autolayer(fit_ets1$fitted,series = 'ETS',size=1)+
  autolayer(fit_ets1,series = 'ETS',size=1) +
   labs(x='Time', y='Index', title='ETS') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('brown4'))+
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))
f6 <- autoplot(ts_sp500,color='grey40',size=1) +
  autolayer(fit_arima1$fitted,series = 'ARIMA',size=1)+
  autolayer(fit_arima1,series = 'ARIMA',size=1) +
   labs(x='Time', y='Index', title='ARIMA') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('red'))+
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))
library(gridExtra)
grid.arrange(f1, f2, f3, f4, f5,f6, ncol=2)
## Warning: Removed 1 row containing missing values (`geom_line()`).

grid.arrange(ff6,f6, ncol=1)

#Expected Index from ARIMA
fore_fit_arima$mean
##           Jan      Feb      Mar Apr May Jun      Jul      Aug      Sep      Oct
## 2022                                        3805.621 3825.863 3846.104 3866.345
## 2023 3927.069 3947.310 3967.552                                                
##           Nov      Dec
## 2022 3886.587 3906.828
## 2023
#Actual Index
test
##          Jan     Feb     Mar Apr May Jun     Jul     Aug     Sep     Oct
## 2022                                     4130.29 3955.00 3585.62 3871.98
## 2023 4076.60 3970.15 4109.31                                            
##          Nov     Dec
## 2022 4080.11 3839.50
## 2023

S&P 500 Return Forecast

Since we had to difference S&P 500 to make it stationary, we are going to use the log return so that we don’t have to difference the data and forecast under parsimony principle

unloadNamespace('ggfortify')
unloadNamespace('forecast')
library(forecast)
library(ggfortify)
## Registered S3 methods overwritten by 'ggfortify':
##   method                 from    
##   autoplot.Arima         forecast
##   autoplot.acf           forecast
##   autoplot.ar            forecast
##   autoplot.bats          forecast
##   autoplot.decomposed.ts forecast
##   autoplot.ets           forecast
##   autoplot.forecast      forecast
##   autoplot.stl           forecast
##   autoplot.ts            forecast
##   fitted.ar              forecast
##   fortify.ts             forecast
##   residuals.ar           forecast
#Compute log return of S&P 500
sp500_return <- sp500_data %>%
  tq_transmute(select = adjusted,
               mutate_fun = periodReturn,
               type = 'log',
               col_rename = 'returns')

head(sp500_return)
## # A tibble: 6 x 2
##   date       returns
##   <date>       <dbl>
## 1 2013-01-01  0     
## 2 2013-02-01  0.0110
## 3 2013-03-01  0.0354
## 4 2013-04-01  0.0179
## 5 2013-05-01  0.0206
## 6 2013-06-01 -0.0151
#Graph the return
sp500_return %>%
  ggplot(aes(x=date,y=returns))+
  geom_line(color='#4373B6') +
  theme_bw() +
  ggtitle('S&P500 Returns from 2013(M1) - 2023(M3) ')+
  xlab('Date')+
  ylab('Returns')+
  scale_y_continuous(labels = scales::percent)

#Plot ACF and PACF of S&P 500 return
ggplot.corr(data = sp500_return$returns,  
            lag.max = 24, 
            ci= 0.95, 
            large.sample.size = FALSE, 
            horizontal = TRUE)+ 
  ggtitle("S&P 500 Log Return") + 
  theme(plot.title = element_text(hjust = 0.5,size=12,face = 'bold'))

#adf test
#Verified Stationarity
adf.test(sp500_return$returns)
## Warning in adf.test(sp500_return$returns): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  sp500_return$returns
## Dickey-Fuller = -5.0361, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
#Convert the return into ts format
ts_dsp500_return <- ts(sp500_return$returns,start = c(2013,1),frequency = 12)
head(ts_dsp500_return)
##              Jan         Feb         Mar         Apr         May         Jun
## 2013  0.00000000  0.01099993  0.03535529  0.01792417  0.02055020 -0.01511293
#Custom code for ARIMA Model ACF PACF reisduals

arima.order <- c(2,0,2)

p <- 0:arima.order[1]
d <- 0:arima.order[2] #Revise this part depending on how many differencing was conducted
q <- 0:arima.order[3]

df.arima.order <- data.frame(t(expand.grid(p=p,d=d,q=q)))
df.colnames <- data.frame(expand.grid(p=p,d=d,q=q))
df.colnames$type <- with(df.colnames,paste0
                         ("ARIMA(",p,",",d,",",q,")"))

list.arima.model <- lapply(df.arima.order, 
                           function(data, x, ...) { 
                             arima(x = data, order = x, ...)
                             }, 
                           data = ts_dsp500_return) # Data has to be in ts format

df.arima.residual <- sapply(list.arima.model, 
                            function(x) {
                              x$residuals
                              })
df.arima.residual <- data.frame(df.arima.residual)
colnames(df.arima.residual) <- df.colnames$type
df.arima.residual$time <- time(ts_dsp500_return) # Data has to be in ts format


df.arima.residual.plot <- gather(data = df.arima.residual, value = "residual", key = "type",-time)


df.acf <- sapply(subset(df.arima.residual, select = -time), 
       function(x, lag.max,...) {
         acf(x = x, lag.max = lag.max, plot = FALSE)$acf
       },
       lag.max = 24)
df.acf <- data.frame(df.acf)
colnames(df.acf) <- df.colnames$type

df.acf$lag <- 0:(dim(df.acf)[1]-1)
df.acf.plot <- gather(df.acf, 
                      value = "value", 
                      key = "Model",
                      -lag)


df.pacf <- sapply(subset(df.arima.residual, select = -time),
                  function(x, lag.max,...) {
                    acfvalue <- as.numeric(acf(x = x, 
                                               lag.max = lag.max,
                                               type = "partial",
                                               plot = FALSE)$acf)
                    return(c(1,acfvalue))
                    },
                  lag.max = 24)
df.pacf <- data.frame(df.pacf)
colnames(df.pacf) <- df.colnames$type

df.pacf$lag <- 0:(dim(df.pacf)[1]-1)
df.pacf.plot <- gather(df.pacf, 
                       value = "value", 
                       key = "Model",
                       -lag)

fig.acf <- ggplot(data = df.acf.plot, aes(x = lag, y = value)) +
  geom_col(width = 0.4, fill = "#667fa2") +
  geom_hline(yintercept = qnorm((1+0.95)/2)/sqrt(dim(df.acf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  geom_hline(yintercept = -qnorm((1+0.95)/2)/sqrt(dim(df.acf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  scale_y_continuous(name = element_blank()) +
  scale_x_continuous(breaks = seq(0,24,4)) +
  ggtitle("ACF of residuals") +
  facet_wrap(~Model) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        strip.background = element_rect(fill = "#f8ddc4"))

fig.pacf <- ggplot(data = df.pacf.plot, aes(x = lag, y = value)) +
  geom_col(width = 0.4, fill = "#667fa2") +
  geom_hline(yintercept = qnorm((1+0.95)/2)/sqrt(dim(df.pacf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  geom_hline(yintercept = -qnorm((1+0.95)/2)/sqrt(dim(df.pacf.plot)[1]),
             linetype = "dashed",
             colour = "grey50") +
  scale_y_continuous(name = element_blank()) +
  scale_x_continuous(breaks = seq(0,24,4)) +
  ggtitle("PACF of residuals") +
  facet_wrap(~Model) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        strip.background = element_rect(fill = "#f8ddc4"))

fig.residual <- ggplot(data = df.arima.residual.plot, aes(x = time, y = residual)) +
  geom_line(colour = "#667fa2") +
  ggtitle("Residuals") +
  facet_wrap(~type) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        strip.background = element_rect(fill = "#f8ddc4"))
fig.acf

fig.pacf

fig.residual
## 
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

#Custom Code for AIC 
aic <- sapply(list.arima.model, function(X) {X$aic})
aic <- round(as.numeric(aic),2)
df.aic.plot <- data.frame(aic = aic, p = df.colnames$p, q = df.colnames$q)

ggplot(data = df.aic.plot, aes(x = p, y = q)) + 
  geom_raster(aes(fill = aic)) +
  geom_text(aes( label = aic), colour = "black") + 
  scale_x_continuous(expand = c(0.05,0.05)) +
  scale_color_continuous( high = "#FFA800", low = "#4AC200") +
  theme_bw() +
  theme(panel.grid.minor = element_blank()) + 
  scale_fill_distiller(palette = "Spectral", direction = -1) +
  ggtitle('Akaike Information Criterion (AIC)')

auto.arima(ts_dsp500_return)
## Series: ts_dsp500_return 
## ARIMA(0,0,1) with non-zero mean 
## 
## Coefficients:
##           ma1    mean
##       -0.2445  0.0082
## s.e.   0.0941  0.0028
## 
## sigma^2 = 0.001732:  log likelihood = 217.51
## AIC=-429.01   AICc=-428.81   BIC=-420.58
#Unload ggfority and forecast package
#Reload forecast 
#Have to do this since ggfortify & forecast don't work well with visualization when loaded simulatneously

unloadNamespace('ggfortify')
unloadNamespace('forecast')
library('forecast')
## Registered S3 methods overwritten by 'forecast':
##   method                 from     
##   autoplot.Arima         ggfortify
##   autoplot.acf           ggfortify
##   autoplot.ar            ggfortify
##   autoplot.bats          ggfortify
##   autoplot.decomposed.ts ggfortify
##   autoplot.ets           ggfortify
##   autoplot.forecast      ggfortify
##   autoplot.stl           ggfortify
##   autoplot.ts            ggfortify
##   fitted.ar              ggfortify
##   fortify.ts             ggfortify
##   residuals.ar           ggfortify
#Split the data
dtrain <- window(ts_dsp500_return,start=2013,end=c(2022,6))
dtest <- window(ts_dsp500_return,start=c(2022,7))
#Fit the model and implement 8month forecast
fd_fit_naive <- naive(dtrain)
dfore_fit_naive <- forecast(fd_fit_naive,h=length(dtest))

fd_fit_ses <- ses(dtrain)
dfore_fit_ses <- forecast(fd_fit_ses,h=length(dtest))

fd_fit_hwa <- hw(dtrain,seasonal='additive')
dfore_fit_hwa <- forecast(fd_fit_hwa,h=length(dtest))


fd_fit_ets <- ets(dtrain)
dfore_fit_ets <- forecast(fd_fit_ets,h=length(dtest))

fd_fit_arima <- auto.arima(dtrain)
dfore_fit_arima <- forecast(fd_fit_arima,h=length(dtest))
#Plot the result
fdd1 <- autoplot(ts_dsp500_return,color='grey30') +
  autolayer(dfore_fit_naive,PI=TRUE,series='Naive Forecast',size=1.2)+
  autolayer(dtest,series=NA,color='black',linetype='longdash')+
  labs(x='Time', y='Return', title='Actual vs Naive Forecast') +
  theme_bw() +
  scale_y_continuous(labels = scales::percent,
                     breaks = seq(-0.2,0.2,0.1)) +
  scale_colour_manual(name='Method', values=c('#4373B6'))+
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))
fdd2 <- autoplot(ts_dsp500_return,color='grey30') +
  autolayer(dfore_fit_ses,PI=TRUE,series='Simple Exponential Smoothing',size=1.2)+
  autolayer(dtest,series=NA,color='black',linetype='longdash')+
  labs(x='Time', y='Return', title='Actual vs Simple Exponential Smoothing') +
  theme_bw() +
  scale_y_continuous(labels = scales::percent) +
  scale_colour_manual(name='Method', values=c('orangered1'))+
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))



fdd3 <- autoplot(ts_dsp500_return,color='grey30') +
  autolayer(dfore_fit_hwa,PI=TRUE,series='Holt-Winter Additive',size=1.2)+
  autolayer(dtest,series=NA,color='black',linetype='longdash')+
  labs(x='Time', y='Return', title='Actual vs Holt-Winter Additive') +
  theme_bw() +
  scale_y_continuous(labels = scales::percent) +
  scale_colour_manual(name='Method', values=c('deepskyblue4'))+
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))
  

fdd5 <- autoplot(ts_dsp500_return,color='grey30') +
  autolayer(dfore_fit_ets,PI=TRUE,series='ETS',size=1.2)+
  autolayer(dtest,series=NA,color='black',linetype='longdash')+
  labs(x='Time', y='Return', title='Actual vs ETS') +
  theme_bw() +
  scale_y_continuous(labels = scales::percent) +
  scale_colour_manual(name='Method', values=c('brown4'))+
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))
  

fdd6 <- autoplot(ts_dsp500_return,color='grey30') +
  autolayer(dfore_fit_arima,PI=TRUE,series='ARIMA',size=1.2)+
  autolayer(dtest,series=NA,color='black',linetype='longdash')+
  labs(x='Time', y='Return', title='Actual vs ARIMA') +
  theme_bw() +
  scale_y_continuous(labels = scales::percent) +
  scale_colour_manual(name='Method', values=c('red'))+
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))
library(gridExtra)
grid.arrange(fdd1, fdd2, fdd3, fdd5,fdd6, ncol=2)

#Fit several forecasting methods into S&P 500

#Naive Forecast
fit_dnaive <- naive(ts_dsp500_return)

#Simple Exponential Smoothing
fit_dses <- ses(ts_dsp500_return)

#Holt-Winter method
fit_dhwa <- hw(ts_dsp500_return,seasonal='additive')


#ETS
fit_dets <- ets(ts_dsp500_return)
fit_dets1 <- fit_dets%>%forecast()


#ARIMA(0,1,1)
fit_darima <- auto.arima(ts_dsp500_return)
fit_darima1 <- fit_darima%>%forecast()
#Plot the result
fd1 <- autoplot(ts_dsp500_return,color='grey30') +
  autolayer(fit_dnaive$fitted,series = 'Naive Forecast')+
  autolayer(fit_dnaive,series = 'Naive Forecast') +
   labs(x='Time', y='Return', title='Naive Forecast') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('#4373B6'))+
  scale_y_continuous(labels = scales::percent) +
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))


fd2 <- autoplot(ts_dsp500_return,color='grey30') +
  autolayer(fit_dses$fitted,series = 'Simple Exponential Smoothing')+
  autolayer(fit_dses,series = 'Simple Exponential Smoothing') +
   labs(x='Time', y='Return', title='Simple Exponential Smoothing') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('orangered1'))+
  scale_y_continuous(labels = scales::percent) +
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))

fd3 <- autoplot(ts_dsp500_return,color='grey30') +
  autolayer(fit_dhwa$fitted,series = 'Holt-Winter Additive')+
  autolayer(fit_dhwa,series = 'Holt-Winter Additive') +
   labs(x='Time', y='Return', title='Holt-Winter Additive') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('deepskyblue4'))+
  scale_y_continuous(labels = scales::percent) +
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))

fd5 <- autoplot(ts_dsp500_return,color='grey30') +
  autolayer(fit_dets1$fitted,series = 'ETS')+
  autolayer(fit_dets1,series = 'ETS') +
   labs(x='Time', y='Return', title='ETS') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('brown4'))+
  scale_y_continuous(labels = scales::percent) +
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))

fd6 <- autoplot(ts_dsp500_return,color='grey30') +
  autolayer(fit_darima1$fitted,series = 'ARIMA')+
  autolayer(fit_darima1,series = 'ARIMA') +
   labs(x='Time', y='Return', title='ARIMA') +
  theme_bw() +
  scale_colour_manual(name='Method', values=c('red'))+
  scale_y_continuous(labels = scales::percent) +
  theme(legend.position=c(0.02, 0.98),
        legend.justification=c('left', 'top'))
grid.arrange(fd1, fd2, fd3,fd5,fd6, ncol=2)
## Warning: Removed 1 row containing missing values (`geom_line()`).

grid.arrange(fdd6,fd6, ncol=1)

# Forecasted return from Arima
# 2.3%
dfore_fit_arima$mean
##              Jan         Feb         Mar Apr May Jun         Jul         Aug
## 2022                                                 0.023658837 0.008275137
## 2023 0.008275137 0.008275137 0.008275137                                    
##              Sep         Oct         Nov         Dec
## 2022 0.008275137 0.008275137 0.008275137 0.008275137
## 2023
# Actual Return
# 8.7%
dtest
##              Jan         Feb         Mar Apr May Jun         Jul         Aug
## 2022                                                  0.08720138 -0.04336703
## 2023  0.05992118 -0.02645948  0.03445129                                    
##              Sep         Oct         Nov         Dec
## 2022 -0.09804917  0.07683456  0.05235798 -0.06078183
## 2023
#ARIMA model forecasts that the return for Aprial will be 0.3%
fit_darima1[4]
## $mean
##              Jan         Feb         Mar         Apr         May         Jun
## 2023                                     0.003262986 0.008174623 0.008174623
## 2024 0.008174623 0.008174623 0.008174623 0.008174623 0.008174623 0.008174623
## 2025 0.008174623 0.008174623 0.008174623                                    
##              Jul         Aug         Sep         Oct         Nov         Dec
## 2023 0.008174623 0.008174623 0.008174623 0.008174623 0.008174623 0.008174623
## 2024 0.008174623 0.008174623 0.008174623 0.008174623 0.008174623 0.008174623
## 2025
fit_darima$fitted %>% tk_tbl
## # A tibble: 123 x 2
##    index         value
##    <yearmon>     <dbl>
##  1 1 2013     0.00794 
##  2 2 2013     0.0101  
##  3 3 2013     0.00795 
##  4 4 2013     0.00147 
##  5 5 2013     0.00415 
##  6 6 2013     0.00417 
##  7 7 2013     0.0129  
##  8 8 2013    -0.000479
##  9 9 2013     0.0158  
## 10 10 2013    0.00488 
## # ... with 113 more rows
ts_dsp500_return %>% tk_tbl
## # A tibble: 123 x 2
##    index       value
##    <yearmon>   <dbl>
##  1 1 2013     0     
##  2 2 2013     0.0110
##  3 3 2013     0.0354
##  4 4 2013     0.0179
##  5 5 2013     0.0206
##  6 6 2013    -0.0151
##  7 7 2013     0.0483
##  8 8 2013    -0.0318
##  9 9 2013     0.0293
## 10 10 2013    0.0436
## # ... with 113 more rows