This is short personal reminder and some WEB links on ARIMA analysis. The question for this the ARIMA analysis is:

What will the Dow Jones Industrial Average daily closing value be on 18 August 2014?

My goal is to forecast Dow Jones Industrial Average index value in 4 days with the following data:
1. All available historical data of the index.

1. Downloading

Dow Jones Industrial Average data I have downloaded from Yahoo.

library(quantmod)

# Dow Jones Industrial Average
getSymbols("DJIA",src="yahoo")
## [1] "DJIA"

The data in CSV format also can be obtained from Federal Reserve bank of St. Luise and also from other sources. For manually double check the data and my generated plots I used Google Finance.

2. Descriptive analysis

library(UsingR)
library(ggplot2)
library(tseries)
library(psych)

# as I understood :
# zoo package mask as.Date form base function and do not import namespaces of other packages which extend it. 
# a workaround founded in internet 
# http://r.789695.n4.nabble.com/issues-with-zoo-masking-as-Date-function-resulting-in-issues-with-as-Date-td3905640.html

# this issue is only for Rmd file. 

as.Date.timeDate <- timeDate:::as.Date.timeDate 

# saving all data as data frame
duomenys.org <- as.data.frame(DJIA[,1:6])

# copy of the original data
duomenys <- duomenys.org[1:(dim(duomenys.org)[1] - 4), ]
duomenys$DJIA.Date <- as.Date(as.character(rownames(duomenys.org)), format="%Y-%m-%d")[1:(dim(duomenys.org)[1] - 4)]

Some summary statistics and plots.

str(duomenys)
## 'data.frame':    1927 obs. of  7 variables:
##  $ DJIA.Open    : num  12460 12467 12480 12394 12425 ...
##  $ DJIA.High    : num  12630 12510 12504 12445 12517 ...
##  $ DJIA.Low     : num  12374 12405 12327 12338 12338 ...
##  $ DJIA.Close   : num  12475 12481 12398 12423 12417 ...
##  $ DJIA.Volume  : num  3.43e+09 3.00e+09 2.92e+09 2.76e+09 3.04e+09 ...
##  $ DJIA.Adjusted: num  12475 12481 12398 12423 12417 ...
##  $ DJIA.Date    : Date, format: "2007-01-03" "2007-01-04" ...
summary(duomenys)
##    DJIA.Open       DJIA.High        DJIA.Low       DJIA.Close   
##  Min.   : 6547   Min.   : 6758   Min.   : 6440   Min.   : 6547  
##  1st Qu.:10837   1st Qu.:10965   1st Qu.:10720   1st Qu.:10836  
##  Median :12475   Median :12592   Median :12375   Median :12480  
##  Mean   :12369   Mean   :12487   Mean   :12252   Mean   :12373  
##  3rd Qu.:13545   3rd Qu.:13638   3rd Qu.:13440   3rd Qu.:13549  
##  Max.   :17133   Max.   :17228   Max.   :17056   Max.   :17138  
##   DJIA.Volume       DJIA.Adjusted     DJIA.Date         
##  Min.   :2.20e+06   Min.   : 6547   Min.   :2007-01-03  
##  1st Qu.:3.30e+09   1st Qu.:10836   1st Qu.:2008-11-29  
##  Median :3.88e+09   Median :12480   Median :2010-10-28  
##  Mean   :4.14e+09   Mean   :12373   Mean   :2010-10-29  
##  3rd Qu.:4.72e+09   3rd Qu.:13549   3rd Qu.:2012-09-25  
##  Max.   :1.15e+10   Max.   :17138   Max.   :2014-08-27
describe(duomenys)
##               vars    n      mean        sd    median   trimmed       mad
## DJIA.Open        1 1927 1.237e+04 2.301e+03 1.248e+04 1.237e+04      2045
## DJIA.High        2 1927 1.249e+04 2.289e+03 1.259e+04 1.249e+04      2044
## DJIA.Low         3 1927 1.225e+04 2.316e+03 1.237e+04 1.226e+04      2079
## DJIA.Close       4 1927 1.237e+04 2.303e+03 1.248e+04 1.238e+04      2050
## DJIA.Volume      5 1927 4.135e+09 1.288e+09 3.878e+09 4.003e+09 999820962
## DJIA.Adjusted    6 1927 1.237e+04 2.303e+03 1.248e+04 1.238e+04      2050
## DJIA.Date*       7 1927 1.491e+04 8.071e+02 1.491e+04 1.491e+04      1036
##                   min       max     range  skew kurtosis        se
## DJIA.Open        6547 1.713e+04 1.059e+04 -0.01    -0.41 5.241e+01
## DJIA.High        6758 1.723e+04 1.047e+04  0.00    -0.42 5.215e+01
## DJIA.Low         6440 1.706e+04 1.062e+04 -0.02    -0.40 5.277e+01
## DJIA.Close       6547 1.714e+04 1.059e+04 -0.01    -0.41 5.245e+01
## DJIA.Volume   2200000 1.146e+10 1.145e+10  1.19     2.46 2.933e+07
## DJIA.Adjusted    6547 1.714e+04 1.059e+04 -0.01    -0.41 5.245e+01
## DJIA.Date*      13516 1.631e+04 2.793e+03  0.00    -1.20 1.839e+01

Main plot

mindate <- duomenys$DJIA.Date[1]
maxdate <- duomenys$DJIA.Date[length(duomenys$DJIA.Date)]
ttl <- paste("Dow Jones Industrial Average between", 
             mindate, "and", maxdate, sep = " ")

ggplot(data=duomenys, aes(x=DJIA.Date, y=DJIA.Close)) + 
    geom_line() + 
    xlab("") + 
    ylab("DJIA") +
    ggtitle(ttl)

plot of chunk main_plot

ARIMA model is very good explained in Econometrics Academy internet page.

Making Augmented Dickey-Fuller Test

adf.test(duomenys$DJIA.Close, alternative="stationary", k=0)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  duomenys$DJIA.Close
## Dickey-Fuller = -1.609, Lag order = 0, p-value = 0.7438
## alternative hypothesis: stationary
adf.test(duomenys$DJIA.Close, alternative="stationary")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  duomenys$DJIA.Close
## Dickey-Fuller = -1.424, Lag order = 12, p-value = 0.8221
## alternative hypothesis: stationary

We can not use these data, because they are not stationary. We are calculating distance and making Augmented Dickey-Fuller Test again.

close.diff <- diff(duomenys$DJIA.Close)
adf.test(close.diff, alternative="stationary", k=0)
## Warning: p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  close.diff
## Dickey-Fuller = -48.67, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary
adf.test(close.diff, alternative="stationary")
## Warning: p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  close.diff
## Dickey-Fuller = -11.8, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary

Now the data are stationary. This is important for the ARIMA model.

Now I’m making ACF and PACF with original data and with the differences.

# ACF and PACF
acf(duomenys$DJIA.Close, main="all data")

plot of chunk ACF

pacf(duomenys$DJIA.Close, main="all data")

plot of chunk ACF

acf(close.diff, main="differences, all data")

plot of chunk ACF

pacf(close.diff, main="differences, all data")

plot of chunk ACF

The results are the same. For calculation of the ARIMA model - I must to use differences.

I’m making possibles/interesting orders for me ARIMA models and aic data frame for answers

# order list

order.list <- list(
    "1,0,0" = c(1,0,0), # ARIMA(1,0,0) or AR(1)
    "2,0,0" = c(2,0,0), # ARIMA(2,0,0) or AR(2)
    "0,0,1" = c(0,0,1), # ARIMA(0,0,1) or MA(1)
    "1,0,1" = c(1,0,1), # ARIMA(1,0,1) or AR(1) MA(1)
    "1,1,0" = c(1,1,0), # ARIMA(1,1,0)
    "0,1,1" = c(0,1,1), # ARIMA(0,1,1)
    "1,1,1" = c(1,1,1), # ARIMA(1,1,1)
    "1,1,3" = c(1,1,3), # ARIMA(1,1,3)
    "2,1,3" = c(2,1,3) # ARIMA(2,1,3)
)


aic.df <- data.frame(ARIMA.mod=character(length(order.list)), 
                     AIC=numeric(length(order.list)), 
                     stringsAsFactors=F)

Now with the created orders I’m calculating ARIMA models and saving AIC value to the aic.df. Then I’m plotting aic values and selecting the minimum value. I will use this order to make my final model and to predict future values.

for(i in 1:length(order.list)){
    aic.df[i, 1] <- names(order.list)[i]
    aic.df[i, 2] <- arima(duomenys$DJIA.Close, order = order.list[[i]])$aic
}

aic.df$ARIMA.mod <- as.factor(aic.df$ARIMA.mod)

ggplot(data=subset(x = aic.df, subset = aic.df$AIC<max(aic.df$AIC)), aes(x=ARIMA.mod, y=AIC)) + 
    geom_bar(stat="identity") + 
    coord_cartesian(ylim=c(min(aic.df$AIC) - 50, min(aic.df$AIC) + 75)) +
    geom_hline(yintercept=min(aic.df$AIC), color="red") +
    ggtitle("AIC, all data")

plot of chunk Arima

As we can see the best order for ARIMA model is 0,1,1.

For this analysis I will make also model with simplest order 1,0,0, because I’m expecting the best result from it.

model1 <- arima(duomenys$DJIA.Close, order = order.list[[match(min(aic.df$AIC),aic.df$AIC)]])

model2 <- arima(duomenys$DJIA.Close, order = order.list[[1]])

Now making prediction with model2 and printing next 4 predicted values and expected values.

DJIA.pred1 <- predict(model1, n.ahead = 4)
DJIA.pred2 <- predict(model2, n.ahead = 4)
# prog.visi_old<- data.frame(minus_se = DJIA.pred$pred - 1 * DJIA.pred$se, 
#                        pred = DJIA.pred$pred,
#                        plus_se = DJIA.pred$pred + 1 * DJIA.pred$se, 
#                        expected = duomenys.org$DJIA.Close[(dim(duomenys.org)[1] - 3):dim(duomenys.org)[1]])

prog.visi<- data.frame(pred_min_aic = DJIA.pred1$pred, 
                       pred_simle = DJIA.pred2$pred, 
                       expected = duomenys.org$DJIA.Close[(dim(duomenys.org)[1] - 3):dim(duomenys.org)[1]])


prog.visi$diff_aic_exp <- prog.visi$pred_min_aic/prog.visi$expected - 1
prog.visi$diff_siml_exp <- prog.visi$pred_simle/prog.visi$expected - 1

knitr::kable(prog.visi) 
pred_min_aic pred_simle expected diff_aic_exp diff_siml_exp
17120 17116 17080 0.0024 0.0021
17120 17109 17098 0.0013 0.0006
17120 17103 17068 0.0031 0.0021
17120 17096 17078 0.0024 0.0011

As we can see from the table above, the simplest model give the better result as that with the smallest aic.