Time Series Data Analysis

Nitin Mittapally

Dataset Introduction

The data we have contains the quarterly disposable income in Japan during the period of 1961 to 1981 Reading first this data into R Studio

data = read.csv("./data.csv")
testData = data[81:84,]
data = data[1:80,]
data = ts(data)
Visualizing & Transforming the data
data %>% ggtsdisplay()

we see non-constant variance across the given time period, applying the log transformation on the data

data %>% log() %>%  ggtsdisplay()

Now the variance looks almost constant across.we see seasonality in the data with cycle period of 4 removing the seasonality

data %>% log() %>% diff(lag = 4) %>%  ggtsdisplay()

Checking if the series is stationary
data %>% log() %>%  diff(lag=4) %>%  adf.test()
## 
##  Augmented Dickey-Fuller Test
## 
## data:  .
## Dickey-Fuller = -3.0763, Lag order = 4, p-value = 0.1362
## alternative hypothesis: stationary

we see series is not stationary yet. Trying diff on the series

data %>% log() %>%  diff(lag=4) %>% diff() %>%  adf.test()
## Warning in adf.test(.): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  .
## Dickey-Fuller = -6.6486, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary

The test confirms that series is now stationary

Fitting the model

modelPerfomance = data.frame()
data %>% log() %>%  diff(lag=4) %>% diff() %>% ggtsdisplay()

From the above graph we see, for seasonality, acf gradually decreases and have spike at lag 4. we may try fitting ar(1) and for non seasonal we can try arma(1,1)

(fit1 = data %>%  log() %>%  
    Arima(order=c(1,1,1), seasonal = list(order=c(1,1,0), period = 4)))
## Series: . 
## ARIMA(1,1,1)(1,1,0)[4] 
## 
## Coefficients:
##           ar1      ma1     sar1
##       -0.0035  -0.6777  -0.3222
## s.e.   0.2265   0.1833   0.1362
## 
## sigma^2 estimated as 0.008633:  log likelihood=72.72
## AIC=-137.44   AICc=-136.87   BIC=-128.17
coeftest(fit1)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## ar1  -0.003455   0.226517 -0.0153 0.9878305    
## ma1  -0.677694   0.183293 -3.6973 0.0002179 ***
## sar1 -0.322192   0.136191 -2.3657 0.0179946 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelPerfomance = rbind(modelPerfomance, list("order" = "111*110", "AIC"= fit1$aic, "BIC" = fit1$bic))

we see coefficients are not significant. Trying by removing non significant coeffs

(fit1 = data %>%  log() %>%  
    Arima(order=c(0,1,1), seasonal = list(order=c(1,1,0), period = 4)))
## Series: . 
## ARIMA(0,1,1)(1,1,0)[4] 
## 
## Coefficients:
##           ma1     sar1
##       -0.6800  -0.3211
## s.e.   0.1002   0.1161
## 
## sigma^2 estimated as 0.008515:  log likelihood=72.72
## AIC=-139.44   AICc=-139.1   BIC=-132.49
coeftest(fit1)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value Pr(>|z|)    
## ma1  -0.68003    0.10022 -6.7851 1.16e-11 ***
## sar1 -0.32110    0.11613 -2.7651 0.005691 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
checkresiduals(fit1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)(1,1,0)[4]
## Q* = 7.0015, df = 8, p-value = 0.5365
## 
## Model df: 2.   Total lags used: 10
modelPerfomance = rbind(modelPerfomance, list("order" = "011*110", "AIC"= fit1$aic, "BIC" = fit1$bic))

this model looks good. we don’t see any kind of pattern in the residuals. The ACF of residuals is similar to that of white noise and the distribution looks similar to normal distribution. This may be a potential model we are looking for. lets try some other models

(fit1 = data %>%  log() %>%  
    Arima(order=c(0,1,1), seasonal = list(order=c(1,1,1), period = 4)))
## Series: . 
## ARIMA(0,1,1)(1,1,1)[4] 
## 
## Coefficients:
##           ma1     sar1     sma1
##       -0.6752  -0.2073  -0.1316
## s.e.   0.1003   0.3134   0.3174
## 
## sigma^2 estimated as 0.008611:  log likelihood=72.81
## AIC=-137.61   AICc=-137.04   BIC=-128.34
coeftest(fit1)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ma1  -0.67520    0.10026 -6.7343 1.647e-11 ***
## sar1 -0.20730    0.31343 -0.6614    0.5084    
## sma1 -0.13157    0.31740 -0.4145    0.6785    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelPerfomance = rbind(modelPerfomance, list("order" = "011*111", "AIC"= fit1$aic, "BIC" = fit1$bic))

(fit1 = data %>%  log() %>%  
    Arima(order=c(0,1,2), seasonal = list(order=c(1,1,0), period = 4)))
## Series: . 
## ARIMA(0,1,2)(1,1,0)[4] 
## 
## Coefficients:
##           ma1     ma2     sar1
##       -0.6806  0.0013  -0.3217
## s.e.   0.1149  0.1171   0.1282
## 
## sigma^2 estimated as 0.008633:  log likelihood=72.72
## AIC=-137.44   AICc=-136.87   BIC=-128.17
coeftest(fit1)
## 
## z test of coefficients:
## 
##        Estimate Std. Error z value Pr(>|z|)    
## ma1  -0.6806280  0.1149013 -5.9236 3.15e-09 ***
## ma2   0.0012993  0.1170972  0.0111  0.99115    
## sar1 -0.3216522  0.1282378 -2.5082  0.01213 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelPerfomance = rbind(modelPerfomance, list("order" = "012*110", "AIC"= fit1$aic, "BIC" = fit1$bic))

(fit1 = data %>%  log() %>%  
    Arima(order=c(0,1,1), seasonal = list(order=c(2,1,0), period = 4)))
## Series: . 
## ARIMA(0,1,1)(2,1,0)[4] 
## 
## Coefficients:
##           ma1     sar1     sar2
##       -0.6711  -0.3465  -0.0664
## s.e.   0.1012   0.1257   0.1270
## 
## sigma^2 estimated as 0.008598:  log likelihood=72.86
## AIC=-137.71   AICc=-137.14   BIC=-128.44
coeftest(fit1)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## ma1  -0.671053   0.101246 -6.6280 3.404e-11 ***
## sar1 -0.346455   0.125683 -2.7566  0.005841 ** 
## sar2 -0.066438   0.127001 -0.5231  0.600885    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelPerfomance = rbind(modelPerfomance, list("order" = "011*210", "AIC"= fit1$aic, "BIC" = fit1$bic))

(fit1 = data %>%  log() %>%  
    Arima(order=c(0,1,2), seasonal = list(order=c(2,1,0), period = 4)))
## Series: . 
## ARIMA(0,1,2)(2,1,0)[4] 
## 
## Coefficients:
##           ma1     ma2     sar1     sar2
##       -0.6722  0.0024  -0.3476  -0.0665
## s.e.   0.1164  0.1169   0.1378   0.1270
## 
## sigma^2 estimated as 0.008719:  log likelihood=72.86
## AIC=-135.71   AICc=-134.84   BIC=-124.13
coeftest(fit1)
## 
## z test of coefficients:
## 
##        Estimate Std. Error z value  Pr(>|z|)    
## ma1  -0.6722455  0.1164044 -5.7751 7.691e-09 ***
## ma2   0.0024399  0.1169191  0.0209   0.98335    
## sar1 -0.3476439  0.1377838 -2.5231   0.01163 *  
## sar2 -0.0664804  0.1269763 -0.5236   0.60058    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelPerfomance = rbind(modelPerfomance, list("order" = "012*210", "AIC"= fit1$aic, "BIC" = fit1$bic))
modelPerfomance
##     order       AIC       BIC
## 1 111*110 -137.4412 -128.1712
## 2 011*110 -139.4410 -132.4885
## 3 011*111 -137.6139 -128.3440
## 4 012*110 -137.4411 -128.1711
## 5 011*210 -137.7135 -128.4436
## 6 012*210 -135.7140 -124.1265

from the above metrics we see 011*110 is the best model

(model = data %>%   
    Arima(order=c(0,1,1), seasonal = list(order=c(1,1,0), period = 4), lambda = 0))
## Series: . 
## ARIMA(0,1,1)(1,1,0)[4] 
## Box Cox transformation: lambda= 0 
## 
## Coefficients:
##           ma1     sar1
##       -0.6800  -0.3211
## s.e.   0.1002   0.1161
## 
## sigma^2 estimated as 0.008515:  log likelihood=72.72
## AIC=-139.44   AICc=-139.1   BIC=-132.49

Forecasting data

(fVals = model %>%  forecast(h=4))
##    Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## 81       15.99057 14.20709 17.99794 13.344959 19.16067
## 82       15.21812 13.44117 17.22999 12.586109 18.40054
## 83       16.73152 14.69478 19.05056 13.718968 20.40560
## 84       11.57187 10.10846 13.24714  9.410249 14.23004

comparing actual values with the forecasted values

eval  = data.frame()
eval  = rbind(eval,c(  15.99,   16.20,   (15.99 - 16.20)/16.20*100 ))
eval  = rbind(eval,c(   15.21,   14.67,   (15.21 - 14.67)/14.67*100 ))
eval  = rbind(eval,c(   16.73,   16.02,   (16.73 - 16.02)/16.02*100 ))
eval  = rbind(eval,c(   11.57,   11.61,   (11.57 - 11.61)/11.61*100 ))
colnames(eval) =  c("forecasted", "actual", "pct%")
eval
##   forecasted actual       pct%
## 1      15.99  16.20 -1.2962963
## 2      15.21  14.67  3.6809816
## 3      16.73  16.02  4.4319600
## 4      11.57  11.61 -0.3445306

we see the forecasted values are closed to the actual values.

Forecasting and Ploting the data

model %>%  forecast(h=4) %>% autoplot()

Generated Model Equation

\[ (1-B) (1-B^4) X_{t} = -0.32 * X_{t-4} + a_{t} - 0.68 * a_{t-1} \]