時系列分析(ARIMA)

y <- c(AirPassengers) # 月間航空旅客数
n <- length(y)        # 月数
n.tr <- 120           # 訓練月数
n.te <- n - n.tr      # テスト月数
t <- 1:n              # 月
ii.tr <- 1:120        # 訓練期間インデックス
ii.te <- 121:n        # テスト期間インデックス
matplot(y, type = 'o', pch = 1, col = 2)

fq <- 12

library(MASS)
bc <- boxcox(y ~ t, data = data.frame(y = y[ii.tr], t = t[ii.tr]))

lambda <- round(bc$x[which.max(bc$y)], 1)
lambda
## [1] 0
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

y.bc <- BoxCox(y[ii.tr], lambda)

matplot(y.bc, type = 'o', pch = 1, col = 2, ylab = 'y')

d <- 1

dy <- diff(y.bc, diff = d)

matplot(dy, type = 'o', pch = 1, col = 2, ylab = 'y')

ys <- ts(10 * sin(2 * pi * t / 10) + rnorm(n))

dir.create('acf_lag', showWarnings = F)

for (k in 1:25)
{
  png(paste0('acf_lag/acf_lag', k, '.png'))

  ys.lag <- lag(ys, k = -k)
  ts2 <- cbind(ys, ys.lag)

  par(mfrow = c(3, 1), mar = c(2, 4, 1, 1))
  matplot(ts2, type = 'o', pch = 1, col = c(1, 4), 
          xlab = '', ylab = '', xlim = c(0, 120))
  abline(v = c(0, k), col = 2)
  arrows( 0, 0, k, 0, length = 0.1, col = 2)
  mtext(paste(k), side = 1, at = k, line = 1, col = 2)
  legend('topright', col = c(1, 4), bg = 'white', lty = 1,
        legend = c('原系列(周期10の正弦波+ノイズ)', paste0(k, '次ラグ系列')))
  grid()
   acf(ys, lag.max = k, xlim = c(0, 120), ylim = c(-1, 1), main = '')
  pacf(ys, lag.max = k, xlim = c(0, 120), ylim = c(-1, 1), main = '')

  dev.off()
}

a <- acf(dy, lag.max = 50) # MA(q)
grid()
abline(v = fq, col = 3, lty = 3)

q <- 12

pa <- pacf(dy, lag.max = 50) # AR(p)【注意】acfと違いグラフが1次ラグから始まる。
grid()
abline(v = fq, col = 3, lty = 3)

p <- 12

fit.arima <- Arima(ts(y[ii.tr], frequency = fq), lambda = 'auto',
                   order    = c(p, d, q),
                   seasonal = c(1, 1, 0)) # SARIMA

# c.f.
#fit.arima <- auto.arima(ts(y[ii.tr], frequency = fq), lambda = 'auto')
fit.arima
## Series: ts(y[ii.tr], frequency = fq) 
## ARIMA(12,1,12)(1,1,0)[12] 
## Box Cox transformation: lambda= -0.3096643 
## 
## Coefficients:
##           ar1     ar2      ar3     ar4      ar5      ar6     ar7      ar8
##       -0.2542  0.0940  -0.0545  0.0567  -0.0594  -0.0297  0.1077  -0.0028
## s.e.   0.1417  0.1441   0.1368  0.1352   0.1335   0.1306  0.1425   0.1334
##          ar9    ar10     ar11     ar12      ma1      ma2      ma3      ma4
##       0.1405  0.0729  -0.2619  -0.2848  -0.0736  -0.1363  -0.0955  -0.2205
## s.e.  0.1335  0.1311   0.1333   0.1807   0.1472   0.1384   0.1264   0.1472
##          ma5     ma6      ma7      ma8      ma9    ma10    ma11     ma12
##       0.1913  0.2118  -0.1829  -0.0367  -0.1446  0.0096  0.3593  -0.6336
## s.e.  0.1458  0.1479   0.1603   0.1385   0.1426  0.1516  0.1519   0.1503
##         sar1
##       0.1202
## s.e.  0.2297
## 
## sigma^2 = 5.446e-05:  log likelihood = 381.33
## AIC=-710.67   AICc=-693.12   BIC=-641.17
checkresiduals(fit.arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(12,1,12)(1,1,0)[12]
## Q* = 6.3161, df = 3, p-value = 0.09721
## 
## Model df: 25.   Total lags used: 28
pred <- forecast(fit.arima, h = n.te, level = 95)

matplot(t, y, type = 'o', pch = 16, col = 2, main = pred$method)
grid()
abline(v = n.tr + 1, lty  = 3, col = 3)
matlines(t[ii.tr], fit.arima$fitted, type = 'o', pch = 16, lty = 2, col = 4)
matlines(t[ii.te], pred$mean,        type = 'o', pch =  1, lty = 2, col = 4)
matlines(t[ii.te], pred$lower, lty = 3, col = 'gray')
matlines(t[ii.te], pred$upper, lty = 3, col = 'gray')
legend('topleft', bg = 'white', lty = c(1, 2, 2, 3),
       col = c(2, 4, 4, 'gray'), pch = c(16, 16, 1, NA),
       legend = c('原系列', '1期先予測値', '予測値', '95%予測区間'))

get.accuracy <- function(yhat, y)
{
  data.frame(MBE  = mean(yhat - y),
             MAE  = mean(abs(yhat - y)),
             MAPE = mean(abs((yhat - y) / y)) * 100,
             RMSE = sqrt(mean((yhat - y)^2))
  )
}

get.accuracy(yhat = pred$mean, y = y[ii.te])
##         MBE      MAE     MAPE     RMSE
## 1 -14.07091 14.92617 3.421171 17.18222
n <- 24*7*2            # 時間数
t <- 1:n               # 時間
n.tr <- floor(n * 0.8) # 訓練期間数
n.te <- n - n.tr       # テスト期間数
ii.tr <- 1:n.tr        # 訓練期間インデックス
ii.te <- (n.tr+1):n    # テスト期間インデックス

set.seed(5)

y <- 100 + 0.1 * t + 0.02 * t^2 + 100 * sin(2 * pi * t / 24) + rnorm(n, sd = 5)

matplot(y, type = 'o', pch = 1, col = 2)

d <- 2

dy <- diff(y[ii.tr], diff = d)

matplot(dy, type = 'o', pch = 1, col = 2, ylab = 'y')

a <- acf(dy, lag.max = 50) # MA(q)
grid()
abline(v = fq, col = 3, lty = 3)

q <- 2

pa <- pacf(dy, lag.max = 50) # AR(p)【注意】acfと違いグラフが1次ラグから始まる。
grid()
abline(v = fq, col = 3, lty = 3)

p <- 4

fit.arima <- Arima(ts(y[ii.tr], frequency = fq), lambda = 'auto',
                   order    = c(p, d, q),
                   seasonal = c(1, 0, 0))
## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.
fit.arima
## Series: ts(y[ii.tr], frequency = fq) 
## ARIMA(4,2,2)(1,0,0)[12] 
## Box Cox transformation: lambda= 0.9190757 
## 
## Coefficients:
## Warning in sqrt(diag(x$var.coef)): 計算結果が NaN になりました
##          ar1     ar2      ar3      ar4      ma1     ma2    sar1
##       0.9297  0.4478  -0.0589  -0.4882  -1.9251  0.9989  0.1034
## s.e.  0.0543  0.0797   0.0799   0.0545      NaN     NaN  0.0639
## 
## sigma^2 = 24.85:  log likelihood = -807.37
## AIC=1630.75   AICc=1631.31   BIC=1659.41
checkresiduals(fit.arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,2,2)(1,0,0)[12]
## Q* = 102.25, df = 17, p-value = 3.397e-14
## 
## Model df: 7.   Total lags used: 24
pred <- forecast(fit.arima, h = n.te, level = 95)

matplot(t, y, type = 'o', pch = 16, col = 2, main = pred$method)
grid()
abline(v = n.tr + 1, lty  = 3, col = 3)
matlines(t[ii.tr], fit.arima$fitted, type = 'o', pch = 16, lty = 2, col = 4)
matlines(t[ii.te], pred$mean,        type = 'o', pch =  1, lty = 2, col = 4)
matlines(t[ii.te], pred$lower, lty = 3, col = 'gray')
matlines(t[ii.te], pred$upper, lty = 3, col = 'gray')
legend('topleft', bg = 'white', lty = c(1, 2, 2, 3),
       col = c(2, 4, 4, 'gray'), pch = c(16, 16, 1, NA),
       legend = c('原系列', '1期先予測値', '予測値', '95%予測区間'))