First test for testing ARIMA models
H0: theta=0 or phi=0 Ha: theta!=0 or phi!=0
test stat: theta/SE(Theta hat) or phi/SE(phi hat) or T (df)
df=n-(#theta’s + #phi’s)
Diagnostics: residual analysis (if ARIMA is doing well, then residuals will be small, and T-stat will be small, P-value high.)
H0: “The model adequately models our time series data.” Ha: “The model is inadequate.”
test stats: box pierce stat (original) Ljung-box stat (better) box.test(mod1#residuals)
to improve model, go to ACF & PACF
seasonal ARIMA:
ARIMA(p,d,q)x(P,D,Q)S #second half is for seasonal component
S=12 for monthly, S=4 for quarterly P~AR(r) piece D~I(d) piece Q~MA(q) piece
heavy references from TSDecomp.R, code courtesy of Dr. Knudson
# n births per month in NYC
births <- scan("http://robjhyndman.com/tsdldata/data/nybirths.dat")
birthstimeseries <- ts(births, frequency=12, start=c(1946,1))
TScomp <- decompose(birthstimeseries)
plot(TScomp)
# subtract seasonal component
noseason <- birthstimeseries - TScomp$seasonal
plot(noseason)
# this is what we removed
plot(TScomp$seasonal)
library(forecast)
## Warning: package 'forecast' was built under R version 3.4.4
library(quantmod)
## Warning: package 'quantmod' was built under R version 3.4.4
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.4.4
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.4
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
library(tseries)
## Warning: package 'tseries' was built under R version 3.4.4
library(timeSeries)
## Warning: package 'timeSeries' was built under R version 3.4.4
## Loading required package: timeDate
## Warning: package 'timeDate' was built under R version 3.4.4
##
## Attaching package: 'timeSeries'
## The following object is masked from 'package:zoo':
##
## time<-
library(forecast)
library(xts)
ARIMA(p,d,q)x(P,D,Q)S #second half is for seasonal component
S=12 for monthly, S=4 for quarterly P~AR(r) piece D~I(d) piece Q~MA(q) piece
seasonal vs nonseasonal diff’s 1st seasonal diffs: y(t)-y(t-s) ex y(t)-y(t-4) #quarterly or y(t)-y(t-12) #monthly 1st order seasonal differences. (used to calculate D)
nonseasonal differences (done if non-flat trend after differences) 2nd remove upward/downward trend done 2nd, because seasonal may take care of nonseasonal differences.
steps: 1, plot data to see if you have seasonality -if seasonally, your data has an increasing tend, transform before going forward. (log() seems to work nicely) 2. do diffs to identify D,d -if seasonality and no trend, use D>=1,d=0 -If trend but no seasonality, use D=0, d>=1 -If seasonality and trend, do D>=1 then see if resulting differences have trend. (if yes, d>=1)
#### STEP 1: PLOT DATA
births <- scan("http://robjhyndman.com/tsdldata/data/nybirths.dat")
births <- births[-c(1:24)]
births <- ts(births, frequency=12, start=c(1946,1))
par(mfrow=c(2,1))
plot.ts(births) # hard to see seasonality due to noise
#this filter helps smooth the noise, seasonality sticks out
plot.ts(filter(births, sides=2, filter=rep(1/3,3)))
#this filter mostly smooths out the seasonality
par(mfrow=c(1,1))
(wgts <- c(.5, rep(1,11), .5)/12)
## [1] 0.04166667 0.08333333 0.08333333 0.08333333 0.08333333 0.08333333
## [7] 0.08333333 0.08333333 0.08333333 0.08333333 0.08333333 0.08333333
## [13] 0.04166667
plot.ts(filter(births, sides=2, filter = wgts))
#### STEP 2: DIFFS
## First: diffs of lag S to remove seasonality
## Second: diffs to remove trend
## (Skip either step if deemed unnecessary in step 1)
# Step 2, part A:
diff1 <- diff(births, lag = 12)
plot.ts(diff1)
adf.test(diff1) #stationary? good enough for me.
##
## Augmented Dickey-Fuller Test
##
## data: diff1
## Dickey-Fuller = -3.4166, Lag order = 5, p-value = 0.05488
## alternative hypothesis: stationary
#to get diff2:
diff2 <- diff(diff1, lag=12)
# Step 2, part B: doesn't look necessary anymore
# (no trend remaining in diff1)
#### STEP 3: ACF and PACF
## Let's look at the nonseasonal p and q first
Acf(diff1) #spikes at low lags ==> nonseasonal MA (q > =1)
Pacf(diff1) # spike at low lag ==> nonseasonal AR (p >= 1)##partial ACF here
# Our model thus far is p>=1, d=0, q>=1
#### STEP 4: ESTIMATE MODEL
# ARIMA (p, d, q) x (P, D, Q) S
# ARIMA (1, 0, 1) x (1, 1, 1) 12
mod1 <- arima(births, order = c(1,0,1), season = list( order = c( 1,1,1), period=12))
# ordre of nonseasonal list of order of seasonal and S ~ period
#### STEP 5: EXAMINE MODELs, and maybe make more ~ box test here
# ARIMA (1, 0, 1) x (1, 1, 1) 12
summary(mod1) #gets coefficients test worst one first. IE -.1564/.1747 is closest to 0.
##
## Call:
## arima(x = births, order = c(1, 0, 1), seasonal = list(order = c(1, 1, 1), period = 12))
##
## Coefficients:
## ar1 ma1 sar1 sma1
## 0.9884 -0.1564 -0.2199 -0.8820
## s.e. 0.0287 0.1747 0.1004 0.1498
##
## sigma^2 estimated as 0.3924: log likelihood = -136.81, aic = 283.61
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.06656964 0.5998188 0.4452406 0.2299257 1.727419 0.3675936
## ACF1
## Training set 0.0410978
Box.test(mod1$residuals, type = "Ljung")
##
## Box-Ljung test
##
## data: mod1$residuals
## X-squared = 0.24832, df = 1, p-value = 0.6183
# no evidence the model is inadequate! (resids small enough)
# but mod1's ma1 is not significantly diff from 0, so let's try removing it
# ARIMA (1, 0, 0) x (1, 1, 1) 12
mod2 <- arima(births, order = c(1,0,0), season = list( order = c( 1,1,1), period=12))
Box.test(mod2$residuals, type = "Ljung") # yay
##
## Box-Ljung test
##
## data: mod2$residuals
## X-squared = 0.62922, df = 1, p-value = 0.4276
summary(mod2) # most Wald test stats look decently far from 0
##
## Call:
## arima(x = births, order = c(1, 0, 0), seasonal = list(order = c(1, 1, 1), period = 12))
##
## Coefficients:
## ar1 sar1 sma1
## 0.9750 -0.2263 -0.8595
## s.e. 0.0326 0.0998 0.1280
##
## sigma^2 estimated as 0.3986: log likelihood = -137.27, aic = 282.55
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.07991759 0.6045416 0.450681 0.28466 1.748639 0.3720852
## ACF1
## Training set -0.06542041
# let's test the parameter with Wald test stat closest to 0
(teststat <- -0.2263/0.0998)
## [1] -2.267535
2*pt(abs(teststat), df = 144-3, lower.tail = FALSE) #p value
## [1] 0.02487795
# therefore, reject null that this parameter = 0
# we should keep this parameter estimate in the model
# and not remove the seasonal ar parameter
### Let's see if we can improve our model.
## we said p >=1, P >=1, Q >=1 and we tried all EQUAL to 1
# ARIMA (2, 0, 0) x (1, 1, 1) 12
mod3 <- arima(births, order = c(2,0,0), season = list( order = c( 1,1,1), period=12))
mod3 # ar2 not significantly diff from 0
##
## Call:
## arima(x = births, order = c(2, 0, 0), seasonal = list(order = c(1, 1, 1), period = 12))
##
## Coefficients:
## ar1 ar2 sar1 sma1
## 0.9275 0.0513 -0.2237 -0.8671
## s.e. 0.0876 0.0881 0.0999 0.1333
##
## sigma^2 estimated as 0.3965: log likelihood = -137.1, aic = 284.21
# ARIMA (1, 0, 0) x (2, 1, 1) 12
mod4 <- arima(births, order = c(1,0,0), season = list( order = c( 2,1,1), period=12))
mod4 # sar2 IS significantly diff from 0
##
## Call:
## arima(x = births, order = c(1, 0, 0), seasonal = list(order = c(2, 1, 1), period = 12))
##
## Coefficients:
## ar1 sar1 sar2 sma1
## 0.9677 -0.4796 -0.3846 -0.6243
## s.e. 0.0268 0.1138 0.1022 0.1227
##
## sigma^2 estimated as 0.3673: log likelihood = -131.72, aic = 273.44
AIC(mod4); AIC(mod2) # we improved! mod4 better than mod2
## [1] 273.4391
## [1] 282.5484
# should we add even more?
# ARIMA (1, 0, 0) x (2, 1, 2) 12
mod5 <- arima(births, order = c(1,0,0), season = list( order = c( 2,1,2), period=12))
mod5 # sma1 NOT significantly diff from 0
##
## Call:
## arima(x = births, order = c(1, 0, 0), seasonal = list(order = c(2, 1, 2), period = 12))
##
## Coefficients:
## ar1 sar1 sar2 sma1 sma2
## 0.9612 -0.2486 -0.3563 -0.8751 0.2615
## s.e. 0.0290 0.2787 0.1310 0.3022 0.2498
##
## sigma^2 estimated as 0.3631: log likelihood = -131.09, aic = 274.19
# so forget mod5. Simpler (mod4) is better
AIC(mod5); AIC(mod4) #plus they have the same AIC (roughly anyway)
## [1] 274.1868
## [1] 273.4391
# Diagnostics for ARIMA (1, 0, 0) x (2, 1, 1) 12
mod4
##
## Call:
## arima(x = births, order = c(1, 0, 0), seasonal = list(order = c(2, 1, 1), period = 12))
##
## Coefficients:
## ar1 sar1 sar2 sma1
## 0.9677 -0.4796 -0.3846 -0.6243
## s.e. 0.0268 0.1138 0.1022 0.1227
##
## sigma^2 estimated as 0.3673: log likelihood = -131.72, aic = 273.44
Box.test(mod4$residuals, type = "Ljung") # model not inadequate :)
##
## Box-Ljung test
##
## data: mod4$residuals
## X-squared = 1.3714, df = 1, p-value = 0.2416
# So our final mod is mod4
######### For fun, let's compare what we got to auto.arima
automod <- auto.arima(births)
automod # pretty similar
## Series: births
## ARIMA(1,0,0)(2,1,1)[12] with drift
##
## Coefficients:
## ar1 sar1 sar2 sma1 drift
## 0.6988 -0.4160 -0.3503 -0.6846 0.0447
## s.e. 0.0631 0.1222 0.1093 0.1281 0.0033
##
## sigma^2 estimated as 0.3287: log likelihood=-122.05
## AIC=256.09 AICc=256.77 BIC=273.39
AIC(automod); AIC(mod4)
## [1] 256.0934
## [1] 273.4391
Box.test(automod$residuals)
##
## Box-Pierce test
##
## data: automod$residuals
## X-squared = 0.23843, df = 1, p-value = 0.6253
######### Forecasting
par(mfrow=c(2,1))
plot(forecast(automod, h = 12), main = "auto.arima")
plot(forecast(mod4, h = 12), main = "home-cooked")
# til we're better chefs, let's just eat at the restaurant
# auto.arima isn't Michelin rated, but it's better than we are
# after just 2 days of training