Introduction
For this model development we will follow the Box-Jenkins methodology to build a time series model for Housing Starts (New Privately Owned Housing Units). The problem requires that we subset the data into two parts, the first will be the estimation (training) sample and the second the hold-out (out-of-sample) set.
I will import the time series data for Housing Starts (New Privately Owned Housing Units) from the Quandl website.
Before I get started with loading the data I need to take care of a few house keeping issues. I will load/require the necessary packages that will be used during this model development.
require(forecast)
require(Quandl)
require(dygraphs)
require(DT)
Next I will load the data.
Quandl.api_key('Ltw-PAye5rkz6MwzLNx-')
HOUSTNSA_all <- Quandl("FRED/HOUSTNSA", type="zoo")
A quick inspection shows that this data of Housing Starts (New Privately Owned Housing Units) is at monthly frequency and the data is available for the periods January 1959 to December 2015.
str(HOUSTNSA_all)
## 'zooreg' series from Jan 1959 to Dec 2015
## Data: num [1:684] 96.2 99 127.7 150.8 152.5 ...
## Index: Class 'yearmon' num [1:684] 1959 1959 1959 1959 1959 ...
## Frequency: 12
The problem call for us to subset the data into two parts. The first part of the data will go from January 1959 to December 2013. The hold-out portion will go from January 2014 until the end of the data set. However, we will use our model to forecast until the end of 2016.
HOUSTNSA_1 <- window(HOUSTNSA_all, end="2013-12")
HOUSTNSA_2 <- window(HOUSTNSA_all, start=2014)
HOUSTNSA <- HOUSTNSA_1
A plot of the original time series data will allow us to quickly determine if any transformations of the data are necessary. A note has been made on the plot indicating the region of the data that will be held-out.
# transformations on time series data
log_HOUSTNSA <- log(HOUSTNSA)
diff_HOUSTNSA <- diff(HOUSTNSA)
diff12_diff_HOUSTNSA <- diff(diff_HOUSTNSA, lag=12)
# Model 1
m1 <- Arima(HOUSTNSA, order=c(2,1,0), seasonal= list(order=c(2,1,0), period= 12))
tsdiag(m1, gof.lag=12)
m1
## Series: HOUSTNSA
## ARIMA(2,1,0)(2,1,0)[12]
##
## Coefficients:
## ar1 ar2 sar1 sar2
## -0.3656 -0.0988 -0.5122 -0.2385
## s.e. 0.0395 0.0393 0.0387 0.0384
##
## sigma^2 estimated as 139.8: log likelihood=-2518.07
## AIC=5046.14 AICc=5046.23 BIC=5068.5
m1.fcast <- forecast(m1, h=36)
# Model 2
m2 <- Arima(HOUSTNSA, order=c(2,1,0), seasonal= list(order=c(2,1,1), period= 12))
tsdiag(m2, gof.lag=12)
m2
## Series: HOUSTNSA
## ARIMA(2,1,0)(2,1,1)[12]
##
## Coefficients:
## ar1 ar2 sar1 sar2 sma1
## -0.3345 -0.0914 0.1432 -0.0113 -0.8851
## s.e. 0.0400 0.0394 0.0460 0.0440 0.0249
##
## sigma^2 estimated as 114.6: log likelihood=-2459.89
## AIC=4931.79 AICc=4931.92 BIC=4958.62
m2.fcast <- forecast(m2, h=36)
# Model 3
m3 <- Arima(HOUSTNSA, order=c(2,1,1), seasonal= list(order=c(2,1,1), period= 12))
tsdiag(m3, gof.lag=12)
m3
## Series: HOUSTNSA
## ARIMA(2,1,1)(2,1,1)[12]
##
## Coefficients:
## ar1 ar2 ma1 sar1 sar2 sma1
## -0.5063 -0.1446 0.1734 0.1401 -0.0101 -0.8843
## s.e. 0.3317 0.1038 0.3345 0.0465 0.0441 0.0250
##
## sigma^2 estimated as 114.6: log likelihood=-2459.78
## AIC=4933.56 AICc=4933.73 BIC=4964.86
m3.fcast <- forecast(m3, h=36)
# Model 4
m4 <- Arima(HOUSTNSA, order=c(2,1,2), seasonal= list(order=c(2,1,2), period= 12))
tsdiag(m4, gof.lag=12)
m4
## Series: HOUSTNSA
## ARIMA(2,1,2)(2,1,2)[12]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sar1 sar2 sma1 sma2
## -0.4297 -0.178 0.1009 0.0568 -0.7163 0.1478 -0.0322 -0.7619
## s.e. 0.4282 0.134 0.4301 0.1618 0.1605 0.0465 0.1576 0.1363
##
## sigma^2 estimated as 114.3: log likelihood=-2458.99
## AIC=4935.98 AICc=4936.26 BIC=4976.23
m4.fcast <- forecast(m4, h=36)
# Model 4.1
m4.1 <- Arima(HOUSTNSA, order=c(2,1,0), seasonal= list(order=c(2,1,2), period= 12), fixed=c(NA,NA,NA,NA,0,NA))
tsdiag(m4.1, gof.lag=12)
m4.1
## Series: HOUSTNSA
## ARIMA(2,1,0)(2,1,2)[12]
##
## Coefficients:
## ar1 ar2 sar1 sar2 sma1 sma2
## -0.3324 -0.0973 -0.7448 0.1517 0 -0.7910
## s.e. 0.0400 0.0394 0.0403 0.0454 0 0.0386
##
## sigma^2 estimated as 114.4: log likelihood=-2459.23
## AIC=4930.45 AICc=4930.58 BIC=4957.28
m4.1.fcast <- forecast(m4.1, h=36)
# Model 4.2
m4.2 <- Arima(HOUSTNSA, order=c(2,1,1), seasonal= list(order=c(2,1,2), period= 12), fixed=c(NA,NA,NA,NA,NA,0, NA))
tsdiag(m4.2, gof.lag=12)
m4.2
## Series: HOUSTNSA
## ARIMA(2,1,1)(2,1,2)[12]
##
## Coefficients:
## ar1 ar2 ma1 sar1 sar2 sma1 sma2
## -0.5182 -0.1542 0.1878 -0.7473 0.1489 0 -0.7891
## s.e. 0.3121 0.0967 0.3152 0.0405 0.0457 0 0.0388
##
## sigma^2 estimated as 114.4: log likelihood=-2459.08
## AIC=4932.16 AICc=4932.33 BIC=4963.46
m4.2.fcast <- forecast(m4.2, h=36)
Plotting Forecasted Data
Based on the model explorations above it looks like our initial ideals about \(ARIMA(2,1,0)X(2,1,0)_{12}\) was a reasonable starting point but there was much room for improvement. There was indeed an MA component to both the seasonal and non-seasonal. We will plot the forecasting models of all six model explorations and then build a table comparing the AIC, AICc, and BIC results to help us determine which model performs the best.
par(mfrow=c(1,2), cex=0.65)
plot.splineforecast(m1.fcast, type="o", pch=16, xlim=c(2012,2017), main="Model 1: Forecasts for ARIMA(2,1,0)(2,1,0)[12]")
legend(2012, 225,legend=c("Original Data","Forecasted Data","Fitted Data"), col=c("black", "blue", "red"), lty=c(1,1,1))
lines(m1.fcast$mean, type="p", pch=16, col="blue")
lines(HOUSTNSA_all, type="o", pch=16)
plot.splineforecast(m2.fcast, type="o", pch=16, xlim=c(2012,2017), main="Model 2: Forecasts for ARIMA(2,1,0)(2,1,1)[12]")
legend(2012, 225,legend=c("Original Data","Forecasted Data","Fitted Data"), col=c("black", "blue", "red"), lty=c(1,1,1))
lines(m2.fcast$mean, type="p", pch=16, col="blue")
lines(HOUSTNSA_all, type="o", pch=16)
par(mfrow=c(1,2), cex=0.65)
plot.splineforecast(m3.fcast, type="o", pch=16, xlim=c(2012,2017), main="Model 3: Forecasts for ARIMA(2,1,1)(2,1,1)[12]")
legend(2012, 225,legend=c("Original Data","Forecasted Data","Fitted Data"), col=c("black", "blue", "red"), lty=c(1,1,1))
lines(m3.fcast$mean, type="p", pch=16, col="blue")
lines(HOUSTNSA_all, type="o", pch=16)
plot.splineforecast(m4.fcast, type="o", pch=16, xlim=c(2012,2017), main="Model 4: Forecasts for ARIMA(2,1,2)(2,1,2[12]")
legend(2012, 225,legend=c("Original Data","Forecasted Data","Fitted Data"), col=c("black", "blue", "red"), lty=c(1,1,1))
lines(m4.fcast$mean, type="p", pch=16, col="blue")
lines(HOUSTNSA_all, type="o", pch=16)
par(mfrow=c(1,2), cex=0.65)
plot.splineforecast(m4.1.fcast, type="o", pch=16, xlim=c(2012,2017), main="Model 4.1: Forecasts for ARIMA(2,1,0)(2,1,2)[12]: sma1 fixed")
legend(2012, 225,legend=c("Original Data","Forecasted Data","Fitted Data"), col=c("black", "blue", "red"), lty=c(1,1,1))
lines(m4.1.fcast$mean, type="p", pch=16, col="blue")
lines(HOUSTNSA_all, type="o", pch=16)
plot.splineforecast(m4.2.fcast, type="o", pch=16, xlim=c(2012,2017), main="Model 4.2: Forecasts for ARIMA(2,1,1)(2,1,2)[12]: sma1 fixed")
legend(2012, 225,legend=c("Original Data","Forecasted Data","Fitted Data"), col=c("black", "blue", "red"), lty=c(1,1,1))
lines(m4.2.fcast$mean, type="p", pch=16, col="blue")
lines(HOUSTNSA_all, type="o", pch=16)
## Series: HOUSTNSA
## ARIMA(2,1,0)(2,1,2)[12]
##
## Coefficients:
## ar1 ar2 sar1 sar2 sma1 sma2
## -0.3324 -0.0973 -0.7448 0.1517 0 -0.7910
## s.e. 0.0400 0.0394 0.0403 0.0454 0 0.0386
##
## sigma^2 estimated as 114.4: log likelihood=-2459.23
## AIC=4930.45 AICc=4930.58 BIC=4957.28