library(tseries)
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(forecast)
## Loading required package: timeDate
## This is forecast 6.2
# PROBLEM 3
  #Consider the monthly time series for Housing Starts (New Privately Owned Housing Units), FRED/HOUSTNSA.
  #Follow the Box-Jenkins methodology to build a time series model based on the data until the end of 2013.
 # After you check the model for adequacy use it to produce and plot a forecast until the end of 2016.

METHODOLOGY

hs_d <- read.csv(file="C:\\Users\\t420\\Desktop\\Housing.csv", header=TRUE, sep=",")

hs <- ts(hs_d[,2],start=c(1959,1), frequency=12)

str(hs)
##  Time-Series [1:685] from 1959 to 2016: 73.6 77.5 89.9 90.9 111.6 ...
head(hs)
## [1]  73.6  77.5  89.9  90.9 111.6  99.2
tail(hs)
## [1] 147.8 152.5 150.8 127.7  99.0  96.2
tsdisplay(hs, lag.max = 100, xlab=" Years 1959-2015", ylab="Housing Units", main="Monthly Privately Owned Housing Units")

Number of private housing units are highly flucutating across the period. Let’s check its stationarity using kpss and Aadf test.

adf.test(hs)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  hs
## Dickey-Fuller = -2.6015, Lag order = 8, p-value = 0.3237
## alternative hypothesis: stationary
kpss.test(hs,null="Trend")
## Warning in kpss.test(hs, null = "Trend"): p-value smaller than printed p-
## value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  hs
## KPSS Trend = 0.37489, Truncation lag parameter = 6, p-value = 0.01
#The p-value of ADF test is 0.3237 which is greater than alpha. In this case, we fail to reject the null hypothesis. Therefore the data isn't stationary. In case of KPSS test, P-value is less than alpha. In this case, we reject null hypothesis. That means the data isn't stationary. Therefore, let's check the first level differencing.
dhs <- diff(hs)
tsdisplay(dhs, lag.max = 100, xlab=" Years 1959-2015", ylab="Housing Units", main="Housing Units from 1959 to 2015")

#Let's check its ADF and KPSS test. 
adf.test(dhs)
## Warning in adf.test(dhs): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  dhs
## Dickey-Fuller = -21.289, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
#ADF test has p-value of 0.01 less than alpha. Therefore, we reject the null hypothesis. This means the first differenced data is stationary.
kpss.test(dhs)
## Warning in kpss.test(dhs): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  dhs
## KPSS Level = 0.015847, Truncation lag parameter = 6, p-value = 0.1
#KPSS test has p-value of 0.1 greater than alpha. This means we fail to reject null hypothesis. Therefore, the first differenced data is stationary. 

Seasonality Adjustment

#Data has become stationary after first differencing. However, there is seasonal trend. We need to adjust that. Let's do that. 
sdhs <- diff(diff(hs))
tsdisplay(sdhs, lag.max = 12, xlab=" Years 1959-2015", ylab="Housing Units", main="Seasonally Adjusted Housing Units from 1959 to 2015")

#splitting the data into two parts
sdhsall <- sdhs
sdhs_2013 <- window(sdhsall, end=c(2013,12))
sdhs_2014 <- window(sdhsall, start=c(2014,1))
adf.test(sdhs_2013)
## Warning in adf.test(sdhs_2013): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  sdhs_2013
## Dickey-Fuller = -17.714, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(sdhs_2013,null="Trend")
## Warning in kpss.test(sdhs_2013, null = "Trend"): p-value greater than
## printed p-value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  sdhs_2013
## KPSS Trend = 0.0058439, Truncation lag parameter = 5, p-value =
## 0.1

Here both test suggests the stationarity of data. Let’s analyse the ACF and PACF.

tsdisplay(sdhs_2013, lag.max = 12, xlab=" Years 1959-2013", ylab="Housing Units", main="Seasonally Adjusted Housing Units from 1959 to 2013")

Based on the anlaysis of spikes shown in ACF and PACF diagram, let’s estimate different models. There are not significant points in PACF. While in case of ACF, probable significant poins are at lag 5,6 and lag 12. Let’s check them.

arm <- arima(sdhs_2013, order=c(0,0,5), seasonal=list(order=c(0,0,1),period=12))
tsdisplay(residuals(arm))

arm
## 
## Call:
## arima(x = sdhs_2013, order = c(0, 0, 5), seasonal = list(order = c(0, 0, 1), 
##     period = 12))
## 
## Coefficients:
##           ma1     ma2      ma3      ma4     ma5    sma1  intercept
##       -1.1109  0.1053  -0.1783  -0.1204  0.3042  0.4159    -0.0007
## s.e.   0.0400  0.0612   0.0541   0.0652  0.0410  0.0308     0.0017
## 
## sigma^2 estimated as 197.1:  log likelihood = -2677.24,  aic = 5370.47
tsdiag(arm,gof.lag=12)

arm_1 <- arima(sdhs_2013, order=c(0,0,7), seasonal=list(order=c(0,0,1),period=12))
tsdisplay(residuals(arm_1))

arm_1
## 
## Call:
## arima(x = sdhs_2013, order = c(0, 0, 7), seasonal = list(order = c(0, 0, 1), 
##     period = 12))
## 
## Coefficients:
##           ma1     ma2      ma3      ma4     ma5     ma6     ma7    sma1
##       -1.0834  0.1123  -0.2288  -0.0721  0.1630  0.1057  0.0033  0.4182
## s.e.   0.0421  0.0596   0.0604   0.0677  0.0678  0.0685  0.0432  0.0309
##       intercept
##         -0.0007
## s.e.     0.0015
## 
## sigma^2 estimated as 195:  log likelihood = -2673.69,  aic = 5367.39
tsdiag(arm_1,gof.lag=12)

arm_2 <- arima(sdhs_2013, order=c(0,0,12), seasonal=list(order=c(0,0,1),period=12))
tsdisplay(residuals(arm_2))

arm_2
## 
## Call:
## arima(x = sdhs_2013, order = c(0, 0, 12), seasonal = list(order = c(0, 0, 1), 
##     period = 12))
## 
## Coefficients:
##           ma1     ma2      ma3      ma4     ma5      ma6      ma7      ma8
##       -1.1305  0.1067  -0.0992  -0.0349  0.1798  -0.0819  -0.0136  -0.1805
## s.e.   0.0377  0.0570   0.0570   0.0579  0.0576   0.0569   0.0551   0.0535
##          ma9    ma10    ma11     ma12    sma1  intercept
##       0.2715  0.0172  0.2412  -0.2758  0.4300    -0.0007
## s.e.  0.0578  0.0612  0.0561   0.0359  0.0306     0.0021
## 
## sigma^2 estimated as 176.7:  log likelihood = -2641.58,  aic = 5313.17
tsdiag(arm_2,gof.lag=12)

Based on the models estimated, ARIMA(0,0,12)(0,0,1) has the lowest AIC value of 5313.7 and almost all p values for Ljung Box statisitc are near to 0.6. That makes this best model for forecasting.

ARMA.fcast1 <- forecast(arm_2, h=36)
plot(ARMA.fcast1, xlim=c(2000,2016))