# load necessary packages

library(Quandl)
## Warning: package 'Quandl' was built under R version 3.3.3
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.3.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 3.3.3
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 3.3.3
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
## Loading required package: PerformanceAnalytics
## Warning: package 'PerformanceAnalytics' was built under R version 3.3.3
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## Loading required package: quantmod
## Warning: package 'quantmod' was built under R version 3.3.3
## Loading required package: TTR
## Warning: package 'TTR' was built under R version 3.3.3
## Version 0.4-0 included new data defaults. See ?getSymbols.
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 3.3.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.8.0     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## Warning: package 'ggplot2' was built under R version 3.3.3
## Warning: package 'tibble' was built under R version 3.3.3
## Warning: package 'tidyr' was built under R version 3.3.3
## Warning: package 'readr' was built under R version 3.3.3
## Warning: package 'purrr' was built under R version 3.3.3
## Warning: package 'dplyr' was built under R version 3.3.3
## Warning: package 'stringr' was built under R version 3.3.3
## Warning: package 'forcats' was built under R version 3.3.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x dplyr::first()           masks xts::first()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x dplyr::last()            masks xts::last()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
## 
## Attaching package: 'tidyquant'
## The following object is masked from 'package:dplyr':
## 
##     as_tibble
## The following object is masked from 'package:tibble':
## 
##     as_tibble
library(forecast)
## Warning: package 'forecast' was built under R version 3.3.3
library(readr)
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 3.3.3
library(zoo)
library(timetk)
## Warning: package 'timetk' was built under R version 3.3.3
library(tibbletime)
## Warning: package 'tibbletime' was built under R version 3.3.3
## 
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
## 
##     filter
library(lubridate)
library(forecast)
library(broom)
## Warning: package 'broom' was built under R version 3.3.3
library(sweep)
## Warning: package 'sweep' was built under R version 3.3.3
# Read The Data

# obtain data on Total Nonfarm Payroll Employment, Not Seasonally Adjusted

data.tbl <- 
  tq_get("FRED/PAYNSA",
         get = "Quandl", 
         from = "1975-01-01", 
         to = "2017-12-12") %>% 
  rename(E = value)

str(data.tbl)
## Classes 'tbl_df', 'tbl' and 'data.frame':    516 obs. of  2 variables:
##  $ date: Date, format: "1975-01-01" "1975-02-01" ...
##  $ E   : num  76189 75764 75808 76142 76774 ...
##  - attr(*, "freq")= chr "monthly"
# Connstruct the following transformed time series 

# 1. change in Total Nonfarm Payroll Employment dE = E-E(-1)

# 2. log of Total Nonfarm Payroll Employment lE=log(E) 

# 3. log change in Total Nonfarm Payroll Employment dlE =lE-lE(-1) 

# 4. 12 month log change in Total Nonfarm Payroll Employment d12lE =lE-lE(-12) 

# 5. twice differenced Total Nonfarm Payroll Employment dd12lE=d12lE-d12lE(-1) 

data.tbl <-
  tq_get("FRED/PAYNSA",
         get = "Quandl",
         from = "1975-01-01",
         to = "2017-12-12") %>%
  rename(E = value) %>%
  mutate(dE = E - lag(E),
         lE = log(E),
         dlE = lE - lag(lE),
         d12lE = lE - lag(lE, 12),
         dd12lE = d12lE - lag(d12lE))
## Warning: package 'bindrcpp' was built under R version 3.3.3
str(data.tbl)
## Classes 'tbl_df', 'tbl' and 'data.frame':    516 obs. of  7 variables:
##  $ date  : Date, format: "1975-01-01" "1975-02-01" ...
##  $ E     : num  76189 75764 75808 76142 76774 ...
##  $ dE    : num  NA -425 44 334 632 496 -678 528 606 557 ...
##  $ lE    : num  11.2 11.2 11.2 11.2 11.2 ...
##  $ dlE   : num  NA -0.005594 0.000581 0.004396 0.008266 ...
##  $ d12lE : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ dd12lE: num  NA NA NA NA NA NA NA NA NA NA ...
# Plot the origion and the transformed time series, command on thier trends, volatility, and seasonal patterns

# set default theme for ggplot2

theme_set(theme_bw())

# plot time series: levels, logs, differences

data.tbl %>%
  gather(variable, value, -date) %>%
  mutate(variable.f = factor(variable, ordered = TRUE,
                             levels = c("E", "lE", "dE", "dlE", "d12lE", "dd12lE"),
                             labels = c("E", "log(E)",
                                        expression(paste(Delta,"E")),
                                        expression(paste(Delta,"log(E)")),
                                        expression(paste(Delta[12],"log(E)")),
                                        expression(paste(Delta,Delta[12],"log(E)"))))) %>%
  ggplot(aes(x = date, y = value)) +
  geom_hline(aes(yintercept = 0), linetype = "dotted") +
  geom_line() +
  scale_x_yearmon() +
  labs(x = "", y = "") +
  # facet_wrap(~variable.f, ncol = 3, scales = "free_y")
  facet_wrap(~variable.f, ncol = 3, scales = "free_y", labeller = label_parsed)

# Plot ACF and PACF for lE, dlE, d12lE, dd12lE. Comment on their shape

maxlag <-60

par(mfrow=c(2,4))

Acf(data.tbl$lE, type="correlation", lag.max = maxlag, ylab = "", main = expression(paste("ACF for log(E)")))
Acf(data.tbl$dlE, type="correlation", lag.max = maxlag, ylab = "", main = expression(paste("ACF for ", Delta,"log(E)")))
Acf(data.tbl$d12lE, type="correlation", lag.max = maxlag, ylab = "", main = expression(paste("ACF for ", Delta[12], "log(E)")))
Acf(data.tbl$dd12lE, type="correlation", lag.max = maxlag, ylab = "", main = expression(paste("ACF for ", Delta, Delta[12], "log(E)")))

Acf(data.tbl$lE, type="partial", lag.max = maxlag, ylab = "", main = expression(paste("PACF for log(E)")))
Acf(data.tbl$dlE, type="partial", lag.max = maxlag, ylab = "", main = expression(paste("PACF for ", Delta, "log(E)")))
Acf(data.tbl$d12lE, type="partial", lag.max = maxlag, ylab = "", main = expression(paste("PACF for ", Delta[12], "log( E)")))
Acf(data.tbl$dd12lE, type="partial", lag.max = maxlag, ylab = "", main = expression(paste("PACF for ", Delta,Delta[12], "log(E)")))

# Perform the ADF and KPSS tests on lE, d12lE, dd12lE. Summarize the results

# for lE

lE = ts(data.tbl$E, start=1975, frequency=12)

library(urca)
## Warning: package 'urca' was built under R version 3.3.3
ur.df(lE, type="trend") %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3686.7   -96.0   274.8   593.5  1454.3 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  2.174e+03  7.365e+02   2.952  0.00331 **
## z.lag.1     -2.439e-02  8.914e-03  -2.736  0.00643 **
## tt           3.035e+00  1.203e+00   2.522  0.01198 * 
## z.diff.lag   1.021e-01  4.396e-02   2.323  0.02059 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 945 on 510 degrees of freedom
## Multiple R-squared:  0.02339,    Adjusted R-squared:  0.01764 
## F-statistic: 4.071 on 3 and 510 DF,  p-value: 0.007112
## 
## 
## Value of test-statistic is: -2.7361 5.7223 3.9196 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.96 -3.41 -3.12
## phi2  6.09  4.68  4.03
## phi3  8.27  6.25  5.34
ur.kpss(lE, type = "mu") %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 6 lags. 
## 
## Value of test-statistic is: 7.2113 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
# for d12lE

d12lE = ts(data.tbl$d12lE, start=1975, frequency=12)

ur.df(na.omit(d12lE), type="drift") %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0116788 -0.0011131  0.0000672  0.0010520  0.0150024 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0001836  0.0001221   1.503   0.1335    
## z.lag.1     -0.0130988  0.0051238  -2.556   0.0109 *  
## z.diff.lag   0.4829601  0.0385611  12.525   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.002097 on 499 degrees of freedom
## Multiple R-squared:  0.2426, Adjusted R-squared:  0.2396 
## F-statistic: 79.93 on 2 and 499 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -2.5565 3.2843 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.43 -2.86 -2.57
## phi1  6.43  4.59  3.78
ur.kpss(d12lE, type = "mu") %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 1.1682 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
# for dd12lE

dd12lE = ts(data.tbl$dd12lE, start=1975, frequency=12)

ur.df(na.omit(dd12lE), type="drift") %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0106539 -0.0010717  0.0000114  0.0010181  0.0117633 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.589e-05  8.778e-05  -0.295    0.768    
## z.lag.1     -3.501e-01  4.213e-02  -8.310 9.16e-16 ***
## z.diff.lag  -3.576e-01  4.115e-02  -8.689  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001965 on 498 degrees of freedom
## Multiple R-squared:  0.3671, Adjusted R-squared:  0.3645 
## F-statistic: 144.4 on 2 and 498 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -8.3098 34.5598 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.43 -2.86 -2.57
## phi1  6.43  4.59  3.78
ur.kpss(dd12lE, type = "mu") %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0356 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
# Summary The Results

# The results of ADF test and KPSS show that LE is stationary at level, and d12lE is not stationary.

# The results of ADF test for dd12lE  show that it is stationary at level, but the results of 

# KPSS aaccept the null hypothesis menaing that the series is non-stationary at level and need to take difference. 
# Split the sample into two parts: estimation sample from 1975M1 to 2014M12, and prediction sample from 2015M1 to 2017M12. 

# Use ACF and PACF from (c) to identify and estimate a suitable model for dd12lE using Arima.

# Check the estimated model for adequacy 

# plot inverted AR and MA roots to check stationarity and invertibility using plot, diagnose residuals using ggtsdiag. 

## split sample into two parts - estimation sample and prediction sample

dd12lE1 <-window(dd12lE, start=1976 +(2/12), end = 2014, frequency=12)  # Series in  Sub-Sample One

autoplot(dd12lE1, main = "Twice differenced Total Nonfarm Payroll Employment in Sub-Sample One, 1976M2-2014M12")

dd12lE2 <-window(dd12lE, start=2015, end = 2017, frequency=12)  # Series in  Sub-Sample two

autoplot(dd12lE2, main = "Twice differenced Total Nonfarm Payroll Employment in Sub-Sample two, 2015M1-2017M12")

## find the best model using ARIMA model 

maxlag <-60

par(mfrow=c(2,1))

Acf(dd12lE1, type="correlation", lag.max = maxlag, ylab = "", main = expression(paste("ACF for ", Delta, Delta[12], "log(E)")))

Acf(dd12lE1, type="partial", lag.max = maxlag, ylab = "", main = expression(paste("PACF for ", Delta,Delta[12], "log(E)")))

ur.df(na.omit(dd12lE1), type="drift") %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0106809 -0.0011277  0.0000061  0.0010229  0.0117647 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0000290  0.0000953  -0.304    0.761    
## z.lag.1     -0.3436655  0.0443251  -7.753 6.03e-14 ***
## z.diff.lag  -0.3587606  0.0434891  -8.249 1.77e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.002028 on 450 degrees of freedom
## Multiple R-squared:  0.3631, Adjusted R-squared:  0.3602 
## F-statistic: 128.2 on 2 and 450 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -7.7533 30.0819 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1  6.47  4.61  3.79
ur.kpss(dd12lE1, type = "mu") %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0415 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
library(tseries)

m1 <- arima(dd12lE1, order=c(1,0,0), seasonal=list(order=c(1,0,1), period = 12))

m1
## 
## Call:
## arima(x = dd12lE1, order = c(1, 0, 0), seasonal = list(order = c(1, 0, 1), period = 12))
## 
## Coefficients:
##          ar1    sar1     sma1  intercept
##       0.5322  0.1803  -0.8287      0e+00
## s.e.  0.0421  0.0610   0.0330      1e-04
## 
## sigma^2 estimated as 3.382e-06:  log likelihood = 2214.83,  aic = -4419.65
m2 <- arima(dd12lE1, order=c(2,0,0), seasonal=list(order=c(1,0,1), period = 12))

m2
## 
## Call:
## arima(x = dd12lE1, order = c(2, 0, 0), seasonal = list(order = c(1, 0, 1), period = 12))
## 
## Coefficients:
##          ar1     ar2    sar1     sma1  intercept
##       0.3426  0.3802  0.1412  -0.8071      0e+00
## s.e.  0.0441  0.0442  0.0610   0.0334      1e-04
## 
## sigma^2 estimated as 2.911e-06:  log likelihood = 2249.02,  aic = -4486.03
m3 <- arima(dd12lE1, order=c(3,0,0), seasonal=list(order=c(1,0,1), period = 12))

m3 
## 
## Call:
## arima(x = dd12lE1, order = c(3, 0, 0), seasonal = list(order = c(1, 0, 1), period = 12))
## 
## Coefficients:
##          ar1     ar2     ar3    sar1     sma1  intercept
##       0.2594  0.3103  0.2134  0.1336  -0.8243      0e+00
## s.e.  0.0469  0.0460  0.0478  0.0593   0.0318      1e-04
## 
## sigma^2 estimated as 2.783e-06:  log likelihood = 2258.77,  aic = -4503.55
# Based on AIC, model 3, ARIMA(3,0,0)(1,0,1)[12] is the best model.

library(lmtest)
## Warning: package 'lmtest' was built under R version 3.3.3
# M3 has significant coefficients, then we do not need to restrict model.

coeftest(m1)
## 
## z test of coefficients:
## 
##              Estimate  Std. Error  z value  Pr(>|z|)    
## ar1        5.3220e-01  4.2133e-02  12.6313 < 2.2e-16 ***
## sar1       1.8026e-01  6.1048e-02   2.9528  0.003149 ** 
## sma1      -8.2871e-01  3.3029e-02 -25.0905 < 2.2e-16 ***
## intercept -4.4195e-05  6.4474e-05  -0.6855  0.493051    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(m2)
## 
## z test of coefficients:
## 
##              Estimate  Std. Error  z value  Pr(>|z|)    
## ar1        3.4258e-01  4.4071e-02   7.7733 7.645e-15 ***
## ar2        3.8019e-01  4.4209e-02   8.5997 < 2.2e-16 ***
## sar1       1.4124e-01  6.0956e-02   2.3171    0.0205 *  
## sma1      -8.0710e-01  3.3438e-02 -24.1375 < 2.2e-16 ***
## intercept -3.0683e-05  8.6363e-05  -0.3553    0.7224    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(m3)
## 
## z test of coefficients:
## 
##              Estimate  Std. Error  z value  Pr(>|z|)    
## ar1        2.5938e-01  4.6929e-02   5.5270 3.257e-08 ***
## ar2        3.1028e-01  4.5997e-02   6.7456 1.524e-11 ***
## ar3        2.1339e-01  4.7789e-02   4.4652 8.001e-06 ***
## sar1       1.3358e-01  5.9330e-02   2.2515   0.02435 *  
## sma1      -8.2431e-01  3.1771e-02 -25.9450 < 2.2e-16 ***
## intercept -2.4330e-05  9.5052e-05  -0.2560   0.79798    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Accuracy 

accuracy(m3)
##                         ME        RMSE         MAE      MPE     MAPE
## Training set -8.931781e-07 0.001668286 0.001200889 31.87307 252.4614
##                   MASE         ACF1
## Training set 0.6797329 -0.009641745
# Inverted AR and MA roots 

plot(m3)

# Diagnose residulas using ggtsdiag 

nlags <- 60

ggtsdiag(m3, gof.lag = nlags)

# Based on the residuals, m3 has highly significant p-value. 

# Then, the best model is 

# ARIMA(3,0,0)(1,0,1)[12]
# Split the sample into two parts: estimation sample from 1975M1 to 2014M12, and prediction sample from 2015M1 to 2017M12. 

# Use ACF and PACF from (c) to identify and estimate a suitable model for lE using Arima.

# Check the estimated model for adequacy 

# plot inverted AR and MA roots to check stationarity and invertibility using plot, diagnose residuals using ggtsdiag. 

## split sample into two parts - estimation sample and prediction sample

lE1 <-window(lE, start=1975, end = 2014, frequency=12)  # Series in  Sub-Sample One

autoplot(lE1, main = "log of Total Nonfarm Payroll Employment in Sub-Sample One, 1975M1-2014M12")

lE2 <-window(lE, start=2015, end = 2017, frequency=12)  # Series in  Sub-Sample two

autoplot(lE2, main = "log of Total Nonfarm Payroll Employment in Sub-Sample two, 2015M1-2017M12")

## find the best model using ARIMA model 

# We need to test ACF, PACF, ADF, KPSS test in sub-sample one.   

maxlag <-60

par(mfrow=c(2,1))

Acf(lE1, type="correlation", lag.max = maxlag, ylab = "", main = expression(paste("ACF for log(E)")))

Acf(lE1, type="partial", lag.max = maxlag, ylab = "", main = expression(paste("ACF for log(E)")))

ur.df(na.omit(lE1), type="drift") %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3615.9  -110.3   279.7   589.4  1541.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 558.304280 263.807269   2.116   0.0348 *
## z.lag.1      -0.003911   0.002293  -1.706   0.0887 .
## z.diff.lag    0.098674   0.046525   2.121   0.0345 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 941.4 on 464 degrees of freedom
## Multiple R-squared:  0.01604,    Adjusted R-squared:  0.0118 
## F-statistic: 3.781 on 2 and 464 DF,  p-value: 0.02349
## 
## 
## Value of test-statistic is: -1.706 4.8441 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1  6.47  4.61  3.79
ur.kpss(lE1, type = "mu") %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 7.6967 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
# Series: lE1 

# ARIMA(1,0,4) with non-zero mean 

m4 <- auto.arima(lE1, ic="aic", seasonal=TRUE, stationary=TRUE, stepwise=FALSE, approximation=FALSE)

m4
## Series: lE1 
## ARIMA(1,0,4) with non-zero mean 
## 
## Coefficients:
##          ar1     ma1     ma2     ma3     ma4        mean
##       0.9668  1.0063  1.2164  1.0006  0.8593  112305.553
## s.e.  0.0118  0.0304  0.0359  0.0273  0.0274    6694.842
## 
## sigma^2 estimated as 1024561:  log likelihood=-3914.27
## AIC=7842.55   AICc=7842.79   BIC=7871.6
# Based on auto.arima, model ARIMA(1,0,4) is the best model in sub-sample one.

# M4 has significant coefficients, then we dont need to restrict model.

coeftest(m4)
## 
## z test of coefficients:
## 
##             Estimate Std. Error z value  Pr(>|z|)    
## ar1       9.6682e-01 1.1790e-02  82.005 < 2.2e-16 ***
## ma1       1.0063e+00 3.0423e-02  33.076 < 2.2e-16 ***
## ma2       1.2164e+00 3.5923e-02  33.861 < 2.2e-16 ***
## ma3       1.0006e+00 2.7269e-02  36.692 < 2.2e-16 ***
## ma4       8.5925e-01 2.7432e-02  31.323 < 2.2e-16 ***
## intercept 1.1231e+05 6.6948e+03  16.775 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Accuracy 

accuracy(m4)
##                    ME     RMSE      MAE         MPE      MAPE      MASE
## Training set 26.75893 1005.711 749.3217 0.006342968 0.6700129 0.3294454
##                    ACF1
## Training set -0.2791811
# Inverted AR and MA roots 

plot(m4)

# Diagnose residulas using ggtsdiag 

nlags <- 60

ggtsdiag(m4, gof.lag = nlags)

# Based on the residuals, m4 has not highly significant p-value, then we need to find the best model

m5 <- arima(lE1, order=c(1,2,1), seasonal=list(order=c(2,1,1), period = 12))

m5
## 
## Call:
## arima(x = lE1, order = c(1, 2, 1), seasonal = list(order = c(2, 1, 1), period = 12))
## 
## Coefficients:
##           ar1      ma1    sar1     sar2     sma1
##       -0.1659  -0.5140  0.0595  -0.0260  -0.6735
## s.e.   0.0766   0.0689  0.0701   0.0583   0.0517
## 
## sigma^2 estimated as 34425:  log likelihood = -3028.66,  aic = 6069.31
# Accuracy in both Sub-Samples

accuracy(m5)
##                     ME     RMSE     MAE        MPE      MAPE      MASE
## Training set -5.640952 182.8673 137.406 -0.0064589 0.1265387 0.1872851
##                      ACF1
## Training set -0.007198373
# Inverted AR and MA roots in Sub-Sample One 

plot(m5)

# Diagnose residulas using ggtsdiag in both Sub-Samples 

nlags <- 60

ggtsdiag(m5, gof.lag = nlags)

# Based on the residuals, m5 has highly significant p-value than m4. 

# Then the best model is ARIMA(1,2,1)(2,1,1)[12]
# Use the rolling scheme to generate a sequence of 1 period ahead forecasts for the prediction subsample, 2015M1-2018M12 using the same model specification as in (f).


# Multistep Ahead Forecasting 

end <- 2014 

forecasting.length <- window(lE, start = end + (1/12))

m5.f.h <- forecast(m5, length(forecasting.length))

autoplot(m5.f.h)

# 1 month Ahead Recursive Forecasting

m5.Rol.F <- zoo()

fstQ <- 1975

lstQ <- 2014

for(i in 1:length(lE2)) {
  temp <- window(lE, start=fstQ + (i-1)/12, end = lstQ + (i-1)/12)
  m5.update <- arima(temp, order = c(1,2,1), seasonal = list(order = c(2,1,1), period = 12))
  m5.Rol.F <- c(m5.Rol.F, forecast(m5.update, 1)$mean)
}

m5.Rol.F <- as.ts(m5.Rol.F)

plot(m5.Rol.F)

# Plot the forecast from (g) together with its confidence intervals and the actual data for the period 2008M1-2017M12.

par(mfrow=c(2,1))

plot(m5.f.h, xlim=c(2008,2017), main="Multistep Ahead Forecasts")
lines(m5.f.h$mean, type="p", pch=16, lty="dashed", col="blue")
lines(lE, pch=16, lty="dashed")

plot(m5.Rol.F, xlim=c(2008,2017), main = "1-step Ahead Recursive Forecasts")
lines(m5.Rol.F, type="p", pch=16, lty="solid", col="red")
lines(lE,pch=16, lty="dashed")

# Forecast Error

par(mfrow=c(2,1))

plot(lE - m5.f.h$mean, main = "Forecast Error: Multistep Ahead Forecasts")
abline(h=0, lty="dashed")

plot(lE - m5.Rol.F, main = "Forecast Error: 1-Month Ahead Recursive Forecasts")
abline(h=0, lty="dashed")

# test for accuracy

accuracy(m5.f.h$mean, lE)
##                ME     RMSE      MAE        MPE      MAPE      ACF1
## Test set 84.94892 552.2811 469.5141 0.06464511 0.3263059 0.9066706
##          Theil's U
## Test set 0.5260941
accuracy(m5.Rol.F, lE)
##                 ME     RMSE      MAE         MPE       MAPE      ACF1
## Test set -8.024886 127.0852 106.4006 -0.00590278 0.07569034 -0.234191
##          Theil's U
## Test set 0.1121559
# The results of forecasting show that 1-month Ahead Recursive Forecasting is better than multistep Ahead Forecasting.