par(mar=c(3,3,3,3))
inflation %>% autoplot()
par(mar=c(3,3,3,3)) ##This margin command should do the trick
acf(inflation)
pacf(inflation)
Based on the graph of inflation rate over time, acf, and pacf, it seems that inflation is not stationary. To illustrate this point more clearly, we shall use augmented Dicky Fuller test.
adf_trend <- ur.df(inflation, type = 'trend', selectlags = 'AIC')
adf <- ur.df(inflation, type = 'none', selectlags = 'AIC')
adf_drift <- ur.df(inflation,type='drift',selectlags='AIC')
adf %>% summary
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0982 -0.5181 0.3025 1.2239 4.7719
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 -0.07584 0.04625 -1.640 0.107
## z.diff.lag 0.19744 0.12996 1.519 0.134
##
## Residual standard error: 1.63 on 57 degrees of freedom
## Multiple R-squared: 0.06924, Adjusted R-squared: 0.03658
## F-statistic: 2.12 on 2 and 57 DF, p-value: 0.1294
##
##
## Value of test-statistic is: -1.6398
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.6 -1.95 -1.61
adf_drift %>% summary
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression drift
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4471 -0.9355 -0.2778 0.6800 4.6391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.86834 0.35418 2.452 0.01736 *
## z.lag.1 -0.23069 0.07717 -2.989 0.00415 **
## z.diff.lag 0.27353 0.12841 2.130 0.03756 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.563 on 56 degrees of freedom
## Multiple R-squared: 0.1595, Adjusted R-squared: 0.1294
## F-statistic: 5.312 on 2 and 56 DF, p-value: 0.007721
##
##
## Value of test-statistic is: -2.9893 4.4681
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau2 -3.51 -2.89 -2.58
## phi1 6.70 4.71 3.86
adf_trend %>% summary
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0107 -1.0733 -0.0886 0.6873 4.3343
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.84949 0.61073 3.028 0.003740 **
## z.lag.1 -0.28932 0.08110 -3.567 0.000756 ***
## tt -0.02454 0.01259 -1.949 0.056445 .
## z.diff.lag 0.28369 0.12542 2.262 0.027682 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.526 on 55 degrees of freedom
## Multiple R-squared: 0.2137, Adjusted R-squared: 0.1709
## F-statistic: 4.984 on 3 and 55 DF, p-value: 0.003951
##
##
## Value of test-statistic is: -3.5674 4.3933 6.5898
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -4.04 -3.45 -3.15
## phi2 6.50 4.88 4.16
## phi3 8.73 6.49 5.47
From the above test result, we fail to reject the null hypothesis at 95% confidence level. Thus inflation is non-stationary. We proceed by taking the first difference.
par(mar=c(3,3,3,3))
inflation[1:48] %>% diff %>% acf(main='ACF')
inflation[1:48] %>% diff %>% pacf(main='PACF')
adf_trend <- ur.df(diff(inflation), type = 'trend', selectlags = 'AIC')
adf <- ur.df(diff(inflation), type = 'none', selectlags = 'AIC')
adf_drift <- ur.df(diff(inflation),type='drift',selectlags='AIC')
adf %>% summary
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5659 -0.6336 -0.1460 0.7359 3.8154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 -1.1813 0.1589 -7.436 6.58e-10 ***
## z.diff.lag 0.4046 0.1224 3.305 0.00166 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.54 on 56 degrees of freedom
## Multiple R-squared: 0.5144, Adjusted R-squared: 0.497
## F-statistic: 29.66 on 2 and 56 DF, p-value: 1.645e-09
##
##
## Value of test-statistic is: -7.4363
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.6 -1.95 -1.61
adf_drift %>% summary
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression drift
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5705 -0.6382 -0.1506 0.7313 3.8108
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.004614 0.204004 0.023 0.98204
## z.lag.1 -1.181330 0.160301 -7.369 9.36e-10 ***
## z.diff.lag 0.404599 0.123519 3.276 0.00183 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.554 on 55 degrees of freedom
## Multiple R-squared: 0.5144, Adjusted R-squared: 0.4967
## F-statistic: 29.13 on 2 and 55 DF, p-value: 2.362e-09
##
##
## Value of test-statistic is: -7.3694 27.1563
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau2 -3.51 -2.89 -2.58
## phi1 6.70 4.71 3.86
adf_trend %>% summary
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3883 -0.6820 -0.0498 0.7663 3.6504
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.31316 0.42704 0.733 0.46653
## z.lag.1 -1.19441 0.16156 -7.393 9.47e-10 ***
## tt -0.01011 0.01228 -0.823 0.41404
## z.diff.lag 0.41002 0.12406 3.305 0.00169 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.558 on 54 degrees of freedom
## Multiple R-squared: 0.5204, Adjusted R-squared: 0.4938
## F-statistic: 19.53 on 3 and 54 DF, p-value: 1.055e-08
##
##
## Value of test-statistic is: -7.3931 18.2239 27.3339
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -4.04 -3.45 -3.15
## phi2 6.50 4.88 4.16
## phi3 8.73 6.49 5.47
From the result of the test and the acf and pacf graph, the first order difference is stationary. Moreover, from the ACF graph, we can see that the 4 lag have value higher than the significant level, therefore we consider the model ARIMA(0,1,4). From the PACF graph, we can see that 2 lag have value higher than the significant level, therefore we consider the model ARIMA(2,1,0). We also consider choosing the ARIMA model automatically using the ARIMA function of the Fable package.
infl <- infl %>% as_tsibble(index=Dte)
fit <- infl %>% dplyr::filter(Dte < 2008) %>%
model(arima014 = ARIMA(Rate ~ pdq(0,1,4)),
arima210 = ARIMA(Rate ~ pdq(2,1,0)),
stepwise = ARIMA(Rate),
search = ARIMA(Rate,stepwise=FALSE))
glance(fit) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 4 x 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima210 1.93 -81.5 169. 170. 175.
## 2 stepwise 1.86 -82.5 173. 174. 180.
## 3 search 1.86 -82.5 173. 174. 180.
## 4 arima014 2.00 -81.2 172. 174. 182.
Based on the above result, we shall choose the arima model (2,1,0) to model the inflation since it has lowest AIC. We then proceed to test for autocorrelation of the resdiual using Box-Pierce test.
resid <- augment(fit) %>% dplyr::filter(.model == 'arima210') %>% select(.resid)
resid$.resid %>% Box.test()
##
## Box-Pierce test
##
## data: .
## X-squared = 0.016279, df = 1, p-value = 0.8985
Based on the above result of the Box-Pierce test, we can see that the p-value is 0.8985 therefore we fail to reject the null hypothesis. Thus we can conclude that the residual are white noise.
infl %>% dplyr::filter(Dte < 2008) %>%
model(arima014 = ARIMA(Rate ~ pdq(0,1,4)),
arima210 = ARIMA(Rate ~ pdq(2,1,0)),
stepwise = ARIMA(Rate),
search = ARIMA(Rate,stepwise=FALSE)) %>% accuracy
## # A tibble: 4 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima014 Training 0.0409 1.34 0.986 -3.09 24.4 0.815 0.800 0.00333
## 2 arima210 Training 0.0334 1.35 0.994 -2.59 24.1 0.821 0.805 -0.0184
## 3 stepwise Training 0.0311 1.32 1.01 -9.25 27.5 0.832 0.790 0.0333
## 4 search Training 0.0311 1.32 1.01 -9.25 27.5 0.832 0.790 0.0333
Here we used the model trained above with year smaller than 2008 and used on the test set of 2008-2019. We can see that the ARIMA(0,1,4) has the lowest in-sample RMSE.
infl %>% dplyr::filter(Dte < 2008) %>%
model(arima014 = ARIMA(Rate ~ pdq(0,1,4)),
arima210 = ARIMA(Rate ~ pdq(2,1,0)),
stepwise = ARIMA(Rate),
search = ARIMA(Rate,stepwise=FALSE)) %>% forecast(h=12) %>% accuracy(infl)
## # A tibble: 4 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima014 Test -1.14 1.58 1.33 -160. 319. 1.10 0.945 -0.189
## 2 arima210 Test -1.18 1.63 1.38 -168. 328. 1.14 0.976 -0.197
## 3 search Test -2.09 2.40 2.19 -269. 453. 1.81 1.44 -0.169
## 4 stepwise Test -2.09 2.40 2.19 -269. 453. 1.81 1.44 -0.169
Whereas with the out-of-sample prediction, the ARIMA(0,1,4) still have the lowest RMSE. Therefore the ARIMA(0,1,4) has the best accuracy.
In order to set time series, we use macroeconomic variable GDP during 1929 – 2021 in the US. Times series values are observed at the same frequency, namely yearly; each of these values are random variables. Therefore, we can say that GDP is a stochastic process.
set.seed(123)
GDP <- read.csv('GDPA.csv')
gdp=GDP$GDPA
year=year(GDP$DATE)
plot(year,gdp,type="l")
sprintf('Mean of GDP is %.3f',mean(gdp))
## [1] "Mean of GDP is 5361.377"
sprintf('Variance of GDP is %.3f',var(gdp))
## [1] "Variance of GDP is 43274256.528"
sprintf('Kurtosis of GDP is %.3f',kurtosis(gdp))
## [1] "Kurtosis of GDP is -0.069"
View the graph shows a significant increasing trend, therefore it is nonstationary. In order to check non-stationarity, we use ADF test: first with trend and drift, then with drift and finally with no trend and drift. Each time, the result shows that we cannot reject null hypothesis. Therefore, GDP time series is nonstationary. Mean of GDP is 5361.377, variance of GDP is 43274257, and kurtosis of GDP is -0.069.
par(mar=c(3,3,3,3))
acf(gdp)
pacf(gdp)
The autocorrelation coefficients start at a very high value (nearly 1) and decline very slowly. Thus, GDP time series is nonstationary. We have to find a solution to make it stationary.
We plot the log (GDP).
summary(ur.df(gdp, type = "trend", selectlags = "AIC"))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1190.90 -60.50 -0.38 60.16 858.10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -96.179634 64.237451 -1.497 0.13795
## z.lag.1 0.026530 0.008573 3.095 0.00265 **
## tt 6.428301 2.120927 3.031 0.00321 **
## z.diff.lag -0.406606 0.132164 -3.077 0.00280 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 225 on 87 degrees of freedom
## Multiple R-squared: 0.5707, Adjusted R-squared: 0.5559
## F-statistic: 38.55 on 3 and 87 DF, p-value: 6.076e-16
##
##
## Value of test-statistic is: 3.0948 33.9832 40.9271
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -4.04 -3.45 -3.15
## phi2 6.50 4.88 4.16
## phi3 8.73 6.49 5.47
summary(ur.df(gdp, type = "drift", selectlags = "AIC"))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression drift
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1306.99 -64.34 -34.01 100.40 920.18
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.32559 33.04028 2.219 0.0290 *
## z.lag.1 0.04656 0.00571 8.154 2.25e-12 ***
## z.diff.lag -0.28415 0.13156 -2.160 0.0335 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 235.3 on 88 degrees of freedom
## Multiple R-squared: 0.5253, Adjusted R-squared: 0.5145
## F-statistic: 48.7 on 2 and 88 DF, p-value: 5.767e-15
##
##
## Value of test-statistic is: 8.1537 42.4341
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau2 -3.51 -2.89 -2.58
## phi1 6.70 4.71 3.86
summary(ur.df(gdp, type = "none", selectlags = "AIC"))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1359.96 -2.79 22.49 128.22 965.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 0.049580 0.005666 8.750 1.24e-13 ***
## z.diff.lag -0.211173 0.130163 -1.622 0.108
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 240.4 on 89 degrees of freedom
## Multiple R-squared: 0.6791, Adjusted R-squared: 0.6718
## F-statistic: 94.15 on 2 and 89 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: 8.7502
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.6 -1.95 -1.61
lgdp=log(gdp)
plot(year,lgdp,type="l")
summary(ur.df(lgdp, type = "trend", selectlags = "AIC"))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.179905 -0.022845 0.005325 0.021119 0.125939
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.287612 0.102803 2.798 0.00634 **
## z.lag.1 -0.062449 0.024974 -2.501 0.01427 *
## tt 0.004180 0.001736 2.407 0.01819 *
## z.diff.lag 0.636625 0.078670 8.092 3.21e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04885 on 87 degrees of freedom
## Multiple R-squared: 0.4357, Adjusted R-squared: 0.4163
## F-statistic: 22.39 on 3 and 87 DF, p-value: 7.786e-11
##
##
## Value of test-statistic is: -2.5006 7.0058 3.3656
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -4.04 -3.45 -3.15
## phi2 6.50 4.88 4.16
## phi3 8.73 6.49 5.47
It is still nonstationary. And ADF test also shows that we cannot reject null hypothesis. Then we do the first differences of log (GDP) time series.
dlgdp=diff(lgdp)
plot(dlgdp,type="l")
summary(ur.df(dlgdp, type = "trend", selectlags = "AIC"))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.216801 -0.017395 0.001436 0.018180 0.135289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0452356 0.0117602 3.846 0.000229 ***
## z.lag.1 -0.5307719 0.0855842 -6.202 1.88e-08 ***
## tt -0.0002247 0.0001946 -1.155 0.251455
## z.diff.lag 0.1765792 0.0990315 1.783 0.078105 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04768 on 86 degrees of freedom
## Multiple R-squared: 0.3146, Adjusted R-squared: 0.2907
## F-statistic: 13.16 on 3 and 86 DF, p-value: 3.775e-07
##
##
## Value of test-statistic is: -6.2018 13.1752 19.5768
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -4.04 -3.45 -3.15
## phi2 6.50 4.88 4.16
## phi3 8.73 6.49 5.47
summary(ur.df(dlgdp, type = "drift", selectlags = "AIC"))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression drift
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.205215 -0.020543 -0.002389 0.020181 0.142866
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.034469 0.007179 4.801 6.51e-06 ***
## z.lag.1 -0.525611 0.085631 -6.138 2.41e-08 ***
## z.diff.lag 0.184147 0.099003 1.860 0.0663 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04777 on 87 degrees of freedom
## Multiple R-squared: 0.304, Adjusted R-squared: 0.288
## F-statistic: 19 on 2 and 87 DF, p-value: 1.423e-07
##
##
## Value of test-statistic is: -6.1381 19.0234
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau2 -3.51 -2.89 -2.58
## phi1 6.70 4.71 3.86
summary(ur.df(dlgdp, type = "none", selectlags = "AIC"))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.13083 -0.00315 0.01506 0.03529 0.17403
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 -0.23264 0.06718 -3.463 0.000827 ***
## z.diff.lag 0.05890 0.10680 0.551 0.582721
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05342 on 88 degrees of freedom
## Multiple R-squared: 0.1221, Adjusted R-squared: 0.1022
## F-statistic: 6.12 on 2 and 88 DF, p-value: 0.003246
##
##
## Value of test-statistic is: -3.4629
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.6 -1.95 -1.61
In this case, ADF test shows that we can reject null hypothesis. Thus, the GDP first differenced is stationary, i.e. log (GDP) time series is I (1). The following two figures are ACF and PACF plot of first differenced log (GDP).
par(mar=c(3,3,3,3))
acf(dlgdp)
pacf(dlgdp)
From plot ACF, first three lags are significant, q=3. From plot PACF, first two lags are significant, p=2. ARMA (2,3) is probably the best model. Then we use auto.arima to find best model, auto.arima returns best ARIMA model according to either AIC, AICc or BIC value.
arma_model <- arima(dlgdp, order=c(2,0,3))
arma_model
##
## Call:
## arima(x = dlgdp, order = c(2, 0, 3))
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 intercept
## 1.1609 -0.9171 -0.4871 0.5077 0.3139 0.0608
## s.e. 0.1097 0.0869 0.1650 0.1228 0.1406 0.0087
##
## sigma^2 estimated as 0.002238: log likelihood = 148.98, aic = -283.97
We find the same results with the AIC and BIC information criteria, the minimum is reach when p=2 and q=3. Thus the qualified model is an ARMA (2,3).
Next, we do the Box-Pierce test.
res_arma_model=arma_model$residuals
Box.test(res_arma_model, lag = 1, type = c("Box-Pierce"), fitdf = 0)
##
## Box-Pierce test
##
## data: res_arma_model
## X-squared = 0.0034134, df = 1, p-value = 0.9534
P-value is bigger than 0.05, thus we cannot reject the null hypothesis, indicating there is no correlation among the residuals, the residuals are white noise.
We choose 80% (75 years) of data as training sample and 20% (18 years) of data as test sample. Then we compare the accuracy of forecasts using AR(2) and forecasts using ARMA(2,3).
train.ids <- sample(1:93, 75, replace=FALSE)
test.ids <- (1:93)[-train.ids]
dlgdp.train <- dlgdp[train.ids]
dlgdp.test <- dlgdp[test.ids]
fit.train=auto.arima(dlgdp.train)
ar2.train <- Arima(dlgdp.train, order=c(2, 0, 0))
accuracy(forecast(fit.train, length(dlgdp.test))$mean, dlgdp.test)
## ME RMSE MAE MPE MAPE
## Test set -0.02054019 0.07828496 0.04614392 35.36575 64.11258
accuracy(forecast(ar2.train,length(dlgdp.test))$mean, dlgdp.test)
## ME RMSE MAE MPE MAPE
## Test set -0.02074355 0.07894985 0.04636353 35.58854 64.18822
RMSE[ARMA(2,3)]=0.0932 > RMSE[AR(2)]=0.0930, thus using AR is better than using ARMA. ARMA model is chosen as the best model but AR shows better performance to forecast. ARMA model has more parameters and is likely to have overfitting problems.
The following figure is out-of-sample forecast of first differenced log (GDP). The blue line is the forecast for 2022-2031.
fit=auto.arima(dlgdp)
plot(forecast(fit))
Comparison: Forecasts using AR(2) vs. Forecasts using ARMA(2,3)
ar2 <- Arima(dlgdp, order=c(2, 0, 0))
plot(forecast(ar2))
Here we cannot compute the accuracy.
rate <- read.csv("Interest Rate.csv", header = T, na.strings = "")
rate$DATE <- as.Date.character(rate$DATE)
names(rate) <- c("Date","rate")
kurtosis(rate$rate)#3.963085
## [1] 0.9538737
## attr(,"method")
## [1] "excess"
rate.ts<-ts(rate)
plot(rate.ts[,2],ylab="Rate")
adf.test(rate.ts[,2])
##
## Augmented Dickey-Fuller Test
##
## data: rate.ts[, 2]
## Dickey-Fuller = -2.6326, Lag order = 9, p-value = 0.3105
## alternative hypothesis: stationary
This plot illustrates obviously that the interest rate isn’t stationary.Then we do the ADF test to verify it. The p-value is 0.3105, we cannot reject the null hypothesis, the interest rate is non-stationary as we think. Then we difference it, and do the ADF test again. The p-value is smaller than 0.01, it implies the data is stationary after the differencing. We have the plot I(1):
d_rate <- diff(rate.ts)
d_date <-ts(d_rate)
plot(d_rate[,-1],ylab="Rate I(1)")
adf.test(d_rate[-1])
##
## Augmented Dickey-Fuller Test
##
## data: d_rate[-1]
## Dickey-Fuller = -1.5483, Lag order = 11, p-value = 0.7695
## alternative hypothesis: stationary
Secondly, we plot the ACF and PACF to find the p,q value of interest rate data:
par(mar=c(3,3,3,3))
acf(d_rate[,-1],main="ACF")
pacf(d_rate[,-1],main="PACF")
ar(d_rate[,-1],method = "mle")
##
## Call:
## ar(x = d_rate[, -1], method = "mle")
##
## Coefficients:
## 1 2 3 4 5 6 7 8
## 0.3360 0.1270 0.0155 -0.0458 0.0613 -0.0120 0.0542 -0.0467
## 9
## 0.0932
##
## Order selected 9 sigma^2 estimated as 0.0405
From these two plots, if we use AR(p) to model the I(1), PACF plot tells us to use p=20,we use AIC to define the p: We get p=9 form AIC, then we use p=9 to do the ADF test: The test statistics is Dickey-Fuller: -2.4248, p-value: 0.1543, it’s bigger than 0.1 and 0.05, so we fail to reject the null hypothesis. ie. the interest rate data exists unit root. We test three different types of model:
adfTest(rate.ts[,2], lags=9,type="c")
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 9
## STATISTIC:
## Dickey-Fuller: -2.4248
## P VALUE:
## 0.1543
##
## Description:
## Fri Apr 15 22:44:08 2022 by user: User
adfTest(rate.ts[,2], lags=9,type="nc")
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 9
## STATISTIC:
## Dickey-Fuller: -1.4011
## P VALUE:
## 0.1698
##
## Description:
## Fri Apr 15 22:44:08 2022 by user: User
adfTest(rate.ts[,2], lags=9,type="ct")
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 9
## STATISTIC:
## Dickey-Fuller: -2.6326
## P VALUE:
## 0.3105
##
## Description:
## Fri Apr 15 22:44:08 2022 by user: User
In all cases, we all fail to reject the null hypothese. Then we attempt to artificially fit a non-random linear increasing trend, and test the residuals for unit roots. We get the result p-value is 0.06341,which is greater than 0.05, we still cannot reject the null hypothsis.
So, from PACF, we firstly define the ARIMA (2,1,0). By using auto.arima to find best model, auto.arima returns best ARIMA model according to either AIC, AICc or BIC value. Then we use ARIMA (3,1,0),ARIMA(2,1,1), ARIMA (1,1,0),ARIMA (2,1,2),ARIMA (3,1,1), finally we find ARIMA (2,1,2) has the smallest AIC value, so ARIMA (2,1,2) is the best model to define interest rate of USA.
arma_model1 <- arima(d_rate[-1], order=c(2,1,0));arma_model1
##
## Call:
## arima(x = d_rate[-1], order = c(2, 1, 0))
##
## Coefficients:
## ar1 ar2
## -0.4825 -0.0550
## s.e. 0.0241 0.0241
##
## sigma^2 estimated as 1.237: log likelihood = -2617.84, aic = 5241.68
arma_model2 <- arima(d_rate[-1], order=c(3,1,0));arma_model2
##
## Call:
## arima(x = d_rate[-1], order = c(3, 1, 0))
##
## Coefficients:
## ar1 ar2 ar3
## -0.4797 -0.0302 0.0514
## s.e. 0.0241 0.0268 0.0241
##
## sigma^2 estimated as 1.234: log likelihood = -2615.58, aic = 5239.16
arma_model3 <- arima(d_rate[-1], order=c(2,1,1));arma_model3
##
## Call:
## arima(x = d_rate[-1], order = c(2, 1, 1))
##
## Coefficients:
## ar1 ar2 ma1
## -1.1872 -0.4131 0.6975
## s.e. 0.0423 0.0253 0.0393
##
## sigma^2 estimated as 1.217: log likelihood = -2603.83, aic = 5215.65
arma_model4 <- arima(d_rate[-1], order=c(1,1,0));arma_model4
##
## Call:
## arima(x = d_rate[-1], order = c(1, 1, 0))
##
## Coefficients:
## ar1
## -0.4574
## s.e. 0.0215
##
## sigma^2 estimated as 1.241: log likelihood = -2620.43, aic = 5244.87
arma_model5 <- arima(d_rate[-1], order=c(2,1,2));arma_model5
##
## Call:
## arima(x = d_rate[-1], order = c(2, 1, 2))
##
## Coefficients:
## ar1 ar2 ma1 ma2
## -1.7304 -0.9983 1.6946 0.9801
## s.e. 0.0012 0.0012 0.0054 0.0047
##
## sigma^2 estimated as 0.9948: log likelihood = -2433.11, aic = 4876.22
arma_model6 <- arima(d_rate[-1], order=c(3,1,1));arma_model6
##
## Call:
## arima(x = d_rate[-1], order = c(3, 1, 1))
##
## Coefficients:
## ar1 ar2 ar3 ma1
## -1.1135 -0.3086 0.0767 0.6512
## s.e. 0.0471 0.0415 0.0251 0.0411
##
## sigma^2 estimated as 1.211: log likelihood = -2599.36, aic = 5208.71
Finally, we need to do the Box-Pierce test to validate the model.p-value = 0.9903 which is bigger than 0.05, hence we cannot reject the null hypothesis(There is no correlation among the residuals.),which implies there is no correlation among the residuals, the residuals are following the white noise series.
res_arma_model=arma_model3$residuals
Box.test(res_arma_model, lag = 1, type = c("Box-Pierce"), fitdf = 0)
##
## Box-Pierce test
##
## data: res_arma_model
## X-squared = 1.3454, df = 1, p-value = 0.2461
We choose 85% of data as a training sample and consequently 15$ of data as a test sample. We would like to compare the accuracy of ARIMA(2,1,2), ARIMA(3,1,1) with the best model we found in question 1: ARIMA(2,1,1) by comparing the accuracy of forecast of these three models.
n <- 860
ntrain <- 860*0.85
ntest <- n-ntrain
train.ids <- sample(1:n, ntrain, replace=FALSE)
test.ids <- (1:n)[-train.ids]
train.set <-rate[train.ids, ]
train.set <- diff(train.set[,-1])
test.set <- rate[test.ids, ]
test.set <- diff(test.set[,-1])
arma_model3.train <- arima(train.set, order=c(2,1,1));arma_model3.train
##
## Call:
## arima(x = train.set, order = c(2, 1, 1))
##
## Coefficients:
## ar1 ar2 ma1
## -0.6561 -0.2571 -1.0000
## s.e. 0.0358 0.0358 0.0035
##
## sigma^2 estimated as 10.95: log likelihood = -1910.87, aic = 3829.75
arma_model5.train <- arima(train.set, order=c(2,1,2));arma_model5.train
##
## Call:
## arima(x = train.set, order = c(2, 1, 2))
##
## Coefficients:
## ar1 ar2 ma1 ma2
## -0.0148 0.0298 -1.9919 0.9924
## s.e. 0.0374 0.0374 0.0092 0.0088
##
## sigma^2 estimated as 8.001: log likelihood = -1800.35, aic = 3610.7
arma_model6.train <- arima(train.set, order=c(3,1,1));arma_model6.train
##
## Call:
## arima(x = train.set, order = c(3, 1, 1))
##
## Coefficients:
## ar1 ar2 ar3 ma1
## -0.7294 -0.4445 -0.2838 -1.0000
## s.e. 0.0355 0.0416 0.0356 0.0035
##
## sigma^2 estimated as 10.06: log likelihood = -1880.44, aic = 3770.88
accuracy(forecast(arma_model3.train,length(test.set))$mean,test.set)
## ME RMSE MAE MPE MAPE
## Test set 0.002067387 1.04918 0.5558423 NaN Inf
accuracy(forecast(arma_model5.train,length(test.set))$mean,test.set)
## ME RMSE MAE MPE MAPE
## Test set -0.004711415 1.045894 0.5546047 NaN Inf
accuracy(forecast(arma_model6.train,length(test.set))$mean,test.set)
## ME RMSE MAE MPE MAPE
## Test set -0.005624183 1.045056 0.5622392 NaN Inf
From the result of accuracy(forecast()) function by using the forecast package, we compute their results of RMSE, RMSE(ARIMA(2,1,1)) < RMSE(ARIMA(2,1,2)) < RMSE(ARIMA(2,1,1)), which indicates ARIMA(2,1,1) is the most accurate model among these three models.
ARIMA3Forecast <- forecast((arma_model3),h=10)
plot(ARIMA3Forecast)
ARIMA5Forecast <- forecast((arma_model5),h=10)
plot(ARIMA5Forecast)
ARIMA6Forecast <- forecast((arma_model6),h=10)
plot(ARIMA6Forecast)
rep_sim(sigma_uv=-0.001,rho=0.1,rep_time=1000)
## [1] "Sample size is 100"
## [1] "Mean of coefficient is -0.04"
## [1] "Variance of coefficient is 0.93"
## [1] "Sample size is 500"
## [1] "Mean of coefficient is 0.02"
## [1] "Variance of coefficient is 0.18"
## [1] "Sample size is 1000"
## [1] "Mean of coefficient is -0.02"
## [1] "Variance of coefficient is 0.09"
rep_sim(sigma_uv=-0.001,rho=0.9,rep_time=1000)
## [1] "Sample size is 100"
## [1] "Mean of coefficient is 0.01"
## [1] "Variance of coefficient is 0.29"
## [1] "Sample size is 500"
## [1] "Mean of coefficient is 0.01"
## [1] "Variance of coefficient is 0.04"
## [1] "Sample size is 1000"
## [1] "Mean of coefficient is -0.00"
## [1] "Variance of coefficient is 0.02"
rep_sim(sigma_uv=-0.009,rho=0.9,rep_time=1000)
## [1] "Sample size is 100"
## [1] "Mean of coefficient is 0.03"
## [1] "Variance of coefficient is 0.30"
## [1] "Sample size is 500"
## [1] "Mean of coefficient is 0.00"
## [1] "Variance of coefficient is 0.04"
## [1] "Sample size is 1000"
## [1] "Mean of coefficient is 0.01"
## [1] "Variance of coefficient is 0.02"
rep_sim(sigma_uv=-0.009,rho=0.1,rep_time=1000)
## [1] "Sample size is 100"
## [1] "Mean of coefficient is -0.02"
## [1] "Variance of coefficient is 1.03"
## [1] "Sample size is 500"
## [1] "Mean of coefficient is 0.01"
## [1] "Variance of coefficient is 0.21"
## [1] "Sample size is 1000"
## [1] "Mean of coefficient is -0.01"
## [1] "Variance of coefficient is 0.10"
From the above result, we can easily see that as the sample size increase, the mean converge to the true value of \(\beta\), here is 0 and the variance getting smaller and smaller.
The population value of the slope parameter is 0 since \[y_{t+1} + y_{t+2} = \beta x_t + v_t + \beta x_{t+1} + v_{t+1} = v_t + v_{t+1} = 0 \times x_t + v_t + v_{t+1}\] (since \(\beta = 0\))
simulate_data_new <- function(sample_size=100,sigma_uv=-0.001,rho=0.1){
Sigma = matrix(c(1,sigma_uv,sigma_uv,0.01),nrow=2,byrow=TRUE)
err <- MASS::mvrnorm(n=sample_size,mu=c(0,0),Sigma=Sigma)
x_t <- arima.sim(list(order=c(1,0,0),ar=rho),n=sample_size,innov=err[,2],n.start=1,start.innov=0)
y_t <- err[,1]
df <- cbind(y_t,x_t)
mod.L21 <- dynlm(y_t + L(y_t,1)~L(x_t,2),data=df)
return(c(mod.L21$coefficients[2]))
}
We have the following estimated equation: \[y_{t+2} + y_{t+1} = \beta_0 + \beta_1 x_t + \varepsilon_t\]
coef_100 <- replicate(1000,simulate_data_new(sample_size=100,sigma_uv = -0.009,rho=0.9))
coef_500 <- replicate(1000,simulate_data_new(sample_size=500,sigma_uv = -0.009,rho=0.9))
coef_1000 <- replicate(1000,simulate_data_new(sample_size=1000,sigma_uv = -0.009,rho=0.9))
From the above result, we can see that as we increase the sample size, the mean of the coefficient converge to true value and the variance decrease.
sigma_uv = -0.009
rho=0.9
sample_size = 1000
Sigma = matrix(c(1,sigma_uv,sigma_uv,0.01),nrow=2,byrow=TRUE)
err <- MASS::mvrnorm(n=sample_size,mu=c(0,0),Sigma=Sigma)
x_t <- arima.sim(list(order=c(1,0,0),ar=rho),n=sample_size,innov=err[,2],n.start=1,start.innov=0)
y_t <- err[,1]
df <- cbind(y_t,x_t)
mod.L21 <- dynlm(y_t + L(y_t,1)~L(x_t,2),data=df)
summ <- summary(mod.L21)
naive <- summ$coefficients[2,2]
NW <- sqrt(diag(NeweyWest(mod.L21)))[2]
sprintf('Standard deviation of the OLS estimator calculated by naive method is %.4f',naive)
## [1] "Standard deviation of the OLS estimator calculated by naive method is 0.1889"
sprintf('Standard deviation of the OLS estimator calculated by NeweyWest method is %.4f',NW)
## [1] "Standard deviation of the OLS estimator calculated by NeweyWest method is 0.2982"