#install.packages(c("ggplot2","dplyr", "forecast", "nlme", "readr", "readxl", "vars", "tseries", "fastmatch", "seasonal", "urca", "GGally", "nnet", "stargazer", "Rmisc", "sweep", "timetk", "tsDyn", "tsfknn", "forecastHybrid", "tsensembler", "BigVAR", "kableExtra", "broom", "astsa", "dynlm", "fGarch", "e1071"), lib="/usr/lib/R/library")
#install.packages(c("later", "promises")) # packages required to load rnn
library(ggplot2)
library(forecast)
library(nlme) # has the function gls
library(readr)
library(readxl) # determine if path is xls or xlsx,
library(broom) # has tidy function to reshape, combine untidy to cleaner formats
library(vars) # estimate standard errors of fitted VAR parameters.
library(tseries) # for GARCH models and unit root tests
library(fastmatch) # to get column number
library(dynlm) # dynamic linear models and time series regression
library(seasonal) # contains the X11 method for decomposition
library(urca) # for unit root test
library(astsa) # applied stat analysis: has sarima() function
library(sarima) # has the whiteNoiseTest function, ACF, PACF etc
library(GGally) # Creates scatterplot matrix
library(nnet)
library(stargazer) # reports results of VAR estimation
library(Rmisc)
library(sweep)
library(timetk) # has tk_tbl to coerce time series objects (e.g. xts, zoo, ts, timeSeries, etc) to tibble objects.
library(tsDyn) # has VECM function
library(tsfknn) # time series forecasting using KNN
library(kableExtra) # generates customized tables
library(forecastHybrid) # creates ensemble forecasts
library(fGarch) # fits GARCH models
library(egcm) # for Engle-Granger two-step procedure for identifying pairs of cointegrated series.
library(e1071) # builds support vector regressions
library(tsensembler) # models ensemble forecasts from machine learning tools
# attaches the dataset called FinCrisis
FinCrisis = read_xlsx("FinCrisis.xlsx")
FinCrisis = FinCrisis[,-1] # drops the 1st column: Date
# converts from lass data.frame to .ts
time_ser=ts(FinCrisis,frequency=12,start=c(1976,6), end=c(2018,12))
All the series are stationary after differencing upto 3 times.
# store the differenced variables as new variables and combine those new variables to the time_ser dataset
# ts.union() and cbind() binds at least two time series that have a common frequency. It can also construct a data frame
time_ser_diff = cbind(time_ser,
hous_st_d1 = diff(time_ser[,1]),
income_d2 = diff(diff(time_ser[,2])),
fed_fundsR_d1 = diff(time_ser[,3]),
yield_sp_d1 = diff(time_ser[,4]),
sec_conL_d2 = diff(diff(time_ser[,5])),
unempR_d1 = diff(time_ser[,6]),
CPI_d3 = diff(diff(diff(time_ser[,7]))),
pvt_house_comp_d1 = diff(time_ser[,8]),
mortgR_d1 = diff(time_ser[,9]),
real_estL_d2 = diff(diff(time_ser[,10])),
house_supply_d1 = (diff(time_ser[,11]))
)
# rename variables or column names using plyr
time_ser_diff = time_ser_diff %>% as.data.frame() %>%
plyr::rename(c("time_ser.hous_st" = "hous_st",
"time_ser.income" = "income",
"time_ser.fed_fundsR" = "fed_fundsR",
"time_ser.yield_sp" = "yield_sp",
"time_ser.sec_conL" = "sec_conL",
"time_ser.unempR" = "unempR",
"time_ser.CPI" = "CPI",
"time_ser.pvt_house_comp" = "pvt_house_comp",
"time_ser.mortgR" = "mortgR",
"time_ser.real_estL" = "real_estL",
"time_ser.house_supply" = "house_supply")) %>%
ts(frequency = 12, start = c(1976,6), end =c(2018,12))
# Training and test sets of the time_ser_diff data frame
train_set = FinCrisis[1:409, ]
test_set = FinCrisis[-(1:409),]
# Training and test sets of the time_ser_diff data frame
train_set_diff = ts(time_ser_diff[1:409, ], frequency=12, start = c(1976,6))
test_set_diff = ts(time_ser_diff[-(1:409),], frequency = 12, start = c(2010,7))
PCA reduces p-dimension dataset to an m-dimension dataset where p > m. It describes the orginal data using fewe variables or dimensions than initially measured. We project the original time_ser_diff data onto a new, orthogonal basis. This removes multicollinearity. R calculates PCA using singulae value decomposition of the scaled and centered matrix data (rather than eigen on covarince matrix) as it provides numerical accuracy.
time_ser.pca = prcomp(time_ser_diff[,2:11], #formula with no response variable and numerical explan vars
center = TRUE, # logical value indicating whether the variables should be shifted to be zero centered. Alternately, we can supply a vector of length equal to the number of columns of x. The value is passed to scale.
scale. = TRUE) #a logical value indicating whether the variables should be scaled to have unit variance before the analysis takes place. The default is FALSE for consistency with S, but in general scaling is advisable. Alternatively, a vector of length equal the number of columns of x can be supplied. The value is passed to scale
summary(time_ser.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4645 1.3724 1.0866 0.65482 0.49411 0.36791 0.1731
## Proportion of Variance 0.6074 0.1883 0.1181 0.04288 0.02441 0.01354 0.0030
## Cumulative Proportion 0.6074 0.7957 0.9138 0.95669 0.98111 0.99464 0.9976
## PC8 PC9 PC10
## Standard deviation 0.13519 0.06179 0.03869
## Proportion of Variance 0.00183 0.00038 0.00015
## Cumulative Proportion 0.99947 0.99985 1.00000
Proportion of variance shows how much of the variance is explained by that principal component. The components are always sorted by how important they are, so the most important components will always be the first few.
#selecting the first 4 principal components to use in the model
x_pca <- time_ser.pca$x[, 1:6]
head(x_pca)
## PC1 PC2 PC3 PC4 PC5 PC6
## [1,] -2.532253 0.9589985 -0.6856539 -0.4194933 0.6971565 -0.9574271
## [2,] -2.417471 1.1089015 -0.9856733 -0.2229814 0.6730305 -0.8642424
## [3,] -2.397437 1.0653462 -1.1664993 -0.3159122 0.5733262 -0.7703319
## [4,] -2.369972 0.9977712 -1.1642373 -0.3433385 0.6511079 -0.7066144
## [5,] -2.267717 1.0412444 -1.4965310 -0.3725687 0.5711167 -0.6006079
## [6,] -2.265047 1.0854381 -1.4703869 -0.5285890 0.5150894 -0.6393254
class(x_pca)
## [1] "matrix"
# combine PC1-PC4 as explanatory variables with hous_st as response variable in a new dataset:
pca_data = ts.union(time_ser[,1], x_pca)
# renaming variable names
pca_data = pca_data %>% as.data.frame() %>%
plyr::rename(c("time_ser[, 1]" = "hous_st", "x_pca.PC1" = "PC1",
"x_pca.PC2" = "PC2", "x_pca.PC3" = "PC3", "x_pca.PC4" = "PC4",
"x_pca.PC5" = "PC5", "x_pca.PC6" = "PC6")) %>%
ts(frequency = 12, start = c(1976,6), end =c(2018,12))
# Training and test sets of the pca_data
train_pca.ts = ts(pca_data[1:409, ], frequency=12, start = c(1976,6))
test_pca.ts = ts(pca_data[-(1:409),], frequency = 12, start = c(2010,7))
When the trends and patterns of two series are similar, then they are cointegrated. The cointegration test measures whether the residuals from a regression are stationary. Stationary residuals are cointegrated. Therefore, the fact that time series are correlated is statistically significant, and not due to some chance.It is also a Dickey-Fuler stationarity test on residuals where the null hypothesis is that the series are not cointegrated. For a stationary test, we should reject the null hypothesis of no cointegrated.
The concept of cointegrated time series arises from the idea that housing prices, securities’ prices, interest rates and other economic indicators return to their long-term average levels after significant movements in short terms. Besides the imbalance in the demand and supply of houses, prices revert to their means as housing pries are highly correlated with inflation. Further, inflation rates are highly correlated with wages or real disposable income.
Given two series x(t) and y(t), R will search for paramteres α, β, and ρ such that
y(t) = α + β * x(t) + r(t) r(t) = ρ * r(t−1) + ϵ(t)
where r(t) = residual and, ϵ(t) = series of idependently and identically distributed (i.i.d) innovations with mean = 0
If |ρ| < 1, then x(t) and y(t) are cointegrated (i.e., r(t) doesn’t contain a unit root). if |ρ| = 1, then the residual series R[t] has a unit root and follows a random walk.
# Constructs an Engle-Granger cointegration model from pvt_house_comp (X) and hous_st (Y)
# identifies a pair of cointegrated series
# procedure selects the appropriate values for α, β, and ρ that best fit the following model:
fit_egcm = egcm(time_ser_diff[,8], time_ser_diff[,1])
summary(fit_egcm)
## Y[i] = 0.9736 X[i] + 66.8147 + R[i], R[i] = 0.7482 R[i-1] + eps[i], eps ~ N(0,125.2861^2)
## (0.0215) (69.6048) (0.0314)
##
## R[511] = -864.9835 (t = -4.788)
##
## WARNING: The series seem cointegrated but the residuals are not AR(1).
##
## Unit Root Tests of Residuals
## Statistic p-value
## Augmented Dickey Fuller (ADF) -4.276 0.00583
## Phillips-Perron (PP) -133.946 0.00010
## Pantula, Gonzales-Farias and Fuller (PGFF) 0.736 0.00010
## Elliott, Rothenberg and Stock DF-GLS (ERSD) -4.261 0.00014
## Johansen's Trace Test (JOT) -79.007 0.00010
## Schmidt and Phillips Rho (SPR) -86.676 0.00010
##
## Variances
## SD(diff(X)) = 95.289345
## SD(diff(Y)) = 112.092429
## SD(diff(residuals)) = 133.568139
## SD(residuals) = 180.670994
## SD(innovations) = 125.286102
##
## Half life = 2.389727
## R[last] = -864.983531 (t=-4.79)
plot(fit_egcm)
This test is different from Augmented Dickey Fuller and Phillips-Perron unit root tests.It measures the evidence of coinintegration in the residuals of two time series. In this case, I have regressed housing starts on provate houses completed series.
We cannot use ADF on residuals as they are devoid of Dickey-Fuller distributions in which the null hypothesis is that cointegration is absent. Alternatively, the residuals have Phillips-Ouliaris distributions.
# Engle-Granger two-step with pvt_house_comp regressed on hous_st:
# Available in po.test() from tseries
# Computes the Phillips-Ouliaris test for the null hypothesis that the matrix or multivariate time series is not cointegrated.
po.test(log(time_ser_diff[,c(1,8)]))
##
## Phillips-Ouliaris Cointegration Test
##
## data: log(time_ser_diff[, c(1, 8)])
## Phillips-Ouliaris demeaned = -94.948, Truncation lag parameter =
## 5, p-value = 0.01
H0: the 2 Series are not cointegrated Ha: the 2 Series are cointergated
We reject the null hypothesis and conclude that the two series are cointegrated.
This test measures if three or more time series are cointegrated. Then, we take a linear combination of underlying series to form a stationary series. VAR(p) without drift is of the form:
x_t = μ + A_1 * x_(t−1) + … + A_p * x_(t−p) + w_t
μ = vector-valued mean of the series, A_i = coefficient matrices for each lag, w_t = multivariate Gaussian noise term with mean zero.
By differencing the series, we can form a Vector Error Correction model (VECM):
Δx_t = μ + A * x_(t−1) + Γ_1 * Δx_(t−1) +…+ Γ_p * Δx_(t−p) + w_t
Δx_t = x_t − x_(t−1) : differencing operator, A = coefficient matrix for the first lag, Γ_i = matrices for each differenced lag.
When the matrix A=0, the series are not cointegrated.
We perform an eigenvalue decomposition of A. r is the rank of the matrix A and the Johansen test checks if r = 0 or 1.
r=n−1, where n is the number of time series under test.
H0: r=0 means implies that no cointegration is present. When rank r > 0, there is a cointegrating relationship between at least two time series.
The eigenvalue decomposition outputs a set of eigenvectors. The components of the largest eigenvector is used in formulating the coefficients of the linear combination of time series. This creates stationarity. We should run the Johansen Test of Cointegration for variables which are I(1) before running ECM. If series are not cointegrated, we don’t have to perform ECM.
fit_jo = ca.jo(log(time_ser_diff[,c(1,8,11)]),
ecdet = "none", # whether to use constant or drift term in the model
type = "trace", # use the trace test statistic or the maximum eigenvalue test statistic
spec = "longrun")
summary(fit_jo)
##
## ######################
## # Johansen-Procedure #
## ######################
##
## Test type: trace statistic , with linear trend
##
## Eigenvalues (lambda):
## [1] 0.14624891 0.05501741 0.01938735
##
## Values of teststatistic and critical values of test:
##
## test 10pct 5pct 1pct
## r <= 2 | 9.97 6.50 8.18 11.65
## r <= 1 | 38.77 15.66 17.95 23.52
## r = 0 | 119.25 28.71 31.52 37.22
##
## Eigenvectors, normalised to first column:
## (These are the cointegration relations)
##
## hous_st.l2 pvt_house_comp.l2 house_supply.l2
## hous_st.l2 1.0000000 1.0000000 1.000000
## pvt_house_comp.l2 -0.9976467 -0.6774901 -12.345973
## house_supply.l2 0.1172957 0.9884285 3.759475
##
## Weights W:
## (This is the loading matrix)
##
## hous_st.l2 pvt_house_comp.l2 house_supply.l2
## hous_st.d 0.02084387 -0.083670630 0.0010979229
## pvt_house_comp.d 0.21339732 -0.002545864 0.0006017081
## house_supply.d 0.04838887 -0.011902500 -0.0027458166
The largest eigenvalue generated by the test is 0.14624891.
Next, the output shows the trace test statistic for the three hypotheses of r ≤ 2, r ≤ 1 and r = 0. At all these three levels, the test statistic exceeds the 0.05 significance level. For instance, when r = 0, 119.25 > 31.52. Similarly, in the second test we test the null hypothesis for r ≤ 1 against the alternative hypothesis of r > 1. As 38.77 > 17.95, we reject r ≤ 1, i.e. the null hypothesis of no cointegration. Thus, the matrix’ rank is 2 and the series will become stationary after using a linear combination of three time series.
We can make a linear combination by using components of eigenvectors associated with the largets eigenvalue of 0.14624891. Correspondingly, we use vectors under the column hous_st.l2 which are (1.000000,-0.9976467 , 0.1172957) to obtain a stationary series.
linear_series = 1.000*time_ser_diff[,1] - 0.9976467*time_ser_diff[,8] + 0.1172957*time_ser_diff[,11]
autoplot(linear_series, type="l") + xlab("Years") + ggtitle("Linear Combination of hous_st, pvt_house_comp and house_supply")
adf.test(linear_series)
##
## Augmented Dickey-Fuller Test
##
## data: linear_series
## Dickey-Fuller = -4.2937, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
The p-value in the Dickey-Fuller test is 0.01 < 0.05. So, we reject the null hypothesis of unit root and conclude that the series formed from the linear combination is stationary.
http://blog.mindymallory.com/2018/02/basic-time-series-analysis-the-var-model-explained/
The variables with cointegration I(0) have a short term relationship as opposed to those variables with cointegration I(1), as the latter have long term relationship. Both short and long run effects are present in the short run error correction model. The first equaation is the ARDL(1,1) model that presumes a long run or steady state association between x and y. When deriving the error correction model, we can add more lagged differences of the regressor (x variable) to remove serial correlation.
y_t = δ + θ_1* y_(t−1) + δ_0 * x_t + δ_1 * x_(t−1) + ν_t,
Δy_t = −α * [y_(t−1) − β_1 − β_2 * x_(t−1)] + δ_0 * Δx_t + ν_t
We can construct the ECM for US housing starts and housing supply as:
Δb_t = −α * [b_(t−1) − β_1 − β_2 * f_(t−1)] + (δ_0 * Δf_t) + [δ_1 * Δf_(t−1)] + ν_t (estimated using codes below)
We can estimate a vector autoregression model of order 1, VAR(1) if both series are I(0). If they are I(1), we can estimate the same equations vy taking the first differences.
y_t = β_10 + β_11 * y_(t−1) + β_12 * x_(t−1) + ν
x_t = β_20 + β_21 * y_(t−1) + β_22 * x_(t−1) + ν
If both the variables in the above equations are cointegrated, we have to include the cointegration relationship in the model. This model is known as the vector error correction model. The equation below displays the cointegration relationship with stationary error terms.
y_t = β_0 + β_1 * x_t + e_t
In the codes below, I have estimated the state of cointergation between hous_st and house_supply:
# dynlm () fits Dynamic Linear Models and Time Series Regression
fit_dyn = dynlm(hous_st_d1 ~ house_supply_d1 + pvt_house_comp_d1 -1, data = train_set_diff, na.action = na.omit) # -1 indicates no intercept
resid_fit = resid(fit_dyn)
ndiffs(resid_fit) #result: 1
## [1] 0
tidy(fit_dyn) # The results of the cointegration equation fit_dyn
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 house_supply_d1 -44.4 10.2 -4.35 0.0000173
## 2 pvt_house_comp_d1 0.223 0.0639 3.49 0.000545
augment(fit_dyn) # shows the fitted values and residuals for each of the original points in the regression.
## # A tibble: 408 x 11
## .rownames hous_st_d1 house_supply_d1 pvt_house_comp_… .fitted .se.fit
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Jul 1976 -94 -0.5 -68 7.08 6.93
## 2 Aug 1976 149 -0.2 52 20.5 3.78
## 3 Sep 1976 170 0 -3 -0.668 0.192
## 4 Oct 1976 -91 -0.4 17 21.6 4.16
## 5 Nov 1976 12 0.2 40 0.0156 3.38
## 6 Dec 1976 163 -0.4 29 24.2 4.37
## 7 Jan 1977 -277 -0.300 0 13.3 3.07
## 8 Feb 1977 416 -0.2 198 53.0 12.7
## 9 Mar 1977 120 -0.1000 -36 -3.57 2.58
## 10 Apr 1977 -171 0.5 -71 -38.0 6.59
## # … with 398 more rows, and 5 more variables: .resid <dbl>, .hat <dbl>,
## # .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>
output = dynlm(d(resid_fit)~L(resid_fit)+L(d(resid_fit))-1) #no constant
tidy(output)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 L(resid_fit) -1.62 0.0807 -20.1 1.08e-62
## 2 L(d(resid_fit)) 0.189 0.0488 3.86 1.31e- 4
The relevant statistic is τ = 2.729679. We reject the null hypothesis that the residuals have unit roots, therefore the series are cointegrated.
If the p-value < 0.05, we reject the null hypothesis of no intergation. The PO test marginally rejects the null of no cointegration at the 5 percent level. When series are cointergrated, we apply the Vector Error Correction model to better understand the causal relationship between the two variables.
vec_hous_st<- dynlm(d(hous_st)~L(resid_fit), data = train_set_diff)
vec_house_supply <- dynlm(d(house_supply)~L(resid_fit), data = train_set_diff)
# generates the results from both equations
tidy(vec_hous_st)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2.71 5.48 -0.494 6.21e- 1
## 2 L(resid_fit) -0.362 0.0485 -7.47 4.96e-13
tidy(vec_house_supply)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.00460 0.0273 0.168 0.866
## 2 L(resid_fit) -0.0000399 0.000242 -0.165 0.869
In the hous_st equation, the error correction term’s coefficient : e_(t−1), is significant for housing starts, implying that changes in the housing supply influence housing starts;
Alternatively, in the house_supply equation, the the error correction coefficient is statistically insignificant, indicating that changes in housing starts impact housing supply.
# times series regression of lagged variables: hous_st and house_supply
fit_lag_dyn = dynlm(L(train_set_diff[,1])~L(train_set_diff[,11]))
coef_b1 = coef(fit_lag_dyn)[[1]] # coefficient of the intercept from fit_lag_dyn model
coef_b2 = coef(fit_lag_dyn)[[2]] # coefficient of the lagged value of the variable: house_supply
# regress hous_st (y variable) with lagged hous_st, levelled and lagged values of house_supply
fit_dyn_ols = dynlm(train_set_diff[,1]~L(train_set_diff[,1])+train_set_diff[,11]+L(train_set_diff[,11]))
coef_a <- 1-coef(fit_dyn_ols)[[2]] # subtract the coefficient of lagged hous_st from 1
coef_d0 <- coef(fit_dyn_ols)[[3]] # coefficient of house_supply in fit_dyn_ols equation
coef_d1 <- coef(fit_dyn_ols)[[4]] # coefficient of lagged house_supply in fit_dyn_ols equation
D_hous_st <- diff(train_set_diff[,1]) # first differencing of hous_st series
D_house_supply <- diff(train_set_diff[,11]) # first differencing of house_supply series
L_hous_st <- stats::lag(train_set_diff[,1],-1) # takes the first lag of hous_st
L_house_supply <- stats::lag(train_set_diff[,11],-1) # takes the first lag of house_supply
LD_house_supply <- stats::lag(diff(train_set_diff[,11]),-1) # takes the first lag of the difference of house_supply
st_supply_combined = data.frame(ts.union(cbind(
train_set_diff[,1],train_set_diff[,11],L_hous_st,L_house_supply,D_hous_st,D_house_supply,LD_house_supply)))
# estimate non-linear least squares (nls) model using the following arguments:
# 1. formula = regression model to be estimated written using regular text mathematical operators
# 2. start = list of guessed or otherwise approximated values of the estimated parameters to initiate a Gauss-Newton numerical optimization process
# 3. data = a data frame, list, or environment data source.
formula = D_hous_st ~ -a*(L_hous_st-b1-b2*L_house_supply) + d0 * D_house_supply+d1*LD_house_supply
fit_nls = nls(formula, na.action=na.omit, data=st_supply_combined,
start=list(a=coef_a, b1 = coef_b1, b2 = coef_b2,
d0 = coef_d0, d1 = coef_d1))
tidy(fit_nls, caption ="Parameter estimates in the error correction model") # kable to make table not working
## # A tibble: 5 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 a 0.0964 0.0201 4.80 2.21e- 6
## 2 b1 2814. 229. 12.3 1.06e-29
## 3 b2 -216. 36.2 -5.97 5.14e- 9
## 4 d0 -49.2 10.2 -4.83 1.92e- 6
## 5 d1 -9.89 10.5 -0.942 3.47e- 1
We can also use the error correction model to assess whether the two series are coinegrated by observing the errors of the correction part for stationarity. The model that estimates the errors are:
e_(t−1) = b_(t−1) − β_1 − β_2 * f_(t-2)
ehat = st_supply_combined$L_house_supply - coef(fit_nls)[[2]] -
coef(fit_nls)[[3]] * st_supply_combined$L_house_supply
ehat = ts(ehat)
ehat.adf <- dynlm(d(ehat)~L(ehat)+L(d(ehat))-1)
tidy(ehat.adf)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 L(ehat) -0.00327 0.00386 -0.846 0.398
## 2 L(d(ehat)) -0.138 0.0493 -2.80 0.00537
#caption="Stationarity test within the error correction model")
We should compare the t-statistic of the lagged term,t= -0.846343 to test for cointegration. As the p-value of 0.3978 > 0.05, we fail to reject the null of no cointegration.
We treat all variables symmetrically in VAR i.e we model them in such a way that these endogenous variables equally impact each other.
As a generalization of the univariate autoregressive model, it forecasts a vector of time series.The system has one equation per variable. The right hand side of each equation has lags and a constant of all the variables.
For a stationary series, we can directly fit VAR to the data and forecast. This is called “VAR in levels”. Otherwise, we difference the non-stationary data first and then fit the model. The resulting model is called “VAR in differences.” Using leveled variables (which are stationary) in VAR models can result in spurious regression. But, differenced variables will remedy the problem. In both instances, we use the concept of least squares to estimate the model.
Moreover, a non-stationary series could be cointegrated. This implies that there is a linear combination of variables which is stationary. In this scenario, we should make a vector error correction model i.e. a VAR model with an error correction mechanism
The VAR model can be used when the variables under study are I(1) but not cointegrated.
Δy_t = [β_11 * Δy_(t−1)] + [β_12 * Δx_(t−1)] + ν Δx_t = [β_21 * Δy_(t−1)] + [β_22 * Δx_(t−1)] + ν
# The R output shows the lag length selected by each of the information criteria available in the vars package.
# selection criteria summary
VARselect(time_ser_diff[,1:2], # Data item containing the endogenous variables
type="both", # Type of deterministic regressors to include: constant, trend, both or none
exogen = time_ser_diff[,3:7])[["selection"]]
## AIC(n) HQ(n) SC(n) FPE(n)
## 5 4 3 5
The R output shows the lag length selected by each of the information criteria available in the vars package. From the above results, we choose 3 as a lag parameter:p as = 5 minimizes AIC,and FPE. We construct multivariate order 3 VAR model, VAR(5).
R estimates VAR using OLS equation where the model is of the form: y_t = A_1y_(t-1) + …. + A_p y_(t-p) + CD_t + u_t
where y_t is a K * 1 vector of endogenous variables and u_t assigns a spherical disturbance term of the same dimension. The coefficient matrices A_1……Ap are of dimension K * K.
# estimates VAR by using OLS per equation
var1 = VAR(time_ser_diff[,1:2], type="both", exogen = na.omit(time_ser_diff[, 3:7]))
stargazer(var1$varresult, type="text", dep.var.labels = c("Housing Starts, Income"), align = TRUE)
##
## ===========================================================
## Dependent variable:
## ----------------------------
## Housing Starts, Income
## (1) (2)
## -----------------------------------------------------------
## hous_st.l1 0.924*** 0.026**
## (0.019) (0.012)
##
## income.l1 -0.006 0.871***
## (0.034) (0.021)
##
## const 369.634* 642.167***
## (199.954) (122.081)
##
## trend 0.252 3.050***
## (1.280) (0.781)
##
## fed_fundsR -6.722 1.831
## (4.557) (2.782)
##
## yield_sp 3.740 1.359
## (13.131) (8.017)
##
## sec_conL 0.013 0.121***
## (0.048) (0.029)
##
## unempR -8.297 2.309
## (5.497) (3.356)
##
## CPI -1.348 -3.233**
## (2.513) (1.534)
##
## -----------------------------------------------------------
## Observations 510 510
## R2 0.928 1.000
## Adjusted R2 0.926 0.999
## Residual Std. Error (df = 501) 110.069 67.202
## F Statistic (df = 8; 501) 802.658*** 125,643.400***
## ===========================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# function computes the multivariate Portmanteau- and Breusch-Godfrey test for serially correlated errors.
serial.test(var1, lags.pt=10)
##
## Portmanteau Test (asymptotic)
##
## data: Residuals of VAR object var1
## Chi-squared = 106.87, df = 36, p-value = 5.972e-09
H0: no serial correlation H1: serial correlation is present
The residuals for this model pass the test for serial correlation. I have plotted the forecasts generated by the VAR(3) below:
# n.ahead - number of forcast steps
# dumvar - matrix which has the same column dimension as in the call to VAR() and row dimension equal to n.ahead. It is used when we have "exogen" in VAR
regressors = tail(time_ser_diff[,3:7], 50)
fct =predict(var1, dumvar = regressors ,n.ahead = nrow(regressors)) # gives 50 forecast values of hous_st
# Time Series plots of VAR forecasts with differently shaded confidence regions (fanchart) for each endogenous variable.
fanchart(fct, colors = "red", cis = 0.95, plot.type = "multiple", main = "Forecasts")
We use F-test on the lags of other variables to implement the granger causality. It tests whether the lags of variables are useful in forecasting housing starts and vice versa.
For instance, fed_fundsR can granger cause hous_st if hous_st can be more accurately predicted by the lagged values of both hous_st and fed_fundsR, rather than the lagged values of hous_st alone. Thus, the granger causality test examines if lagged values of a variable can enhance the forecasts of another variable.
# Granger causality
causality(var1, cause="hous_st")
## $Granger
##
## Granger causality H0: hous_st do not Granger-cause income
##
## data: VAR object var1
## F-Test = 4.9632, df1 = 1, df2 = 1002, p-value = 0.02611
##
##
## $Instant
##
## H0: No instantaneous causality between: hous_st and income
##
## data: VAR object var1
## Chi-squared = 4.6197, df = 1, p-value = 0.03161
causality(var1, cause="income")
## $Granger
##
## Granger causality H0: income do not Granger-cause hous_st
##
## data: VAR object var1
## F-Test = 0.027988, df1 = 1, df2 = 1002, p-value = 0.8672
##
##
## $Instant
##
## H0: No instantaneous causality between: income and hous_st
##
## data: VAR object var1
## Chi-squared = 4.6197, df = 1, p-value = 0.03161
Three of the four results have sufficiently small p-value and they indicates that we can reject null hypothesis: they Granger-cause others. The regressors have information to predict today’s housing starts.
Alternatively, we fail to reject the null hypothesis that income do not Granger-cause hous_st. This means income does not play much role in predicting today’s housing starts.
The ARCH-LM test with q lags checks for the presence of ARCH effects at lags 1 to q. It tests if the coefficients α_1,…. α_q in the equation below:
x^2_t = α_0 + α_1 * x^2_(t-1) +….+ α_q * x^2_(t-q) + ϵ_t
# compute univariate and multivariate ARCH-LM tests for a VAR(p).
arch.test(var1, lags.multi = 2) # uses 2 lags while computing multivariate test statistic
##
## ARCH (multivariate)
##
## data: Residuals of VAR object var1
## Chi-squared = 144.39, df = 18, p-value < 2.2e-16
When q = 2, we test for ARCH effects jointly at lags 1 and 2. H0 = α_1 = α_2 = 0 As the p-value is very small, we reject the null hypothesis and conclude that ARCH effects are present at lags 1 and 2 jointly. ARCH effects are also present at higher lag orders, implying that the data is conditionally heteroskedastic.
In IRF, we shock one variable, say income, and propagate it through the fitted VAR model for a number of periods. We can trace this through the VAR model and see if it impacts the other variables in a statistically significant way.
IRF_hous_st = irf(var1, impulse = 'hous_st', response = 'income', n.ahead = 5, ci = 0.95) # variables names in impulse and response arise from the set of endogenous variables.
IRF_income = irf(var1, impulse = 'income', response = 'hous_st', n.ahead = 5, ci = 0.95)
plot(IRF_hous_st)
plot(IRF_income)
An impulse (shock) to housing starts at time zero has large effects the next period, and the effects enlarge over time. The dotted lines show the 95 percent interval estimates of these effects. The VAR function prints the values corresponding to the impulse response graphs.
Using the VAR model, a Forecast Error Variance Decomposition examines the impact of variables on one another. We use the forecast errors of each equation in the fitted VAR model to compute FEVD. Then, the fitted VAR model determines how much of each error realization is coming from unexpected changes (forecast errors) in the other variable.
Variance decomposition helps to interpret the VAR model. We can determine the amount of variation in the dependent variable is explained by each each independent variable. FEVD explains how a future shock in a time series changes future uncertainity in the other time series in the system. This process evolves over time, so a shock on a time series may not be important in the short run, but may be very significant in the long run.
# We can calculate FEVD with fevd() from the vars package.
FEVD = fevd(var1, n.ahead = 10)
plot(FEVD)
In the first plot, we see the FEVD for housing starts. It appears that although we were borderline on whether or not to conclude that federal funds rate Granger cause housing starts, the FEVD reveals that the magnitude of the causality is tiny anyway, while that of income is greater on housing starts.
In the second plot, we see the FEVD for income. It appears that although we were borderline on whether or not to conclude that housing starts and federal funds rate Granger cause income.
# series may be non-stationary but they are cointegrated, which means that there
# exists a linear combination of them that is stationary. In this case a VAR
# specification that includes an error correction mechanism (usually referred to
# as a vector error correction model) should be included
VECmodel = VECM(train_set_diff[,1:5],
lag = 3, # Number of lags
r = 1, # Number of cointegrating relationships
include = "const", # Type of deterministic regressors to include
estim = "ML") # Type of estimator: 2OLS for the two-step approach or ML for Johansen MLE
summary(VECmodel)
## #############
## ###Model VECM
## #############
## Full sample size: 409 End sample size: 405
## Number of variables: 5 Number of estimated slope parameters 85
## AIC 6486.702 BIC 6843.048 SSR 5985445
## Cointegrating vector (estimated by ML):
## hous_st income fed_fundsR yield_sp sec_conL
## r1 1 0.891575 49.52901 -79.69626 -2.224757
##
##
## ECT Intercept
## Equation hous_st -0.0223(0.0190) 125.8165(108.1263)
## Equation income 0.0127(0.0103) -48.1334(58.5937)
## Equation fed_fundsR 4.3e-05(9.0e-05) -0.2698(0.5148)
## Equation yield_sp -5.4e-05(3.5e-05) 0.3205(0.1981)
## Equation sec_conL 0.0061(0.0009)*** -33.4027(5.2147)***
## hous_st -1 income -1
## Equation hous_st -0.3824(0.0517)*** 0.0734(0.0941)
## Equation income -0.0091(0.0280) -0.3562(0.0510)***
## Equation fed_fundsR 3.8e-05(0.0002) -2.7e-05(0.0004)
## Equation yield_sp -0.0002(9.5e-05) -0.0001(0.0002)
## Equation sec_conL -0.0074(0.0025)** -0.0062(0.0045)
## fed_fundsR -1 yield_sp -1
## Equation hous_st -22.5685(13.5130). -14.2490(35.1534)
## Equation income -10.3410(7.3227) -30.8697(19.0496)
## Equation fed_fundsR 0.2211(0.0643)*** -1.1347(0.1674)***
## Equation yield_sp 0.0047(0.0248) 0.3397(0.0644)***
## Equation sec_conL -0.3420(0.6517) -0.2964(1.6954)
## sec_conL -1 hous_st -2
## Equation hous_st -0.2307(1.0368) -0.1705(0.0539)**
## Equation income 0.3111(0.5618) 0.0368(0.0292)
## Equation fed_fundsR 0.0040(0.0049) 0.0005(0.0003).
## Equation yield_sp 2.5e-05(0.0019) -0.0002(9.9e-05)*
## Equation sec_conL 0.1392(0.0500)** -0.0008(0.0026)
## income -2 fed_fundsR -2
## Equation hous_st 0.0762(0.0966) -11.5409(13.5994)
## Equation income -0.2787(0.0523)*** 0.2188(7.3695)
## Equation fed_fundsR 0.0003(0.0005) -0.1286(0.0647)*
## Equation yield_sp -0.0002(0.0002) -0.0420(0.0249).
## Equation sec_conL 0.0054(0.0047) -0.1984(0.6559)
## yield_sp -2 sec_conL -2
## Equation hous_st -26.3782(36.1831) 0.6551(1.0374)
## Equation income -18.6359(19.6077) 0.3534(0.5622)
## Equation fed_fundsR 0.4183(0.1723)* -0.0032(0.0049)
## Equation yield_sp -0.3196(0.0663)*** 0.0007(0.0019)
## Equation sec_conL 1.2717(1.7450) 0.0900(0.0500).
## hous_st -3 income -3
## Equation hous_st -0.0117(0.0508) -0.0403(0.0943)
## Equation income 0.0093(0.0275) -0.2130(0.0511)***
## Equation fed_fundsR -0.0002(0.0002) 0.0001(0.0004)
## Equation yield_sp -2.2e-05(9.3e-05) -0.0002(0.0002)
## Equation sec_conL -0.0025(0.0024) 0.0042(0.0045)
## fed_fundsR -3 yield_sp -3
## Equation hous_st -15.6448(12.4366) 28.2351(36.7446)
## Equation income 5.8998(6.7394) 25.0770(19.9119)
## Equation fed_fundsR -0.1635(0.0592)** -0.5129(0.1749)**
## Equation yield_sp 0.0724(0.0228)** 0.1327(0.0673)*
## Equation sec_conL -0.1802(0.5998) 1.5537(1.7721)
## sec_conL -3
## Equation hous_st -0.1031(1.0183)
## Equation income 0.2590(0.5518)
## Equation fed_fundsR -0.0019(0.0048)
## Equation yield_sp 0.0023(0.0019)
## Equation sec_conL 0.1732(0.0491)***
Interpret the results …??…
(VECforecast = predict(VECmodel, n.ahead = 60))
## hous_st income fed_fundsR yield_sp sec_conL
## 410 587.9733 11821.40 0.1527269896 2.514954 2517.559
## 411 575.7852 11816.72 0.1580969836 2.539636 2514.938
## 412 571.1187 11837.63 0.1995570508 2.540979 2512.559
## 413 582.6378 11856.18 0.1455268743 2.567269 2510.315
## 414 588.6964 11862.51 0.0569583212 2.593328 2508.541
## 415 591.6404 11868.99 0.0001405286 2.609843 2507.139
## 416 597.7493 11879.74 -0.0415946555 2.625940 2505.866
## 417 603.9747 11891.66 -0.0845962170 2.643446 2504.715
## 418 608.7993 11902.33 -0.1283099531 2.660230 2503.752
## 419 613.4194 11912.64 -0.1718928908 2.676389 2502.960
## 420 618.0799 11923.50 -0.2160591601 2.692935 2502.308
## 421 622.6389 11934.79 -0.2604974242 2.709835 2501.789
## 422 627.0699 11946.12 -0.3043989881 2.726406 2501.406
## 423 631.3639 11957.48 -0.3472241487 2.742395 2501.155
## 424 635.5064 11969.00 -0.3891319259 2.758047 2501.033
## 425 639.5059 11980.72 -0.4304770971 2.773539 2501.031
## 426 643.3612 11992.57 -0.4713458867 2.788835 2501.148
## 427 647.0764 12004.53 -0.5116523319 2.803873 2501.383
## 428 650.6600 12016.62 -0.5513644743 2.818658 2501.732
## 429 654.1154 12028.85 -0.5905061698 2.833212 2502.192
## 430 657.4427 12041.20 -0.6290940927 2.847538 2502.761
## 431 660.6434 12053.66 -0.6671302519 2.861634 2503.437
## 432 663.7205 12066.25 -0.7046225748 2.875502 2504.218
## 433 666.6770 12078.95 -0.7415868946 2.889152 2505.100
## 434 669.5153 12091.77 -0.7780372767 2.902589 2506.083
## 435 672.2377 12104.69 -0.8139830267 2.915816 2507.163
## 436 674.8467 12117.73 -0.8494327894 2.928836 2508.339
## 437 677.3447 12130.87 -0.8843967199 2.941654 2509.610
## 438 679.7340 12144.12 -0.9188853911 2.954275 2510.971
## 439 682.0169 12157.47 -0.9529087690 2.966703 2512.423
## 440 684.1955 12170.92 -0.9864764089 2.978941 2513.963
## 441 686.2720 12184.47 -1.0195978012 2.990994 2515.589
## 442 688.2487 12198.12 -1.0522823270 3.002865 2517.300
## 443 690.1275 12211.85 -1.0845391206 3.014558 2519.093
## 444 691.9105 12225.68 -1.1163770838 3.026077 2520.967
## 445 693.5997 12239.60 -1.1478049524 3.037426 2522.920
## 446 695.1971 12253.61 -1.1788313062 3.048608 2524.951
## 447 696.7046 12267.71 -1.2094645456 3.059626 2527.057
## 448 698.1241 12281.89 -1.2397128869 3.070485 2529.238
## 449 699.4574 12296.15 -1.2695843740 3.081187 2531.492
## 450 700.7063 12310.49 -1.2990868860 3.091735 2533.817
## 451 701.8726 12324.92 -1.3282281381 3.102133 2536.212
## 452 702.9580 12339.42 -1.3570156834 3.112385 2538.675
## 453 703.9643 12353.99 -1.3854569169 3.122492 2541.205
## 454 704.8929 12368.64 -1.4135590795 3.132458 2543.801
## 455 705.7457 12383.37 -1.4413292609 3.142287 2546.460
## 456 706.5241 12398.16 -1.4687744027 3.151980 2549.183
## 457 707.2297 12413.03 -1.4959013009 3.161541 2551.966
## 458 707.8641 12427.96 -1.5227166099 3.170972 2554.811
## 459 708.4287 12442.96 -1.5492268446 3.180277 2557.713
## 460 708.9250 12458.03 -1.5754383839 3.189457 2560.674
## 461 709.3544 12473.16 -1.6013574733 3.198516 2563.691
## 462 709.7183 12488.36 -1.6269902277 3.207456 2566.764
## 463 710.0181 12503.61 -1.6523426343 3.216280 2569.890
## 464 710.2552 12518.93 -1.6774205547 3.224989 2573.070
## 465 710.4308 12534.30 -1.7022297284 3.233586 2576.301
## 466 710.5462 12549.73 -1.7267757745 3.242075 2579.584
## 467 710.6027 12565.22 -1.7510641951 3.250456 2582.916
## 468 710.6015 12580.77 -1.7751003767 3.258732 2586.297
## 469 710.5438 12596.37 -1.7988895935 3.266906 2589.725
plot(VECforecast)
The codes below makes feed-forward neural networks. I have used box-cox transformation to get homoskedastic errors.
# nnetar function in the forecast package for R fits a neural network model to a time series with lagged values of the time series as inputs (and possibly some other exogenous inputs).
# nonlinear autogressive model, and it is not possible to analytically derive prediction intervals.
(fit_nnr = nnetar(train_pca.ts[,1],
xreg = train_pca.ts[,2:7], # external regressors, which must have the same number of rows
lambda="auto")) # transformation is automatically selected using BoxCox.lambda.
## Series: train_pca.ts[, 1]
## Model: NNAR(25,1,16)[12]
## Call: nnetar(y = train_pca.ts[, 1], xreg = train_pca.ts[, 2:7], lambda = "auto")
##
## Average of 20 networks, each of which is
## a 31-16-1 network with 529 weights
## options were - linear output units
##
## sigma^2 estimated as 41.88
autoplot(forecast::forecast(fit_nnr, xreg = train_pca.ts[,2:7], h=300) ) + xlab("Years") + ylab("Housing Starts (in 1000s of units)")
Neural networks are not based on a well-defined stochastic model, and so it is not straightforward to derive prediction intervals for the resultant forecasts. However, we can still compute prediction intervals using simulation where future sample paths are generated using bootstrapped residuals.
Here is a simulation of 9 possible future sample paths for the housing starts data. Each sample path covers the next 40 years after the observed data. In this way, we can iteratively simulate a future sample path. By repeatedly simulating sample paths, we build up knowledge of the distribution for all future values based on the fitted neural network.
sim = ts(matrix(0, nrow=40L, ncol=9L), # generates values for 9 scenarios (columns)
start=end(train_set_diff[,1])[1L]+1L)
for(i in seq(9))
sim[,i] = simulate(fit_nnr,xreg = tail(train_pca.ts[,2:7], 40), nsim=40L) # predict 40 observations (rows)
autoplot(train_set_diff[,1]) + autolayer(sim) + ggtitle("Future Sample Paths of the Housing Starts Data") + xlab("Years") + ylab("Housing Starts (in 1000s of units)")
We can get a suitable picture of the distribution of the forecast when we make the neural network several times.
# forecast() function produces prediction intervals for NNAR models:
fcast = forecast::forecast(fit_nnr,xreg = train_pca.ts[,2:7], PI=FALSE, h=40) # computes prediction intervals when TRUE
autoplot(fcast) + ggtitle("Forecasts with prediction intervals for the monthly US housing starts data.")
sweep::sw_tidy(fit_nnr)
## # A tibble: 4 x 2
## term estimate
## <chr> <dbl>
## 1 m 12
## 2 p 25
## 3 P 1
## 4 size 16
sweep::sw_glance(fit_nnr) %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
| model.desc | sigma | logLik | AIC | BIC | ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| NNAR(25,1,16)[12] | 5.884411 | NA | NA | NA | 0.0168485 | 5.884411 | 3.743895 | -0.0099673 | 0.2893068 | 0.0172308 | -0.1286103 |
# compares predicted vs actual values
sweep::sw_augment(fit_nnr) %>% tail() %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
| index | .actual | .fitted | .resid |
|---|---|---|---|
| Jan 2010 | 614 | 598.4449 | 15.555101 |
| Feb 2010 | 604 | 600.4611 | 3.538882 |
| Mar 2010 | 636 | 637.6933 | -1.693335 |
| Apr 2010 | 687 | 681.1662 | 5.833824 |
| May 2010 | 583 | 577.9268 | 5.073236 |
| Jun 2010 | 536 | 544.4407 | -8.440703 |
# monthly forecasts from May 2018 upto year December 2060
forecast::forecast(fit_nnr,xreg = train_pca.ts[,2:7],h=12,level = c(90, 95)) %>% tk_tbl()
## # A tibble: 35 x 13
## index Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
## <dbl> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 2010 "" "" "" "" "" "" " 98… 1075… 1107… 1171… 1249…
## 2 2011 1392.… 1449… 1580… 1599… 1535… 1585… 1510… 1456… 1497… 1663… 1587…
## 3 2012 1565.… 1663… 1631… 1804… 1892… 1859… 1888… 1856… 1871… 1923… 1892…
## 4 2013 1801.… 1791… 1719… 1760… 1820… 1855… 1877… 1891… 1950… 1945… 1843…
## 5 2014 1604.… 1502… 1338… 1077… 1037… 1050… 1112… 1256… 1407… 1372… 1501…
## 6 2015 1487.… 1472… 1360… 1339… 1348… 1161… 1086… " 98… " 91… " 90… " 83…
## 7 2016 " 848… " 73… " 84… " 91… " 90… 1000… " 98… 1098… 1084… 1184… 1186…
## 8 2017 1388.… 1657… 1632… 1594… 1533… 1720… 1709… 1841… 1888… 1729… 1724…
## 9 2018 1772.… 1994… 2131… 1776… 1906… 1795… 1840… 1816… 1622… 1675… 1642…
## 10 2019 1615.… 1655… 1648… 1814… 1842… 1732… 1708… 1717… 1807… 1743… 1823…
## # … with 25 more rows, and 1 more variable: Dec <fct>
KNN is an algorithm used for regression and classification. The algorithm stores a veriety of examples. Ech example is a vector of features that describes the example and the associated numerical value for predictions. For a new example, KNN finds the k examples that are most similar, also called nearest neighbors. Using a distance metric, called Euclidean distance, it predicts the aggregation of the target value based on the nearest neighbors in a regression.
pred = knn_forecasting(time_ser_diff[,1], k=5, # the number of nearest neighbors used
lags = 1:12, # lagged values of target variable; here using 1st three lags
h=60) # build KNN model to predict time series
#knn_examples(pred)
pred$prediction
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct
## 2019 1299.8 1324.2 1345.4 1366.2 1355.6 1397.2 1378.4 1438.4 1461.4 1446.2
## 2020 1417.0 1444.4 1490.8 1457.6 1443.0 1367.2 1335.6 1319.2 1317.0 1354.6
## 2021 1404.4 1384.4 1374.0 1397.2 1431.0 1418.8 1471.8 1470.2 1456.0 1465.6
## 2022 1472.6 1480.8 1430.0 1424.4 1440.8 1427.0 1452.4 1486.8 1472.0 1451.4
## 2023 1449.6 1490.4 1519.8 1522.6 1525.4 1547.2 1530.6 1523.4 1566.0 1587.0
## Nov Dec
## 2019 1440.0 1461.0
## 2020 1396.6 1396.4
## 2021 1490.4 1480.4
## 2022 1450.4 1447.4
## 2023 1599.8 1620.2
autoplot(pred) + xlab("Years") + ylab("Housing Starts (1000s of units)") + ggtitle("Forecast of housing starts")
We can also consult how the prediction was made. That is, we can consult the instance whose target was predicted and its nearest neighbors.
# nearest_neighbors() function obtains information on consultation when applied to knnForecast object
nearest_neighbors(pred)
## $instance
## Lag 12 Lag 11 Lag 10 Lag 9 Lag 8 Lag 7 Lag 6 Lag 5 Lag 4 Lag 3
## 1334 1290 1327 1276 1329 1177 1184 1280 1237 1217
## Lag 2 Lag 1
## 1256 1260
##
## $nneighbors
## Lag 12 Lag 11 Lag 10 Lag 9 Lag 8 Lag 7 Lag 6 Lag 5 Lag 4 Lag 3 Lag 2
## 1 1244 1214 1227 1210 1210 1083 1258 1260 1280 1254 1300
## 2 1186 1244 1214 1227 1210 1210 1083 1258 1260 1280 1254
## 3 1176 1250 1297 1099 1214 1145 1139 1226 1186 1244 1214
## 4 1226 1186 1244 1214 1227 1210 1210 1083 1258 1260 1280
## 5 1139 1226 1186 1244 1214 1227 1210 1210 1083 1258 1260
## Lag 1 H1 H2 H3 H4 H5 H6 H7 H8 H9 H10 H11 H12 H13
## 1 1343 1392 1376 1533 1272 1337 1564 1465 1526 1409 1439 1450 1474 1450
## 2 1300 1343 1392 1376 1533 1272 1337 1564 1465 1526 1409 1439 1450 1474
## 3 1227 1210 1210 1083 1258 1260 1280 1254 1300 1343 1392 1376 1533 1272
## 4 1254 1300 1343 1392 1376 1533 1272 1337 1564 1465 1526 1409 1439 1450
## 5 1280 1254 1300 1343 1392 1376 1533 1272 1337 1564 1465 1526 1409 1439
## H14 H15 H16 H17 H18 H19 H20 H21 H22 H23 H24 H25 H26 H27
## 1 1511 1455 1407 1316 1249 1267 1314 1281 1461 1416 1369 1369 1452 1431
## 2 1450 1511 1455 1407 1316 1249 1267 1314 1281 1461 1416 1369 1369 1452
## 3 1337 1564 1465 1526 1409 1439 1450 1474 1450 1511 1455 1407 1316 1249
## 4 1474 1450 1511 1455 1407 1316 1249 1267 1314 1281 1461 1416 1369 1369
## 5 1450 1474 1450 1511 1455 1407 1316 1249 1267 1314 1281 1461 1416 1369
## H28 H29 H30 H31 H32 H33 H34 H35 H36 H37 H38 H39 H40 H41
## 1 1467 1491 1424 1516 1504 1467 1472 1557 1475 1392 1489 1370 1355 1486
## 2 1431 1467 1491 1424 1516 1504 1467 1472 1557 1475 1392 1489 1370 1355
## 3 1267 1314 1281 1461 1416 1369 1369 1452 1431 1467 1491 1424 1516 1504
## 4 1452 1431 1467 1491 1424 1516 1504 1467 1472 1557 1475 1392 1489 1370
## 5 1369 1452 1431 1467 1491 1424 1516 1504 1467 1472 1557 1475 1392 1489
## H42 H43 H44 H45 H46 H47 H48 H49 H50 H51 H52 H53 H54 H55
## 1 1457 1492 1442 1494 1437 1390 1546 1520 1510 1566 1525 1584 1567 1540
## 2 1486 1457 1492 1442 1494 1437 1390 1546 1520 1510 1566 1525 1584 1567
## 3 1467 1472 1557 1475 1392 1489 1370 1355 1486 1457 1492 1442 1494 1437
## 4 1355 1486 1457 1492 1442 1494 1437 1390 1546 1520 1510 1566 1525 1584
## 5 1370 1355 1486 1457 1492 1442 1494 1437 1390 1546 1520 1510 1566 1525
## H56 H57 H58 H59 H60
## 1 1536 1641 1698 1614 1582
## 2 1540 1536 1641 1698 1614
## 3 1390 1546 1520 1510 1566
## 4 1567 1540 1536 1641 1698
## 5 1584 1567 1540 1536 1641
Because we have used lags 1-3 as features, the features associated with the next future value of the time series are the two three last values of the time series. The two most similar examples (nearest neighbors) of this instance are vectors [6,7] and [5,6], whose targets (8 and 7) are averaged to produce the prediction 7.5. You can obtain a nice graph including the instance, its nearest neighbors and the prediction: (change)
Below, I have plotted each nearest neighbor in a different plot. R sorts the neighbors in the plots according to their distance to the instance, being the neighbor in the top plot the nearest neighbor.
autoplot(pred, highlight = "neighbors")
Commonly used in KNN, this strategy entails the use of a vector of target values. The forecasting horizon or the number of periods to forecast is the length of vector. For instance, to forecast the number of housing starts for the next 24 months, the model would have 1-24 lags, i.e., housing starts for past 24 months consecutively (feature vector). This way, we would look for the husing starts most similar to the last 24 months in the time series and we would predict an aggregation of their subsequent 24 months.
pred_mimo = knn_forecasting(time_ser_diff[,1], h = 24, lags = 1:24, k = 5, msas = "MIMO")
autoplot(pred_mimo, highlight = "neighbors", faceting = FALSE)
The average of the target vectors of the two nearest neighbors gives the predictions. The plot above presents all the nearest neighbors in one graph. The previous 24 values of the time series are the new instance whose target has to be predicted. Points shown in blue color are the two sequences of 24 consecutive values most similar to this instance. Alternatively, the model averages the subsequent 24 points shown in green are averaged to obtain the prediction (in red).
Another strategy would be to produce several KNN models, each with varying values of k. We would take the avergae of the forecasts generated from each KNN model to get the final forecasts of housing starts.
pred_k <- knn_forecasting(time_ser_diff[,1], h = 60, lags = 1:24, k = c(2, 4))
pred_k$prediction
## Jan Feb Mar Apr May Jun Jul
## 2019 1431.875 1362.500 1431.375 1445.875 1385.875 1505.750 1454.500
## 2020 1438.125 1431.625 1386.375 1342.125 1304.625 1308.250 1288.625
## 2021 1407.500 1443.750 1466.750 1451.000 1494.375 1468.375 1491.250
## 2022 1405.875 1422.625 1443.625 1406.000 1472.875 1452.500 1475.625
## 2023 1548.500 1524.000 1562.500 1547.250 1567.250 1573.875 1590.500
## Aug Sep Oct Nov Dec
## 2019 1479.250 1444.125 1457.375 1449.250 1476.500
## 2020 1370.125 1346.750 1402.875 1405.875 1418.250
## 2021 1489.875 1496.000 1474.625 1461.125 1475.500
## 2022 1459.375 1451.750 1494.125 1466.625 1510.375
## 2023 1603.000 1627.000 1642.625 1684.625 1636.000
autoplot(pred_k) + xlab("Years") + ylab("Housing Starts (in 1000s of units)") + ggtitle("Forecasts of Housing Starts")
Cross validation of time series data is more convoluted than the regular leave-one-out-cross validation or k-folds of datasets. This is because variables may be serially correlated as observations x_t and x_(t+n) are not independent.
Resembling the k-fold cross validation method, we can use a non-rolling method comprising of a specified length of the training set. After each iteration, the method shifts forward the training length by the forecast horizon.
# Perform cross validation on a time series.
# By default, the models combined are from the auto.arima, ets, nnetar, stlm and tbats functions.
ts_cv = cvts(time_ser_diff[,1],
FUN = nnetar, # a function that process point forecasts for the model function.
rolling = FALSE, # If TRUE, non-overlapping windows of size maxHorizon will be used for fitting each model.
windowSize = 12, # length of the window to build each model
maxHorizon = 60, # maximum length of the forecast horizon to use for computing errors.
xreg = time_ser_diff[,2:5]) # External regressors to be used to fit the model.
# Returns range of summary measures of the cross-validated forecast accuracy for cvts objects.
# method implements ME, RMSE, and MAE.
# accuracy is calculated for each forecast horizon up to maxHorizon
accuracy(ts_cv)
## ME RMSE MAE
## Forecast Horizon 1 -60.630109 110.01173 80.85777
## Forecast Horizon 2 -18.495150 69.59350 59.51493
## Forecast Horizon 3 -61.968309 100.91097 74.84686
## Forecast Horizon 4 9.503759 85.94281 78.18174
## Forecast Horizon 5 4.194681 79.53472 73.55715
## Forecast Horizon 6 41.918388 75.39679 59.73669
## Forecast Horizon 7 4.072999 159.54219 139.54753
## Forecast Horizon 8 -47.693499 231.07522 203.55442
## Forecast Horizon 9 -39.406806 236.88779 212.22349
## Forecast Horizon 10 -3.722142 219.03465 183.31575
## Forecast Horizon 11 -21.211138 212.36812 161.60276
## Forecast Horizon 12 13.060942 282.44677 217.21376
## Forecast Horizon 13 62.917179 250.98025 198.69588
## Forecast Horizon 14 77.897245 295.12087 246.73593
## Forecast Horizon 15 66.831918 337.24754 242.21510
## Forecast Horizon 16 45.082476 273.33965 204.43128
## Forecast Horizon 17 94.324111 312.11714 242.53741
## Forecast Horizon 18 133.505566 400.55255 324.99131
## Forecast Horizon 19 133.838601 414.19021 340.18166
## Forecast Horizon 20 45.452069 463.91678 383.89302
## Forecast Horizon 21 61.473242 531.94486 393.39181
## Forecast Horizon 22 64.874706 430.09793 329.15941
## Forecast Horizon 23 41.449956 484.48499 351.81480
## Forecast Horizon 24 68.394053 454.61761 334.07704
## Forecast Horizon 25 44.598589 443.59502 299.64173
## Forecast Horizon 26 79.288187 468.82204 374.27548
## Forecast Horizon 27 40.130841 441.40161 333.55681
## Forecast Horizon 28 51.724670 436.33708 329.46589
## Forecast Horizon 29 40.856948 456.41658 373.92112
## Forecast Horizon 30 -2.949442 469.70686 372.45776
## Forecast Horizon 31 12.362397 481.53855 406.86980
## Forecast Horizon 32 52.078144 522.62797 451.32157
## Forecast Horizon 33 4.530665 529.47800 433.06638
## Forecast Horizon 34 -76.050787 572.06611 435.42710
## Forecast Horizon 35 -15.475664 613.17562 503.47542
## Forecast Horizon 36 -65.892046 607.77530 492.03910
## Forecast Horizon 37 -23.511715 586.80531 486.93472
## Forecast Horizon 38 -8.922999 575.05667 487.96236
## Forecast Horizon 39 37.811661 552.95300 489.89184
## Forecast Horizon 40 39.119280 554.93572 489.56371
## Forecast Horizon 41 35.446097 553.38486 485.93733
## Forecast Horizon 42 46.734214 579.89785 515.50911
## Forecast Horizon 43 22.037815 605.69323 541.95514
## Forecast Horizon 44 71.196174 637.99577 591.73075
## Forecast Horizon 45 20.462650 661.80168 608.99046
## Forecast Horizon 46 -4.261806 611.35762 550.87043
## Forecast Horizon 47 11.536065 629.34823 561.81134
## Forecast Horizon 48 -33.478926 662.76496 589.24399
## Forecast Horizon 49 -54.419564 662.26115 579.27873
## Forecast Horizon 50 -64.892171 662.10873 575.81813
## Forecast Horizon 51 -96.347914 680.52597 568.86033
## Forecast Horizon 52 -114.668870 629.66583 524.36028
## Forecast Horizon 53 -136.523842 636.97415 515.72334
## Forecast Horizon 54 -124.244372 615.23027 500.41905
## Forecast Horizon 55 -91.961385 631.72812 534.77422
## Forecast Horizon 56 -118.885786 623.11038 514.62540
## Forecast Horizon 57 -56.577251 635.11641 535.13372
## Forecast Horizon 58 -94.331285 599.01287 477.23017
## Forecast Horizon 59 -135.975997 596.74556 480.78794
## Forecast Horizon 60 -107.443108 570.95648 480.83297
Time series cross validation generates training and test indices.
ts_partition = tsPartition(time_ser_diff[,1], rolling = FALSE, windowSize = 12, maxHorizon = 60)
The hybridModel function fits multiple individual model specifications, ensembles them to make forecasts. In the codes below, I have fit auto.arima, ets, neural networks, Seasonal and Trend decomposition using Loess Models (STLM) and TBATS models. TBATS is an acronym denoting:
T for trigonometric regressors to model multiple-seasonalities B for Box-Cox transformations A for ARMA errors T for trend S for seasonality
# Unlike the usage in the “forecast” package, the xreg argument should be passed as a matrix, not a dataframe.
train_xreg = matrix(train_pca.ts[,2]) # 2nd variable is PC1
test_xreg = matrix(test_pca.ts[,2])
# fit hybrid model, passing arguments to auto.arima with a.args and to
# nnetar with n.args
fit_hybrid = hybridModel(train_pca.ts[,1], models = "aenst",
a.args = list(xreg = train_xreg),
n.args = list(xreg = train_xreg),
s.args = list(xreg = train_xreg, method = "arima"))
summary(fit_hybrid)
## Length Class Mode
## auto.arima 19 ARIMA list
## ets 19 ets list
## nnetar 17 nnetar list
## stlm 9 stlm list
## tbats 21 bats list
## weights 5 -none- numeric
## frequency 1 -none- numeric
## x 409 ts numeric
## xreg 3 -none- list
## models 5 -none- character
## fitted 409 ts numeric
## residuals 409 ts numeric
# If a xreg is used in training, it must also be supplied to the forecast() function in the xreg argument.
# fit the forecast
fc = forecast::forecast(fit_hybrid, xreg = test_xreg)
autoplot(fc) + ggtitle("Forecast from auto.arima, ets, nnetar model")
# measures the accuracy of the fitted model on the test set
accuracy(fc, test_pca.ts[,1])
## ME RMSE MAE MPE MAPE MASE
## Training set -3.235732 84.58448 63.78808 -0.6829953 4.56599 0.2935760
## Test set 188.181337 247.32413 206.74274 15.7040745 18.91216 0.9515055
## ACF1 Theil's U
## Training set -0.1231626 NA
## Test set 0.8404488 2.446051
# View the individual models
fit_hybrid$auto.arima
## Series: y
## Regression with ARIMA(1,0,0)(1,0,0)[12] errors
##
## Coefficients:
## ar1 sar1 intercept xreg
## 0.9587 -0.0412 1450.1174 5.8013
## s.e. 0.0159 0.0520 129.8661 36.5893
##
## sigma^2 estimated as 13637: log likelihood=-2526.53
## AIC=5063.05 AICc=5063.2 BIC=5083.12
fit_hybrid$ets
## ETS(M,N,N)
##
## Call:
## getModel(modelCode)(y = y, lambda = ..1)
##
## Smoothing parameters:
## alpha = 0.7036
##
## Initial states:
## l = 1477.5855
##
## sigma: 0.0754
##
## AIC AICc BIC
## 6300.626 6300.685 6312.667
fit_hybrid$nnetar
## Series: y
## Model: NNAR(25,1,14)[12]
## Call: getModel(modelCode)(y = y, xreg = c(-2.53225340205507, -2.41747073146128,
## -2.39743675394016, -2.36997156727359, -2.26771711039408, -2.26504706928128,
## -2.22220742021708, -2.22597706102732, -2.3343355167095, -2.27292367987631,
## -2.24516672737086, -2.34702271996905, -2.42405313470473, -2.49824210481856,
## -2.58526101918237, -2.7419732519794, -2.72515424253978, -2.75988418476891,
## -2.65061410333534, -2.75409153042939, -2.75619819016926, -2.77794856818856,
## -2.8178736276948, -2.9017684609263, -3.00492519270628, -3.10055682426189,
## -3.14477191463088, -3.17024457683835, -3.18283296908497, -3.42283813568997,
## -3.45666536870024, -3.49396642672187, -3.45818646357687, -3.50933089513724,
## -3.45588490952563, -3.56218029774638, -3.47764378877796, -3.38753067844741,
## -3.49782420784257, -3.77340541020389, -4.05484349953376, -4.19498567343178,
## -4.21200894882265, -4.06083076331154, -4.22310310114524, -5.04212398507034,
## -5.06416308937719, -3.54979046763557, -3.03956762688214, -2.8367148690948,
## -3.04242387326707, -3.34432569637632, -3.6505258450356, -4.12113979572331,
## -4.59398942577568, -4.45256019799244, -4.2670794196513, -4.07480046169251,
## -4.28133289514944, -4.72647772049544, -4.82259503285766, -4.78957720794046,
## -4.77719004581881, -4.72632252267773, -4.44379733565778, -3.85853874148969,
## -3.62281799960897, -3.91069317099674, -4.13871914578304, -4.0588944379548,
## -4.10097792021573, -3.97634886066671, -3.88421621294822, -3.71009152193184,
## -3.19883400466008, -3.13792515262338, -2.99771971788931, -2.83972677882852,
## -2.76105316043704, -2.66598614802098, -2.6194409673377, -2.67299970164744,
## -2.66666102797455, -2.65672558284935, -2.81346702830862, -2.92430856519787,
## -3.15490184821651, -2.96957892086406, -2.91354009528154, -2.79006934828011,
## -2.75717863846463, -2.80563007455539, -2.73696028447197, -2.85912969309344,
## -2.96702038054752, -3.04659971016401, -3.20729756026328, -3.29368536026856,
## -3.38499366110973, -3.23638414327072, -3.00055922734369, -2.83696764732015,
## -2.5976122027257, -2.52399993320169, -2.60252987276281, -2.64447560587895,
## -2.53664934947151, -2.41137944379992, -2.36138331621682, -2.27146397552483,
## -2.31823551743642, -2.40581640907961, -2.26843742552446, -2.34410566412192,
## -2.30267411294483, -2.23202591546005, -2.31830322531339, -2.18004736864763,
## -2.063533636421, -2.17113390848884, -2.17407505936717, -2.15169002682548,
## -2.08917084894428, -1.89157694731993, -1.93113630109543, -1.94117638486494,
## -1.98367039552098, -1.95366934849964, -1.86089939030547, -1.80462246168858,
## -1.90479697509339, -2.06161744572379, -2.00453638656863, -1.92071339129178,
## -1.89607053170884, -1.954585147786, -1.98002645242534, -1.83690746297841,
## -1.90069652497243, -1.84679110329459, -1.65041645452376, -1.71313045976646,
## -1.73514932613594, -1.73303007752056, -1.79904738804274, -1.86530887684449,
## -1.95902911785077, -1.95695717344483, -1.92650010129506, -1.95299420032876,
## -2.14572075067832, -2.18377847515742, -2.29658608737097, -2.34273411757436,
## -2.34824736696262, -2.21417106193861, -2.05974694498179, -1.86363420937616,
## -1.94589623711255, -1.96178871834281, -1.86957364103643, -1.8731882195113,
## -1.78167280192857, -1.86296803216928, -1.8339084532062, -1.87697333610883,
## -1.87670977623549, -1.8712728170928, -1.75819292729638, -1.70107042727712,
## -1.64207129013349, -1.64169731119441, -1.62349498053838, -1.55323928367974,
## -1.4437106379715, -1.39227900995643, -1.1889981963167, -1.20628143267858,
## -1.10794705866975, -1.01871378803569, -1.04411404789034, -1.01731901326525,
## -0.903747615125605, -0.979062580097725, -0.788011833221756, -0.6054027943427,
## -0.491940961682177, -0.414164794174394, -0.440888265821063, -0.600905093922072,
## -0.46262768523924, -0.5010564834538, -0.447602995140983, -0.301791747420937,
## -0.230933231370143, -0.164947006426785, -0.180387779978776, -0.322865015064944,
## -0.273990478049149, -0.230331183038759, -0.240013920849613, -0.184412076128631,
## -0.164044571958818, -0.15522796345875, -0.254468662086093, -0.184315021695828,
## -0.283976761731281, -0.221253580152211, -0.263090867709636, -0.241929136885789,
## -0.207281060986666, -0.250731791396728, -0.322323808709378, -0.315472844963173,
## -0.500445639345167, -0.631427947118982, -0.629233585152467, -0.619427152785703,
## -0.666298224576046, -0.716717262368568, -0.705674695187021, -0.843846656466859,
## -1.02045925059848, -1.03225025067797, -0.9436239557397, -0.920640269583428,
## -0.863953930430759, -0.769020740375698, -0.641550461427097, -0.666319632675351,
## -0.650962345672308, -0.664091843195205, -0.675242280187631, -0.672979987650844,
## -0.53205723721583, -0.571902615036962, -0.360076321824464, -0.537897404364561,
## -0.54821150747114, -0.57227182991037, -0.59778820043639, -0.604303113123046,
## -0.50120313364835, -0.492179778935794, -0.46736771248732, -0.457913587918991,
## -0.466614072942217, -0.383733298826523, -0.443842048681703, -0.415320323254718,
## -0.475818961115955, -0.40987432635836, -0.330612941335602, -0.308384328236241,
## -0.289286025883486, -0.340974110488695, -0.297771489664169, -0.290576170927847,
## -0.320527729568868, -0.191294415210979, -0.252978543170374, -0.298416828335382,
## -0.270590224063659, -0.251302217636726, -0.250555212035022, -0.288094695512183,
## -0.246673849289918, -0.159457583048414, -0.0221807512517597,
## -0.0987696017623204, -0.0146967450209346, -0.11087031584846,
## -0.0424680912584549, -0.167804832899328, -0.0747923618642429,
## -0.0937321127371446, -0.146725590377343, -0.150663928147411,
## -0.147898950165696, -0.207696465677062, -0.145185794415202, -0.163133524768438,
## -0.175577001833222, -0.143597155212654, -0.307937641352163, -0.343842103225383,
## -0.313717927595442, -0.399562592474078, -0.310768939344867, -0.195135613708124,
## -0.252309532742339, -0.138906590579924, -0.0796689470321104,
## -0.0867834239465508, 0.031179957585826, 0.271320990971581, 0.299590193474306,
## 0.395075305709897, 0.45730848507705, 0.611742707504718, 0.550340644627182,
## 0.627293786458376, 0.644083680249315, 0.846298212217617, 0.927796901678654,
## 1.01208925862243, 0.958358733168966, 1.02215656189841, 1.00298065350309,
## 1.01428265131206, 0.992177176770545, 1.02760986766204, 1.12577813318266,
## 1.17743925934151, 1.17763319059892, 1.19722907242967, 1.27400621771985,
## 1.28456922714219, 1.3502219227801, 1.42979598246797, 1.39322488881286,
## 1.45865991245176, 1.47133251270497, 1.44568563292614, 1.54150135848276,
## 1.62770384744472, 1.67679754545532, 1.57756960856324, 1.59520386462085,
## 1.57511850982883, 1.58687063483707, 1.63574132734423, 1.65320077969572,
## 1.62993139006056, 1.52477589749386, 1.5155658474924, 1.48507947973992,
## 1.42970570676187, 1.42405255501338, 1.47199708186846, 1.44491710716092,
## 1.42474121487796, 1.35193529993886, 1.24919108823229, 1.19582275389461,
## 1.2585911257199, 1.1627176092918, 1.03124265577513, 1.10393457993059,
## 1.13285517301175, 1.05085814279327, 1.07651802669276, 1.02201482065713,
## 0.973198638653967, 0.961850790332554, 0.939996558622829, 0.842147116667305,
## 0.768595058154028, 0.852122906759308, 0.961226332952738, 0.830254708456124,
## 0.817989459041816, 0.92335074691998, 0.876441967319446, 0.943778434917402,
## 1.00425493455356, 1.06550684247749, 1.05444082385856, 1.13615907460112,
## 1.19852944365453, 1.28771726987445, 1.26993449173631, 1.27055505100324,
## 1.25532335409956, 1.32096793348811, 1.49540835412017, 1.55653548906193,
## 1.64742395264557, 1.75449852340947, 1.9195944188468, 2.14775611916698,
## 2.21477276753924, 2.29916465542444, 2.23664033259022, 2.14015292452148,
## 2.17335224653355, 2.14795987252773, 2.20533834940684, 2.45762219329291,
## 2.52006416960797, 2.4379858011099, 2.56495388666384, 2.63352988561327,
## 2.5895682062823, 2.65498620128651, 2.84727746562608, 2.85270599204086,
## 2.89737970517148, 2.89386470113665, 2.90802917570711, 2.90784565261444,
## 2.91746260802876, 2.97769020549782, 3.0119399368071, 3.00707563885383,
## 3.07418872043285, 3.06377052885742, 2.91085094375145, 2.85131026633352
## ))
##
## Average of 20 networks, each of which is
## a 26-14-1 network with 393 weights
## options were - linear output units
##
## sigma^2 estimated as 118
# See forecasts from individual model
autoplot(forecast::forecast(fit_hybrid$nnetar, xreg = test_xreg)) + ggtitle("Forecast from neural network model")
# The print() and summary() methods print information about the ensemble model including the weights assigned to each individual component model.
print(fit_hybrid)
## Hybrid forecast model comprised of the following models: auto.arima, ets, nnetar, stlm, tbats
## ############
## auto.arima with weight 0.2
## ############
## ets with weight 0.2
## ############
## nnetar with weight 0.2
## ############
## stlm with weight 0.2
## ############
## tbats with weight 0.2
summary(fit_hybrid)
## Length Class Mode
## auto.arima 19 ARIMA list
## ets 19 ets list
## nnetar 17 nnetar list
## stlm 9 stlm list
## tbats 21 bats list
## weights 5 -none- numeric
## frequency 1 -none- numeric
## x 409 ts numeric
## xreg 3 -none- list
## models 5 -none- character
## fitted 409 ts numeric
## residuals 409 ts numeric
We can create two types of plots that fit the ensemble model:
plot(fit_hybrid, type = "fit")
plot(fit_hybrid, type = "fit", # plot the original series and the individual fitted models.
ggplot = TRUE) + ggtitle("Representation of the Hybrid Model")
By default, R assigns equal weightage to each component model in the final ensemble model. Albeit,empirically, this approach yields good performance in the ensembles, we can also apply other combination methods such as inverse mean absolute error (MAE), inverse root mean square error (RMSE), and inverse mean absolute scaled error (MASE).
#To apply one of these weighting schemes of the component models, pass this value to the errorMethod argument and pass either "insample.errors" or "cv.errors" to the weights argument.
(fit_hybrid2 = hybridModel(train_pca.ts[,1],
a.args = list(xreg = train_pca.ts[ , 2:7]),
n.args = list(xreg = train_pca.ts[ , 2:7]),
weights = "insample.errors", errorMethod = "MASE", models = "aenst"))
## Hybrid forecast model comprised of the following models: auto.arima, ets, nnetar, stlm, tbats
## ############
## auto.arima with weight 0.038
## ############
## ets with weight 0.034
## ############
## nnetar with weight 0.879
## ############
## stlm with weight 0.035
## ############
## tbats with weight 0.014
After fitting the model we can manipulate the weights stored in the model. As the hybridModel() function adjusts the weights automatically to sum to 1, so should use similar approach to adjust the weights so that the forecasts are unbiased.
fit_hybrid2$weights %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
| x | |
|---|---|
| auto.arima | 0.0376052 |
| ets | 0.0342575 |
| nnetar | 0.8789485 |
| stlm | 0.0354328 |
| tbats | 0.0137560 |
# n-sample errors and various accuracy measure can be extracted with the accuracy method.
accuracy(fit_hybrid2) %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
| ME | RMSE | MAE | MPE | MAPE | ACF1 | Theil’s U | |
|---|---|---|---|---|---|---|---|
| Test set | -0.4505312 | 13.68712 | 10.64188 | -0.0944263 | 0.7671246 | -0.1043302 | 0.1214451 |
# In addition to retrieving the ensemble's accuracy, the individual component models' accuracies can be easily viewed by using the individual = TRUE argument.
accuracy(fit_hybrid2, individual = TRUE)
## $auto.arima
## ME RMSE MAE MPE MAPE MASE
## Training set 0.8325056 97.90184 74.99814 -0.433984 5.252132 0.3451688
## ACF1
## Training set 0.005580162
##
## $ets
## ME RMSE MAE MPE MAPE MASE
## Training set -3.19712 110.5551 82.32716 -0.7161807 5.766845 0.3788996
## ACF1
## Training set -0.06714796
##
## $nnetar
## ME RMSE MAE MPE MAPE MASE
## Training set 0.04690901 4.789925 3.208741 -0.006691157 0.251083 0.0147678
## ACF1
## Training set -0.1695744
##
## $stlm
## ME RMSE MAE MPE MAPE MASE
## Training set -3.185936 105.2861 79.59628 -0.6955999 5.583269 0.3663311
## ACF1
## Training set -0.08211982
##
## $tbats
## ME RMSE MAE MPE MAPE MASE
## Training set -2.78931 110.0513 82.1044 -0.6000222 5.735443 0.9435958
## ACF1
## Training set -0.03045211
Below, I have forecasted housing starts for the next 48 periods from the ensemble model and shown the plot. The individual component models form prediction intervals, and the ensemble uses the most extreme values from those individual models, giving a conservative etimate of the ensemble model’s performance.
(fit_hybrid2_forecast = forecast::forecast(fit_hybrid2, xreg = tail(test_pca.ts[,2:7]),
h = nrow(tail(test_set_diff[,2:7]))))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jul 2010 972.8435 488.2271 1216.614 451.3625 1284.113
## Aug 2010 1046.7746 470.7806 1242.765 427.7969 1317.131
## Sep 2010 1058.0535 454.7032 1240.907 406.0663 1321.229
## Oct 2010 1098.9528 439.8925 1162.993 386.0722 1248.569
## Nov 2010 1088.6573 426.2288 1172.830 367.6747 1263.097
## Dec 2010 1184.1473 413.5928 1485.047 350.7236 1579.536
autoplot(fit_hybrid2_forecast)
Generalized Autoregressive Conditional Heteroskedastic, or GARCH models are useful to analyse and forecast volatility in a time series data. Univariate GARCH(1,1) helps in modeling volality and its clustering.
Financial time series possess the property of volatility clustering wherein the volatility of the variable changes over time. Technically, this behavior is called conditional heteroskedasticity. Because ARMA models don’t consider volatility clustering i.e. they are not conditionally heteroskedastic, so we need to use ARCH and GARCH models for predictions.
Such models include the Autogressive Conditional Heteroskedastic (ARCH) model and Generalised Autogressive Conditional Heteroskedastic (GARCH) model. Different forms of volatility such as sell-offs during a financial crises, can cause serially correlated heteroskedasticity. Thus, the time_ser data is conditionally heteroskedastic.
Maximum likelihood estimates most GARCH models, such as measuring relative loss or profit from trading stocks in a day. If x_t is the value of housing starts on t, then r_t=[x_t − x_(t−1)]/x_(t−1) is called the return. We observe large volatility around the 2008 financial crisis and returns that are mostly noise noise with short periods of large variability.
hous_st_return = diff(time_ser_diff[,1])/stats::lag(time_ser_diff[,1], 1) # returns of housing starts
## garchFit
# We can also calculate approximate returns as log(x_t) - log(x_(t-1))
hous_st_approx_return = diff(log(time_ser_diff[,1])) # log-returns
autoplot(hous_st_approx_return) + ylab("Returns of housing starts") + xlab("Years") + ggtitle("Housing starts returns over time")
We also calculate the autocorrelations and partial autocorrelations for the log returns.
(hous_return.acf = autocorrelations(hous_st_approx_return) )
## An object of class "SampleAutocorrelations"
## Slot *data*:
## An object of class "Lagged1d"
## Slot *data*:
## Lag_0 Lag_1 Lag_2 Lag_3 Lag_4
## 1.000000000 -0.337696071 0.020669913 0.060895883 0.026866374
## Lag_5 Lag_6 Lag_7 Lag_8 Lag_9
## 0.014099809 0.027820931 -0.060173170 0.046525598 0.021256194
## Lag_10 Lag_11 Lag_12 Lag_13 Lag_14
## -0.061764020 0.059106842 -0.112716964 0.129317611 -0.025322512
## Lag_15 Lag_16 Lag_17 Lag_18 Lag_19
## 0.025161904 0.040911800 -0.061328537 0.003417615 0.077785312
## Lag_20 Lag_21 Lag_22 Lag_23 Lag_24
## -0.072004007 0.009488142 0.056320013 0.031118713 -0.142468122
## Lag_25 Lag_26 Lag_27
## 0.077254000 0.047208565 -0.011071597
## Slot n:
## [1] 510
## Slot varnames:
## character(0)
## Slot objectname:
## [1] "x"
(hous_return.pacf = partialAutocorrelations(hous_st_approx_return))
## Slot *data*:
## An object of class "Lagged1d"
## Slot *data*:
## Lag_0 Lag_1 Lag_2 Lag_3 Lag_4
## 1.000000000 -0.337696071 -0.105386902 0.037692151 0.073121144
## Lag_5 Lag_6 Lag_7 Lag_8 Lag_9
## 0.060452379 0.057317377 -0.044570006 0.002237136 0.031374239
## Lag_10 Lag_11 Lag_12 Lag_13 Lag_14
## -0.043581610 0.027240530 -0.104315715 0.072712156 0.041212694
## Lag_15 Lag_16 Lag_17 Lag_18 Lag_19
## 0.061616800 0.079374862 -0.044826188 -0.043220838 0.040111390
## Lag_20 Lag_21 Lag_22 Lag_23 Lag_24
## -0.024061621 -0.014164097 0.038575007 0.099078391 -0.136704372
## Lag_25 Lag_26 Lag_27
## -0.002251135 0.072969266 0.037366431
Routine portmanteau tests, such as Ljung-Box, also reject the IID hypothesis.
checkresiduals(ets(hous_st_approx_return))
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 106.72, df = 22, p-value = 4.226e-13
##
## Model df: 2. Total lags used: 24
Here we carry out IID tests using the method of Li-McLeod:
hous_LM = whiteNoiseTest(hous_return.acf, h0 = "iid", nlags = c(5,10,20), x = hous_st_approx_return, method = "LiMcLeod")
hous_LM$test
## ChiSq DF pvalue
## [1,] 60.76776 5 8.434054e-12
## [2,] 66.36748 10 2.217507e-10
## [3,] 92.61914 20 2.573373e-11
## attr(,"method")
## [1] "LiMcLeod"
plot(hous_return.acf, data = hous_st_approx_return , main="Autocorrelation test of the log returns of housing starts")
Small p-values reject the null hypothesis at 0.05 level. Rejection of the null hypothesis is often taken to mean that the data are autocorrelated. I have fit a GARCH-type model assuming that log returns are GARCH. I ahve also changed the null hypothesis to “garch” (one possible weak white noise hypothesis):
hous_iid = whiteNoiseTest(hous_return.acf, h0 = "garch", nlags = c(5,10,20), x = hous_st_approx_return)
hous_iid$test
## h Q pval
## [1,] 5 44.44354 1.882400e-08
## [2,] 10 46.52845 1.150233e-06
## [3,] 20 58.15016 1.371371e-05
plot(hous_return.pacf, data = hous_st_approx_return, main="Partial Autocorrelation test of the log returns of housing starts")
The low p-values give reason to reject the hypothesis that the log-returns are a GARCH white noise process. So, we should do ARMA modelling.
We have fit GARCH model(s), starting with a GARCH(1,1) model with Gaussian innovations.GARCH(1,1) considers a single autoregressive and a moving average lag. The model is:
ϵ_t = σ_t * w_t σ^2 = α_0 + α_1 * ϵ^2_(t−1) + β_1 * σ^2_(t−1)
Note that it is necessary for α1+β1<1 otherwise the series will become unstable.
fit_garch = garchFit(~garch(1,1), data = hous_st_approx_return, trace = FALSE)
summary(fit_garch)
##
## Title:
## GARCH Modelling
##
## Call:
## garchFit(formula = ~garch(1, 1), data = hous_st_approx_return,
## trace = FALSE)
##
## Mean and Variance Equation:
## data ~ garch(1, 1)
## <environment: 0x5654aa414520>
## [data = hous_st_approx_return]
##
## Conditional Distribution:
## norm
##
## Coefficient(s):
## mu omega alpha1 beta1
## 7.2564e-06 6.5094e-05 5.4054e-02 9.3618e-01
##
## Std. Errors:
## based on Hessian
##
## Error Analysis:
## Estimate Std. Error t value Pr(>|t|)
## mu 7.256e-06 3.183e-03 0.002 0.99818
## omega 6.509e-05 5.483e-05 1.187 0.23512
## alpha1 5.405e-02 1.911e-02 2.829 0.00468 **
## beta1 9.362e-01 2.128e-02 43.993 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log Likelihood:
## 573.6862 normalized: 1.124875
##
## Description:
## Wed Feb 6 02:10:13 2019 by user:
##
##
## Standardised Residuals Tests:
## Statistic p-Value
## Jarque-Bera Test R Chi^2 3.509322 0.1729658
## Shapiro-Wilk Test R W 0.9939851 0.04105379
## Ljung-Box Test R Q(10) 61.96807 1.534529e-09
## Ljung-Box Test R Q(15) 79.20523 9.757384e-11
## Ljung-Box Test R Q(20) 88.16705 1.547757e-10
## Ljung-Box Test R^2 Q(10) 30.61482 0.0006790964
## Ljung-Box Test R^2 Q(15) 44.02928 0.0001088094
## Ljung-Box Test R^2 Q(20) 48.81851 0.0003261925
## LM Arch Test R TR^2 25.59477 0.01224274
##
## Information Criterion Statistics:
## AIC BIC SIC HQIC
## -2.234063 -2.200852 -2.234185 -2.221043
The diagnostics imply that the standardised residuals and their squares are not IID and that the model does not accomodate ARCH effects. Nonetheless, their distribution is Gaussian (from the p-values for Jarque-Bera and Shapiro-Wilk Tests). If not Gaussian, we could try another conditional distribution.
Another possible problem is that alpha_1 + beta_1 > 0.
The persistence of a GARCH model signifies the rate at which large volatilities decay after a shock. The key statistic in GARCH(1,1) is the sum of two parameters: alpha1 and beta1.
Ideally, alpha_1 + beta_1 < 1. If, alpha_1 + beta_1 > 1, then the volatility predictions are explosive. If, alpha_1 + beta_1 = 1, then the model has exponential decay.
fit_garch2 = garchFit(~garch(1,1), cond.dist = c("sstd"), data = hous_st_return, trace = FALSE)
summary(fit_garch2)
##
## Title:
## GARCH Modelling
##
## Call:
## garchFit(formula = ~garch(1, 1), data = hous_st_return, cond.dist = c("sstd"),
## trace = FALSE)
##
## Mean and Variance Equation:
## data ~ garch(1, 1)
## <environment: 0x5654a6a26020>
## [data = hous_st_return]
##
## Conditional Distribution:
## sstd
##
## Coefficient(s):
## mu omega alpha1 beta1 skew shape
## 0.00038759 0.00249668 0.31156371 0.35249518 0.94923888 7.63204945
##
## Std. Errors:
## based on Hessian
##
## Error Analysis:
## Estimate Std. Error t value Pr(>|t|)
## mu 0.0003876 0.0029247 0.133 0.894571
## omega 0.0024967 0.0006917 3.610 0.000307 ***
## alpha1 0.3115637 0.0945111 3.297 0.000979 ***
## beta1 0.3524952 0.1200780 2.936 0.003330 **
## skew 0.9492389 0.0637039 14.901 < 2e-16 ***
## shape 7.6320494 2.4286593 3.142 0.001675 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log Likelihood:
## 573.4515 normalized: 1.126624
##
## Description:
## Wed Feb 6 02:10:13 2019 by user:
##
##
## Standardised Residuals Tests:
## Statistic p-Value
## Jarque-Bera Test R Chi^2 39.89308 2.174341e-09
## Shapiro-Wilk Test R W 0.9856362 6.450794e-05
## Ljung-Box Test R Q(10) 50.9149 1.810721e-07
## Ljung-Box Test R Q(15) 64.03935 5.031556e-08
## Ljung-Box Test R Q(20) 75.97338 1.873609e-08
## Ljung-Box Test R^2 Q(10) 4.51276 0.921266
## Ljung-Box Test R^2 Q(15) 50.02022 1.194994e-05
## Ljung-Box Test R^2 Q(20) 51.30893 0.0001434862
## LM Arch Test R TR^2 23.901 0.02098089
##
## Information Criterion Statistics:
## AIC BIC SIC HQIC
## -2.229672 -2.179781 -2.229946 -2.210110
plot(fit_garch2, which = 13)
fit_garch3 = garchFit(~aparch(1,1), cond.dist = c("sstd"), data = hous_st_return, trace = FALSE)
summary(fit_garch3)
##
## Title:
## GARCH Modelling
##
## Call:
## garchFit(formula = ~aparch(1, 1), data = hous_st_return, cond.dist = c("sstd"),
## trace = FALSE)
##
## Mean and Variance Equation:
## data ~ aparch(1, 1)
## <environment: 0x5654a068d970>
## [data = hous_st_return]
##
## Conditional Distribution:
## sstd
##
## Coefficient(s):
## mu omega alpha1 gamma1 beta1 delta
## -0.0012071 0.0288430 0.2863702 0.1519114 0.3885731 1.0395950
## skew shape
## 0.9169429 8.1930362
##
## Std. Errors:
## based on Hessian
##
## Error Analysis:
## Estimate Std. Error t value Pr(>|t|)
## mu -0.001207 0.003660 -0.330 0.741536
## omega 0.028843 0.007884 3.659 0.000254 ***
## alpha1 0.286370 0.078956 3.627 0.000287 ***
## gamma1 0.151911 0.224820 0.676 0.499230
## beta1 0.388573 0.121144 3.208 0.001339 **
## delta 1.039595 0.536613 1.937 0.052705 .
## skew 0.916943 0.075564 12.135 < 2e-16 ***
## shape 8.193036 2.797200 2.929 0.003400 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log Likelihood:
## 573.9265 normalized: 1.127557
##
## Description:
## Wed Feb 6 02:10:13 2019 by user:
##
##
## Standardised Residuals Tests:
## Statistic p-Value
## Jarque-Bera Test R Chi^2 38.65526 4.03755e-09
## Shapiro-Wilk Test R W 0.985954 8.016392e-05
## Ljung-Box Test R Q(10) 53.03175 7.345548e-08
## Ljung-Box Test R Q(15) 64.66971 3.903785e-08
## Ljung-Box Test R Q(20) 76.41239 1.581817e-08
## Ljung-Box Test R^2 Q(10) 4.479591 0.9231299
## Ljung-Box Test R^2 Q(15) 51.58351 6.617759e-06
## Ljung-Box Test R^2 Q(20) 52.54272 9.485389e-05
## LM Arch Test R TR^2 22.86343 0.02890798
##
## Information Criterion Statistics:
## AIC BIC SIC HQIC
## -2.223680 -2.157158 -2.224164 -2.197596
Once a GARCH model is fit, you can forecast the returns as well as the volatility. The mean square prediction error will depend on the size of the volatility.
predict(fit_garch3, n.ahead = 48, plot=TRUE) # n.ahead is the number of time periods to forecast.
## meanForecast meanError standardDeviation lowerInterval upperInterval
## 1 -0.001207079 0.06516409 0.06516409 -0.1368431 0.1230186
## 2 -0.001207079 0.07154674 0.07154674 -0.1501283 0.1351862
## 3 -0.001207079 0.07542169 0.07542169 -0.1581938 0.1425732
## 4 -0.001207079 0.07777708 0.07777708 -0.1630964 0.1470634
## 5 -0.001207079 0.07920978 0.07920978 -0.1660786 0.1497947
## 6 -0.001207079 0.08008158 0.08008158 -0.1678932 0.1514566
## 7 -0.001207079 0.08061220 0.08061220 -0.1689976 0.1524682
## 8 -0.001207079 0.08093520 0.08093520 -0.1696699 0.1530839
## 9 -0.001207079 0.08113183 0.08113183 -0.1700792 0.1534588
## 10 -0.001207079 0.08125155 0.08125155 -0.1703284 0.1536870
## 11 -0.001207079 0.08132443 0.08132443 -0.1704801 0.1538259
## 12 -0.001207079 0.08136880 0.08136880 -0.1705725 0.1539105
## 13 -0.001207079 0.08139582 0.08139582 -0.1706287 0.1539620
## 14 -0.001207079 0.08141227 0.08141227 -0.1706629 0.1539934
## 15 -0.001207079 0.08142228 0.08142228 -0.1706838 0.1540125
## 16 -0.001207079 0.08142838 0.08142838 -0.1706965 0.1540241
## 17 -0.001207079 0.08143209 0.08143209 -0.1707042 0.1540312
## 18 -0.001207079 0.08143436 0.08143436 -0.1707089 0.1540355
## 19 -0.001207079 0.08143573 0.08143573 -0.1707118 0.1540381
## 20 -0.001207079 0.08143657 0.08143657 -0.1707135 0.1540397
## 21 -0.001207079 0.08143708 0.08143708 -0.1707146 0.1540407
## 22 -0.001207079 0.08143739 0.08143739 -0.1707152 0.1540413
## 23 -0.001207079 0.08143758 0.08143758 -0.1707156 0.1540416
## 24 -0.001207079 0.08143769 0.08143769 -0.1707158 0.1540418
## 25 -0.001207079 0.08143776 0.08143776 -0.1707160 0.1540420
## 26 -0.001207079 0.08143781 0.08143781 -0.1707161 0.1540421
## 27 -0.001207079 0.08143783 0.08143783 -0.1707161 0.1540421
## 28 -0.001207079 0.08143785 0.08143785 -0.1707162 0.1540421
## 29 -0.001207079 0.08143786 0.08143786 -0.1707162 0.1540422
## 30 -0.001207079 0.08143786 0.08143786 -0.1707162 0.1540422
## 31 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 32 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 33 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 34 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 35 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 36 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 37 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 38 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 39 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 40 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 41 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 42 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 43 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 44 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 45 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 46 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 47 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
## 48 -0.001207079 0.08143787 0.08143787 -0.1707162 0.1540422
This framework dynamically uses metalearning state-of-art forecasting combination methods to combine forecastsfrom different machine learning models. The input is a data matrix or a data frame representing a time series, while the output is a predictive ensemble model.
# setting up base model parameters
# learner is a character vector with the base learners to be trained.
# bm_glm = Generalized Linear Models, from the glmnet package
# bm_gbm = Generalized Boosted Regression models, from the gbm package
# bm_randomforest = Random Forest models, from the ranger package.
# bm_mars = Multivariate Adaptive Regression Splines models, from the earth package.
# bm_svr = Support Vector Regression models, from the kernlab package.
# bm_ffnn = Feedforward Neural Network models, from the nnet package.
# bm_pls_pcr = Partial Least Regression and Principal Component Regression models, from the pls package.
specs = model_specs(
learner = c("bm_ppr","bm_glm", "bm_gbm", "bm_mars","bm_svr", "bm_ffnn", "bm_pls_pcr"),
learner_pars = list(
bm_glm = list(alpha = c(0, .5, 1)), # elasticnet mixing parameter with 0 ≤ α ≤ 1, used to estimate penalty
# alpha=1 is the lasso penalty, and alpha=0 the ridge penalty.
bm_gbm = list(shrinkage = c(0.01, 0.5), n.trees = 100),
bm_mars = list(thresh = 0.003) ,
bm_svr = list(kernel = c("rbfdot", "polydot"), C = c(1,3)),
bm_ffnn = list(decay = 0.5),
bm_pls_pcr = list(method = c("kernelpls", "simpls", "cppls"))
))
# building the ensemble
(fit_ensemble = ADE(hous_st ~ hous_st_d1 + yield_sp_d1 + mortgR_d1 + real_estL + house_supply_d1 + sec_conL_d2 + CPI_d3 ,
data = data.frame(na.omit(time_ser_diff)), specs))
## Setting up meta data
## Building base ensemble...
##
## Training the base models...
## Base model: bm_ppr
## Base model: bm_glm
## Base model: bm_gbm
## Base model: bm_mars
## Base model: bm_svr
## Base model: bm_ffnn
## Base model: bm_pls_pcr
## Training meta models (an arbiter for each expert
## ## Arbitrated Dynamic Ensemble ##
##
## Base ensemble specs:
## Time Series Ensemble Model
## Ensemble composed by 15 base Models.
## The distribution of the base Models is the following:
## gbm glm mars mvr nnet ppr svm
## 2 3 1 3 1 1 4
## The formula is: hous_st ~ hous_st_d1 + yield_sp_d1 + mortgR_d1 + real_estL + house_supply_d1 + sec_conL_d2 + CPI_d3
##
## Lambda is set to: 50
## Omega is set to: 0.5
tsensembler::forecast(fit_ensemble, h = 24)
## t+1 t+2 t+3 t+4 t+5 t+6 t+7 t+8
## 1764.024 1210.495 1188.149 1207.792 1210.495 1205.656 1207.804 1106.697
## t+9 t+10 t+11 t+12 t+13 t+14 t+15 t+16
## 1188.029 1188.029 1188.030 1188.029 1188.029 1188.030 1188.029 1188.029
## t+17 t+18 t+19 t+20 t+21 t+22 t+23 t+24
## 1188.029 1188.033 1188.030 1188.030 1188.030 1188.030 1188.030 1188.030