knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

The dataset “FinCrisis” consists of economic variables that affect the number of housing starts in the US.

The following are the variables in the dataset:

hous_st

  1. Housing Starts: Total: New Privately Owned Housing Units Started, Thousands of Units, Monthly, Seasonally Adjusted Annual Rate

CPI

  1. Consumer Price Index for All Urban Consumers: All Items, Index 1982-1984=100, Monthly, Seasonally Adjusted
    # unempR
  2. Civilian Unemployment Rate, Percent, Monthly, Seasonally Adjusted

sec_conL

  1. Total Consumer Credit Owned and Securitized, Outstanding, Billions of Dollars, Monthly, Seasonally Adjusted

yield_sp

  1. 10-Year Treasury Constant Maturity Minus 2-Year Treasury Constant Maturity, Percent, Monthly, Not Seasonally Adjusted

real_estL

  1. Real Estate Loans, All Commercial Banks, Billions of U.S. Dollars, Monthly, Seasonally Adjusted 1945-07-01 2018-08-01

mortgR

  1. 30-Year Conventional Mortgage Rate , Percent, Monthly, Not Seasonally Adjusted

fed_fundsR

  1. Effective Federal Funds Rate, Percent, Monthly, Not Seasonally Adjusted

income

  1. Real Disposable Personal Income, Billions of Chained 2012 Dollars, Monthly, Seasonally Adjusted Annual Rate

pvt_house_comp

  1. New Privately-Owned Housing Units Completed: Total, Thousands of Units, Monthly, Seasonally Adjusted Annual Rate

house_supply

  1. Monthly Supply of Houses in the United States, Months’ Supply, Monthly, Seasonally Adjusted

Loading R packages

library(readxl)
library(readr)
library(dplyr)
library(ggplot2)
library(GGally)
library(gridExtra)
library(purrr)
library(glmnet) # computes penalized linear regression models such as lasso and ridge
library(tidyverse)
library(caret)
library(data.table)
library(tidyverse)
library("NeuralNetTools") # plots neural nets using nnet package
library(nnet)
library(forecast)
library(LiblineaR) # estimates predictive linear models for classification and regression
library(kableExtra) # generates customized tables
library(ggbiplot) # plots PCs and set the graphical parameters of the plots
library(factoextra) # visualization for PCs
library(Metrics) # contains functions that evaluation metrics for Machine Learning models
FinCrisis = read_xlsx("FinCrisis.xlsx")

convert FinCrisis dataset into time series dataset

# converts from data.frame to .ts
time_ser=ts(FinCrisis,frequency=12,start=c(1976,6), end=c(2018,12))
time_ser.df = as.data.frame(time_ser) 

# Making a train and test split for the estimation
train_fin = slice(time_ser.df, 1:409)
test_fin = slice(time_ser.df, -(1:409))

correlation matrix

matrix = cor(FinCrisis[,unlist(lapply(FinCrisis, is.numeric))])
# ranks from most negatively to most positively correlated variables
cor(matrix) %>%
  as.data.frame() %>%
  mutate(var1 = rownames(.)) %>%
  gather(var2, value, -var1) %>%
  arrange(desc(value)) %>%
  group_by(value) %>%
  dplyr::filter(row_number() == 1) 
## # A tibble: 56 x 3
## # Groups:   value [56]
##    var1           var2       value
##    <chr>          <chr>      <dbl>
##  1 hous_st        hous_st    1    
##  2 sec_conL       income     1.000
##  3 CPI            income     0.999
##  4 CPI            sec_conL   0.999
##  5 real_estL      CPI        0.997
##  6 real_estL      sec_conL   0.997
##  7 real_estL      income     0.996
##  8 mortgR         fed_fundsR 0.989
##  9 pvt_house_comp hous_st    0.981
## 10 real_estL      yield_sp   0.794
## # … with 46 more rows

Crossvalidated MSE

crossval_fold_inds <- caret::createFolds(
  y = train_fin$hous_st,
  k = 10
)
calc_mse <- function(observed, predicted) {
  return(mean(abs(observed - predicted)))
}
get_val_mse <- function(formula, train_fin, cross_fold_inds, val_fold){
  val_train <- train_fin %>% slice(cross_fold_inds[[val_fold]])
  train2_data <- train_fin %>% slice(-cross_fold_inds[[val_fold]])
  
  fitmod <- lm(formula, data = train2_data)
  
  val_preds <- predict(fitmod, val_train)
  val_resids <- val_train$hous_st - val_preds
  val_obs <- val_train$hous_st
  
calc_mse(observed = val_obs, predicted = val_preds)
}
mse_results <- expand.grid(
    val_fold = 1:10
  )
get_crossval_mse <- function(formula, train_fin, crossval_fold_inds){
  
  
    xval_mse_results = mse_results %>%
      mutate(
        val_mse = mse_results%>%
        pmap_dbl(
        get_val_mse,
        formula = formula,
        cross_fold_inds = crossval_fold_inds,
        train_fin = train_fin
        ))
    
     mean(xval_mse_results$val_mse)
}

As the above models had high test set MSE’s, we realized that this maybe due to the multicollinearity of the independent variables in the dataset so we chose to analyze the data using PCA.

Principal Component Analysis

In PCA, every variable is centered at zero so that we can easily compare each principal component to the mean. Centering also removes problems arising the scale of each variable. After pbtaining the PCs, we sort the vriables from larget to the samllest based on the standard deviations (eigenvalues).

Proportion of variance shows how much of the variance is explained by that principal component. The components are always sorted by how important they are, so the most important components will always be the first few. In this dataset, PC1 accounts for > 60 % of total variance in the data. The cumulative proprotion shows how much variance is accumulated.

FinCrisis.pca <- 
  prcomp(time_ser.df[,-c(1,2)], #formula with no response variable and numerical explan vars
        center = TRUE, # logical value indicating whether the variables should be shifted to be zero centered. Alternately, a vector of length equal to the number of columns of x can be supplied. The value is passed to scale.
    scale. = TRUE) #a logical value indicating whether the variables should be scaled to have unit variance before the analysis takes place. The default is FALSE for consistency with S, but in general scaling is advisable. Alternatively, a vector of length equal the number of columns of x can be supplied. The value is passed to scale
summary(FinCrisis.pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6    PC7
## Standard deviation     2.4645 1.3724 1.0866 0.65482 0.49411 0.36791 0.1731
## Proportion of Variance 0.6074 0.1883 0.1181 0.04288 0.02441 0.01354 0.0030
## Cumulative Proportion  0.6074 0.7957 0.9138 0.95669 0.98111 0.99464 0.9976
##                            PC8     PC9    PC10
## Standard deviation     0.13519 0.06179 0.03869
## Proportion of Variance 0.00183 0.00038 0.00015
## Cumulative Proportion  0.99947 0.99985 1.00000
# Eigenvalues
eig.val <- get_eigenvalue(FinCrisis.pca)
eig.val %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
eigenvalue variance.percent cumulative.variance.percent
Dim.1 6.0739086 60.7390863 60.73909
Dim.2 1.8835272 18.8352721 79.57436
Dim.3 1.1807085 11.8070849 91.38144
Dim.4 0.4287866 4.2878663 95.66931
Dim.5 0.2441449 2.4414486 98.11076
Dim.6 0.1353575 1.3535751 99.46433
Dim.7 0.0299749 0.2997494 99.76408
Dim.8 0.0182776 0.1827762 99.94686
Dim.9 0.0038175 0.0381750 99.98503
Dim.10 0.0014966 0.0149660 100.00000

A data matrix X of dimension n x p data matrix X can have up to min(n-1,p) principal components. The goal of PCA is to use significantly reduce the number of variables used such that the least number of principal components explain maximum variability.

A common technique to determine the number of PCs to use is to eyeball the scree plot below where we observe the “elbow point”, where the proportion of variance explained (PVE) plummets.

screeplot(FinCrisis.pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)

An eigenvalues <1 would mean that the component actually explains less than a single explanatory variable. We notice is that the first 3 components has an Eigenvalue >1 and explains almost 91% of variance. Alternatively, the first 6 components explain 99.46 percent variance. Thus, I have reduced dimensionality from 12 to 6 while only “loosing” about 0.53567 % of variance.

# Visualize eigenvalues (scree plot). Show the percentage of variances explained by each principal component.
fviz_eig(FinCrisis.pca)

#selecting the first 6 principal components to use in the model
x_pca = FinCrisis.pca$x[, 1:6]
x_pca %>% head() %>%
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
PC1 PC2 PC3 PC4 PC5 PC6
-2.532253 0.9589985 -0.6856539 -0.4194933 0.6971565 -0.9574271
-2.417471 1.1089015 -0.9856733 -0.2229814 0.6730305 -0.8642424
-2.397437 1.0653462 -1.1664993 -0.3159122 0.5733262 -0.7703319
-2.369972 0.9977712 -1.1642373 -0.3433385 0.6511079 -0.7066144
-2.267717 1.0412444 -1.4965310 -0.3725687 0.5711167 -0.6006079
-2.265047 1.0854381 -1.4703869 -0.5285890 0.5150894 -0.6393254
# Biplot for Principal Components using ggplot2
ggbiplot(FinCrisis.pca)

The axes are seen as arrows originating from the center point. The variables - yield spread, real estate loans and CPI contribute to PC1, with higher values in those variables moving the samples to the right on this plot.

# combine PC1-PC6 as explanatory variables with hous_st as response variable in a new dataset:
new_data = cbind(time_ser.df, as.data.frame(x_pca))
new_data = new_data[, -c(1, 3:12)]

# Making a train and test split for the estimation
train_data <- slice(new_data, 1:409)
test_data <- slice(new_data, -(1:409))

Create stacked ensemble using the following methods:

K-Nearest Neighbors Ridge/LASSO Artificial Neural Networks Support Vector Machine

# Creating time slices of the training set 
timeSlices = createTimeSlices(train_data$hous_st, # used in place of cv
                              
    initialWindow = 180, # a vector of outcomes in chronolgical order 15 years  times 12
    horizon = 12, # number of consecutive values in test set sample: 12 months
    fixedWindow = FALSE) #logical,if FALSE, all training samples start at 1

train_fold_inds <- timeSlices$train[seq(from = 1, to = 211, by = 12)]
val_fold_inds <- timeSlices$test[seq(from = 1, to = 211, by = 12)]

K-Nearest Neighbors

knn_fit <- caret::train(
  form = hous_st ~ .,
  data = train_data,
  method = "knn",
  preProcess = c("scale", "center"),
  trControl = trainControl(method = "cv",
    number = 10,
    index = train_fold_inds, # I'm specifying which folds to use, for consistency across methods
    indexOut = val_fold_inds, # I'm specifying which folds to use, for consistency across methods
    returnResamp = "all",
    savePredictions = TRUE),
  tuneGrid = data.frame(k = 1:10)
)

mean(abs(test_data$hous_st - predict(knn_fit, newdata = test_data))) # MAE in test set
## [1] 287.4804
#Output of kNN fit
knn_fit
## k-Nearest Neighbors 
## 
## 409 samples
##   6 predictor
## 
## Pre-processing: scaled (6), centered (6) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 192, 204, 216, 228, 240, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  160.8808  0.1875936  137.2130
##    2  151.9892  0.1520998  131.1968
##    3  154.6955  0.1541260  132.6327
##    4  160.9234  0.1306035  139.3553
##    5  161.3336  0.2314432  139.9667
##    6  160.6328  0.2051346  139.4336
##    7  161.7521  0.2503193  141.7870
##    8  163.1922  0.2509895  144.2205
##    9  164.3352  0.2324320  146.3585
##   10  167.7617  0.2027130  149.1916
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 2.
#Plotting yields Number of Neighbours Vs accuracy (based on repeated cross validation)
plot(knn_fit)

Ridge

When predictors in a regression are strongly correlated, regression coefficients of variable depends on other predictors in the model. Therefore, the specific independent variable does not reflect the effects of the predictor on the regressor. It only partially or marginally effects the regressor based on other predictors in the model.Ridge regression adds bias to alleviate multicollinearity.

# glmnet() runs the model many times for different values of lambda. 
ridge_fit <- caret::train(
  form = hous_st ~ .,
  data = train_data,
  method = "glmnet", # method for fit
  tuneGrid = expand.grid(
    alpha = 0,
    lambda = 2^seq(from = -4, to = 10) # plug in different values of lambda to see how test RMSE changes
  ),
  trControl = trainControl(method = "cv", # evaluate method performance via cross-validation
   number = 10, # number of folds for cross-validation
    index = train_fold_inds, # I'm specifying which folds to use, for consistency across methods
    indexOut = val_fold_inds, # I'm specifying which folds to use, for consistency across methods
    returnResamp = "all", # return information from cross-validation
    savePredictions = TRUE) # return validation set predictions from cross-validation
)
mean(abs(test_data$hous_st - predict(ridge_fit, newdata = test_data))) # MAE in test set
## [1] 99.16544
ridge_fit
## glmnet 
## 
## 409 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 192, 204, 216, 228, 240, ... 
## Resampling results across tuning parameters:
## 
##   lambda     RMSE      Rsquared   MAE     
##      0.0625  125.1038  0.2564421  105.6468
##      0.1250  125.1038  0.2564421  105.6468
##      0.2500  125.1038  0.2564421  105.6468
##      0.5000  125.1038  0.2564421  105.6468
##      1.0000  125.1038  0.2564421  105.6468
##      2.0000  125.1038  0.2564421  105.6468
##      4.0000  125.1038  0.2564421  105.6468
##      8.0000  125.1038  0.2564421  105.6468
##     16.0000  124.6471  0.2566134  105.2336
##     32.0000  123.0606  0.2547423  104.0415
##     64.0000  130.8146  0.2522117  113.9690
##    128.0000  150.6686  0.2493908  135.2232
##    256.0000  179.9229  0.2451270  163.4515
##    512.0000  210.8423  0.2400159  193.8696
##   1024.0000  235.9032  0.2353817  220.1529
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 32.
plot(ridge_fit)

Support Vector Regression

set.seed(123)
svr_fit <- caret::train(
  form = hous_st ~ .,
  data = train_data,
  method = "svmLinear3", # method for fit
  tuneGrid = expand.grid(
    cost = 2^(2:5), #  inverse of a regularization constant. Specify the cost of a violation to the margin. Thus when the cost is small, the margins will be wide and there will be many support vectors.
    Loss = sample(c("L1", "L2"),  replace = TRUE)
    
  ),
  trControl = trainControl(method = "cv", # evaluate method performance via cross-validation
   number = 10, # number of folds for cross-validation
    index = train_fold_inds, # I'm specifying which folds to use, for consistency across methods
    indexOut = val_fold_inds, # I'm specifying which folds to use, for consistency across methods
    returnResamp = "all", # return information from cross-validation
    savePredictions = TRUE) # return validation set predictions from cross-validation
)
mean(abs(test_data$hous_st - predict(svr_fit, newdata = test_data))) # MAE in test set
## [1] 96.70616
svr_fit
## L2 Regularized Support Vector Machine (dual) with Linear Kernel 
## 
## 409 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 192, 204, 216, 228, 240, ... 
## Resampling results across tuning parameters:
## 
##   cost  Loss  RMSE      Rsquared   MAE     
##    4    L2    138.6183  0.2601965  121.0898
##    4    L1    805.6740  0.2492696  800.2012
##    8    L2    137.9572  0.2609627  120.4017
##    8    L1    511.9761  0.2510944  503.2552
##   16    L2    137.1520  0.2619170  119.4300
##   16    L1    319.7882  0.2456201  305.6865
##   32    L2    141.2740  0.2605979  122.7559
##   32    L1    209.9045  0.2560926  193.3992
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were cost = 16 and Loss = L2.
plot(svr_fit)

Artificial Neural Networks

Prior to regression ANN construction we first must split the new_data data set into test and training data sets. Then we have to scale only the explanatory variables. Among the choices of activation function, we have used the logisistic function in out model. Here, each feature of new_data to fall in the [0, 1] interval

# Scale the Data:
# The scale01() function maps each data observation onto the [0, 1] interval as called in the dplyr mutate_all() function
scale01 = function(x){
  (x - min(x)) / (max(x) - min(x))
}
new_data_preds_01 = new_data %>% mutate_at(c("PC1", "PC2", "PC3", "PC4"), scale01)
head(new_data_preds_01)
##   hous_st       PC1       PC2        PC3       PC4       PC5        PC6
## 1    1495 0.2853374 0.5896654 0.24103450 0.4723815 0.6971565 -0.9574271
## 2    1401 0.2982730 0.6150966 0.18637345 0.5256437 0.6730305 -0.8642424
## 3    1550 0.3005307 0.6077074 0.15342846 0.5004559 0.5733262 -0.7703319
## 4    1720 0.3036260 0.5962433 0.15384057 0.4930223 0.6511079 -0.7066144
## 5    1629 0.3151497 0.6036185 0.09329942 0.4850999 0.5711167 -0.6006079
## 6    1641 0.3154506 0.6111160 0.09806266 0.4428124 0.5150894 -0.6393254
# Split  new_data_ann into test and train sets
train_set_preds_01 = slice(new_data_preds_01, 1:407)
test_set_preds_01 = slice(new_data_preds_01, -(1:407))

Cross validation folds for data with interval (0,1) before ANN

# caret::createDataPartition does stratified sampling to ensure test and train sets look similar in some way
# it splits the dataset into test and training set
#'@train_inds outputs 2 lists: 1st list has 35 obs, and the 2nd list has 13 obs, totalling to the n = 48 in the training dataset
train_inds = caret::createDataPartition(new_data_preds_01$hous_st, p = 0.8)
# Choose some indices to use in each part of the split.
# 1st split: split new_data_ann  into training and test sets. 
# use the square bracket notation to subset the data into selected indices
# Creating time slices of the training set after scaling
timeSlices = createTimeSlices(train_set_preds_01$hous_st, # used in place of cv
                              
    initialWindow = 180, # a vector of outcomes in chronolgical order 15 years  times 12
    horizon = 12, # number of consecutive values in test set sample: 12 months
    fixedWindow = FALSE) #logical,if FALSE, all training samples start at 1
train_fold_inds_01 <- timeSlices$train[seq(from = 1, to = 211, by = 12)]
val_fold_inds_01 <- timeSlices$test[seq(from = 1, to = 211, by = 12)]

Timeslices method has created 211 folds. Modeling in 211 datasets is computationally expensive, so we have to reduce that. We command R to skip steps i.e. instead of incrementing by every month, we increment the number of folds by every 12 months. This way, we will have 211/12=17.58 ~ 18 fold cross validated sets or 18-fold time slices.

nnet function with size and decay as tuning parameters

Metrics to measure Forecast Accuracy in test sets

Mean Absolute Percentage Error (MAPE):computes the average absolute percent difference between the observed and predicted values. MAPE = (actual - predicted) / abs(actual).

Percent Bias: computes the average amount that actual is greater than predicted as a percentage of the absolute value of actual. Its value is close to 0 if the model is unbiased. Percent bias is the average of (actual - predicted)/abs(actual).

#Prediction from ridge model
ridge_preds <- predict(ridge_fit, newdata = test_data)
ridge_mape = mape(test_data$hous_st, ridge_preds)
ridge_percent_bias = percent_bias(test_data$hous_st, ridge_preds)

#Prediction from KNN model
knn_preds <- predict(knn_fit, newdata = test_data)
knn_mape = mape(test_data$hous_st, knn_preds)
knn_percent_bias = percent_bias(test_data$hous_st, knn_preds)

#Prediction from ANN model
ann_preds <- predict(ann_fit, newdata = test_data)
ann_mape = mape(test_data$hous_st, ann_preds)
ann_percent_bias = percent_bias(test_data$hous_st, ann_preds)

#Prediction from SVR model
svr_preds <- predict(svr_fit, newdata = test_data)
svr_mape = mape(test_data$hous_st, svr_preds)
svr_percent_bias = percent_bias(test_data$hous_st, svr_preds)

(accuracy = data.frame(
  Models = c("Ridge", "KNN", "ANN", "SVR"),
  MAPE = c(ridge_mape, knn_mape, ann_mape, svr_mape),
  Percent_Bias = c(ridge_percent_bias, knn_percent_bias, ann_percent_bias, svr_percent_bias)
  
))
##   Models       MAPE Percent_Bias
## 1  Ridge 0.11440275  -0.05069752
## 2    KNN 0.27903330  -0.18149670
## 3    ANN 0.10310932  -0.06638015
## 4    SVR 0.09533544   0.03336799

neural interpretation diagram (NID)

The plotnet function plots a neural interpretation diagram (NID). The black lines indicate positive weights between layers and grey lines indicate negative weights.

The thickness of line increases as the magnitude of each weight increases. The first layer consists of input variables whose nodes are labelled as I1 through I6 for six explanatory variables.

The plot below consists of one hidden layer with 10 hidden nodes labelled as H1 through H10. The last layer consists of the output layer labeled as O1. It shows the bias nodes which are connected to the hidden and output layers.

# To view a diagram of the NN1 use the plotnet() function.
plotnet(ann_fit, x_names = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), y_names = "hous_st")

coef(ann_fit$finalModel)
##       b->h1      i1->h1      i2->h1      i3->h1      i4->h1      i5->h1 
##   6.9973094   2.7541821   1.9130187   0.1593454   2.6374881  -0.7964275 
##      i6->h1       b->h2      i1->h2      i2->h2      i3->h2      i4->h2 
##  -1.0175347   6.6981651   3.0004355   2.4396672   0.8289042   2.6990313 
##      i5->h2      i6->h2       b->h3      i1->h3      i2->h3      i3->h3 
##  -0.2761348  -1.7851033  60.6841860 -30.0739451 -30.8723839 -67.0854827 
##      i4->h3      i5->h3      i6->h3       b->h4      i1->h4      i2->h4 
## -41.9800897 -28.9049180  -8.6273998  35.2036697   9.3985052  10.3136393 
##      i3->h4      i4->h4      i5->h4      i6->h4       b->h5      i1->h5 
##   6.6504071  14.3804396   2.4946905 -20.5704051  17.0209935  -8.9758188 
##      i2->h5      i3->h5      i4->h5      i5->h5      i6->h5       b->h6 
## -13.2607918   0.8004982  -6.0931299  -1.1598040  -2.6605575  25.1301763 
##      i1->h6      i2->h6      i3->h6      i4->h6      i5->h6      i6->h6 
##  -1.8421162 -12.1820600 -21.3783462 -21.6272392   4.7187724  -2.5071266 
##       b->h7      i1->h7      i2->h7      i3->h7      i4->h7      i5->h7 
##  29.6180954  -0.3854124 -21.5384474 -64.6874797  29.0406152  -4.1728956 
##      i6->h7       b->h8      i1->h8      i2->h8      i3->h8      i4->h8 
##  16.9318749   5.3045046  -7.3714461 -54.5435554  18.5750584  -0.3864044 
##      i5->h8      i6->h8        b->o       h1->o       h2->o       h3->o 
##  -5.6130632 -28.4729414 180.5661546  87.3308392 168.1559459 298.7879217 
##       h4->o       h5->o       h6->o       h7->o       h8->o 
## 169.1409014 503.7799004 162.6350091 257.4076078 253.5629605

Stacking via Ridge Regression

# Step 1: Validation-fold predictions from component models

#Ridge
ridge_val_pred <- ridge_fit$pred %>%
  dplyr::filter(lambda == ridge_fit$bestTune$lambda) %>%
  arrange(rowIndex) %>%
  pull(pred)

# KNN
knn_val_pred <- knn_fit$pred %>%
  dplyr::filter(k == knn_fit$bestTune$k) %>%
  arrange(rowIndex) %>%
  pull(pred)

# ANN
ann_val_pred = ann_fit$pred %>%
  dplyr::filter(decay == ann_fit$bestTune$decay, size == ann_fit$bestTune$size) %>%
  arrange(rowIndex) %>%
  pull(pred)

# SVR
svr_val_pred = svr_fit$pred %>%
  dplyr::filter(cost == svr_fit$bestTune$cost, Loss == svr_fit$bestTune$Loss) %>%
  arrange(rowIndex) %>%
  pull(pred)

# Step 2: data set with validation-set component model predictions as explanatory variables
val_data <- train_data %>%
  slice(val_fold_inds %>% unlist()) %>%
  mutate(
   ridge_pred = ridge_val_pred,
    knn_pred = knn_val_pred,
    ann_pred = ann_val_pred,
    svr_pred = svr_val_pred
   
  )
# Step 3: fit model using component model predictions as explanatory variables
stacking_fit <- caret::train(
  form = hous_st ~  knn_pred +  ridge_pred + ann_pred + svr_pred,
  data = val_data,
  method = "glmnet", 
  tuneGrid = expand.grid(
    alpha = 0,
    lambda = 2^seq(from = -4, to = 10)
    
  )
)
coef(stacking_fit$finalModel, stacking_fit$bestTune$lambda) %>% t() # Transposes the result to occcupy less space
## 1 x 5 sparse Matrix of class "dgCMatrix"
##   (Intercept) knn_pred ridge_pred  ann_pred  svr_pred
## 1   -130.0761 0.216563  0.3378726 0.2389439 0.2921875
# Step 4 (both cross-validation and refitting to the full training set were already done
# to get ridge_fit, knn_fit, ann_fit and svr_fit above)
knn_test_pred <- predict(knn_fit, newdata = test_data)
ridge_test_pred <- predict(ridge_fit, newdata = test_data)
ann_test_pred = predict(ann_fit, newdata = test_data)
svr_test_pred = predict(svr_fit, newdata = test_data)


# Step 5: Assemble data frame of test set predictions from each component model
stacking_test_x = data.frame(
  ridge_pred = ridge_test_pred,
  knn_pred = knn_test_pred,
  ann_pred = ann_test_pred,
  svr_pred = svr_test_pred
)
stacking_fit
## glmnet 
## 
## 216 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 216, 216, 216, 216, 216, 216, ... 
## Resampling results across tuning parameters:
## 
##   lambda     RMSE      Rsquared   MAE      
##      0.0625  129.0097  0.8568090   96.07441
##      0.1250  129.0097  0.8568090   96.07441
##      0.2500  129.0097  0.8568090   96.07441
##      0.5000  129.0097  0.8568090   96.07441
##      1.0000  129.0097  0.8568090   96.07441
##      2.0000  129.0097  0.8568090   96.07441
##      4.0000  129.0097  0.8568090   96.07441
##      8.0000  129.0097  0.8568090   96.07441
##     16.0000  129.0097  0.8568090   96.07441
##     32.0000  129.0217  0.8568382   96.07210
##     64.0000  129.3976  0.8573605   95.97524
##    128.0000  131.4187  0.8576933   96.95202
##    256.0000  138.2664  0.8578426  101.44871
##    512.0000  155.8921  0.8578873  112.36394
##   1024.0000  188.4798  0.8578940  135.74148
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 16.
plot(stacking_fit)

# Step 6: Stacked model predictions
stacking_preds = predict(stacking_fit, stacking_test_x)
head(as.data.frame(stacking_preds))
##   stacking_preds
## 1       487.3690
## 2       506.4065
## 3       579.3585
## 4       534.8373
## 5       510.2306
## 6       576.6878
# Calculate error rates
ensemble_mape = mape(test_data$hous_st, stacking_preds)
ensemble_percent_bias = percent_bias(test_data$hous_st, stacking_preds)

accuracy = data.frame(
  Models = c("Ridge", "KNN", "ANN", "SVR", "Ensemble"),
  MAPE = c(ridge_mape, knn_mape, ann_mape, svr_mape, ensemble_mape),
  Percent_Bias = c(ridge_percent_bias, knn_percent_bias, ann_percent_bias, svr_percent_bias, ensemble_percent_bias)
  
)
accuracy %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
Models MAPE Percent_Bias
Ridge 0.1144027 -0.0506975
KNN 0.2790333 -0.1814967
ANN 0.1031093 -0.0663802
SVR 0.0953354 0.0333680
Ensemble 0.0651865 -0.0057394

The stacked ensemble produces a MAPE of 0.0651, lower than that of all other models.

Comparing predictions from the the ensemble model with the actual hous_st values:

#Plots comparing Predicted hous_st from ensemble model vs actual hous_st from test set
df_ensemble = data.frame(hous_st = test_data$hous_st, pred_hous_st_stack_ridge = stacking_preds)
 
# convert the last 101 rows of the entire FinCrisis dataset into a dataframe
# test set with 3 columns: date, actual and predicted hous_st
series_ensemble = cbind(tail(FinCrisis[,1], 102) , df_ensemble) 
tail(series_ensemble, 12) %>% select(3)
##     pred_hous_st_stack_ridge
## 91                  1267.462
## 92                  1397.599
## 93                  1351.461
## 94                  1283.334
## 95                  1291.228
## 96                  1227.204
## 97                  1206.512
## 98                  1216.285
## 99                  1160.713
## 100                 1022.141
## 101                 1032.861
## 102                 1818.448
# Melt the "series" data to a long form so that we can pass the data for all 3 variables (hous_st and pred_hous_st) to one aesthetic:
series_ensemble_melt = reshape2::melt(series_ensemble, id.var = 'Date')
ggplot(series_ensemble_melt, aes(x = Date, y = value, colour = variable)) + geom_point() + xlab('Date') 

Predictions from individual component models with the actual hous_st values:

#Plots comparing Predicted hous_st from individual component model vs actual hous_st from test set
df_comp = data.frame(hous_st = test_data$hous_st, knn_preds, ridge_preds,  ann_preds, svr_preds, series_ensemble$pred_hous_st_stack_ridge)
 
# convert the last 101 rows of the entire FinCrisis dataset into a dataframe
# test set with 3 columns: date, actual and predicted hous_st
series_comp = cbind(tail(FinCrisis[,1], 102) , df_comp) 

series_forecast = series_comp %>% tail(12) %>% 
  transmute(knn_forecast = knn_preds, ridge_forecast = ridge_preds, ann_forecast = ann_preds, svr_forecast = svr_preds, ensemble_forecast = series_ensemble.pred_hous_st_stack_ridge )

series_forecast_ts = ts(series_forecast, start=c(2019,1), end=c(2019,12), frequency = 12)

autoplot(series_forecast_ts) + xlab("Year") + ylab("Housing Starts (in 1000s of units)") + ggtitle("Forecasts of US Housing Starts in 2019")

# Melt the "series" data to a long form so that we can pass the data for all 5 variables (hous_st predictions from 4 models plus actual house_st) to one aesthetic:
series_comp_melt = reshape2::melt(series_comp, id.var = 'Date')
ggplot(series_comp_melt, aes(x = Date, y = value, colour = variable)) + geom_point() + xlab('Date')