knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

The dataset “FinCrisis” consists of economic variables that affect the number of housing starts in the US.

The following are the variables in the dataset:

hous_st

  1. Housing Starts: Total: New Privately Owned Housing Units Started, Thousands of Units, Monthly, Seasonally Adjusted Annual Rate

CPI

  1. Consumer Price Index for All Urban Consumers: All Items, Index 1982-1984=100, Monthly, Seasonally Adjusted
    # unempR
  2. Civilian Unemployment Rate, Percent, Monthly, Seasonally Adjusted

sec_conL

  1. Total Consumer Credit Owned and Securitized, Outstanding, Billions of Dollars, Monthly, Seasonally Adjusted

yield_sp

  1. 10-Year Treasury Constant Maturity Minus 2-Year Treasury Constant Maturity, Percent, Monthly, Not Seasonally Adjusted

real_estL

  1. Real Estate Loans, All Commercial Banks, Billions of U.S. Dollars, Monthly, Seasonally Adjusted 1945-07-01 2018-08-01

mortgR

  1. 30-Year Conventional Mortgage Rate , Percent, Monthly, Not Seasonally Adjusted

fed_fundsR

  1. Effective Federal Funds Rate, Percent, Monthly, Not Seasonally Adjusted

income

  1. Real Disposable Personal Income, Billions of Chained 2012 Dollars, Monthly, Seasonally Adjusted Annual Rate

pvt_house_comp

  1. New Privately-Owned Housing Units Completed: Total, Thousands of Units, Monthly, Seasonally Adjusted Annual Rate

house_supply

  1. Monthly Supply of Houses in the United States, Months’ Supply, Monthly, Seasonally Adjusted

Loading R packages

library(readxl)
library(readr)
library(dplyr)
library(ggplot2)
library(GGally)
library(gridExtra)
library(purrr)
library(glmnet) # computes penalized linear regression models such as lasso and ridge
library(tidyverse)
library(caret)
library(data.table)
library(tidyverse)
library("NeuralNetTools") # plots neural nets using nnet package
library(nnet)
library(forecast)
library(LiblineaR) # estimates predictive linear models for classification and regression
library(kableExtra) # generates customized tables
library(ggbiplot) # plots PCs and set the graphical parameters of the plots
library(factoextra) # visualization for PCs
library(Metrics) # contains functions that evaluation metrics for Machine Learning models
FinCrisis = read_xlsx("FinCrisis.xlsx")

convert FinCrisis dataset into time series dataset

# converts from data.frame to .ts
time_ser=ts(FinCrisis,frequency=12,start=c(1976,6), end=c(2018,12))
time_ser.df = as.data.frame(time_ser) 

# Making a train and test split for the estimation
train_fin = slice(time_ser.df, 1:409)
test_fin = slice(time_ser.df, -(1:409))

correlation matrix

matrix = cor(FinCrisis[,unlist(lapply(FinCrisis, is.numeric))])
# ranks from most negatively to most positively correlated variables
cor(matrix) %>%
  as.data.frame() %>%
  mutate(var1 = rownames(.)) %>%
  gather(var2, value, -var1) %>%
  arrange(desc(value)) %>%
  group_by(value) %>%
  dplyr::filter(row_number() == 1) 
## # A tibble: 56 x 3
## # Groups:   value [56]
##    var1           var2       value
##    <chr>          <chr>      <dbl>
##  1 hous_st        hous_st    1    
##  2 sec_conL       income     1.000
##  3 CPI            income     0.999
##  4 CPI            sec_conL   0.999
##  5 real_estL      CPI        0.997
##  6 real_estL      sec_conL   0.997
##  7 real_estL      income     0.996
##  8 mortgR         fed_fundsR 0.989
##  9 pvt_house_comp hous_st    0.981
## 10 real_estL      yield_sp   0.794
## # … with 46 more rows

Crossvalidated MSE

crossval_fold_inds <- caret::createFolds(
  y = train_fin$hous_st,
  k = 10
)
calc_mse <- function(observed, predicted) {
  return(mean(abs(observed - predicted)))
}
get_val_mse <- function(formula, train_fin, cross_fold_inds, val_fold){
  val_train <- train_fin %>% slice(cross_fold_inds[[val_fold]])
  train2_data <- train_fin %>% slice(-cross_fold_inds[[val_fold]])
  
  fitmod <- lm(formula, data = train2_data)
  
  val_preds <- predict(fitmod, val_train)
  val_resids <- val_train$hous_st - val_preds
  val_obs <- val_train$hous_st
  
calc_mse(observed = val_obs, predicted = val_preds)
}
mse_results <- expand.grid(
    val_fold = 1:10
  )
get_crossval_mse <- function(formula, train_fin, crossval_fold_inds){
  
  
    xval_mse_results = mse_results %>%
      mutate(
        val_mse = mse_results%>%
        pmap_dbl(
        get_val_mse,
        formula = formula,
        cross_fold_inds = crossval_fold_inds,
        train_fin = train_fin
        ))
    
     mean(xval_mse_results$val_mse)
}

As the above models had high test set MSE’s, we realized that this maybe due to the multicollinearity of the independent variables in the dataset so we chose to analyze the data using PCA.

Principal Component Analysis

In PCA, every variable is centered at zero so that we can easily compare each principal component to the mean. Centering also removes problems arising the scale of each variable. After pbtaining the PCs, we sort the vriables from larget to the samllest based on the standard deviations (eigenvalues).

Proportion of variance shows how much of the variance is explained by that principal component. The components are always sorted by how important they are, so the most important components will always be the first few. In this dataset, PC1 accounts for > 60 % of total variance in the data. The cumulative proprotion shows how much variance is accumulated.

FinCrisis.pca <- 
  prcomp(time_ser.df[,-c(1,2)], #formula with no response variable and numerical explan vars
        center = TRUE, # logical value indicating whether the variables should be shifted to be zero centered. Alternately, a vector of length equal to the number of columns of x can be supplied. The value is passed to scale.
    scale. = TRUE) #a logical value indicating whether the variables should be scaled to have unit variance before the analysis takes place. The default is FALSE for consistency with S, but in general scaling is advisable. Alternatively, a vector of length equal the number of columns of x can be supplied. The value is passed to scale
summary(FinCrisis.pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6    PC7
## Standard deviation     2.4645 1.3724 1.0866 0.65482 0.49411 0.36791 0.1731
## Proportion of Variance 0.6074 0.1883 0.1181 0.04288 0.02441 0.01354 0.0030
## Cumulative Proportion  0.6074 0.7957 0.9138 0.95669 0.98111 0.99464 0.9976
##                            PC8     PC9    PC10
## Standard deviation     0.13519 0.06179 0.03869
## Proportion of Variance 0.00183 0.00038 0.00015
## Cumulative Proportion  0.99947 0.99985 1.00000
# Eigenvalues
eig.val <- get_eigenvalue(FinCrisis.pca)
eig.val %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
eigenvalue variance.percent cumulative.variance.percent
Dim.1 6.0739086 60.7390863 60.73909
Dim.2 1.8835272 18.8352721 79.57436
Dim.3 1.1807085 11.8070849 91.38144
Dim.4 0.4287866 4.2878663 95.66931
Dim.5 0.2441449 2.4414486 98.11076
Dim.6 0.1353575 1.3535751 99.46433
Dim.7 0.0299749 0.2997494 99.76408
Dim.8 0.0182776 0.1827762 99.94686
Dim.9 0.0038175 0.0381750 99.98503
Dim.10 0.0014966 0.0149660 100.00000

A data matrix X of dimension n x p data matrix X can have up to min(n-1,p) principal components. The goal of PCA is to use significantly reduce the number of variables used such that the least number of principal components explain maximum variability.

A common technique to determine the number of PCs to use is to eyeball the scree plot below where we observe the “elbow point”, where the proportion of variance explained (PVE) plummets.

screeplot(FinCrisis.pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)

An eigenvalues <1 would mean that the component actually explains less than a single explanatory variable. We notice is that the first 3 components has an Eigenvalue >1 and explains almost 91% of variance. Alternatively, the first 6 components explain 99.46 percent variance. Thus, I have reduced dimensionality from 12 to 6 while only “loosing” about 0.53567 % of variance.

# Visualize eigenvalues (scree plot). Show the percentage of variances explained by each principal component.
fviz_eig(FinCrisis.pca)

#selecting the first 6 principal components to use in the model
x_pca = FinCrisis.pca$x[, 1:6]
x_pca %>% head() %>%
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
PC1 PC2 PC3 PC4 PC5 PC6
-2.532253 0.9589985 -0.6856539 -0.4194933 0.6971565 -0.9574271
-2.417471 1.1089015 -0.9856733 -0.2229814 0.6730305 -0.8642424
-2.397437 1.0653462 -1.1664993 -0.3159122 0.5733262 -0.7703319
-2.369972 0.9977712 -1.1642373 -0.3433385 0.6511079 -0.7066144
-2.267717 1.0412444 -1.4965310 -0.3725687 0.5711167 -0.6006079
-2.265047 1.0854381 -1.4703869 -0.5285890 0.5150894 -0.6393254
# Biplot for Principal Components using ggplot2
ggbiplot(FinCrisis.pca)

The axes are seen as arrows originating from the center point. The variables - yield spread, real estate loans and CPI contribute to PC1, with higher values in those variables moving the samples to the right on this plot.

# combine PC1-PC6 as explanatory variables with hous_st as response variable in a new dataset:
new_data = cbind(time_ser.df, as.data.frame(x_pca))
new_data = new_data[, -c(1, 3:12)]

# Making a train and test split for the estimation
train_data <- slice(new_data, 1:409)
test_data <- slice(new_data, -(1:409))

Create stacked ensemble using the following methods:

K-Nearest Neighbors Ridge/LASSO Artificial Neural Networks Support Vector Machine

# Creating time slices of the training set 
timeSlices = createTimeSlices(train_data$hous_st, # used in place of cv
                              
    initialWindow = 180, # a vector of outcomes in chronolgical order 15 years  times 12
    horizon = 12, # number of consecutive values in test set sample: 12 months
    fixedWindow = FALSE) #logical,if FALSE, all training samples start at 1

train_fold_inds <- timeSlices$train[seq(from = 1, to = 211, by = 12)]
val_fold_inds <- timeSlices$test[seq(from = 1, to = 211, by = 12)]

K-Nearest Neighbors

knn_fit <- caret::train(
  form = hous_st ~ .,
  data = train_data,
  method = "knn",
  preProcess = c("scale", "center"),
  trControl = trainControl(method = "cv",
    number = 10,
    index = train_fold_inds, # I'm specifying which folds to use, for consistency across methods
    indexOut = val_fold_inds, # I'm specifying which folds to use, for consistency across methods
    returnResamp = "all",
    savePredictions = TRUE),
  tuneGrid = data.frame(k = 1:10)
)

mean(abs(test_data$hous_st - predict(knn_fit, newdata = test_data))) # MAE in test set
## [1] 287.4804
#Output of kNN fit
knn_fit
## k-Nearest Neighbors 
## 
## 409 samples
##   6 predictor
## 
## Pre-processing: scaled (6), centered (6) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 192, 204, 216, 228, 240, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  160.8808  0.1875936  137.2130
##    2  151.9892  0.1520998  131.1968
##    3  154.6955  0.1541260  132.6327
##    4  160.9234  0.1306035  139.3553
##    5  161.3336  0.2314432  139.9667
##    6  160.6328  0.2051346  139.4336
##    7  161.7521  0.2503193  141.7870
##    8  163.1922  0.2509895  144.2205
##    9  164.3352  0.2324320  146.3585
##   10  167.7617  0.2027130  149.1916
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 2.
#Plotting yields Number of Neighbours Vs accuracy (based on repeated cross validation)
plot(knn_fit)

Ridge

When predictors in a regression are strongly correlated, regression coefficients of variable depends on other predictors in the model. Therefore, the specific independent variable does not reflect the effects of the predictor on the regressor. It only partially or marginally effects the regressor based on other predictors in the model.Ridge regression adds bias to alleviate multicollinearity.

# glmnet() runs the model many times for different values of lambda. 
ridge_fit <- caret::train(
  form = hous_st ~ .,
  data = train_data,
  method = "glmnet", # method for fit
  tuneGrid = expand.grid(
    alpha = 0,
    lambda = 2^seq(from = -4, to = 10) # plug in different values of lambda to see how test RMSE changes
  ),
  trControl = trainControl(method = "cv", # evaluate method performance via cross-validation
   number = 10, # number of folds for cross-validation
    index = train_fold_inds, # I'm specifying which folds to use, for consistency across methods
    indexOut = val_fold_inds, # I'm specifying which folds to use, for consistency across methods
    returnResamp = "all", # return information from cross-validation
    savePredictions = TRUE) # return validation set predictions from cross-validation
)
mean(abs(test_data$hous_st - predict(ridge_fit, newdata = test_data))) # MAE in test set
## [1] 99.16544
ridge_fit
## glmnet 
## 
## 409 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 192, 204, 216, 228, 240, ... 
## Resampling results across tuning parameters:
## 
##   lambda     RMSE      Rsquared   MAE     
##      0.0625  125.1038  0.2564421  105.6468
##      0.1250  125.1038  0.2564421  105.6468
##      0.2500  125.1038  0.2564421  105.6468
##      0.5000  125.1038  0.2564421  105.6468
##      1.0000  125.1038  0.2564421  105.6468
##      2.0000  125.1038  0.2564421  105.6468
##      4.0000  125.1038  0.2564421  105.6468
##      8.0000  125.1038  0.2564421  105.6468
##     16.0000  124.6471  0.2566134  105.2336
##     32.0000  123.0606  0.2547423  104.0415
##     64.0000  130.8146  0.2522117  113.9690
##    128.0000  150.6686  0.2493908  135.2232
##    256.0000  179.9229  0.2451270  163.4515
##    512.0000  210.8423  0.2400159  193.8696
##   1024.0000  235.9032  0.2353817  220.1529
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 32.
plot(ridge_fit)

Support Vector Regression

set.seed(123)
svr_fit <- caret::train(
  form = hous_st ~ .,
  data = train_data,
  method = "svmLinear3", # method for fit
  tuneGrid = expand.grid(
    cost = 2^(2:5), #  inverse of a regularization constant. Specify the cost of a violation to the margin. Thus when the cost is small, the margins will be wide and there will be many support vectors.
    Loss = sample(c("L1", "L2"),  replace = TRUE)
    
  ),
  trControl = trainControl(method = "cv", # evaluate method performance via cross-validation
   number = 10, # number of folds for cross-validation
    index = train_fold_inds, # I'm specifying which folds to use, for consistency across methods
    indexOut = val_fold_inds, # I'm specifying which folds to use, for consistency across methods
    returnResamp = "all", # return information from cross-validation
    savePredictions = TRUE) # return validation set predictions from cross-validation
)
mean(abs(test_data$hous_st - predict(svr_fit, newdata = test_data))) # MAE in test set
## [1] 96.70616
svr_fit
## L2 Regularized Support Vector Machine (dual) with Linear Kernel 
## 
## 409 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 192, 204, 216, 228, 240, ... 
## Resampling results across tuning parameters:
## 
##   cost  Loss  RMSE      Rsquared   MAE     
##    4    L2    138.6183  0.2601965  121.0898
##    4    L1    805.6740  0.2492696  800.2012
##    8    L2    137.9572  0.2609627  120.4017
##    8    L1    511.9761  0.2510944  503.2552
##   16    L2    137.1520  0.2619170  119.4300
##   16    L1    319.7882  0.2456201  305.6865
##   32    L2    141.2740  0.2605979  122.7559
##   32    L1    209.9045  0.2560926  193.3992
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were cost = 16 and Loss = L2.
plot(svr_fit)

Artificial Neural Networks

Prior to regression ANN construction we first must split the new_data data set into test and training data sets. Then we have to scale only the explanatory variables. Among the choices of activation function, we have used the logisistic function in out model. Here, each feature of new_data to fall in the [0, 1] interval

# Scale the Data:
# The scale01() function maps each data observation onto the [0, 1] interval as called in the dplyr mutate_all() function
scale01 = function(x){
  (x - min(x)) / (max(x) - min(x))
}
new_data_preds_01 = new_data %>% mutate_at(c("PC1", "PC2", "PC3", "PC4"), scale01)
head(new_data_preds_01)
##   hous_st       PC1       PC2        PC3       PC4       PC5        PC6
## 1    1495 0.2853374 0.5896654 0.24103450 0.4723815 0.6971565 -0.9574271
## 2    1401 0.2982730 0.6150966 0.18637345 0.5256437 0.6730305 -0.8642424
## 3    1550 0.3005307 0.6077074 0.15342846 0.5004559 0.5733262 -0.7703319
## 4    1720 0.3036260 0.5962433 0.15384057 0.4930223 0.6511079 -0.7066144
## 5    1629 0.3151497 0.6036185 0.09329942 0.4850999 0.5711167 -0.6006079
## 6    1641 0.3154506 0.6111160 0.09806266 0.4428124 0.5150894 -0.6393254
# Split  new_data_ann into test and train sets
train_set_preds_01 = slice(new_data_preds_01, 1:407)
test_set_preds_01 = slice(new_data_preds_01, -(1:407))

Cross validation folds for data with interval (0,1) before ANN

# caret::createDataPartition does stratified sampling to ensure test and train sets look similar in some way
# it splits the dataset into test and training set
#'@train_inds outputs 2 lists: 1st list has 35 obs, and the 2nd list has 13 obs, totalling to the n = 48 in the training dataset
train_inds = caret::createDataPartition(new_data_preds_01$hous_st, p = 0.8)
# Choose some indices to use in each part of the split.
# 1st split: split new_data_ann  into training and test sets. 
# use the square bracket notation to subset the data into selected indices
# Creating time slices of the training set after scaling
timeSlices = createTimeSlices(train_set_preds_01$hous_st, # used in place of cv
                              
    initialWindow = 180, # a vector of outcomes in chronolgical order 15 years  times 12
    horizon = 12, # number of consecutive values in test set sample: 12 months
    fixedWindow = FALSE) #logical,if FALSE, all training samples start at 1
train_fold_inds_01 <- timeSlices$train[seq(from = 1, to = 211, by = 12)]
val_fold_inds_01 <- timeSlices$test[seq(from = 1, to = 211, by = 12)]

Timeslices method has created 211 folds. Modeling in 211 datasets is computationally expensive, so we have to reduce that. We command R to skip steps i.e. instead of incrementing by every month, we increment the number of folds by every 12 months. This way, we will have 211/12=17.58 ~ 18 fold cross validated sets or 18-fold time slices.

nnet function with size and decay as tuning parameters

Metrics to measure Forecast Accuracy in test sets

Mean Absolute Percentage Error (MAPE):computes the average absolute percent difference between the observed and predicted values. MAPE = (actual - predicted) / abs(actual).

Percent Bias: computes the average amount that actual is greater than predicted as a percentage of the absolute value of actual. Its value is close to 0 if the model is unbiased. Percent bias is the average of (actual - predicted)/abs(actual).

#Prediction from ridge model
ridge_preds <- predict(ridge_fit, newdata = test_data)
ridge_mape = mape(test_data$hous_st, ridge_preds)
ridge_percent_bias = percent_bias(test_data$hous_st, ridge_preds)

#Prediction from KNN model
knn_preds <- predict(knn_fit, newdata = test_data)
knn_mape = mape(test_data$hous_st, knn_preds)
knn_percent_bias = percent_bias(test_data$hous_st, knn_preds)

#Prediction from ANN model
ann_preds <- predict(ann_fit, newdata = test_data)
ann_mape = mape(test_data$hous_st, ann_preds)
ann_percent_bias = percent_bias(test_data$hous_st, ann_preds)

#Prediction from SVR model
svr_preds <- predict(svr_fit, newdata = test_data)
svr_mape = mape(test_data$hous_st, svr_preds)
svr_percent_bias = percent_bias(test_data$hous_st, svr_preds)

(accuracy = data.frame(
  Models = c("Ridge", "KNN", "ANN", "SVR"),
  MAPE = c(ridge_mape, knn_mape, ann_mape, svr_mape),
  Percent_Bias = c(ridge_percent_bias, knn_percent_bias, ann_percent_bias, svr_percent_bias)
  
))
##   Models       MAPE Percent_Bias
## 1  Ridge 0.11440275  -0.05069752
## 2    KNN 0.27903330  -0.18149670
## 3    ANN 0.10310932  -0.06638015
## 4    SVR 0.09533544   0.03336799

neural interpretation diagram (NID)

The plotnet function plots a neural interpretation diagram (NID). The black lines indicate positive weights between layers and grey lines indicate negative weights.

The thickness of line increases as the magnitude of each weight increases. The first layer consists of input variables whose nodes are labelled as I1 through I6 for six explanatory variables.

The plot below consists of one hidden layer with 10 hidden nodes labelled as H1 through H10. The last layer consists of the output layer labeled as O1. It shows the bias nodes which are connected to the hidden and output layers.

# To view a diagram of the NN1 use the plotnet() function.
plotnet(ann_fit, x_names = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), y_names = "hous_st")

coef(ann_fit$finalModel)
##       b->h1      i1->h1      i2->h1      i3->h1      i4->h1      i5->h1 
##   6.9973094   2.7541821   1.9130187   0.1593454   2.6374881  -0.7964275 
##      i6->h1       b->h2      i1->h2      i2->h2      i3->h2      i4->h2 
##  -1.0175347   6.6981651   3.0004355   2.4396672   0.8289042   2.6990313 
##      i5->h2      i6->h2       b->h3      i1->h3      i2->h3      i3->h3 
##  -0.2761348  -1.7851033  60.6841860 -30.0739451 -30.8723839 -67.0854827 
##      i4->h3      i5->h3      i6->h3       b->h4      i1->h4      i2->h4 
## -41.9800897 -28.9049180  -8.6273998  35.2036697   9.3985052  10.3136393 
##      i3->h4      i4->h4      i5->h4      i6->h4       b->h5      i1->h5 
##   6.6504071  14.3804396   2.4946905 -20.5704051  17.0209935  -8.9758188 
##      i2->h5      i3->h5      i4->h5      i5->h5      i6->h5       b->h6 
## -13.2607918   0.8004982  -6.0931299  -1.1598040  -2.6605575  25.1301763 
##      i1->h6      i2->h6      i3->h6      i4->h6      i5->h6      i6->h6 
##  -1.8421162 -12.1820600 -21.3783462 -21.6272392   4.7187724  -2.5071266 
##       b->h7      i1->h7      i2->h7      i3->h7      i4->h7      i5->h7 
##  29.6180954  -0.3854124 -21.5384474 -64.6874797  29.0406152  -4.1728956 
##      i6->h7       b->h8      i1->h8      i2->h8      i3->h8      i4->h8 
##  16.9318749   5.3045046  -7.3714461 -54.5435554  18.5750584  -0.3864044 
##      i5->h8      i6->h8        b->o       h1->o       h2->o       h3->o 
##  -5.6130632 -28.4729414 180.5661546  87.3308392 168.1559459 298.7879217 
##       h4->o       h5->o       h6->o       h7->o       h8->o 
## 169.1409014 503.7799004 162.6350091 257.4076078 253.5629605

Stacking via Ridge Regression

# Step 1: Validation-fold predictions from component models

#Ridge
ridge_val_pred <- ridge_fit$pred %>%
  dplyr::filter(lambda == ridge_fit$bestTune$lambda) %>%
  arrange(rowIndex) %>%
  pull(pred)

# KNN
knn_val_pred <- knn_fit$pred %>%
  dplyr::filter(k == knn_fit$bestTune$k) %>%
  arrange(rowIndex) %>%
  pull(pred)

# ANN
ann_val_pred = ann_fit$pred %>%
  dplyr::filter(decay == ann_fit$bestTune$decay, size == ann_fit$bestTune$size) %>%
  arrange(rowIndex) %>%
  pull(pred)

# SVR
svr_val_pred = svr_fit$pred %>%
  dplyr::filter(cost == svr_fit$bestTune$cost, Loss == svr_fit$bestTune$Loss) %>%
  arrange(rowIndex) %>%
  pull(pred)

# Step 2: data set with validation-set component model predictions as explanatory variables
val_data <- train_data %>%
  slice(val_fold_inds %>% unlist()) %>%
  mutate(
   ridge_pred = ridge_val_pred,
    knn_pred = knn_val_pred,
    ann_pred = ann_val_pred,
    svr_pred = svr_val_pred
   
  )
# Step 3: fit model using component model predictions as explanatory variables
stacking_fit <- caret::train(
  form = hous_st ~  knn_pred +  ridge_pred + ann_pred + svr_pred,
  data = val_data,
  method = "glmnet", 
  tuneGrid = expand.grid(
    alpha = 0,
    lambda = 2^seq(from = -4, to = 10)
    
  )
)
coef(stacking_fit$finalModel, stacking_fit$bestTune$lambda) %>% t() # Transposes the result to occcupy less space
## 1 x 5 sparse Matrix of class "dgCMatrix"
##   (Intercept) knn_pred ridge_pred  ann_pred  svr_pred
## 1   -130.0761 0.216563  0.3378726 0.2389439 0.2921875
# Step 4 (both cross-validation and refitting to the full training set were already done
# to get ridge_fit, knn_fit, ann_fit and svr_fit above)
knn_test_pred <- predict(knn_fit, newdata = test_data)
ridge_test_pred <- predict(ridge_fit, newdata = test_data)
ann_test_pred = predict(ann_fit, newdata = test_data)
svr_test_pred = predict(svr_fit, newdata = test_data)


# Step 5: Assemble data frame of test set predictions from each component model
stacking_test_x = data.frame(
  ridge_pred = ridge_test_pred,
  knn_pred = knn_test_pred,
  ann_pred = ann_test_pred,
  svr_pred = svr_test_pred
)
stacking_fit
## glmnet 
## 
## 216 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 216, 216, 216, 216, 216, 216, ... 
## Resampling results across tuning parameters:
## 
##   lambda     RMSE      Rsquared   MAE      
##      0.0625  129.0097  0.8568090   96.07441
##      0.1250  129.0097  0.8568090   96.07441
##      0.2500  129.0097  0.8568090   96.07441
##      0.5000  129.0097  0.8568090   96.07441
##      1.0000  129.0097  0.8568090   96.07441
##      2.0000  129.0097  0.8568090   96.07441
##      4.0000  129.0097  0.8568090   96.07441
##      8.0000  129.0097  0.8568090   96.07441
##     16.0000  129.0097  0.8568090   96.07441
##     32.0000  129.0217  0.8568382   96.07210
##     64.0000  129.3976  0.8573605   95.97524
##    128.0000  131.4187  0.8576933   96.95202
##    256.0000  138.2664  0.8578426  101.44871
##    512.0000  155.8921  0.8578873  112.36394
##   1024.0000  188.4798  0.8578940  135.74148
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 16.
plot(stacking_fit)

# Step 6: Stacked model predictions
stacking_preds = predict(stacking_fit, stacking_test_x)
tail(as.data.frame(stacking_preds))
##     stacking_preds
## 97        1206.512
## 98        1216.285
## 99        1160.713
## 100       1022.141
## 101       1032.861
## 102       1818.448
# Calculate error rates
ensemble_mape = mape(test_data$hous_st, stacking_preds)
ensemble_percent_bias = percent_bias(test_data$hous_st, stacking_preds)

accuracy = data.frame(
  Models = c("Ridge", "KNN", "ANN", "SVR", "Ensemble"),
  MAPE = c(ridge_mape, knn_mape, ann_mape, svr_mape, ensemble_mape),
  Percent_Bias = c(ridge_percent_bias, knn_percent_bias, ann_percent_bias, svr_percent_bias, ensemble_percent_bias)
  
)
accuracy %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
Models MAPE Percent_Bias
Ridge 0.1144027 -0.0506975
KNN 0.2790333 -0.1814967
ANN 0.1031093 -0.0663802
SVR 0.0953354 0.0333680
Ensemble 0.0651865 -0.0057394

The stacked ensemble produces a MAPE of 0.0651, lower than that of all other models.

Comparing predictions from the the ensemble model with the actual hous_st values:

#Plots comparing Predicted hous_st from ensemble model vs actual hous_st from test set
df_ensemble = data.frame(hous_st = test_data$hous_st, pred_hous_st_stack_ridge = stacking_preds)
 
# convert the last 101 rows of the entire FinCrisis dataset into a dataframe
# test set with 3 columns: date, actual and predicted hous_st
series_ensemble = cbind(tail(FinCrisis[,1], 102) , df_ensemble) 
head(series_ensemble)
##         Date hous_st pred_hous_st_stack_ridge
## 1 2010-07-01     546                 487.3690
## 2 2010-08-01     599                 506.4065
## 3 2010-09-01     594                 579.3585
## 4 2010-10-01     543                 534.8373
## 5 2010-11-01     545                 510.2306
## 6 2010-12-01     539                 576.6878
# Melt the "series" data to a long form so that we can pass the data for all 3 variables (hous_st and pred_hous_st) to one aesthetic:
series_ensemble_melt = reshape2::melt(series_ensemble, id.var = 'Date')
ggplot(series_ensemble_melt, aes(x = Date, y = value, colour = variable)) + geom_point() + xlab('Date') 

Predictions from individual component models with the actual hous_st values:

#Plots comparing Predicted hous_st from individual component model vs actual hous_st from test set
df_comp = data.frame(hous_st = test_data$hous_st, knn_preds, ridge_preds,  ann_preds, svr_preds)
 
# convert the last 101 rows of the entire FinCrisis dataset into a dataframe
# test set with 3 columns: date, actual and predicted hous_st
series_comp = cbind(tail(FinCrisis[,1], 102) , df_comp, stacking_preds)
series_comp %>%
kable("html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
Date hous_st knn_preds ridge_preds ann_preds svr_preds stacking_preds
2010-07-01 546 593.5 592.0258 605.1938 493.7866 487.3690
2010-08-01 599 584.0 626.0560 605.1938 526.6318 506.4065
2010-09-01 594 559.5 748.1920 605.1938 653.2332 579.3585
2010-10-01 543 559.5 678.9041 605.1938 580.9826 534.8373
2010-11-01 545 559.5 642.1370 605.1938 539.2831 510.2306
2010-12-01 539 661.5 708.9837 605.1938 613.8317 576.6878
2011-01-01 630 620.0 638.8115 605.1938 540.7761 522.6453
2011-02-01 517 609.0 650.4377 605.1939 563.0942 530.7123
2011-03-01 600 625.0 700.3906 605.1940 609.1985 564.5262
2011-04-01 554 661.5 704.6946 605.2362 609.0198 573.8428
2011-05-01 561 661.5 713.7050 605.5362 616.6482 579.1878
2011-06-01 608 661.5 742.7555 605.3604 648.3127 598.2131
2011-07-01 623 661.5 776.0342 605.2009 687.6789 620.9213
2011-08-01 585 585.0 792.4190 605.1944 694.4731 611.8738
2011-09-01 650 585.0 808.6609 605.1995 705.2766 620.5194
2011-10-01 610 585.0 791.3653 857.8555 686.0553 669.4300
2011-11-01 711 661.5 826.4117 862.6014 722.1650 709.5231
2011-12-01 694 661.5 874.1928 862.6014 772.3487 740.3301
2012-01-01 723 661.5 813.7806 862.6014 705.4386 700.3682
2012-02-01 704 661.5 864.6327 862.6014 757.3326 732.7125
2012-03-01 695 661.5 865.4521 862.6014 762.5044 734.5005
2012-04-01 753 661.5 939.9862 862.6028 845.5302 783.9430
2012-05-01 708 661.5 926.9428 862.6014 824.0429 773.2573
2012-06-01 757 610.5 925.3699 862.6014 819.7723 760.4333
2012-07-01 740 610.5 984.8099 862.6014 884.1718 799.3332
2012-08-01 754 610.5 982.6803 862.6016 884.9667 798.8460
2012-09-01 847 661.5 952.3541 862.6018 852.6442 790.2001
2012-10-01 915 661.5 985.9349 862.6082 894.5085 813.7799
2012-11-01 833 661.5 957.0790 862.6016 857.7902 793.3001
2012-12-01 976 661.5 974.0883 862.6017 875.9364 804.3492
2013-01-01 888 661.5 1041.3182 862.8398 950.2901 848.8464
2013-02-01 962 661.5 1032.3162 866.0809 944.6333 844.9265
2013-03-01 1010 661.5 1090.2145 993.3530 1012.4336 914.7100
2013-04-01 835 661.5 1000.0634 862.6702 908.2104 822.5719
2013-05-01 930 661.5 973.0228 862.9436 883.0676 806.1545
2013-06-01 839 661.5 1018.6964 936.2836 935.7860 854.5142
2013-07-01 880 661.5 936.4299 1006.4158 862.7309 822.1305
2013-08-01 917 661.5 911.7910 1019.3600 838.2116 809.7343
2013-09-01 850 661.5 926.5522 1023.1875 854.4846 820.3911
2013-10-01 925 661.5 990.4888 1024.8873 919.4286 861.3755
2013-11-01 1100 1622.0 999.1392 1025.2343 934.9733 1076.9319
2013-12-01 1002 1582.0 921.1486 1025.2265 852.7169 1017.8823
2014-01-01 890 1622.0 975.0005 1025.2356 911.7179 1061.9815
2014-02-01 941 1622.0 1001.0735 1025.2351 939.5957 1078.9362
2014-03-01 970 1622.0 986.8962 1025.2285 926.1726 1070.2225
2014-04-01 1049 1582.0 915.7223 1025.2001 849.5095 1015.1054
2014-05-01 1008 1602.0 1012.6392 1025.2344 951.3262 1081.9401
2014-06-01 903 1583.5 896.0709 1024.4633 825.0677 1001.4730
2014-07-01 1082 1602.0 906.6354 1021.2084 837.8084 1011.9938
2014-08-01 983 1604.5 1009.9188 1025.2095 948.1572 1080.6304
2014-09-01 1020 1602.0 1064.8912 1025.2363 1011.0579 1117.0478
2014-10-01 1077 1564.5 1017.8838 1025.2299 956.1328 1076.9943
2014-11-01 1001 1564.5 948.4641 1023.3827 879.5823 1030.7308
2014-12-01 1068 1578.5 1053.1219 1025.1859 990.2144 1101.8799
2015-01-01 1094 1472.5 1105.6809 1025.0466 1040.4889 1111.3387
2015-02-01 888 1595.0 1049.8279 1021.0396 975.3056 1098.9933
2015-03-01 963 1595.0 934.6451 885.1060 852.9855 991.8553
2015-04-01 1203 1607.5 1123.8195 1056.6987 1065.1123 1161.4610
2015-05-01 1079 1604.5 1128.1672 1025.2353 1073.1220 1157.1026
2015-06-01 1185 1618.5 1028.0778 1025.2151 968.2269 1095.6632
2015-07-01 1133 1618.5 1073.5894 1025.2348 1017.2117 1125.3578
2015-08-01 1134 1619.5 1071.3946 1027.9051 1010.9413 1123.6387
2015-09-01 1212 1415.5 1051.5663 1152.4016 996.5563 1098.3049
2015-10-01 1064 1618.5 1031.9647 1081.0902 971.7602 1111.3598
2015-11-01 1171 1578.5 1056.7912 1183.8574 996.3089 1142.8139
2015-12-01 1155 1607.5 1096.1189 1278.6830 1036.5720 1196.8044
2016-01-01 1114 1540.0 1113.6473 1278.8215 1058.4558 1194.5360
2016-02-01 1202 1540.0 1106.2529 1277.9572 1046.2162 1188.2549
2016-03-01 1115 1540.0 1098.5662 1270.1216 1035.9612 1180.7891
2016-04-01 1173 1595.0 1069.0911 1232.5201 1000.1365 1163.2891
2016-05-01 1133 1595.0 1093.3973 1271.9213 1029.0504 1189.3644
2016-06-01 1183 1452.5 1178.8287 1279.6810 1123.4822 1216.8150
2016-07-01 1225 1565.5 1216.2870 1280.0957 1158.2109 1264.1892
2016-08-01 1161 1515.5 1152.6975 1266.2317 1089.9508 1208.6184
2016-09-01 1064 1515.5 1115.3258 1193.2265 1049.4746 1166.7207
2016-10-01 1327 1452.5 1149.1826 1274.9623 1089.5196 1195.7475
2016-11-01 1151 1452.5 1241.5062 1289.3787 1200.9242 1262.9368
2016-12-01 1280 1618.5 1116.6259 1278.8077 1065.0452 1214.4647
2017-01-01 1225 1608.0 1129.7097 1278.4982 1075.5985 1219.6210
2017-02-01 1289 1608.0 1194.8417 1279.3930 1147.3295 1262.8000
2017-03-01 1179 1608.0 1227.9956 1292.1060 1184.8864 1288.0131
2017-04-01 1165 1619.5 1124.1391 1277.8390 1068.9230 1218.1213
2017-05-01 1122 1528.5 1187.0681 1297.5728 1138.8926 1244.8355
2017-06-01 1225 1452.5 1249.0476 1594.1367 1205.8426 1339.7420
2017-07-01 1185 1377.0 1162.7122 1114.0834 1116.4570 1153.3980
2017-08-01 1172 1469.0 1080.7993 881.0112 1021.3955 1062.1787
2017-09-01 1158 1595.0 1123.9557 1227.6680 1063.9528 1199.3132
2017-10-01 1265 1452.5 1181.4216 1375.1890 1132.3831 1243.1129
2017-11-01 1303 1588.0 1203.6580 1336.2336 1147.1213 1274.9684
2017-12-01 1210 1452.5 1200.8528 1438.8483 1148.5592 1269.6156
2018-01-01 1334 1452.5 1208.4962 1407.7139 1157.8125 1267.4624
2018-02-01 1290 1452.5 1274.4365 1767.0213 1233.1146 1397.5985
2018-03-01 1327 1452.5 1231.0717 1698.3062 1181.5488 1351.4608
2018-04-01 1276 1452.5 1222.1804 1434.3382 1174.5363 1283.3342
2018-05-01 1329 1452.5 1227.4562 1454.4434 1179.0103 1291.2279
2018-06-01 1177 1452.5 1169.9136 1345.7183 1115.3431 1227.2039
2018-07-01 1184 1452.5 1137.0422 1350.3594 1078.7430 1206.5124
2018-08-01 1280 1452.5 1147.1613 1360.4075 1092.2717 1216.2853
2018-09-01 1237 1452.5 1070.4007 1341.2537 1006.5054 1160.7135
2018-10-01 1217 1356.0 956.1160 1157.6658 886.0567 1022.1407
2018-11-01 1256 1356.0 942.6102 1237.9392 872.7185 1032.8610
2018-12-01 1260 1770.5 1775.1186 1823.2856 1812.7832 1818.4482
# Melt the "series" data to a long form so that we can pass the data for all 5 variables (hous_st predictions from 4 models plus actual house_st) to one aesthetic:
series_comp_melt = reshape2::melt(series_comp, id.var = 'Date')
ggplot(series_comp_melt, aes(x = Date, y = value, colour = variable)) + geom_point() + xlab('Date')