knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

The dataset “FinCrisis” consists of economic variables that affect the number of housing starts in the US.

The following are the variables in the dataset:

hous_st

Housing Starts: Total: New Privately Owned Housing Units Started, Thousands of Units, Monthly, Seasonally Adjusted Annual Rate

CPI

Consumer Price Index for All Urban Consumers: All Items, Index 1982-1984=100, Monthly, Seasonally Adjusted
# unempR
Civilian Unemployment Rate, Percent, Monthly, Seasonally Adjusted

sec_conL

Total Consumer Credit Owned and Securitized, Outstanding, Billions of Dollars, Monthly, Seasonally Adjusted

yield_sp

10-Year Treasury Constant Maturity Minus 2-Year Treasury Constant Maturity, Percent, Monthly, Not Seasonally Adjusted

real_estL

Real Estate Loans, All Commercial Banks, Billions of U.S. Dollars, Monthly, Seasonally Adjusted 1945-07-01 2018-08-01

mortgR

30-Year Conventional Mortgage Rate , Percent, Monthly, Not Seasonally Adjusted

fed_fundsR

Effective Federal Funds Rate, Percent, Monthly, Not Seasonally Adjusted

income

Real Disposable Personal Income, Billions of Chained 2012 Dollars, Monthly, Seasonally Adjusted Annual Rate

pvt_house_comp

New Privately-Owned Housing Units Completed: Total, Thousands of Units, Monthly, Seasonally Adjusted Annual Rate

house_supply

Monthly Supply of Houses in the United States, Months’ Supply, Monthly, Seasonally Adjusted

Loading R packages

library(readxl)
library(readr)
library(dplyr)
library(ggplot2)
library(GGally)
library(gridExtra)
library(purrr)
library(glmnet) # computes penalized linear regression models such as lasso and ridge
library(tidyverse)
library(caret)
library(data.table)
library(tidyverse)
library("NeuralNetTools") # plots neural nets using nnet package
library(nnet)
library(forecast)
library(LiblineaR) # estimates predictive linear models for classification and regression
library(kableExtra) # generates customized tables
library(ggbiplot) # plots PCs and set the graphical parameters of the plots
library(factoextra) # visualization for PCs
library(Metrics) # contains functions that evaluation metrics for Machine Learning models

FinCrisis = read_xlsx("FinCrisis.xlsx")

convert FinCrisis dataset into time series dataset

# converts from data.frame to .ts
time_ser=ts(FinCrisis,frequency=12,start=c(1976,6), end=c(2018,12))

time_ser.df = as.data.frame(time_ser) 

# Making a train and test split for the estimation
train_fin = slice(time_ser.df, 1:409)
test_fin = slice(time_ser.df, -(1:409))

correlation matrix

matrix = cor(FinCrisis[,unlist(lapply(FinCrisis, is.numeric))])
# ranks from most negatively to most positively correlated variables
cor(matrix) %>%
  as.data.frame() %>%
  mutate(var1 = rownames(.)) %>%
  gather(var2, value, -var1) %>%
  arrange(desc(value)) %>%
  group_by(value) %>%
  dplyr::filter(row_number() == 1)

## # A tibble: 56 x 3
## # Groups:   value [56]
##    var1           var2       value
##    <chr>          <chr>      <dbl>
##  1 hous_st        hous_st    1    
##  2 sec_conL       income     1.000
##  3 CPI            income     0.999
##  4 CPI            sec_conL   0.999
##  5 real_estL      CPI        0.997
##  6 real_estL      sec_conL   0.997
##  7 real_estL      income     0.996
##  8 mortgR         fed_fundsR 0.989
##  9 pvt_house_comp hous_st    0.981
## 10 real_estL      yield_sp   0.794
## # … with 46 more rows

Crossvalidated MSE

crossval_fold_inds <- caret::createFolds(
  y = train_fin$hous_st,
  k = 10
)
calc_mse <- function(observed, predicted) {
  return(mean(abs(observed - predicted)))
}
get_val_mse <- function(formula, train_fin, cross_fold_inds, val_fold){
  val_train <- train_fin %>% slice(cross_fold_inds[[val_fold]])
  train2_data <- train_fin %>% slice(-cross_fold_inds[[val_fold]])
  
  fitmod <- lm(formula, data = train2_data)
  
  val_preds <- predict(fitmod, val_train)
  val_resids <- val_train$hous_st - val_preds
  val_obs <- val_train$hous_st
  
calc_mse(observed = val_obs, predicted = val_preds)
}
mse_results <- expand.grid(
    val_fold = 1:10
  )
get_crossval_mse <- function(formula, train_fin, crossval_fold_inds){
  
  
    xval_mse_results = mse_results %>%
      mutate(
        val_mse = mse_results%>%
        pmap_dbl(
        get_val_mse,
        formula = formula,
        cross_fold_inds = crossval_fold_inds,
        train_fin = train_fin
        ))
    
     mean(xval_mse_results$val_mse)
}

As the above models had high test set MSE’s, we realized that this maybe due to the multicollinearity of the independent variables in the dataset so we chose to analyze the data using PCA.

Principal Component Analysis

In PCA, every variable is centered at zero so that we can easily compare each principal component to the mean. Centering also removes problems arising the scale of each variable. After pbtaining the PCs, we sort the vriables from larget to the samllest based on the standard deviations (eigenvalues).

Proportion of variance shows how much of the variance is explained by that principal component. The components are always sorted by how important they are, so the most important components will always be the first few. In this dataset, PC1 accounts for > 60 % of total variance in the data. The cumulative proprotion shows how much variance is accumulated.

FinCrisis.pca <- 
  prcomp(time_ser.df[,-c(1,2)], #formula with no response variable and numerical explan vars
        center = TRUE, # logical value indicating whether the variables should be shifted to be zero centered. Alternately, a vector of length equal to the number of columns of x can be supplied. The value is passed to scale.
    scale. = TRUE) #a logical value indicating whether the variables should be scaled to have unit variance before the analysis takes place. The default is FALSE for consistency with S, but in general scaling is advisable. Alternatively, a vector of length equal the number of columns of x can be supplied. The value is passed to scale
summary(FinCrisis.pca)

## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6    PC7
## Standard deviation     2.4645 1.3724 1.0866 0.65482 0.49411 0.36791 0.1731
## Proportion of Variance 0.6074 0.1883 0.1181 0.04288 0.02441 0.01354 0.0030
## Cumulative Proportion  0.6074 0.7957 0.9138 0.95669 0.98111 0.99464 0.9976
##                            PC8     PC9    PC10
## Standard deviation     0.13519 0.06179 0.03869
## Proportion of Variance 0.00183 0.00038 0.00015
## Cumulative Proportion  0.99947 0.99985 1.00000

# Eigenvalues
eig.val <- get_eigenvalue(FinCrisis.pca)
eig.val %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)

	eigenvalue	variance.percent	cumulative.variance.percent
Dim.1	6.0739086	60.7390863	60.73909
Dim.2	1.8835272	18.8352721	79.57436
Dim.3	1.1807085	11.8070849	91.38144
Dim.4	0.4287866	4.2878663	95.66931
Dim.5	0.2441449	2.4414486	98.11076
Dim.6	0.1353575	1.3535751	99.46433
Dim.7	0.0299749	0.2997494	99.76408
Dim.8	0.0182776	0.1827762	99.94686
Dim.9	0.0038175	0.0381750	99.98503
Dim.10	0.0014966	0.0149660	100.00000

A data matrix X of dimension n x p data matrix X can have up to min(n-1,p) principal components. The goal of PCA is to use significantly reduce the number of variables used such that the least number of principal components explain maximum variability.

A common technique to determine the number of PCs to use is to eyeball the scree plot below where we observe the “elbow point”, where the proportion of variance explained (PVE) plummets.

screeplot(FinCrisis.pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)

An eigenvalues <1 would mean that the component actually explains less than a single explanatory variable. We notice is that the first 3 components has an Eigenvalue >1 and explains almost 91% of variance. Alternatively, the first 6 components explain 99.46 percent variance. Thus, I have reduced dimensionality from 12 to 6 while only “loosing” about 0.53567 % of variance.

# Visualize eigenvalues (scree plot). Show the percentage of variances explained by each principal component.
fviz_eig(FinCrisis.pca)

#selecting the first 6 principal components to use in the model
x_pca = FinCrisis.pca$x[, 1:6]
x_pca %>% head() %>%
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)

PC1	PC2	PC3	PC4	PC5	PC6
-2.532253	0.9589985	-0.6856539	-0.4194933	0.6971565	-0.9574271
-2.417471	1.1089015	-0.9856733	-0.2229814	0.6730305	-0.8642424
-2.397437	1.0653462	-1.1664993	-0.3159122	0.5733262	-0.7703319
-2.369972	0.9977712	-1.1642373	-0.3433385	0.6511079	-0.7066144
-2.267717	1.0412444	-1.4965310	-0.3725687	0.5711167	-0.6006079
-2.265047	1.0854381	-1.4703869	-0.5285890	0.5150894	-0.6393254

# Biplot for Principal Components using ggplot2
ggbiplot(FinCrisis.pca)

The axes are seen as arrows originating from the center point. The variables - yield spread, real estate loans and CPI contribute to PC1, with higher values in those variables moving the samples to the right on this plot.

# combine PC1-PC6 as explanatory variables with hous_st as response variable in a new dataset:
new_data = cbind(time_ser.df, as.data.frame(x_pca))
new_data = new_data[, -c(1, 3:12)]

# Making a train and test split for the estimation
train_data <- slice(new_data, 1:409)
test_data <- slice(new_data, -(1:409))

Create stacked ensemble using the following methods:

K-Nearest Neighbors Ridge/LASSO Artificial Neural Networks Support Vector Machine

# Creating time slices of the training set 
timeSlices = createTimeSlices(train_data$hous_st, # used in place of cv
                              
    initialWindow = 180, # a vector of outcomes in chronolgical order 15 years  times 12
    horizon = 12, # number of consecutive values in test set sample: 12 months
    fixedWindow = FALSE) #logical,if FALSE, all training samples start at 1

train_fold_inds <- timeSlices$train[seq(from = 1, to = 211, by = 12)]
val_fold_inds <- timeSlices$test[seq(from = 1, to = 211, by = 12)]

K-Nearest Neighbors

knn_fit <- caret::train(
  form = hous_st ~ .,
  data = train_data,
  method = "knn",
  preProcess = c("scale", "center"),
  trControl = trainControl(method = "cv",
    number = 10,
    index = train_fold_inds, # I'm specifying which folds to use, for consistency across methods
    indexOut = val_fold_inds, # I'm specifying which folds to use, for consistency across methods
    returnResamp = "all",
    savePredictions = TRUE),
  tuneGrid = data.frame(k = 1:10)
)

mean(abs(test_data$hous_st - predict(knn_fit, newdata = test_data))) # MAE in test set

## [1] 287.4804

#Output of kNN fit
knn_fit

## k-Nearest Neighbors 
## 
## 409 samples
##   6 predictor
## 
## Pre-processing: scaled (6), centered (6) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 192, 204, 216, 228, 240, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  160.8808  0.1875936  137.2130
##    2  151.9892  0.1520998  131.1968
##    3  154.6955  0.1541260  132.6327
##    4  160.9234  0.1306035  139.3553
##    5  161.3336  0.2314432  139.9667
##    6  160.6328  0.2051346  139.4336
##    7  161.7521  0.2503193  141.7870
##    8  163.1922  0.2509895  144.2205
##    9  164.3352  0.2324320  146.3585
##   10  167.7617  0.2027130  149.1916
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 2.

#Plotting yields Number of Neighbours Vs accuracy (based on repeated cross validation)
plot(knn_fit)

Ridge

When predictors in a regression are strongly correlated, regression coefficients of variable depends on other predictors in the model. Therefore, the specific independent variable does not reflect the effects of the predictor on the regressor. It only partially or marginally effects the regressor based on other predictors in the model.Ridge regression adds bias to alleviate multicollinearity.

# glmnet() runs the model many times for different values of lambda. 
ridge_fit <- caret::train(
  form = hous_st ~ .,
  data = train_data,
  method = "glmnet", # method for fit
  tuneGrid = expand.grid(
    alpha = 0,
    lambda = 2^seq(from = -4, to = 10) # plug in different values of lambda to see how test RMSE changes
  ),
  trControl = trainControl(method = "cv", # evaluate method performance via cross-validation
   number = 10, # number of folds for cross-validation
    index = train_fold_inds, # I'm specifying which folds to use, for consistency across methods
    indexOut = val_fold_inds, # I'm specifying which folds to use, for consistency across methods
    returnResamp = "all", # return information from cross-validation
    savePredictions = TRUE) # return validation set predictions from cross-validation
)
mean(abs(test_data$hous_st - predict(ridge_fit, newdata = test_data))) # MAE in test set

## [1] 99.16544

ridge_fit

## glmnet 
## 
## 409 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 192, 204, 216, 228, 240, ... 
## Resampling results across tuning parameters:
## 
##   lambda     RMSE      Rsquared   MAE     
##      0.0625  125.1038  0.2564421  105.6468
##      0.1250  125.1038  0.2564421  105.6468
##      0.2500  125.1038  0.2564421  105.6468
##      0.5000  125.1038  0.2564421  105.6468
##      1.0000  125.1038  0.2564421  105.6468
##      2.0000  125.1038  0.2564421  105.6468
##      4.0000  125.1038  0.2564421  105.6468
##      8.0000  125.1038  0.2564421  105.6468
##     16.0000  124.6471  0.2566134  105.2336
##     32.0000  123.0606  0.2547423  104.0415
##     64.0000  130.8146  0.2522117  113.9690
##    128.0000  150.6686  0.2493908  135.2232
##    256.0000  179.9229  0.2451270  163.4515
##    512.0000  210.8423  0.2400159  193.8696
##   1024.0000  235.9032  0.2353817  220.1529
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 32.

plot(ridge_fit)

Support Vector Regression

set.seed(123)
svr_fit <- caret::train(
  form = hous_st ~ .,
  data = train_data,
  method = "svmLinear3", # method for fit
  tuneGrid = expand.grid(
    cost = 2^(2:5), #  inverse of a regularization constant. Specify the cost of a violation to the margin. Thus when the cost is small, the margins will be wide and there will be many support vectors.
    Loss = sample(c("L1", "L2"),  replace = TRUE)
    
  ),
  trControl = trainControl(method = "cv", # evaluate method performance via cross-validation
   number = 10, # number of folds for cross-validation
    index = train_fold_inds, # I'm specifying which folds to use, for consistency across methods
    indexOut = val_fold_inds, # I'm specifying which folds to use, for consistency across methods
    returnResamp = "all", # return information from cross-validation
    savePredictions = TRUE) # return validation set predictions from cross-validation
)
mean(abs(test_data$hous_st - predict(svr_fit, newdata = test_data))) # MAE in test set

## [1] 96.70616

svr_fit

## L2 Regularized Support Vector Machine (dual) with Linear Kernel 
## 
## 409 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 192, 204, 216, 228, 240, ... 
## Resampling results across tuning parameters:
## 
##   cost  Loss  RMSE      Rsquared   MAE     
##    4    L2    138.6183  0.2601965  121.0898
##    4    L1    805.6740  0.2492696  800.2012
##    8    L2    137.9572  0.2609627  120.4017
##    8    L1    511.9761  0.2510944  503.2552
##   16    L2    137.1520  0.2619170  119.4300
##   16    L1    319.7882  0.2456201  305.6865
##   32    L2    141.2740  0.2605979  122.7559
##   32    L1    209.9045  0.2560926  193.3992
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were cost = 16 and Loss = L2.

plot(svr_fit)

Artificial Neural Networks

Prior to regression ANN construction we first must split the new_data data set into test and training data sets. Then we have to scale only the explanatory variables. Among the choices of activation function, we have used the logisistic function in out model. Here, each feature of new_data to fall in the [0, 1] interval

# Scale the Data:
# The scale01() function maps each data observation onto the [0, 1] interval as called in the dplyr mutate_all() function
scale01 = function(x){
  (x - min(x)) / (max(x) - min(x))
}
new_data_preds_01 = new_data %>% mutate_at(c("PC1", "PC2", "PC3", "PC4"), scale01)
head(new_data_preds_01)

##   hous_st       PC1       PC2        PC3       PC4       PC5        PC6
## 1    1495 0.2853374 0.5896654 0.24103450 0.4723815 0.6971565 -0.9574271
## 2    1401 0.2982730 0.6150966 0.18637345 0.5256437 0.6730305 -0.8642424
## 3    1550 0.3005307 0.6077074 0.15342846 0.5004559 0.5733262 -0.7703319
## 4    1720 0.3036260 0.5962433 0.15384057 0.4930223 0.6511079 -0.7066144
## 5    1629 0.3151497 0.6036185 0.09329942 0.4850999 0.5711167 -0.6006079
## 6    1641 0.3154506 0.6111160 0.09806266 0.4428124 0.5150894 -0.6393254

# Split  new_data_ann into test and train sets
train_set_preds_01 = slice(new_data_preds_01, 1:407)
test_set_preds_01 = slice(new_data_preds_01, -(1:407))

Cross validation folds for data with interval (0,1) before ANN

# caret::createDataPartition does stratified sampling to ensure test and train sets look similar in some way
# it splits the dataset into test and training set
#'@train_inds outputs 2 lists: 1st list has 35 obs, and the 2nd list has 13 obs, totalling to the n = 48 in the training dataset
train_inds = caret::createDataPartition(new_data_preds_01$hous_st, p = 0.8)
# Choose some indices to use in each part of the split.
# 1st split: split new_data_ann  into training and test sets. 
# use the square bracket notation to subset the data into selected indices
# Creating time slices of the training set after scaling
timeSlices = createTimeSlices(train_set_preds_01$hous_st, # used in place of cv
                              
    initialWindow = 180, # a vector of outcomes in chronolgical order 15 years  times 12
    horizon = 12, # number of consecutive values in test set sample: 12 months
    fixedWindow = FALSE) #logical,if FALSE, all training samples start at 1
train_fold_inds_01 <- timeSlices$train[seq(from = 1, to = 211, by = 12)]
val_fold_inds_01 <- timeSlices$test[seq(from = 1, to = 211, by = 12)]

Timeslices method has created 211 folds. Modeling in 211 datasets is computationally expensive, so we have to reduce that. We command R to skip steps i.e. instead of incrementing by every month, we increment the number of folds by every 12 months. This way, we will have 211/12=17.58 ~ 18 fold cross validated sets or 18-fold time slices.

nnet function with size and decay as tuning parameters

Metrics to measure Forecast Accuracy in test sets

Mean Absolute Percentage Error (MAPE):computes the average absolute percent difference between the observed and predicted values. MAPE = (actual - predicted) / abs(actual).

Percent Bias: computes the average amount that actual is greater than predicted as a percentage of the absolute value of actual. Its value is close to 0 if the model is unbiased. Percent bias is the average of (actual - predicted)/abs(actual).

#Prediction from ridge model
ridge_preds <- predict(ridge_fit, newdata = test_data)
ridge_mape = mape(test_data$hous_st, ridge_preds)
ridge_percent_bias = percent_bias(test_data$hous_st, ridge_preds)

#Prediction from KNN model
knn_preds <- predict(knn_fit, newdata = test_data)
knn_mape = mape(test_data$hous_st, knn_preds)
knn_percent_bias = percent_bias(test_data$hous_st, knn_preds)

#Prediction from ANN model
ann_preds <- predict(ann_fit, newdata = test_data)
ann_mape = mape(test_data$hous_st, ann_preds)
ann_percent_bias = percent_bias(test_data$hous_st, ann_preds)

#Prediction from SVR model
svr_preds <- predict(svr_fit, newdata = test_data)
svr_mape = mape(test_data$hous_st, svr_preds)
svr_percent_bias = percent_bias(test_data$hous_st, svr_preds)

(accuracy = data.frame(
  Models = c("Ridge", "KNN", "ANN", "SVR"),
  MAPE = c(ridge_mape, knn_mape, ann_mape, svr_mape),
  Percent_Bias = c(ridge_percent_bias, knn_percent_bias, ann_percent_bias, svr_percent_bias)
  
))

##   Models       MAPE Percent_Bias
## 1  Ridge 0.11440275  -0.05069752
## 2    KNN 0.27903330  -0.18149670
## 3    ANN 0.10310932  -0.06638015
## 4    SVR 0.09533544   0.03336799

neural interpretation diagram (NID)

The plotnet function plots a neural interpretation diagram (NID). The black lines indicate positive weights between layers and grey lines indicate negative weights.

The thickness of line increases as the magnitude of each weight increases. The first layer consists of input variables whose nodes are labelled as I1 through I6 for six explanatory variables.

The plot below consists of one hidden layer with 10 hidden nodes labelled as H1 through H10. The last layer consists of the output layer labeled as O1. It shows the bias nodes which are connected to the hidden and output layers.

# To view a diagram of the NN1 use the plotnet() function.
plotnet(ann_fit, x_names = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), y_names = "hous_st")

coef(ann_fit$finalModel)

##       b->h1      i1->h1      i2->h1      i3->h1      i4->h1      i5->h1 
##   6.9973094   2.7541821   1.9130187   0.1593454   2.6374881  -0.7964275 
##      i6->h1       b->h2      i1->h2      i2->h2      i3->h2      i4->h2 
##  -1.0175347   6.6981651   3.0004355   2.4396672   0.8289042   2.6990313 
##      i5->h2      i6->h2       b->h3      i1->h3      i2->h3      i3->h3 
##  -0.2761348  -1.7851033  60.6841860 -30.0739451 -30.8723839 -67.0854827 
##      i4->h3      i5->h3      i6->h3       b->h4      i1->h4      i2->h4 
## -41.9800897 -28.9049180  -8.6273998  35.2036697   9.3985052  10.3136393 
##      i3->h4      i4->h4      i5->h4      i6->h4       b->h5      i1->h5 
##   6.6504071  14.3804396   2.4946905 -20.5704051  17.0209935  -8.9758188 
##      i2->h5      i3->h5      i4->h5      i5->h5      i6->h5       b->h6 
## -13.2607918   0.8004982  -6.0931299  -1.1598040  -2.6605575  25.1301763 
##      i1->h6      i2->h6      i3->h6      i4->h6      i5->h6      i6->h6 
##  -1.8421162 -12.1820600 -21.3783462 -21.6272392   4.7187724  -2.5071266 
##       b->h7      i1->h7      i2->h7      i3->h7      i4->h7      i5->h7 
##  29.6180954  -0.3854124 -21.5384474 -64.6874797  29.0406152  -4.1728956 
##      i6->h7       b->h8      i1->h8      i2->h8      i3->h8      i4->h8 
##  16.9318749   5.3045046  -7.3714461 -54.5435554  18.5750584  -0.3864044 
##      i5->h8      i6->h8        b->o       h1->o       h2->o       h3->o 
##  -5.6130632 -28.4729414 180.5661546  87.3308392 168.1559459 298.7879217 
##       h4->o       h5->o       h6->o       h7->o       h8->o 
## 169.1409014 503.7799004 162.6350091 257.4076078 253.5629605

Stacking via Ridge Regression

# Step 1: Validation-fold predictions from component models

#Ridge
ridge_val_pred <- ridge_fit$pred %>%
  dplyr::filter(lambda == ridge_fit$bestTune$lambda) %>%
  arrange(rowIndex) %>%
  pull(pred)

# KNN
knn_val_pred <- knn_fit$pred %>%
  dplyr::filter(k == knn_fit$bestTune$k) %>%
  arrange(rowIndex) %>%
  pull(pred)

# ANN
ann_val_pred = ann_fit$pred %>%
  dplyr::filter(decay == ann_fit$bestTune$decay, size == ann_fit$bestTune$size) %>%
  arrange(rowIndex) %>%
  pull(pred)

# SVR
svr_val_pred = svr_fit$pred %>%
  dplyr::filter(cost == svr_fit$bestTune$cost, Loss == svr_fit$bestTune$Loss) %>%
  arrange(rowIndex) %>%
  pull(pred)

# Step 2: data set with validation-set component model predictions as explanatory variables
val_data <- train_data %>%
  slice(val_fold_inds %>% unlist()) %>%
  mutate(
   ridge_pred = ridge_val_pred,
    knn_pred = knn_val_pred,
    ann_pred = ann_val_pred,
    svr_pred = svr_val_pred
   
  )
# Step 3: fit model using component model predictions as explanatory variables
stacking_fit <- caret::train(
  form = hous_st ~  knn_pred +  ridge_pred + ann_pred + svr_pred,
  data = val_data,
  method = "glmnet", 
  tuneGrid = expand.grid(
    alpha = 0,
    lambda = 2^seq(from = -4, to = 10)
    
  )
)
coef(stacking_fit$finalModel, stacking_fit$bestTune$lambda) %>% t() # Transposes the result to occcupy less space

## 1 x 5 sparse Matrix of class "dgCMatrix"
##   (Intercept) knn_pred ridge_pred  ann_pred  svr_pred
## 1   -130.0761 0.216563  0.3378726 0.2389439 0.2921875

# Step 4 (both cross-validation and refitting to the full training set were already done
# to get ridge_fit, knn_fit, ann_fit and svr_fit above)
knn_test_pred <- predict(knn_fit, newdata = test_data)
ridge_test_pred <- predict(ridge_fit, newdata = test_data)
ann_test_pred = predict(ann_fit, newdata = test_data)
svr_test_pred = predict(svr_fit, newdata = test_data)


# Step 5: Assemble data frame of test set predictions from each component model
stacking_test_x = data.frame(
  ridge_pred = ridge_test_pred,
  knn_pred = knn_test_pred,
  ann_pred = ann_test_pred,
  svr_pred = svr_test_pred
)

stacking_fit

## glmnet 
## 
## 216 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 216, 216, 216, 216, 216, 216, ... 
## Resampling results across tuning parameters:
## 
##   lambda     RMSE      Rsquared   MAE      
##      0.0625  129.0097  0.8568090   96.07441
##      0.1250  129.0097  0.8568090   96.07441
##      0.2500  129.0097  0.8568090   96.07441
##      0.5000  129.0097  0.8568090   96.07441
##      1.0000  129.0097  0.8568090   96.07441
##      2.0000  129.0097  0.8568090   96.07441
##      4.0000  129.0097  0.8568090   96.07441
##      8.0000  129.0097  0.8568090   96.07441
##     16.0000  129.0097  0.8568090   96.07441
##     32.0000  129.0217  0.8568382   96.07210
##     64.0000  129.3976  0.8573605   95.97524
##    128.0000  131.4187  0.8576933   96.95202
##    256.0000  138.2664  0.8578426  101.44871
##    512.0000  155.8921  0.8578873  112.36394
##   1024.0000  188.4798  0.8578940  135.74148
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 16.

plot(stacking_fit)

# Step 6: Stacked model predictions
stacking_preds = predict(stacking_fit, stacking_test_x)
tail(as.data.frame(stacking_preds))

##     stacking_preds
## 97        1206.512
## 98        1216.285
## 99        1160.713
## 100       1022.141
## 101       1032.861
## 102       1818.448

# Calculate error rates
ensemble_mape = mape(test_data$hous_st, stacking_preds)
ensemble_percent_bias = percent_bias(test_data$hous_st, stacking_preds)

accuracy = data.frame(
  Models = c("Ridge", "KNN", "ANN", "SVR", "Ensemble"),
  MAPE = c(ridge_mape, knn_mape, ann_mape, svr_mape, ensemble_mape),
  Percent_Bias = c(ridge_percent_bias, knn_percent_bias, ann_percent_bias, svr_percent_bias, ensemble_percent_bias)
  
)
accuracy %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)

Models	MAPE	Percent_Bias
Ridge	0.1144027	-0.0506975
KNN	0.2790333	-0.1814967
ANN	0.1031093	-0.0663802
SVR	0.0953354	0.0333680
Ensemble	0.0651865	-0.0057394

The stacked ensemble produces a MAPE of 0.0651, lower than that of all other models.

Comparing predictions from the the ensemble model with the actual hous_st values:

#Plots comparing Predicted hous_st from ensemble model vs actual hous_st from test set
df_ensemble = data.frame(hous_st = test_data$hous_st, pred_hous_st_stack_ridge = stacking_preds)
 
# convert the last 101 rows of the entire FinCrisis dataset into a dataframe
# test set with 3 columns: date, actual and predicted hous_st
series_ensemble = cbind(tail(FinCrisis[,1], 102) , df_ensemble) 
head(series_ensemble)

##         Date hous_st pred_hous_st_stack_ridge
## 1 2010-07-01     546                 487.3690
## 2 2010-08-01     599                 506.4065
## 3 2010-09-01     594                 579.3585
## 4 2010-10-01     543                 534.8373
## 5 2010-11-01     545                 510.2306
## 6 2010-12-01     539                 576.6878

# Melt the "series" data to a long form so that we can pass the data for all 3 variables (hous_st and pred_hous_st) to one aesthetic:
series_ensemble_melt = reshape2::melt(series_ensemble, id.var = 'Date')
ggplot(series_ensemble_melt, aes(x = Date, y = value, colour = variable)) + geom_point() + xlab('Date')

Predictions from individual component models with the actual hous_st values:

#Plots comparing Predicted hous_st from individual component model vs actual hous_st from test set
df_comp = data.frame(hous_st = test_data$hous_st, knn_preds, ridge_preds,  ann_preds, svr_preds)
 
# convert the last 101 rows of the entire FinCrisis dataset into a dataframe
# test set with 3 columns: date, actual and predicted hous_st
series_comp = cbind(tail(FinCrisis[,1], 102) , df_comp, stacking_preds)
series_comp %>%
kable("html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)

Date	hous_st	knn_preds	ridge_preds	ann_preds	svr_preds	stacking_preds
2010-07-01	546	593.5	592.0258	605.1938	493.7866	487.3690
2010-08-01	599	584.0	626.0560	605.1938	526.6318	506.4065
2010-09-01	594	559.5	748.1920	605.1938	653.2332	579.3585
2010-10-01	543	559.5	678.9041	605.1938	580.9826	534.8373
2010-11-01	545	559.5	642.1370	605.1938	539.2831	510.2306
2010-12-01	539	661.5	708.9837	605.1938	613.8317	576.6878
2011-01-01	630	620.0	638.8115	605.1938	540.7761	522.6453
2011-02-01	517	609.0	650.4377	605.1939	563.0942	530.7123
2011-03-01	600	625.0	700.3906	605.1940	609.1985	564.5262
2011-04-01	554	661.5	704.6946	605.2362	609.0198	573.8428
2011-05-01	561	661.5	713.7050	605.5362	616.6482	579.1878
2011-06-01	608	661.5	742.7555	605.3604	648.3127	598.2131
2011-07-01	623	661.5	776.0342	605.2009	687.6789	620.9213
2011-08-01	585	585.0	792.4190	605.1944	694.4731	611.8738
2011-09-01	650	585.0	808.6609	605.1995	705.2766	620.5194
2011-10-01	610	585.0	791.3653	857.8555	686.0553	669.4300
2011-11-01	711	661.5	826.4117	862.6014	722.1650	709.5231
2011-12-01	694	661.5	874.1928	862.6014	772.3487	740.3301
2012-01-01	723	661.5	813.7806	862.6014	705.4386	700.3682
2012-02-01	704	661.5	864.6327	862.6014	757.3326	732.7125
2012-03-01	695	661.5	865.4521	862.6014	762.5044	734.5005
2012-04-01	753	661.5	939.9862	862.6028	845.5302	783.9430
2012-05-01	708	661.5	926.9428	862.6014	824.0429	773.2573
2012-06-01	757	610.5	925.3699	862.6014	819.7723	760.4333
2012-07-01	740	610.5	984.8099	862.6014	884.1718	799.3332
2012-08-01	754	610.5	982.6803	862.6016	884.9667	798.8460
2012-09-01	847	661.5	952.3541	862.6018	852.6442	790.2001
2012-10-01	915	661.5	985.9349	862.6082	894.5085	813.7799
2012-11-01	833	661.5	957.0790	862.6016	857.7902	793.3001
2012-12-01	976	661.5	974.0883	862.6017	875.9364	804.3492
2013-01-01	888	661.5	1041.3182	862.8398	950.2901	848.8464
2013-02-01	962	661.5	1032.3162	866.0809	944.6333	844.9265
2013-03-01	1010	661.5	1090.2145	993.3530	1012.4336	914.7100
2013-04-01	835	661.5	1000.0634	862.6702	908.2104	822.5719
2013-05-01	930	661.5	973.0228	862.9436	883.0676	806.1545
2013-06-01	839	661.5	1018.6964	936.2836	935.7860	854.5142
2013-07-01	880	661.5	936.4299	1006.4158	862.7309	822.1305
2013-08-01	917	661.5	911.7910	1019.3600	838.2116	809.7343
2013-09-01	850	661.5	926.5522	1023.1875	854.4846	820.3911
2013-10-01	925	661.5	990.4888	1024.8873	919.4286	861.3755
2013-11-01	1100	1622.0	999.1392	1025.2343	934.9733	1076.9319
2013-12-01	1002	1582.0	921.1486	1025.2265	852.7169	1017.8823
2014-01-01	890	1622.0	975.0005	1025.2356	911.7179	1061.9815
2014-02-01	941	1622.0	1001.0735	1025.2351	939.5957	1078.9362
2014-03-01	970	1622.0	986.8962	1025.2285	926.1726	1070.2225
2014-04-01	1049	1582.0	915.7223	1025.2001	849.5095	1015.1054
2014-05-01	1008	1602.0	1012.6392	1025.2344	951.3262	1081.9401
2014-06-01	903	1583.5	896.0709	1024.4633	825.0677	1001.4730
2014-07-01	1082	1602.0	906.6354	1021.2084	837.8084	1011.9938
2014-08-01	983	1604.5	1009.9188	1025.2095	948.1572	1080.6304
2014-09-01	1020	1602.0	1064.8912	1025.2363	1011.0579	1117.0478
2014-10-01	1077	1564.5	1017.8838	1025.2299	956.1328	1076.9943
2014-11-01	1001	1564.5	948.4641	1023.3827	879.5823	1030.7308
2014-12-01	1068	1578.5	1053.1219	1025.1859	990.2144	1101.8799
2015-01-01	1094	1472.5	1105.6809	1025.0466	1040.4889	1111.3387
2015-02-01	888	1595.0	1049.8279	1021.0396	975.3056	1098.9933
2015-03-01	963	1595.0	934.6451	885.1060	852.9855	991.8553
2015-04-01	1203	1607.5	1123.8195	1056.6987	1065.1123	1161.4610
2015-05-01	1079	1604.5	1128.1672	1025.2353	1073.1220	1157.1026
2015-06-01	1185	1618.5	1028.0778	1025.2151	968.2269	1095.6632
2015-07-01	1133	1618.5	1073.5894	1025.2348	1017.2117	1125.3578
2015-08-01	1134	1619.5	1071.3946	1027.9051	1010.9413	1123.6387
2015-09-01	1212	1415.5	1051.5663	1152.4016	996.5563	1098.3049
2015-10-01	1064	1618.5	1031.9647	1081.0902	971.7602	1111.3598
2015-11-01	1171	1578.5	1056.7912	1183.8574	996.3089	1142.8139
2015-12-01	1155	1607.5	1096.1189	1278.6830	1036.5720	1196.8044
2016-01-01	1114	1540.0	1113.6473	1278.8215	1058.4558	1194.5360
2016-02-01	1202	1540.0	1106.2529	1277.9572	1046.2162	1188.2549
2016-03-01	1115	1540.0	1098.5662	1270.1216	1035.9612	1180.7891
2016-04-01	1173	1595.0	1069.0911	1232.5201	1000.1365	1163.2891
2016-05-01	1133	1595.0	1093.3973	1271.9213	1029.0504	1189.3644
2016-06-01	1183	1452.5	1178.8287	1279.6810	1123.4822	1216.8150
2016-07-01	1225	1565.5	1216.2870	1280.0957	1158.2109	1264.1892
2016-08-01	1161	1515.5	1152.6975	1266.2317	1089.9508	1208.6184
2016-09-01	1064	1515.5	1115.3258	1193.2265	1049.4746	1166.7207
2016-10-01	1327	1452.5	1149.1826	1274.9623	1089.5196	1195.7475
2016-11-01	1151	1452.5	1241.5062	1289.3787	1200.9242	1262.9368
2016-12-01	1280	1618.5	1116.6259	1278.8077	1065.0452	1214.4647
2017-01-01	1225	1608.0	1129.7097	1278.4982	1075.5985	1219.6210
2017-02-01	1289	1608.0	1194.8417	1279.3930	1147.3295	1262.8000
2017-03-01	1179	1608.0	1227.9956	1292.1060	1184.8864	1288.0131
2017-04-01	1165	1619.5	1124.1391	1277.8390	1068.9230	1218.1213
2017-05-01	1122	1528.5	1187.0681	1297.5728	1138.8926	1244.8355
2017-06-01	1225	1452.5	1249.0476	1594.1367	1205.8426	1339.7420
2017-07-01	1185	1377.0	1162.7122	1114.0834	1116.4570	1153.3980
2017-08-01	1172	1469.0	1080.7993	881.0112	1021.3955	1062.1787
2017-09-01	1158	1595.0	1123.9557	1227.6680	1063.9528	1199.3132
2017-10-01	1265	1452.5	1181.4216	1375.1890	1132.3831	1243.1129
2017-11-01	1303	1588.0	1203.6580	1336.2336	1147.1213	1274.9684
2017-12-01	1210	1452.5	1200.8528	1438.8483	1148.5592	1269.6156
2018-01-01	1334	1452.5	1208.4962	1407.7139	1157.8125	1267.4624
2018-02-01	1290	1452.5	1274.4365	1767.0213	1233.1146	1397.5985
2018-03-01	1327	1452.5	1231.0717	1698.3062	1181.5488	1351.4608
2018-04-01	1276	1452.5	1222.1804	1434.3382	1174.5363	1283.3342
2018-05-01	1329	1452.5	1227.4562	1454.4434	1179.0103	1291.2279
2018-06-01	1177	1452.5	1169.9136	1345.7183	1115.3431	1227.2039
2018-07-01	1184	1452.5	1137.0422	1350.3594	1078.7430	1206.5124
2018-08-01	1280	1452.5	1147.1613	1360.4075	1092.2717	1216.2853
2018-09-01	1237	1452.5	1070.4007	1341.2537	1006.5054	1160.7135
2018-10-01	1217	1356.0	956.1160	1157.6658	886.0567	1022.1407
2018-11-01	1256	1356.0	942.6102	1237.9392	872.7185	1032.8610
2018-12-01	1260	1770.5	1775.1186	1823.2856	1812.7832	1818.4482

# Melt the "series" data to a long form so that we can pass the data for all 5 variables (hous_st predictions from 4 models plus actual house_st) to one aesthetic:
series_comp_melt = reshape2::melt(series_comp, id.var = 'Date')
ggplot(series_comp_melt, aes(x = Date, y = value, colour = variable)) + geom_point() + xlab('Date')

Ensemble Model to Predict Housing Starts

Sudiksha Joshi

3/25/2019