knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
The dataset “FinCrisis” consists of economic variables that affect the number of housing starts in the US.
library(readxl)
library(readr)
library(dplyr)
library(ggplot2)
library(GGally)
library(gridExtra)
library(purrr)
library(glmnet) # computes penalized linear regression models such as lasso and ridge
library(tidyverse)
library(caret)
library(data.table)
library(tidyverse)
library("NeuralNetTools") # plots neural nets using nnet package
library(nnet)
library(forecast)
library(LiblineaR) # estimates predictive linear models for classification and regression
library(kableExtra) # generates customized tables
library(ggbiplot) # plots PCs and set the graphical parameters of the plots
library(factoextra) # visualization for PCs
library(Metrics) # contains functions that evaluation metrics for Machine Learning models
FinCrisis = read_xlsx("FinCrisis.xlsx")
# converts from data.frame to .ts
time_ser=ts(FinCrisis,frequency=12,start=c(1976,6), end=c(2018,12))
time_ser.df = as.data.frame(time_ser)
# Making a train and test split for the estimation
train_fin = slice(time_ser.df, 1:409)
test_fin = slice(time_ser.df, -(1:409))
matrix = cor(FinCrisis[,unlist(lapply(FinCrisis, is.numeric))])
# ranks from most negatively to most positively correlated variables
cor(matrix) %>%
as.data.frame() %>%
mutate(var1 = rownames(.)) %>%
gather(var2, value, -var1) %>%
arrange(desc(value)) %>%
group_by(value) %>%
dplyr::filter(row_number() == 1)
## # A tibble: 56 x 3
## # Groups: value [56]
## var1 var2 value
## <chr> <chr> <dbl>
## 1 hous_st hous_st 1
## 2 sec_conL income 1.000
## 3 CPI income 0.999
## 4 CPI sec_conL 0.999
## 5 real_estL CPI 0.997
## 6 real_estL sec_conL 0.997
## 7 real_estL income 0.996
## 8 mortgR fed_fundsR 0.989
## 9 pvt_house_comp hous_st 0.981
## 10 real_estL yield_sp 0.794
## # … with 46 more rows
crossval_fold_inds <- caret::createFolds(
y = train_fin$hous_st,
k = 10
)
calc_mse <- function(observed, predicted) {
return(mean(abs(observed - predicted)))
}
get_val_mse <- function(formula, train_fin, cross_fold_inds, val_fold){
val_train <- train_fin %>% slice(cross_fold_inds[[val_fold]])
train2_data <- train_fin %>% slice(-cross_fold_inds[[val_fold]])
fitmod <- lm(formula, data = train2_data)
val_preds <- predict(fitmod, val_train)
val_resids <- val_train$hous_st - val_preds
val_obs <- val_train$hous_st
calc_mse(observed = val_obs, predicted = val_preds)
}
mse_results <- expand.grid(
val_fold = 1:10
)
get_crossval_mse <- function(formula, train_fin, crossval_fold_inds){
xval_mse_results = mse_results %>%
mutate(
val_mse = mse_results%>%
pmap_dbl(
get_val_mse,
formula = formula,
cross_fold_inds = crossval_fold_inds,
train_fin = train_fin
))
mean(xval_mse_results$val_mse)
}
As the above models had high test set MSE’s, we realized that this maybe due to the multicollinearity of the independent variables in the dataset so we chose to analyze the data using PCA.
In PCA, every variable is centered at zero so that we can easily compare each principal component to the mean. Centering also removes problems arising the scale of each variable. After pbtaining the PCs, we sort the vriables from larget to the samllest based on the standard deviations (eigenvalues).
Proportion of variance shows how much of the variance is explained by that principal component. The components are always sorted by how important they are, so the most important components will always be the first few. In this dataset, PC1 accounts for > 60 % of total variance in the data. The cumulative proprotion shows how much variance is accumulated.
FinCrisis.pca <-
prcomp(time_ser.df[,-c(1,2)], #formula with no response variable and numerical explan vars
center = TRUE, # logical value indicating whether the variables should be shifted to be zero centered. Alternately, a vector of length equal to the number of columns of x can be supplied. The value is passed to scale.
scale. = TRUE) #a logical value indicating whether the variables should be scaled to have unit variance before the analysis takes place. The default is FALSE for consistency with S, but in general scaling is advisable. Alternatively, a vector of length equal the number of columns of x can be supplied. The value is passed to scale
summary(FinCrisis.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4645 1.3724 1.0866 0.65482 0.49411 0.36791 0.1731
## Proportion of Variance 0.6074 0.1883 0.1181 0.04288 0.02441 0.01354 0.0030
## Cumulative Proportion 0.6074 0.7957 0.9138 0.95669 0.98111 0.99464 0.9976
## PC8 PC9 PC10
## Standard deviation 0.13519 0.06179 0.03869
## Proportion of Variance 0.00183 0.00038 0.00015
## Cumulative Proportion 0.99947 0.99985 1.00000
# Eigenvalues
eig.val <- get_eigenvalue(FinCrisis.pca)
eig.val %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
| eigenvalue | variance.percent | cumulative.variance.percent | |
|---|---|---|---|
| Dim.1 | 6.0739086 | 60.7390863 | 60.73909 |
| Dim.2 | 1.8835272 | 18.8352721 | 79.57436 |
| Dim.3 | 1.1807085 | 11.8070849 | 91.38144 |
| Dim.4 | 0.4287866 | 4.2878663 | 95.66931 |
| Dim.5 | 0.2441449 | 2.4414486 | 98.11076 |
| Dim.6 | 0.1353575 | 1.3535751 | 99.46433 |
| Dim.7 | 0.0299749 | 0.2997494 | 99.76408 |
| Dim.8 | 0.0182776 | 0.1827762 | 99.94686 |
| Dim.9 | 0.0038175 | 0.0381750 | 99.98503 |
| Dim.10 | 0.0014966 | 0.0149660 | 100.00000 |
A data matrix X of dimension n x p data matrix X can have up to min(n-1,p) principal components. The goal of PCA is to use significantly reduce the number of variables used such that the least number of principal components explain maximum variability.
A common technique to determine the number of PCs to use is to eyeball the scree plot below where we observe the “elbow point”, where the proportion of variance explained (PVE) plummets.
screeplot(FinCrisis.pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
col=c("red"), lty=5, cex=0.6)
An eigenvalues <1 would mean that the component actually explains less than a single explanatory variable. We notice is that the first 3 components has an Eigenvalue >1 and explains almost 91% of variance. Alternatively, the first 6 components explain 99.46 percent variance. Thus, I have reduced dimensionality from 12 to 6 while only “loosing” about 0.53567 % of variance.
# Visualize eigenvalues (scree plot). Show the percentage of variances explained by each principal component.
fviz_eig(FinCrisis.pca)
#selecting the first 6 principal components to use in the model
x_pca = FinCrisis.pca$x[, 1:6]
x_pca %>% head() %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 |
|---|---|---|---|---|---|
| -2.532253 | 0.9589985 | -0.6856539 | -0.4194933 | 0.6971565 | -0.9574271 |
| -2.417471 | 1.1089015 | -0.9856733 | -0.2229814 | 0.6730305 | -0.8642424 |
| -2.397437 | 1.0653462 | -1.1664993 | -0.3159122 | 0.5733262 | -0.7703319 |
| -2.369972 | 0.9977712 | -1.1642373 | -0.3433385 | 0.6511079 | -0.7066144 |
| -2.267717 | 1.0412444 | -1.4965310 | -0.3725687 | 0.5711167 | -0.6006079 |
| -2.265047 | 1.0854381 | -1.4703869 | -0.5285890 | 0.5150894 | -0.6393254 |
# Biplot for Principal Components using ggplot2
ggbiplot(FinCrisis.pca)
The axes are seen as arrows originating from the center point. The variables - yield spread, real estate loans and CPI contribute to PC1, with higher values in those variables moving the samples to the right on this plot.
# combine PC1-PC6 as explanatory variables with hous_st as response variable in a new dataset:
new_data = cbind(time_ser.df, as.data.frame(x_pca))
new_data = new_data[, -c(1, 3:12)]
# Making a train and test split for the estimation
train_data <- slice(new_data, 1:409)
test_data <- slice(new_data, -(1:409))
K-Nearest Neighbors Ridge/LASSO Artificial Neural Networks Support Vector Machine
# Creating time slices of the training set
timeSlices = createTimeSlices(train_data$hous_st, # used in place of cv
initialWindow = 180, # a vector of outcomes in chronolgical order 15 years times 12
horizon = 12, # number of consecutive values in test set sample: 12 months
fixedWindow = FALSE) #logical,if FALSE, all training samples start at 1
train_fold_inds <- timeSlices$train[seq(from = 1, to = 211, by = 12)]
val_fold_inds <- timeSlices$test[seq(from = 1, to = 211, by = 12)]
knn_fit <- caret::train(
form = hous_st ~ .,
data = train_data,
method = "knn",
preProcess = c("scale", "center"),
trControl = trainControl(method = "cv",
number = 10,
index = train_fold_inds, # I'm specifying which folds to use, for consistency across methods
indexOut = val_fold_inds, # I'm specifying which folds to use, for consistency across methods
returnResamp = "all",
savePredictions = TRUE),
tuneGrid = data.frame(k = 1:10)
)
mean(abs(test_data$hous_st - predict(knn_fit, newdata = test_data))) # MAE in test set
## [1] 287.4804
#Output of kNN fit
knn_fit
## k-Nearest Neighbors
##
## 409 samples
## 6 predictor
##
## Pre-processing: scaled (6), centered (6)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 192, 204, 216, 228, 240, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 160.8808 0.1875936 137.2130
## 2 151.9892 0.1520998 131.1968
## 3 154.6955 0.1541260 132.6327
## 4 160.9234 0.1306035 139.3553
## 5 161.3336 0.2314432 139.9667
## 6 160.6328 0.2051346 139.4336
## 7 161.7521 0.2503193 141.7870
## 8 163.1922 0.2509895 144.2205
## 9 164.3352 0.2324320 146.3585
## 10 167.7617 0.2027130 149.1916
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 2.
#Plotting yields Number of Neighbours Vs accuracy (based on repeated cross validation)
plot(knn_fit)
When predictors in a regression are strongly correlated, regression coefficients of variable depends on other predictors in the model. Therefore, the specific independent variable does not reflect the effects of the predictor on the regressor. It only partially or marginally effects the regressor based on other predictors in the model.Ridge regression adds bias to alleviate multicollinearity.
# glmnet() runs the model many times for different values of lambda.
ridge_fit <- caret::train(
form = hous_st ~ .,
data = train_data,
method = "glmnet", # method for fit
tuneGrid = expand.grid(
alpha = 0,
lambda = 2^seq(from = -4, to = 10) # plug in different values of lambda to see how test RMSE changes
),
trControl = trainControl(method = "cv", # evaluate method performance via cross-validation
number = 10, # number of folds for cross-validation
index = train_fold_inds, # I'm specifying which folds to use, for consistency across methods
indexOut = val_fold_inds, # I'm specifying which folds to use, for consistency across methods
returnResamp = "all", # return information from cross-validation
savePredictions = TRUE) # return validation set predictions from cross-validation
)
mean(abs(test_data$hous_st - predict(ridge_fit, newdata = test_data))) # MAE in test set
## [1] 99.16544
ridge_fit
## glmnet
##
## 409 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 192, 204, 216, 228, 240, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.0625 125.1038 0.2564421 105.6468
## 0.1250 125.1038 0.2564421 105.6468
## 0.2500 125.1038 0.2564421 105.6468
## 0.5000 125.1038 0.2564421 105.6468
## 1.0000 125.1038 0.2564421 105.6468
## 2.0000 125.1038 0.2564421 105.6468
## 4.0000 125.1038 0.2564421 105.6468
## 8.0000 125.1038 0.2564421 105.6468
## 16.0000 124.6471 0.2566134 105.2336
## 32.0000 123.0606 0.2547423 104.0415
## 64.0000 130.8146 0.2522117 113.9690
## 128.0000 150.6686 0.2493908 135.2232
## 256.0000 179.9229 0.2451270 163.4515
## 512.0000 210.8423 0.2400159 193.8696
## 1024.0000 235.9032 0.2353817 220.1529
##
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 32.
plot(ridge_fit)
set.seed(123)
svr_fit <- caret::train(
form = hous_st ~ .,
data = train_data,
method = "svmLinear3", # method for fit
tuneGrid = expand.grid(
cost = 2^(2:5), # inverse of a regularization constant. Specify the cost of a violation to the margin. Thus when the cost is small, the margins will be wide and there will be many support vectors.
Loss = sample(c("L1", "L2"), replace = TRUE)
),
trControl = trainControl(method = "cv", # evaluate method performance via cross-validation
number = 10, # number of folds for cross-validation
index = train_fold_inds, # I'm specifying which folds to use, for consistency across methods
indexOut = val_fold_inds, # I'm specifying which folds to use, for consistency across methods
returnResamp = "all", # return information from cross-validation
savePredictions = TRUE) # return validation set predictions from cross-validation
)
mean(abs(test_data$hous_st - predict(svr_fit, newdata = test_data))) # MAE in test set
## [1] 96.70616
svr_fit
## L2 Regularized Support Vector Machine (dual) with Linear Kernel
##
## 409 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 192, 204, 216, 228, 240, ...
## Resampling results across tuning parameters:
##
## cost Loss RMSE Rsquared MAE
## 4 L2 138.6183 0.2601965 121.0898
## 4 L1 805.6740 0.2492696 800.2012
## 8 L2 137.9572 0.2609627 120.4017
## 8 L1 511.9761 0.2510944 503.2552
## 16 L2 137.1520 0.2619170 119.4300
## 16 L1 319.7882 0.2456201 305.6865
## 32 L2 141.2740 0.2605979 122.7559
## 32 L1 209.9045 0.2560926 193.3992
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were cost = 16 and Loss = L2.
plot(svr_fit)
Prior to regression ANN construction we first must split the new_data data set into test and training data sets. Then we have to scale only the explanatory variables. Among the choices of activation function, we have used the logisistic function in out model. Here, each feature of new_data to fall in the [0, 1] interval
# Scale the Data:
# The scale01() function maps each data observation onto the [0, 1] interval as called in the dplyr mutate_all() function
scale01 = function(x){
(x - min(x)) / (max(x) - min(x))
}
new_data_preds_01 = new_data %>% mutate_at(c("PC1", "PC2", "PC3", "PC4"), scale01)
head(new_data_preds_01)
## hous_st PC1 PC2 PC3 PC4 PC5 PC6
## 1 1495 0.2853374 0.5896654 0.24103450 0.4723815 0.6971565 -0.9574271
## 2 1401 0.2982730 0.6150966 0.18637345 0.5256437 0.6730305 -0.8642424
## 3 1550 0.3005307 0.6077074 0.15342846 0.5004559 0.5733262 -0.7703319
## 4 1720 0.3036260 0.5962433 0.15384057 0.4930223 0.6511079 -0.7066144
## 5 1629 0.3151497 0.6036185 0.09329942 0.4850999 0.5711167 -0.6006079
## 6 1641 0.3154506 0.6111160 0.09806266 0.4428124 0.5150894 -0.6393254
# Split new_data_ann into test and train sets
train_set_preds_01 = slice(new_data_preds_01, 1:407)
test_set_preds_01 = slice(new_data_preds_01, -(1:407))
# caret::createDataPartition does stratified sampling to ensure test and train sets look similar in some way
# it splits the dataset into test and training set
#'@train_inds outputs 2 lists: 1st list has 35 obs, and the 2nd list has 13 obs, totalling to the n = 48 in the training dataset
train_inds = caret::createDataPartition(new_data_preds_01$hous_st, p = 0.8)
# Choose some indices to use in each part of the split.
# 1st split: split new_data_ann into training and test sets.
# use the square bracket notation to subset the data into selected indices
# Creating time slices of the training set after scaling
timeSlices = createTimeSlices(train_set_preds_01$hous_st, # used in place of cv
initialWindow = 180, # a vector of outcomes in chronolgical order 15 years times 12
horizon = 12, # number of consecutive values in test set sample: 12 months
fixedWindow = FALSE) #logical,if FALSE, all training samples start at 1
train_fold_inds_01 <- timeSlices$train[seq(from = 1, to = 211, by = 12)]
val_fold_inds_01 <- timeSlices$test[seq(from = 1, to = 211, by = 12)]
Timeslices method has created 211 folds. Modeling in 211 datasets is computationally expensive, so we have to reduce that. We command R to skip steps i.e. instead of incrementing by every month, we increment the number of folds by every 12 months. This way, we will have 211/12=17.58 ~ 18 fold cross validated sets or 18-fold time slices.
Mean Absolute Percentage Error (MAPE):computes the average absolute percent difference between the observed and predicted values. MAPE = (actual - predicted) / abs(actual).
Percent Bias: computes the average amount that actual is greater than predicted as a percentage of the absolute value of actual. Its value is close to 0 if the model is unbiased. Percent bias is the average of (actual - predicted)/abs(actual).
#Prediction from ridge model
ridge_preds <- predict(ridge_fit, newdata = test_data)
ridge_mape = mape(test_data$hous_st, ridge_preds)
ridge_percent_bias = percent_bias(test_data$hous_st, ridge_preds)
#Prediction from KNN model
knn_preds <- predict(knn_fit, newdata = test_data)
knn_mape = mape(test_data$hous_st, knn_preds)
knn_percent_bias = percent_bias(test_data$hous_st, knn_preds)
#Prediction from ANN model
ann_preds <- predict(ann_fit, newdata = test_data)
ann_mape = mape(test_data$hous_st, ann_preds)
ann_percent_bias = percent_bias(test_data$hous_st, ann_preds)
#Prediction from SVR model
svr_preds <- predict(svr_fit, newdata = test_data)
svr_mape = mape(test_data$hous_st, svr_preds)
svr_percent_bias = percent_bias(test_data$hous_st, svr_preds)
(accuracy = data.frame(
Models = c("Ridge", "KNN", "ANN", "SVR"),
MAPE = c(ridge_mape, knn_mape, ann_mape, svr_mape),
Percent_Bias = c(ridge_percent_bias, knn_percent_bias, ann_percent_bias, svr_percent_bias)
))
## Models MAPE Percent_Bias
## 1 Ridge 0.11440275 -0.05069752
## 2 KNN 0.27903330 -0.18149670
## 3 ANN 0.10310932 -0.06638015
## 4 SVR 0.09533544 0.03336799
The plotnet function plots a neural interpretation diagram (NID). The black lines indicate positive weights between layers and grey lines indicate negative weights.
The thickness of line increases as the magnitude of each weight increases. The first layer consists of input variables whose nodes are labelled as I1 through I6 for six explanatory variables.
The plot below consists of one hidden layer with 10 hidden nodes labelled as H1 through H10. The last layer consists of the output layer labeled as O1. It shows the bias nodes which are connected to the hidden and output layers.
# To view a diagram of the NN1 use the plotnet() function.
plotnet(ann_fit, x_names = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), y_names = "hous_st")
coef(ann_fit$finalModel)
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1
## 6.9973094 2.7541821 1.9130187 0.1593454 2.6374881 -0.7964275
## i6->h1 b->h2 i1->h2 i2->h2 i3->h2 i4->h2
## -1.0175347 6.6981651 3.0004355 2.4396672 0.8289042 2.6990313
## i5->h2 i6->h2 b->h3 i1->h3 i2->h3 i3->h3
## -0.2761348 -1.7851033 60.6841860 -30.0739451 -30.8723839 -67.0854827
## i4->h3 i5->h3 i6->h3 b->h4 i1->h4 i2->h4
## -41.9800897 -28.9049180 -8.6273998 35.2036697 9.3985052 10.3136393
## i3->h4 i4->h4 i5->h4 i6->h4 b->h5 i1->h5
## 6.6504071 14.3804396 2.4946905 -20.5704051 17.0209935 -8.9758188
## i2->h5 i3->h5 i4->h5 i5->h5 i6->h5 b->h6
## -13.2607918 0.8004982 -6.0931299 -1.1598040 -2.6605575 25.1301763
## i1->h6 i2->h6 i3->h6 i4->h6 i5->h6 i6->h6
## -1.8421162 -12.1820600 -21.3783462 -21.6272392 4.7187724 -2.5071266
## b->h7 i1->h7 i2->h7 i3->h7 i4->h7 i5->h7
## 29.6180954 -0.3854124 -21.5384474 -64.6874797 29.0406152 -4.1728956
## i6->h7 b->h8 i1->h8 i2->h8 i3->h8 i4->h8
## 16.9318749 5.3045046 -7.3714461 -54.5435554 18.5750584 -0.3864044
## i5->h8 i6->h8 b->o h1->o h2->o h3->o
## -5.6130632 -28.4729414 180.5661546 87.3308392 168.1559459 298.7879217
## h4->o h5->o h6->o h7->o h8->o
## 169.1409014 503.7799004 162.6350091 257.4076078 253.5629605
# Step 1: Validation-fold predictions from component models
#Ridge
ridge_val_pred <- ridge_fit$pred %>%
dplyr::filter(lambda == ridge_fit$bestTune$lambda) %>%
arrange(rowIndex) %>%
pull(pred)
# KNN
knn_val_pred <- knn_fit$pred %>%
dplyr::filter(k == knn_fit$bestTune$k) %>%
arrange(rowIndex) %>%
pull(pred)
# ANN
ann_val_pred = ann_fit$pred %>%
dplyr::filter(decay == ann_fit$bestTune$decay, size == ann_fit$bestTune$size) %>%
arrange(rowIndex) %>%
pull(pred)
# SVR
svr_val_pred = svr_fit$pred %>%
dplyr::filter(cost == svr_fit$bestTune$cost, Loss == svr_fit$bestTune$Loss) %>%
arrange(rowIndex) %>%
pull(pred)
# Step 2: data set with validation-set component model predictions as explanatory variables
val_data <- train_data %>%
slice(val_fold_inds %>% unlist()) %>%
mutate(
ridge_pred = ridge_val_pred,
knn_pred = knn_val_pred,
ann_pred = ann_val_pred,
svr_pred = svr_val_pred
)
# Step 3: fit model using component model predictions as explanatory variables
stacking_fit <- caret::train(
form = hous_st ~ knn_pred + ridge_pred + ann_pred + svr_pred,
data = val_data,
method = "glmnet",
tuneGrid = expand.grid(
alpha = 0,
lambda = 2^seq(from = -4, to = 10)
)
)
coef(stacking_fit$finalModel, stacking_fit$bestTune$lambda) %>% t() # Transposes the result to occcupy less space
## 1 x 5 sparse Matrix of class "dgCMatrix"
## (Intercept) knn_pred ridge_pred ann_pred svr_pred
## 1 -130.0761 0.216563 0.3378726 0.2389439 0.2921875
# Step 4 (both cross-validation and refitting to the full training set were already done
# to get ridge_fit, knn_fit, ann_fit and svr_fit above)
knn_test_pred <- predict(knn_fit, newdata = test_data)
ridge_test_pred <- predict(ridge_fit, newdata = test_data)
ann_test_pred = predict(ann_fit, newdata = test_data)
svr_test_pred = predict(svr_fit, newdata = test_data)
# Step 5: Assemble data frame of test set predictions from each component model
stacking_test_x = data.frame(
ridge_pred = ridge_test_pred,
knn_pred = knn_test_pred,
ann_pred = ann_test_pred,
svr_pred = svr_test_pred
)
stacking_fit
## glmnet
##
## 216 samples
## 4 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 216, 216, 216, 216, 216, 216, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.0625 129.0097 0.8568090 96.07441
## 0.1250 129.0097 0.8568090 96.07441
## 0.2500 129.0097 0.8568090 96.07441
## 0.5000 129.0097 0.8568090 96.07441
## 1.0000 129.0097 0.8568090 96.07441
## 2.0000 129.0097 0.8568090 96.07441
## 4.0000 129.0097 0.8568090 96.07441
## 8.0000 129.0097 0.8568090 96.07441
## 16.0000 129.0097 0.8568090 96.07441
## 32.0000 129.0217 0.8568382 96.07210
## 64.0000 129.3976 0.8573605 95.97524
## 128.0000 131.4187 0.8576933 96.95202
## 256.0000 138.2664 0.8578426 101.44871
## 512.0000 155.8921 0.8578873 112.36394
## 1024.0000 188.4798 0.8578940 135.74148
##
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 16.
plot(stacking_fit)
# Step 6: Stacked model predictions
stacking_preds = predict(stacking_fit, stacking_test_x)
head(as.data.frame(stacking_preds))
## stacking_preds
## 1 487.3690
## 2 506.4065
## 3 579.3585
## 4 534.8373
## 5 510.2306
## 6 576.6878
# Calculate error rates
ensemble_mape = mape(test_data$hous_st, stacking_preds)
ensemble_percent_bias = percent_bias(test_data$hous_st, stacking_preds)
accuracy = data.frame(
Models = c("Ridge", "KNN", "ANN", "SVR", "Ensemble"),
MAPE = c(ridge_mape, knn_mape, ann_mape, svr_mape, ensemble_mape),
Percent_Bias = c(ridge_percent_bias, knn_percent_bias, ann_percent_bias, svr_percent_bias, ensemble_percent_bias)
)
accuracy %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
| Models | MAPE | Percent_Bias |
|---|---|---|
| Ridge | 0.1144027 | -0.0506975 |
| KNN | 0.2790333 | -0.1814967 |
| ANN | 0.1031093 | -0.0663802 |
| SVR | 0.0953354 | 0.0333680 |
| Ensemble | 0.0651865 | -0.0057394 |
The stacked ensemble produces a MAPE of 0.0651, lower than that of all other models.
#Plots comparing Predicted hous_st from ensemble model vs actual hous_st from test set
df_ensemble = data.frame(hous_st = test_data$hous_st, pred_hous_st_stack_ridge = stacking_preds)
# convert the last 101 rows of the entire FinCrisis dataset into a dataframe
# test set with 3 columns: date, actual and predicted hous_st
series_ensemble = cbind(tail(FinCrisis[,1], 102) , df_ensemble)
tail(series_ensemble, 12) %>% select(3)
## pred_hous_st_stack_ridge
## 91 1267.462
## 92 1397.599
## 93 1351.461
## 94 1283.334
## 95 1291.228
## 96 1227.204
## 97 1206.512
## 98 1216.285
## 99 1160.713
## 100 1022.141
## 101 1032.861
## 102 1818.448
# Melt the "series" data to a long form so that we can pass the data for all 3 variables (hous_st and pred_hous_st) to one aesthetic:
series_ensemble_melt = reshape2::melt(series_ensemble, id.var = 'Date')
ggplot(series_ensemble_melt, aes(x = Date, y = value, colour = variable)) + geom_point() + xlab('Date')
#Plots comparing Predicted hous_st from individual component model vs actual hous_st from test set
df_comp = data.frame(hous_st = test_data$hous_st, knn_preds, ridge_preds, ann_preds, svr_preds, series_ensemble$pred_hous_st_stack_ridge)
# convert the last 101 rows of the entire FinCrisis dataset into a dataframe
# test set with 3 columns: date, actual and predicted hous_st
series_comp = cbind(tail(FinCrisis[,1], 102) , df_comp)
series_forecast = series_comp %>% tail(12) %>%
transmute(knn_forecast = knn_preds, ridge_forecast = ridge_preds, ann_forecast = ann_preds, svr_forecast = svr_preds, ensemble_forecast = series_ensemble.pred_hous_st_stack_ridge )
series_forecast_ts = ts(series_forecast, start=c(2019,1), end=c(2019,12), frequency = 12)
autoplot(series_forecast_ts) + xlab("Year") + ylab("Housing Starts (in 1000s of units)") + ggtitle("Forecasts of US Housing Starts in 2019")
# Melt the "series" data to a long form so that we can pass the data for all 5 variables (hous_st predictions from 4 models plus actual house_st) to one aesthetic:
series_comp_melt = reshape2::melt(series_comp, id.var = 'Date')
ggplot(series_comp_melt, aes(x = Date, y = value, colour = variable)) + geom_point() + xlab('Date')