Data Preprocessing

Load Libraries

library(keras)
library(tensorflow)
library(ggplot2)
library(stats)
library(readr)
library(dplyr)
library(forecast)
library(Metrics)

Import Data

#Read Mortgage Data (source: https://fred.stlouisfed.org/graph/?g=NUh)
 MORTGAGE30US_3_ <- read_csv("~/Downloads/MORTGAGE30US (3).csv")

Inspect Data

MORTGAGE30US_3_$DATE <- as.Date(MORTGAGE30US_3_$DATE, "%Y-%M-%D")

ggplot(MORTGAGE30US_3_, aes(x=DATE, y = MORTGAGE30US)) + geom_line()

Feature Engineering

Transform into stationarity

This is done by getting the difference between two consecutive values in the series. This transformation, commonly known as differencing, removes components in the data that are time dependent. Furthermore, it is easier to model using the difference, rather than the raw values, and the resulting model has a higher predictive power.

#Remove first row as we cannot difference
dateID <- MORTGAGE30US_3_$DATE[2:nrow(MORTGAGE30US_3_)]

diff <- diff(MORTGAGE30US_3_$MORTGAGE30US, differencs = 1)

Create Lagged Dataset

LSTM expects the data to be in a supervised learning mode. That is, having a target variable Y and predictor X. To achieve this, we transform the series by lagging the series and have the value at time (t−k) as the input and value at time t as the ouput, for a k-step lagged dataset.

supervised <- as.data.frame(cbind(lag(diff,1), diff))
supervised[is.na(supervised)] <- 0

#Add row id

Split into training and test

Unlike in most analysis where training and testing data sets are randomly sampled, with time series data the order of the observations does matter. The following code split the first 70% of the series as training set and the remaining 30% as test set.

n_ <- round(nrow(MORTGAGE30US_3_) * .85, digits = 0)
train <- supervised[1:n_, ]
test <- supervised[(n_+1):2510,]
train_id <- dateID[1:n_]
test_id <- dateID[(n_+1):2510]

Normalization

Just like in any other neural network model, we rescale the input data X to the range of the activation function. As shown earlier, the default activation function for LSTM is sigmoid function whose range is [-1, 1]. The code below will help in this transformation. Note that the min and max values of the training data set are the scaling coefficients used to scale both the training and testing data sets as well as the predicted values. This ensures that the min and max values of the test data do not influence the model.

Let’s create a function to help.

scale_data <- function(train, test, feature_range = c(0,1)) {
  x = train
  fr_min = feature_range[1]
  fr_max = feature_range[2]
  std_train = (x - min(x)) / (max(x) - min(x))
  std_test = (test - min(x)) / (max(x) - min(x))
  
  scaled_train = std_train * (fr_max - fr_min) + fr_min
  scaled_test = std_test * (fr_max - fr_min) + fr_min

  return( list(scaled_train = as.vector(scaled_train), scaled_test = as.vector(scaled_test) ,scaler= c(min =min(x), max = max(x))) )
}


#Function to reverse scale data for prediction
reverse_scaling <- function(scaled, scaler, feature_range = c(0,1)) {
  min = scaler[1]
  max = scaler[2]
  t = length(scaled)
  mins = feature_range[1]
  maxs = feature_range[2]
  inverted_dfs = numeric(t)
  
  for(i in 1:t) {
    X = (scaled[i] - mins) / (maxs - mins)
    rawValues = X * (max - min) + min
    inverted_dfs[i] <- rawValues
  }
  return(inverted_dfs)
}

Scaled Data and Retrieve train/test from list.

Scaled <- scale_data(train, test, c(-1,1))

x_train <- Scaled$scaled_train[,1]
y_train <- Scaled$scaled_train[,2]

x_test <- Scaled$scaled_test[,1]
y_test <- Scaled$scaled_test[,2]

Modeling

LSTM

For our ARIMA and ETS model parameterization, all we need to to is convert to ts() object and the model will find optimal (p,d,q).

The LSTM requires more model parameteriztation. Define model paramters

dim(x_train) <- c(length(x_train), 1,1)
X_shape2 <- dim(x_train)[2]
X_shape3 <- dim(x_train)[3]
batch_size <- 1
units <- 100
n_timesteps <- 12
n_predictions <- n_timesteps

build_matrix <- function(tseries, overall_timesteps) {
  t(sapply(1:(length(tseries) - overall_timesteps + 1), function(x) 
    tseries[x:(x + overall_timesteps - 1)]))
}
reshape_X_3d <- function(X) {
  dim(X) <- c(dim(X)[1], dim(X)[2], 1)
  X
}

model <- keras_model_sequential()
model %>% 
  layer_lstm(units, batch_input_shape = c(batch_size, X_shape2, X_shape3), stateful = TRUE) %>%
  layer_dense(units = 1)

Compile Model

model %>% 
  compile(loss = 'mean_squared_error',
          optimizer = optimizer_adam(lr = 0.03, decay = 1e-6),
          metrics = c('accuracy')
          )

Perform Modeling

We set the argument shuffle = FALSE to avoid shuffling the training set and maintain the dependencies between xi and xi+t. LSTM also requires resetting of the network state after each epoch. To achive this we run a loop over epochs where within each epoch we fit the model and reset the states via the argument reset_states().

## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

ARIMA

int_rate_ts <- MORTGAGE30US_3_ %>% select(MORTGAGE30US) %>% ts()
arimaFit <- auto.arima(int_rate_ts)
arimaFitted <- arimaFit$fitted
arimaFitted <- as.data.frame(cbind(arimaFitted, MORTGAGE30US_3_$DATE ))
arimaFitted <- arimaFitted[-1,]
arimaFitted <- cbind(arimaFitted, MORTGAGE30US_3_[-1,])
arimaFitted$set <- "train"
arimaFitted$set[arimaFitted$`MORTGAGE30US_3_$DATE` > 15393] <- "test"
arimaFitted$arimaFitted[arimaFitted$set == "train"] <- NA
arimaPlot <- plot_ly(arimaFitted, x = ~DATE) %>%
  add_trace(y = ~MORTGAGE30US, mode = "lines", name = "Interest Rate") %>%
  add_trace(y = ~arimaFitted, mode = "lines", name = "Interest Rate") %>%
  layout(title = "ARIMA")

arimaMAE <- arimaFitted %>% filter(set == "test") %>% select(MORTGAGE30US, arimaFitted)
 arimaMAE <- mae(arimaMAE$MORTGAGE30US, arimaMAE$arimaFitted)

ETS

int_rate_ts <- MORTGAGE30US_3_ %>% select(MORTGAGE30US) %>% ts()
ETSFit <- ets(int_rate_ts)
ETSFitted <- ETSFit$fitted
ETSFitted <- as.data.frame(cbind(ETSFitted, MORTGAGE30US_3_$DATE ))
ETSFitted <- ETSFitted[-1,]
ETSFitted <- cbind(ETSFitted, MORTGAGE30US_3_[-1,])
ETSFitted$set <- "train"
ETSFitted$set[ETSFitted$`MORTGAGE30US_3_$DATE` > 15393] <- "test"
ETSFitted$ETSFitted[ETSFitted$set == "train"] <- NA
etsPlot <- plot_ly(ETSFitted, x = ~DATE) %>%
  add_trace(y = ~MORTGAGE30US, mode = "lines", name = "Interest Rate") %>%
  add_trace(y = ~ETSFitted, mode = "lines", name = "Interest Rate") %>%
  layout(title = "ETS")

ETSMAE <- ETSFitted %>% filter(set == "test") %>% select(MORTGAGE30US, ETSFitted)
 ETSMAE <-  mae(ETSMAE$MORTGAGE30US, ETSMAE$ETSFitted)

Post-modeling

MAE <- data.frame("mae" = c(arimaMAE, ETSMAE, lstmMAE), "model" = c("ARIMA", "ETS", "LSTM"))

ggplot(MAE, aes(model, mae, color = model)) + geom_bar(stat = "identity", fill = "white")

We can clearly see that the LSTM performs the best at minimizing MAE and this is no surprise.

Plot