Load Libraries
library(keras)
library(tensorflow)
library(ggplot2)
library(stats)
library(readr)
library(dplyr)
library(forecast)
library(Metrics)
#Read Mortgage Data (source: https://fred.stlouisfed.org/graph/?g=NUh)
MORTGAGE30US_3_ <- read_csv("~/Downloads/MORTGAGE30US (3).csv")
MORTGAGE30US_3_$DATE <- as.Date(MORTGAGE30US_3_$DATE, "%Y-%M-%D")
ggplot(MORTGAGE30US_3_, aes(x=DATE, y = MORTGAGE30US)) + geom_line()
This is done by getting the difference between two consecutive values in the series. This transformation, commonly known as differencing, removes components in the data that are time dependent. Furthermore, it is easier to model using the difference, rather than the raw values, and the resulting model has a higher predictive power.
#Remove first row as we cannot difference
dateID <- MORTGAGE30US_3_$DATE[2:nrow(MORTGAGE30US_3_)]
diff <- diff(MORTGAGE30US_3_$MORTGAGE30US, differencs = 1)
LSTM expects the data to be in a supervised learning mode. That is, having a target variable Y and predictor X. To achieve this, we transform the series by lagging the series and have the value at time (t−k) as the input and value at time t as the ouput, for a k-step lagged dataset.
supervised <- as.data.frame(cbind(lag(diff,1), diff))
supervised[is.na(supervised)] <- 0
#Add row id
Unlike in most analysis where training and testing data sets are randomly sampled, with time series data the order of the observations does matter. The following code split the first 70% of the series as training set and the remaining 30% as test set.
n_ <- round(nrow(MORTGAGE30US_3_) * .85, digits = 0)
train <- supervised[1:n_, ]
test <- supervised[(n_+1):2510,]
train_id <- dateID[1:n_]
test_id <- dateID[(n_+1):2510]
Just like in any other neural network model, we rescale the input data X to the range of the activation function. As shown earlier, the default activation function for LSTM is sigmoid function whose range is [-1, 1]. The code below will help in this transformation. Note that the min and max values of the training data set are the scaling coefficients used to scale both the training and testing data sets as well as the predicted values. This ensures that the min and max values of the test data do not influence the model.
Let’s create a function to help.
scale_data <- function(train, test, feature_range = c(0,1)) {
x = train
fr_min = feature_range[1]
fr_max = feature_range[2]
std_train = (x - min(x)) / (max(x) - min(x))
std_test = (test - min(x)) / (max(x) - min(x))
scaled_train = std_train * (fr_max - fr_min) + fr_min
scaled_test = std_test * (fr_max - fr_min) + fr_min
return( list(scaled_train = as.vector(scaled_train), scaled_test = as.vector(scaled_test) ,scaler= c(min =min(x), max = max(x))) )
}
#Function to reverse scale data for prediction
reverse_scaling <- function(scaled, scaler, feature_range = c(0,1)) {
min = scaler[1]
max = scaler[2]
t = length(scaled)
mins = feature_range[1]
maxs = feature_range[2]
inverted_dfs = numeric(t)
for(i in 1:t) {
X = (scaled[i] - mins) / (maxs - mins)
rawValues = X * (max - min) + min
inverted_dfs[i] <- rawValues
}
return(inverted_dfs)
}
Scaled Data and Retrieve train/test from list.
Scaled <- scale_data(train, test, c(-1,1))
x_train <- Scaled$scaled_train[,1]
y_train <- Scaled$scaled_train[,2]
x_test <- Scaled$scaled_test[,1]
y_test <- Scaled$scaled_test[,2]
For our ARIMA and ETS model parameterization, all we need to to is convert to ts() object and the model will find optimal (p,d,q).
The LSTM requires more model parameteriztation. Define model paramters
dim(x_train) <- c(length(x_train), 1,1)
X_shape2 <- dim(x_train)[2]
X_shape3 <- dim(x_train)[3]
batch_size <- 1
units <- 100
n_timesteps <- 12
n_predictions <- n_timesteps
build_matrix <- function(tseries, overall_timesteps) {
t(sapply(1:(length(tseries) - overall_timesteps + 1), function(x)
tseries[x:(x + overall_timesteps - 1)]))
}
reshape_X_3d <- function(X) {
dim(X) <- c(dim(X)[1], dim(X)[2], 1)
X
}
model <- keras_model_sequential()
model %>%
layer_lstm(units, batch_input_shape = c(batch_size, X_shape2, X_shape3), stateful = TRUE) %>%
layer_dense(units = 1)
Compile Model
model %>%
compile(loss = 'mean_squared_error',
optimizer = optimizer_adam(lr = 0.03, decay = 1e-6),
metrics = c('accuracy')
)
Perform Modeling
We set the argument shuffle = FALSE to avoid shuffling the training set and maintain the dependencies between xi and xi+t. LSTM also requires resetting of the network state after each epoch. To achive this we run a loop over epochs where within each epoch we fit the model and reset the states via the argument reset_states().
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
int_rate_ts <- MORTGAGE30US_3_ %>% select(MORTGAGE30US) %>% ts()
arimaFit <- auto.arima(int_rate_ts)
arimaFitted <- arimaFit$fitted
arimaFitted <- as.data.frame(cbind(arimaFitted, MORTGAGE30US_3_$DATE ))
arimaFitted <- arimaFitted[-1,]
arimaFitted <- cbind(arimaFitted, MORTGAGE30US_3_[-1,])
arimaFitted$set <- "train"
arimaFitted$set[arimaFitted$`MORTGAGE30US_3_$DATE` > 15393] <- "test"
arimaFitted$arimaFitted[arimaFitted$set == "train"] <- NA
arimaPlot <- plot_ly(arimaFitted, x = ~DATE) %>%
add_trace(y = ~MORTGAGE30US, mode = "lines", name = "Interest Rate") %>%
add_trace(y = ~arimaFitted, mode = "lines", name = "Interest Rate") %>%
layout(title = "ARIMA")
arimaMAE <- arimaFitted %>% filter(set == "test") %>% select(MORTGAGE30US, arimaFitted)
arimaMAE <- mae(arimaMAE$MORTGAGE30US, arimaMAE$arimaFitted)
int_rate_ts <- MORTGAGE30US_3_ %>% select(MORTGAGE30US) %>% ts()
ETSFit <- ets(int_rate_ts)
ETSFitted <- ETSFit$fitted
ETSFitted <- as.data.frame(cbind(ETSFitted, MORTGAGE30US_3_$DATE ))
ETSFitted <- ETSFitted[-1,]
ETSFitted <- cbind(ETSFitted, MORTGAGE30US_3_[-1,])
ETSFitted$set <- "train"
ETSFitted$set[ETSFitted$`MORTGAGE30US_3_$DATE` > 15393] <- "test"
ETSFitted$ETSFitted[ETSFitted$set == "train"] <- NA
etsPlot <- plot_ly(ETSFitted, x = ~DATE) %>%
add_trace(y = ~MORTGAGE30US, mode = "lines", name = "Interest Rate") %>%
add_trace(y = ~ETSFitted, mode = "lines", name = "Interest Rate") %>%
layout(title = "ETS")
ETSMAE <- ETSFitted %>% filter(set == "test") %>% select(MORTGAGE30US, ETSFitted)
ETSMAE <- mae(ETSMAE$MORTGAGE30US, ETSMAE$ETSFitted)
MAE <- data.frame("mae" = c(arimaMAE, ETSMAE, lstmMAE), "model" = c("ARIMA", "ETS", "LSTM"))
ggplot(MAE, aes(model, mae, color = model)) + geom_bar(stat = "identity", fill = "white")
We can clearly see that the LSTM performs the best at minimizing MAE and this is no surprise.
Plot