Predicting Oil Prices: Time Series & Generalized Additive Models for Location, Scale and Shape
library(gamlss)
library(gamlss.data)
library(glmnet)
library(xts)
library(zoo)
library(dplyr)
library(forecast)
library(tseries)
library(corrplot)
library(ggplot2)
library(caret)Dataset
#setwd
# Load dataset
oil <- gamlss.data::oil
attach(oil)
# help(oil) shows information about the data; data dictionary
# Take a look
# head(oil) # all the data seems to be at same scale
# Check size
dim(oil) # 25 variables and 1000 observations## [1] 1000 25
# Check NA values
sum(is.na(oil)) # No NA values## [1] 0
# Var types
str(oil) # all vars are numeric## 'data.frame': 1000 obs. of 25 variables:
## $ OILPRICE : num 4.64 4.63 4.63 4.65 4.63 ...
## $ CL2_log : num 4.64 4.65 4.64 4.64 4.65 ...
## $ CL3_log : num 4.64 4.65 4.64 4.64 4.65 ...
## $ CL4_log : num 4.64 4.65 4.65 4.65 4.66 ...
## $ CL5_log : num 4.65 4.66 4.65 4.65 4.66 ...
## $ CL6_log : num 4.65 4.66 4.65 4.65 4.66 ...
## $ CL7_log : num 4.65 4.66 4.65 4.65 4.66 ...
## $ CL8_log : num 4.65 4.66 4.65 4.65 4.66 ...
## $ CL9_log : num 4.65 4.66 4.66 4.65 4.66 ...
## $ CL10_log : num 4.65 4.66 4.66 4.65 4.66 ...
## $ CL11_log : num 4.65 4.66 4.65 4.65 4.66 ...
## $ CL12_log : num 4.65 4.66 4.65 4.65 4.66 ...
## $ CL13_log : num 4.65 4.65 4.65 4.64 4.65 ...
## $ CL14_log : num 4.64 4.65 4.65 4.64 4.65 ...
## $ CL15_log : num 4.64 4.65 4.64 4.64 4.65 ...
## $ BDIY_log : num 6.85 6.85 6.88 6.88 6.9 ...
## $ SPX_log : num 7.22 7.24 7.22 7.22 7.24 ...
## $ DX1_log : num 4.39 4.38 4.39 4.38 4.38 ...
## $ GC1_log : num 7.41 7.42 7.42 7.41 7.4 ...
## $ HO1_log : num 1.14 1.15 1.16 1.14 1.14 ...
## $ USCI_log : num 4.11 4.12 4.12 4.1 4.11 ...
## $ GNR_log : num 3.92 3.94 3.92 3.93 3.94 ...
## $ SHCOMP_log: num 7.74 7.76 7.77 7.77 7.76 ...
## $ FTSE_log : num 8.64 8.65 8.64 8.64 8.66 ...
## $ respLAG : num 4.63 4.64 4.63 4.63 4.65 ...
# Take a look on variables distribution
summary(oil) # vars have median and mean close, probably a simetric distribution## OILPRICE CL2_log CL3_log CL4_log
## Min. :3.266 Min. :3.345 Min. :3.391 Min. :3.428
## 1st Qu.:3.966 1st Qu.:3.982 1st Qu.:4.001 1st Qu.:4.017
## Median :4.517 Median :4.519 Median :4.519 Median :4.518
## Mean :4.309 Mean :4.317 Mean :4.322 Mean :4.326
## 3rd Qu.:4.580 3rd Qu.:4.581 3rd Qu.:4.581 3rd Qu.:4.579
## Max. :4.705 Max. :4.696 Max. :4.682 Max. :4.672
## CL5_log CL6_log CL7_log CL8_log
## Min. :3.482 Min. :3.501 Min. :3.516 Min. :3.529
## 1st Qu.:4.032 1st Qu.:4.048 1st Qu.:4.058 1st Qu.:4.068
## Median :4.519 Median :4.518 Median :4.517 Median :4.514
## Mean :4.329 Mean :4.331 Mean :4.332 Mean :4.333
## 3rd Qu.:4.575 3rd Qu.:4.570 3rd Qu.:4.563 3rd Qu.:4.556
## Max. :4.672 Max. :4.673 Max. :4.673 Max. :4.673
## CL9_log CL10_log CL11_log CL12_log
## Min. :3.542 Min. :3.555 Min. :3.566 Min. :3.576
## 1st Qu.:4.075 1st Qu.:4.082 1st Qu.:4.090 1st Qu.:4.097
## Median :4.512 Median :4.509 Median :4.506 Median :4.503
## Mean :4.333 Mean :4.337 Mean :4.333 Mean :4.332
## 3rd Qu.:4.550 3rd Qu.:4.546 3rd Qu.:4.542 3rd Qu.:4.537
## Max. :4.672 Max. :4.670 Max. :4.667 Max. :4.663
## CL13_log CL14_log CL15_log BDIY_log
## Min. :3.585 Min. :3.594 Min. :3.603 Min. :5.670
## 1st Qu.:4.103 1st Qu.:4.110 1st Qu.:4.117 1st Qu.:6.596
## Median :4.500 Median :4.497 Median :4.493 Median :6.806
## Mean :4.332 Mean :4.331 Mean :4.331 Mean :6.787
## 3rd Qu.:4.532 3rd Qu.:4.527 3rd Qu.:4.522 3rd Qu.:7.011
## Max. :4.658 Max. :4.654 Max. :4.649 Max. :7.757
## SPX_log DX1_log GC1_log HO1_log
## Min. :7.153 Min. :4.369 Min. :6.956 Min. :-0.1442
## 1st Qu.:7.354 1st Qu.:4.391 1st Qu.:7.089 1st Qu.: 0.6220
## Median :7.531 Median :4.417 Median :7.159 Median : 1.0547
## Mean :7.481 Mean :4.459 Mean :7.192 Mean : 0.8600
## 3rd Qu.:7.611 3rd Qu.:4.557 3rd Qu.:7.345 3rd Qu.: 1.1013
## Max. :7.664 Max. :4.613 Max. :7.491 Max. : 1.1877
## USCI_log GNR_log SHCOMP_log FTSE_log
## Min. :3.650 Min. :3.317 Min. :7.576 Min. :8.568
## 1st Qu.:3.838 1st Qu.:3.787 1st Qu.:7.652 1st Qu.:8.716
## Median :4.021 Median :3.868 Median :7.734 Median :8.778
## Mean :3.962 Mean :3.818 Mean :7.840 Mean :8.760
## 3rd Qu.:4.070 3rd Qu.:3.920 3rd Qu.:8.032 3rd Qu.:8.813
## Max. :4.148 Max. :3.985 Max. :8.550 Max. :8.868
## respLAG
## Min. :3.266
## 1st Qu.:3.966
## Median :4.517
## Mean :4.310
## 3rd Qu.:4.580
## Max. :4.705
Data Transformation Framework
Transform Functions
First, it is defined some functions to perform transformations on the data.
# Transformation functions with tag and timestamp
## a. Rolling standard deviation (of arbitrary window)
### input: dataset, variable name (it can be a vector of names of several vars) and window size
roll_sd <- function(data, varnames, window){
fun <- "roll_sd" ## function tag
x <- data.frame(1:nrow(data)) ## create an empty df
for(i in 1:length(varnames)){ ## calculate for one var at once
a <- data[,varnames[i]] ## collect data for the var
calc <- rollapply(a, window, sd) ## apply function
calc <- as.data.frame(c((rep(NA, window-1)), ## NA fill - keep vec size
calc))
colnames(calc) <- paste0(varnames[i], "_rollsd") ## create new var transformed name
x <- cbind(x, calc) ## attach it
timestamp <- format(Sys.time(), ## create a timestamp
"%m %d, %Y %H:%M")
track <- t(c(varnames[i], fun, window, timestamp)) ## create a track vec with metadata info
write.table(track, file = "trackmetadata.csv", sep = "\t",
append = TRUE, col.names = FALSE, row.names = FALSE) ## write metainfo in a file
}
x[,1] <- NULL ## take off x index
return(x)
}
## b. Rolling mean (of arbitrary window)
### input: dataset, variable name (it can be a vector of names of several vars) and window size
roll_mean <- function(data, varnames, window){
fun <- "roll_mean" ## tag
x <- data.frame(1:nrow(data)) ## empty df
for(i in 1:length(varnames)){ ## for one var at once
a <- data[,varnames[i]] ## collect data
calc <- rollapply(a, window, mean) ## apply transformation
calc <- as.data.frame(c((rep(NA, window-1)), calc)) ## set vec size
colnames(calc) <- paste0(varnames[i], "_rollmean") ## create name of new transf var
x <- cbind(x, calc) ## attach it
timestamp <- format(Sys.time(), "%m %d, %Y %H:%M") ## get timestamp
track <- t(c(varnames[i], fun, window, timestamp)) ## get track info
write.table(track, file = "trackmetadata.csv", sep = "\t",
append = TRUE, col.names = FALSE, row.names = FALSE) ## add it in the file
}
x[,1] <- NULL
return(x)
}
## c. Lagging (of arbitrary order)
### input: dataset name (dataframe), varnames (string names) (it can be a vector), lag and dif values (pattern is 1)
lagging <- function(data, varnames, lag=1, dif=1){
fun <- "lag" ## function tag
x <- data.frame(1:nrow(data)) ## empty df
for(i in 1:length(varnames)){ ## for each variable
a <- data[,varnames[i]] ## collect var data
calc <- as.data.frame(lag(a, lag=lag, dif=dif)) ## apply transformation
colnames(calc) <- paste0(varnames[i], "_lag") ## create new var name
x <- cbind(x, calc) ## attach it
timestamp <- format(Sys.time(), "%m %d, %Y %H:%M") ## get timestamp
track <- t(c(varnames[i], fun, paste0(lag,"_","dif"), timestamp)) ## get track info
write.table(track, file = "trackmetadata.csv", sep = "\t",
append = TRUE, col.names = FALSE, row.names = FALSE) ## add it in the file
}
x[,1] <- NULL ## remove x indexes
return(x)
}
## d. Leading (of arbitrary order)
leading <- function(data, varnames, k=1, dif=1){
fun <- "lead" ## function tag
x <- data.frame(1:nrow(data)) ## empty df
for(i in 1:length(varnames)){ ## for var
a <- data[,varnames[i]] ## collect var data
calc <- dplyr::lead(a, k=k, dif=dif) ## apply transform
calc <- as.data.frame(calc)
colnames(calc) <- paste0(varnames[i], "_lead") ## put new var name
x <- cbind(x, calc) ## attach it
timestamp <- format(Sys.time(), "%m %d, %Y %H:%M") ## get timestamp
track <- t(c(varnames[i], fun, paste0(k,"_",dif), timestamp)) ## get tracker
write.table(track, file = "trackmetadata.csv", sep = "\t",
append = TRUE, col.names = FALSE, row.names = FALSE) ## add it in the file
}
x[,1] <- NULL
return(x)
}
## e. Differencing
diffts <- function(data, varnames, lag=1, dif=1){
fun <- "diff" ## function tag
x <- data.frame(1:nrow(data)) ## empty df
for(i in 1:length(varnames)){ ## for each var
a <- data[,varnames[i]] ## het var data
calc <- diff(a, lag=lag, dif=dif) ## apply transformation
calc <- as.data.frame(c((rep(NA, lag)), calc))
calc <- apply(calc, 2, as.numeric)
colnames(calc) <- paste0(varnames[i], "_diff") # create new varname
x <- cbind(x, calc) ## attach it
timestamp <- format(Sys.time(), "%m %d, %Y %H:%M") ## get timestamp
track <- t(c(varnames[i], fun, window, timestamp)) ## get tracker
write.table(track, file = "trackmetadata.csv", sep = "\t",
append = TRUE, col.names = FALSE, row.names = FALSE) ## add it in the file
}
x[,1] <- NULL
return(x)
}
## f. Spread (between two input drivers)
spread <- function(data, var1, var2){
fun <- "spread" ## function tag
x <- as.data.frame(data[,var1]-data[,var2]) ## calculate spread
colnames(x) <- paste0(var1, ".", var2, "_spr") ## new var name
timestamp <- format(Sys.time(), "%m %d, %Y %H:%M") ## get timestamp
track <- t(c(paste0(var1, "_", var2), fun, 0, timestamp)) ## get tracker
write.table(track, file = "trackmetadata.csv", sep = "\t",
append = TRUE, col.names = FALSE, row.names = FALSE) ## add tracker in the file
return(x)
}
## g. Ratio (between two input drivers)
ratio <- function(data, var1, var2){
fun <- "ratio" ## function tag
x <- as.data.frame(data[,var1]/data[,var2]) ## get ratio values
colnames(x) <- paste0(var1, ".", var2, "_rt") ## new var name
timestamp <- format(Sys.time(), "%m %d, %Y %H:%M") ## timestamp
track <- t(c(paste0(var1, "_", var2), fun, 0, timestamp)) ## get tracker
write.table(track, file = "trackmetadata.csv", sep = "\t",
append = TRUE, col.names = FALSE, row.names = FALSE) ## add tracker in the file
return(x)
}
## h. Product (between two input drivers)
prod <- function(data, var1, var2){
fun <- "prod" ## function tag
x <- as.data.frame(data[,var1]*data[,var2]) ## get the product values
colnames(x) <- paste0(var1, ".", var2, "_prod") ## new var name
timestamp <- format(Sys.time(), "%m %d, %Y %H:%M") ## timestamp
track <- t(c(paste0(var1, "_", var2), fun, 0, timestamp)) ## get tracker
write.table(track, file = "trackmetadata.csv", sep = "\t",
append = TRUE, col.names = FALSE, row.names = FALSE) ## add tracker in the file
return(x)
}
## Between diff
betdiff <- function(data, var1, var2){
fun <- "betdiff" ## fun tag
x <- as.data.frame(data[,var1]-data[,var2]) ## get diff between 2 vars
x <- x*1 ## just a transformation to format var type
colnames(x) <- paste0(var1, ".", var2, "_betdiff") ## new var name
timestamp <- format(Sys.time(), "%m %d, %Y %H:%M") ## time stamp
track <- t(c(paste0(var1, "_", var2), fun, 0, timestamp)) ## get tracker
write.table(track, file = "trackmetadata.csv", sep = "\t",
append = TRUE, col.names = FALSE, row.names = FALSE) ## add tracker to the file
return(x)
}User can specify the transformation by choosing the functions and inputing the name of the variables. To bind transformation to the dataset use:
oil[,ncol(oil)+1] <- betdiff(oil, "OILPRICE", "CL2_log")The variables are created following a pattern in their names: VarName_TagFunctionApplied or if the function is a measure between 2 variables the pattern is VarName1.Varname2_TagFunctionApplied. User can have a composition of different functions by using “%>%” like in dplyr package. Example:
head(betdiff(oil, "OILPRICE", "CL2_log") %>%
roll_sd("OILPRICE.CL2_log_betdiff", 2)) # Note that here it is used the name resulted by previous transformation, following the pattern.## OILPRICE.CL2_log_betdiff_rollsd
## 1 NA
## 2 0.011825057
## 3 0.006005468
## 4 0.008329129
## 5 0.019094196
## 6 0.007586508
It is also possible to apply several transformations on the original dataset at once using the function composition below. It receives a list containing the function that must be applied and their arguments. It can be applied only in the variables of the original dataset, so it is not possible concatenate transformations this way at once. Only sequential.
# define function composition
composition <- function(data, list_args){
x <- data.frame(1:nrow(data))
for(i in seq(1,length(list_args),2)){
result <- do.call(list_args[[i]], list_args[[i+1]]) # apply functions chosen
x <- cbind(x, result)
}
x[,1] <- NULL
data <- cbind(data, x) # attach it to the data
return(data)
}Define a list of transformations and their parameters - infinite transformations can be set here. Example: Difference between OILPRICE and respLAG AND lagging of OILPRICE window 2, AND roll mean with window 2 for CL3_log, CL2_log, DX1_log AND leading window 5 to CL15_log etc… etc..
lista <- list(betdiff, list(oil, "OILPRICE", "respLAG"),
lagging, list(oil, "OILPRICE", 2),
roll_mean, list(oil, c("CL3_log", "CL2_log", "DX1_log"), 2),
leading, list(oil, "CL15_log", 5))
# apply a composition
head(composition(oil, lista))## OILPRICE CL2_log CL3_log CL4_log CL5_log CL6_log CL7_log CL8_log
## 1 4.640923 4.636475 4.641116 4.644968 4.648038 4.649761 4.650908 4.651863
## 2 4.633077 4.645352 4.649857 4.653484 4.656338 4.657858 4.658711 4.659564
## 3 4.634049 4.637831 4.642466 4.646312 4.649665 4.651672 4.653103 4.654341
## 4 4.646312 4.638315 4.642562 4.646120 4.648708 4.649952 4.650621 4.651099
## 5 4.631520 4.650526 4.654722 4.658047 4.660321 4.661267 4.661740 4.662117
## 6 4.627616 4.635893 4.640344 4.644199 4.646984 4.648421 4.649378 4.650335
## CL9_log CL10_log CL11_log CL12_log CL13_log CL14_log CL15_log BDIY_log
## 1 4.652340 4.651672 4.650621 4.648613 4.646120 4.643236 4.639765 6.850126
## 2 4.660037 4.659374 4.657952 4.655578 4.652531 4.649187 4.645256 6.850126
## 3 4.655293 4.655007 4.653865 4.651672 4.648804 4.645736 4.642081 6.879356
## 4 4.651577 4.651004 4.649570 4.647080 4.644102 4.640923 4.636960 6.882437
## 5 4.662401 4.661645 4.659942 4.656908 4.653484 4.649761 4.645544 6.896694
## 6 4.651099 4.650813 4.649570 4.646984 4.643910 4.640537 4.636475 6.913737
## SPX_log DX1_log GC1_log HO1_log USCI_log GNR_log SHCOMP_log FTSE_log
## 1 7.221624 4.386554 7.413367 1.136197 4.108412 3.917806 7.744539 8.636699
## 2 7.235309 4.379762 7.419680 1.152564 4.120986 3.942552 7.762536 8.650062
## 3 7.222756 4.387449 7.418481 1.155182 4.115127 3.923952 7.766061 8.639729
## 4 7.222252 4.383675 7.410347 1.136743 4.103965 3.925531 7.765158 8.642292
## 5 7.237620 4.382364 7.399704 1.139946 4.107096 3.941970 7.755763 8.659907
## 6 7.233556 4.382951 7.404888 1.137256 4.097672 3.936325 7.775213 8.656137
## respLAG OILPRICE.CL2_log_betdiff OILPRICE.respLAG_betdiff OILPRICE_lag
## 1 4.631812 0.004448320 0.0091112388 NA
## 2 4.640923 -0.012274836 -0.0078462165 4.640923
## 3 4.633077 -0.003781823 0.0009720063 4.633077
## 4 4.634049 0.007997345 0.0122629838 4.634049
## 5 4.646312 -0.019005926 -0.0147921680 4.646312
## 6 4.631520 -0.008276984 -0.0039035865 4.631520
## CL3_log_rollmean CL2_log_rollmean DX1_log_rollmean CL15_log_lead
## 1 NA NA NA 4.645256
## 2 4.645487 4.640914 4.383158 4.642081
## 3 4.646161 4.641591 4.383605 4.636960
## 4 4.642514 4.638073 4.385562 4.645544
## 5 4.648642 4.644420 4.383020 4.636475
## 6 4.647533 4.643210 4.382658 4.634341
All functions writes a track information in a metadata file, including: Name of the variable, function applied to get that result and a timestamp. I preferred to write it in a file instead of an object because this information will not be accessed often, so putting it in a file, you save computer memory. Also you can access it in the future. It can be loaded in R and filtered as needed.
Getting Stats
This function get statistics of the variables and returns it here. The information can be stored in a object, but it also writes them in a file.
# Getting stats values of the tests: Stationarity (KPSS & DF); Normality (Shapiro-Wilk);
# Pearson Correlation with target variable
# input: dataset, name of the variables to be tested, name of the target variable
getStats <- function(data, varnames, target){
statsTest <- structure(list(variable = character(),
test = character(),
value = numeric()),
class = "data.frame")
for(i in 1:length(varnames)){
dat <- replace(x = data[,varnames[i]], list = is.na(data[,varnames[i]]),
values = 0)
# Kwiatkowski-Phillips-Schmidt-Shin - Stacionarity
kpss <- suppressWarnings(kpss.test(dat)[[3]])
statsTest <- rbind(statsTest, c(varnames[i], "kpss", kpss))
# Dickey Fuller - Stacionarity
dickeyfuller <- suppressWarnings(adf.test(dat))[[4]]
statsTest <- rbind(statsTest, c(varnames[i], "dickeyfuller",
dickeyfuller))
# Shapiro-Wilk Test - Normality
shapiro <- suppressWarnings(shapiro.test(dat))[[2]]
statsTest <- rbind(statsTest, c(varnames[i], "shapiro", shapiro))
# Pearson Correlation with target variable
targetcor <- cor(data[,target], dat)
statsTest <- rbind(statsTest, c(varnames[i], "targetcor",
targetcor)) ## bind to the object
}
colnames(statsTest) <- c("var", "test", "value")
write.table(statsTest, "statsmetadata.txt", sep = "\t",
col.names = FALSE, append = TRUE) ## write stats in a file
return(statsTest)
}
# apply a transformation on data
#oil[,ncol(oil)+1] <- betdiff(oil, "OILPRICE", "CL2_log")
# get stats for variables
getStats(oil, c("OILPRICE.CL2_log_betdiff", "CL2_log", "CL3_log"), "OILPRICE")## var test value
## 1 OILPRICE.CL2_log_betdiff kpss 0.01
## 2 OILPRICE.CL2_log_betdiff dickeyfuller 0.01
## 3 OILPRICE.CL2_log_betdiff shapiro 4.84124736744713e-23
## 4 OILPRICE.CL2_log_betdiff targetcor 0.52789534800614
## 5 CL2_log kpss 0.01
## 6 CL2_log dickeyfuller 0.769579302872897
## 7 CL2_log shapiro 2.56446659912172e-32
## 8 CL2_log targetcor 0.998080113823744
## 9 CL3_log kpss 0.01
## 10 CL3_log dickeyfuller 0.779965219302587
## 11 CL3_log shapiro 1.30062861931017e-32
## 12 CL3_log targetcor 0.997696727558502
Model
Lagging Variables in 1 day
Considering that predictions will be of one day ahead, all the indexes (“BDIY_log”, “SPX_log”, “DX1_log”, “GC1_log”, “HO1_log”, “USCI_log”, “GNR_log”, “SHCOMP_log”, “FTSE_log”) will be lagged in 1 day.
laglist <- list(lagging, list(oil, c("BDIY_log",
"SPX_log", "DX1_log", "GC1_log",
"HO1_log", "USCI_log", "GNR_log",
"SHCOMP_log", "FTSE_log")))
oillag <- composition(oil, laglist)
oillag <- na.omit(oillag) # exclude NA values generated by the lag
oillag[,3:24] <- NULL # excluding CL log columns and indexes not lagged
rm(oil)
invisible(gc())Exploratory Data Analysis - Time Series
Set oil as a time serie.
oilts <- ts(oillag, frequency = 365, start = c(2018, 1, 1)) # random date just to help
autoplot(oilts[,c("OILPRICE", "CL2_log", "BDIY_log_lag", "SPX_log_lag",
"DX1_log_lag", "GC1_log_lag", "HO1_log_lag",
"USCI_log_lag", "GNR_log_lag", "SHCOMP_log_lag",
"FTSE_log_lag", "respLAG")]) +
ggtitle("Plot of the Oil Time-Series") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.y=element_blank())Oil price follows a trend similar to USCI_log, but with higher values and with BDIY_log but with lower values. It also show a trend like the opposite of SHCOMP_log. GNR_log and HO1_log also shows a light drop in the end, similar to Oil Price.
Correlation with Target
#dev.off()
cormat <- cor(oillag)
corrplot(cormat, method = "color",
type = "lower",
tl.col = "#424242",
tl.srt = 45,
addCoef.col = "#ffffff",
col = colorRampPalette(c("#4D4D4D", "white", "#5288DB"))(200),
tl.cex = 0.5,
number.cex = 0.5,
number.font = 1,
cl.cex = 0.5)This heatmap shows that really there is a correlation between Oil Price and USCI_log of 95%. And a negative correlation of 79% with SHCOMP_log. Also shows a correlation with HO1_log of 98% and with GNR_log of 91%. CL2_log is the lag with maximum correlation, 99.8% and for this reason, I choose it to continue in the model and I will exclude the other lags.
par(mfrow=c(3,3))
for(i in 4:length(oillag)) {
plot(oillag$OILPRICE, oillag[,i], main = paste0("Oil Price x ", colnames(oillag)[i]),
ylab=colnames(oillag)[i], xlab = "Oil Price")
}The dispersion graphs shows a positive trend between Oil Price and BDIY_log, H01_log, USCI_log, GNR_log and negative between Oil Price and DX1_log and SHCOMP_log.
Cross Correlation
## CCF of Oil Price and Indexes
par(mfrow=c(3,3))
for(i in 4:length(oillag)) {
y = oillag[,i]
ccf(oillag[,"OILPRICE"], y, main = paste0("Cross Correlation Oil Price x ",
colnames(oillag)[i]),
ylab=colnames(oillag)[i], xlab = "Oil Price")
}Considering all these graphs, index FTSE_log does not show influence on Oil Price. It will not be used anymore.
Seasonality
ggseasonplot(oilts[, 'OILPRICE'])Not possible to identify a clear seasonality with this graph.
Decompose
#autoplot(decompose(oilts[,"OILPRICE"], type = "additive")) + xlab("") + ylab("") +
# ggtitle("Additive Decomposition of Oil Price") +
# theme(plot.title = element_text(hjust = 0.5))
autoplot(decompose(oilts[,"OILPRICE"], type = "multiplicative")) + xlab("") + ylab("") +
ggtitle("Multiplicative Decomposition of Oil Price") +
theme(plot.title = element_text(hjust = 0.5))#stlRes <- stl(oilts[, 'OILPRICE'], s.window = "periodic")Oil Price shows a trend of drop and now it is possible identify that there is a seasonal component on it.
Stationarity
Getting the Dickey-Fuller test values to identify which of these series are stationary.
stats <- getStats(oillag, c("OILPRICE", "CL2_log", "BDIY_log_lag", "SPX_log_lag",
"DX1_log_lag", "GC1_log_lag", "HO1_log_lag",
"USCI_log_lag", "GNR_log_lag", "SHCOMP_log_lag", "respLAG"), "OILPRICE")
stats %>%
filter(test == "dickeyfuller")## var test value
## 1 OILPRICE dickeyfuller 0.805707301519076
## 2 CL2_log dickeyfuller 0.773007700419777
## 3 BDIY_log_lag dickeyfuller 0.290132480678399
## 4 SPX_log_lag dickeyfuller 0.740638833971042
## 5 DX1_log_lag dickeyfuller 0.685923645955033
## 6 GC1_log_lag dickeyfuller 0.477682133093941
## 7 HO1_log_lag dickeyfuller 0.771040951988363
## 8 USCI_log_lag dickeyfuller 0.689118870548244
## 9 GNR_log_lag dickeyfuller 0.623559284946592
## 10 SHCOMP_log_lag dickeyfuller 0.567639326687163
## 11 respLAG dickeyfuller 0.788519456385602
None of them is stationary. Then, the diffts function will be used to transform them.
difflist <- list(diffts, list(oillag, c("OILPRICE", "CL2_log", "BDIY_log_lag",
"SPX_log_lag", "DX1_log_lag", "GC1_log_lag",
"HO1_log_lag", "USCI_log_lag", "GNR_log_lag",
"SHCOMP_log_lag", "respLAG")))
oildiff <- composition(oillag, difflist)
rm(difflist)
invisible(gc())Testing differentiated series to check if it was enough to transform them in stationary.
stats <- getStats(oildiff, c("OILPRICE_diff", "CL2_log_diff", "BDIY_log_lag_diff",
"SPX_log_lag_diff", "DX1_log_lag_diff", "GC1_log_lag_diff",
"HO1_log_lag_diff", "USCI_log_lag_diff", "GNR_log_lag_diff",
"SHCOMP_log_lag_diff", "respLAG_diff"), "OILPRICE")
stats %>%
filter(test == "dickeyfuller")## var test value
## 1 OILPRICE_diff dickeyfuller 0.01
## 2 CL2_log_diff dickeyfuller 0.01
## 3 BDIY_log_lag_diff dickeyfuller 0.01
## 4 SPX_log_lag_diff dickeyfuller 0.01
## 5 DX1_log_lag_diff dickeyfuller 0.01
## 6 GC1_log_lag_diff dickeyfuller 0.01
## 7 HO1_log_lag_diff dickeyfuller 0.01
## 8 USCI_log_lag_diff dickeyfuller 0.01
## 9 GNR_log_lag_diff dickeyfuller 0.01
## 10 SHCOMP_log_lag_diff dickeyfuller 0.01
## 11 respLAG_diff dickeyfuller 0.01
Now they are all stationary and ready to be imputed in the model.
oilstationary <- oildiff[,c("OILPRICE_diff", "CL2_log_diff", "BDIY_log_lag_diff",
"SPX_log_lag_diff", "DX1_log_lag_diff", "GC1_log_lag_diff",
"HO1_log_lag_diff", "USCI_log_lag_diff", "GNR_log_lag_diff",
"SHCOMP_log_lag_diff", "respLAG_diff")]
rm(oildiff, stats)
invisible(gc())
oilstationary <- ts(oilstationary, frequency = 365, start = c(2018, 1, 1)) # random date just to help
autoplot(oilstationary[,c("OILPRICE_diff", "CL2_log_diff", "BDIY_log_lag_diff",
"SPX_log_lag_diff", "DX1_log_lag_diff", "GC1_log_lag_diff",
"HO1_log_lag_diff", "USCI_log_lag_diff", "GNR_log_lag_diff",
"SHCOMP_log_lag_diff", "respLAG_diff")]) +
ggtitle("Plot of the Oil Time-Series") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.y=element_blank())Split Train & Test
# create time slices for the data
oilfinal <- na.omit(oilstationary)
timeSlices <- createTimeSlices(1:nrow(oilfinal),
initialWindow = 700, horizon = 200, fixedWindow = TRUE)
trainSlices <- timeSlices[[1]]
#testSlices <- timeSlices[[2]]
traintransf = as.data.frame(oilfinal[trainSlices[[1]],])
testtransf = as.data.frame(oilfinal[-trainSlices[[1]],])
# original dataset
oilts1 <- oilts[,c("OILPRICE", "CL2_log", "BDIY_log_lag",
"SPX_log_lag", "DX1_log_lag", "GC1_log_lag",
"HO1_log_lag", "USCI_log_lag", "GNR_log_lag",
"SHCOMP_log_lag", "respLAG")]
trainori = as.data.frame(oilts1[trainSlices[[1]],])
testori = as.data.frame(oilts1[-trainSlices[[1]],])
rm(timeSlices, oilts, oilstationary, trainSlices)
invisible(gc())Using LASSO Regularization for Feature Selection
# Find best lambda
x_tr <- traintransf
x_ts <- testtransf
cv_model <- cv.glmnet(as.matrix(x_tr[,-1]),
as.matrix(x_tr[,'OILPRICE_diff']), alpha = 0, nlambda = 1000,
nfolds=20, standardize = TRUE, type.measure = "mae")
best_lambda <- cv_model$lambda.min
# Fit the model with best lambda
best_model <- glmnet(as.matrix(x_tr[,-1]),
as.matrix(x_tr[,'OILPRICE_diff']),
alpha = 0, lambda = best_lambda, standardize = TRUE)
# Check predictions and adjustment
#predict(best_model, s = best_lambda, newx = as.matrix(x_ts[,-ncol(x_ts)]))
best_model$dev.ratio## [1] 0.02747424
# Create a dataset with selected vars
# Select features
df_coef <- round(as.matrix(coef(cv_model, s=cv_model$lambda.min)), 2)
# See all contributing variables
selected <- df_coef[df_coef[, 1] != 0, ]
data_sel <- x_tr[,c(which(colnames(x_tr) %in% names(selected)))]Fitting the Model
Models will be fit considering original data and transformed data. First, I will consider the models with all variables and both datasets (transformed and original).
Base Model - original data
# Base model - AIC: -3967.455
model_base <- gamlss(OILPRICE ~ ., data = trainori,
trace = FALSE, family = BCT)
shapiro.test(model_base$residuals)##
## Shapiro-Wilk normality test
##
## data: model_base$residuals
## W = 0.99791, p-value = 0.5469
checkresiduals(model_base$residuals, DF=14)summary(model_base)## ******************************************************************
## Family: c("BCT", "Box-Cox t")
##
## Call: gamlss(formula = OILPRICE ~ ., family = BCT, data = trainori,
## trace = FALSE)
##
## Fitting method: RS()
##
## ------------------------------------------------------------------
## Mu link function: identity
## Mu Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.050979 0.289920 3.625 0.000310 ***
## CL2_log 0.106439 0.153404 0.694 0.488015
## BDIY_log_lag -0.005750 0.002367 -2.429 0.015395 *
## SPX_log_lag -0.045602 0.014465 -3.153 0.001688 **
## DX1_log_lag -0.001218 0.037210 -0.033 0.973905
## GC1_log_lag -0.058847 0.016499 -3.567 0.000386 ***
## HO1_log_lag 0.023642 0.017051 1.387 0.166024
## USCI_log_lag 0.004198 0.020477 0.205 0.837624
## GNR_log_lag 0.029555 0.023574 1.254 0.210378
## SHCOMP_log_lag -0.035153 0.009985 -3.521 0.000459 ***
## respLAG 0.865374 0.146639 5.901 5.65e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## Sigma link function: log
## Sigma Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.09474 0.03668 -166.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## Nu link function: identity
## Nu Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.61 12.67 1.232 0.218
##
## ------------------------------------------------------------------
## Tau link function: log
## Tau Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.2217 0.1049 11.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## No. of observations in the fit: 700
## Degrees of Freedom for the fit: 14
## Residual Deg. of Freedom: 686
## at cycle: 10
##
## Global Deviance: -3995.455
## AIC: -3967.455
## SBC: -3903.74
## ******************************************************************
Residuals are normally distributed, with mean close to zero and uncorrelated.
Base Model - transformed data
# Base model transf data - -3965.401
model_t <- gamlss(OILPRICE_diff ~ ., data = traintransf,
trace = FALSE, family = TF) # can not use boxcox because of negative values
shapiro.test(model_t$residuals)##
## Shapiro-Wilk normality test
##
## data: model_t$residuals
## W = 0.99562, p-value = 0.04592
checkresiduals(model_t$residuals, DF=13)summary(model_t)## ******************************************************************
## Family: c("TF", "t Family")
##
## Call: gamlss(formula = OILPRICE_diff ~ ., family = TF, data = traintransf,
## trace = FALSE)
##
## Fitting method: RS()
##
## ------------------------------------------------------------------
## Mu link function: identity
## Mu Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0003739 0.0004970 -0.752 0.4521
## CL2_log_diff -0.3340934 0.2847331 -1.173 0.2411
## BDIY_log_lag_diff 0.0030997 0.0202904 0.153 0.8786
## SPX_log_lag_diff -0.0375782 0.1216444 -0.309 0.7575
## DX1_log_lag_diff -0.0944434 0.1479443 -0.638 0.5234
## GC1_log_lag_diff 0.0439346 0.0551174 0.797 0.4257
## HO1_log_lag_diff 0.0895552 0.0510620 1.754 0.0799 .
## USCI_log_lag_diff 0.0359061 0.1118945 0.321 0.7484
## GNR_log_lag_diff -0.0575744 0.1117780 -0.515 0.6067
## SHCOMP_log_lag_diff 0.0014158 0.0462602 0.031 0.9756
## respLAG_diff 0.2150485 0.2770158 0.776 0.4378
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## Sigma link function: log
## Sigma Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.5399 0.0481 -94.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## Nu link function: log
## Nu Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3462 0.1541 8.734 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## No. of observations in the fit: 700
## Degrees of Freedom for the fit: 13
## Residual Deg. of Freedom: 687
## at cycle: 8
##
## Global Deviance: -3986.104
## AIC: -3960.104
## SBC: -3900.94
## ******************************************************************
Residuals are normally distributed, with mean close to zero and uncorrelated.
Model with Original Data and Cubic Splines
# Base model Cubic Splines - AIC: -3896.047
model_base_cs <- gamlss(OILPRICE ~ cs(CL2_log) + cs(BDIY_log_lag) +
cs(SPX_log_lag) + cs(DX1_log_lag) + cs(GC1_log_lag) +
+ cs(HO1_log_lag) + cs(USCI_log_lag) + cs(GNR_log_lag) +
cs(SHCOMP_log_lag) + cs(respLAG),
data = trainori,
trace = FALSE, family = BCCG)
shapiro.test(model_base_cs$residuals)
checkresiduals(model_base_cs$residuals, DF = 43)summary(model_base_cs)
## Important vars: SPX, DX1, GC1, SHCOMP, resplagResiduals are not normally distributed, the mean is close to zero but the residuals seems to be slightly correlated.
Model with Transformed Data and Cubic Splines on Covariables
# Base model transf data cubic splines - -3974.824
model_t_cs <- gamlss(OILPRICE_diff ~ cs(CL2_log_diff) + BDIY_log_lag_diff +
SPX_log_lag_diff + DX1_log_lag_diff + GC1_log_lag_diff +
+ cs(HO1_log_lag_diff) + USCI_log_lag_diff + GNR_log_lag_diff +
SHCOMP_log_lag_diff + cs(respLAG_diff), data = traintransf,
trace = FALSE, family = TF) # can not use boxcox because of negative values
shapiro.test(model_t_cs$residuals)
checkresiduals(model_t_cs$residuals, DF = 42)summary(model_t_cs)
## Important vars: SPX, GNR, SHCOMPResiduals are normally distributed by Shapiro Test, with mean close to zero and uncorrelated in most of the cases.
Variable Selection
# Var Selection - base model
stepGAIC(model_base)
#Step: AIC= -3973.58
#formula = OILPRICE ~ BDIY_log_lag + SPX_log_lag +
# GC1_log_lag + HO1_log_lag + GNR_log_lag + SHCOMP_log_lag +
# respLAG, family = BCT, data = trainori, trace = FALSE
# Var Selection - transf data model
stepGAIC(model_t)
#Step: AIC= -3974.05
#formula = OILPRICE_diff ~ CL2_log_diff + HO1_log_lag_diff,
# family = TF, data = traintransf, trace = FALSEModels with Selected Variables
model_sel <- gamlss(OILPRICE ~ BDIY_log_lag +
SPX_log_lag + GC1_log_lag +
HO1_log_lag + GNR_log_lag +
SHCOMP_log_lag + respLAG,
data = trainori,
trace = FALSE, family = BCCG)
model_sel_t <- gamlss(OILPRICE_diff ~ CL2_log_diff +
HO1_log_lag_diff, data = traintransf,
trace = FALSE, family = TF)Models with Selected Variables - original data and cs covariables
# Base with selected vars and cs - -3844.406
model_sel_cs <- gamlss(OILPRICE ~ BDIY_log_lag +
SPX_log_lag + GC1_log_lag +
HO1_log_lag + GNR_log_lag +
SHCOMP_log_lag + cs(respLAG),
data = trainori,
trace = FALSE, family = BCCG)
shapiro.test(model_sel_cs$residuals)
checkresiduals(model_sel_cs$residuals, DF = 42)summary(model_sel_cs)Models with Selected Variables - transformed data and cs covariables
# Model transformed with sel vars and cs AIC -3987.72
model_sel_t_cs <- gamlss(OILPRICE_diff ~ cs(CL2_log_diff) +
cs(HO1_log_lag_diff), data = traintransf,
trace = FALSE, family = TF)
shapiro.test(model_sel_t_cs$residuals)
checkresiduals(model_sel_t_cs$residuals, DF = 42)summary(model_sel_t_cs)Residuals are not normally distributed by Shapiro Test, with mean close to zero and uncorrelated in most of the cases.
Model with Selected Variables Residuals
plot(model_sel)## ******************************************************************
## Summary of the Quantile Residuals
## mean = -5.203918e-06
## variance = 1.001431
## coef. of skewness = 0.0577843
## coef. of kurtosis = 9.062829
## Filliben correlation coefficient = 0.9640789
## ******************************************************************
wp(model_sel)## Warning in wp(model_sel): Some points are missed out
## increase the y limits using ylim.all
Validation
base_pred <- predict(model_base, newdata = testori, type = "response")
base_cs_pred <- predict(model_base_cs, newdata = testori, type = "response")## new prediction
sel_pred <- predict(model_sel, newdata = testori, type = "response")
sel_cs_pred <- predict(model_sel_cs, newdata = testori, type = "response")## new prediction
sel_t_pred <- predict(model_sel_t, newdata = testtransf, type = "response")
sel_t_cs_pred <- predict(model_sel_t_cs, newdata = testtransf, type = "response")## new prediction
t_pred <- predict(model_t, newdata = testtransf, type = "response")
t_cs_pred <- predict(model_t_cs, newdata = testtransf, type = "response")## new prediction
accuracy_values <- rbind(
accuracy(base_pred, testori[,"OILPRICE"]),
accuracy(base_cs_pred, testori[,"OILPRICE"]),
accuracy(sel_pred, testori[,"OILPRICE"]),
accuracy(sel_cs_pred, testori[,"OILPRICE"]),
accuracy(sel_t_pred, testtransf[,"OILPRICE_diff"]),
accuracy(sel_t_cs_pred, testtransf[,"OILPRICE_diff"]),
accuracy(t_pred, testtransf[,"OILPRICE_diff"]),
accuracy(t_cs_pred, testtransf[,"OILPRICE_diff"]))
row.names(accuracy_values) <- c("base_pred", "base_cs_pred", "sel_pred",
"sel_cs_pred", "sel_t_pred", "sel_t_cs_pred",
"t_pred", "t_cs_pred")
accuracy_values## ME RMSE MAE MPE MAPE
## base_pred 0.0120680335 0.03528933 0.02771349 0.3062927 0.7352158
## base_cs_pred 0.0377049649 0.08824671 0.06854488 0.8922033 1.7822138
## sel_pred 0.0112645887 0.03538602 0.02802574 0.2827468 0.7426300
## sel_cs_pred -0.0066787269 0.04265551 0.03333156 -0.2213765 0.8948030
## sel_t_pred -0.0001800693 0.03291055 0.02513054 97.1018507 114.6079746
## sel_t_cs_pred 0.0017576171 0.03344413 0.02546063 94.4076601 124.0153016
## t_pred -0.0002693446 0.03300739 0.02517636 99.2155333 113.4286559
## t_cs_pred 0.0020826050 0.03349348 0.02549447 97.6230640 124.6598126
Considering these measures of errors, the transformation of the data had not good results. Three models shows good performance, with less than 1% of errors. The Base Model, Model with only the Selected Variables and the Model with Selected Variables & CS. The Model with Selected Variables is chosen to make new predictions because it has a smaller number of terms and it is more generalisable and simple.
Confidence Interval
# Limits for 99% of confindence
critval <- 2.80
upr <- sel_pred + (critval * (sel_pred-testori$OILPRICE))
lwr <- sel_pred - (critval * (sel_pred-testori$OILPRICE))Probabilistic Forecast
sel_predts <- ts(sel_pred, frequency = 365, start = c(2019,336))
trainorits <- ts(trainori, frequency = 365, start = c(2018,1,1))
testorits <- ts(testori, frequency = 365, start = c(2019,336))
upper <- ts(upr, frequency = 365, start = c(2019,336))
lower <- ts(lwr, frequency = 365, start = c(2019,336))
upr <- c(trainori$OILPRICE, upr)
lwr <- c(trainori$OILPRICE, lwr)
g <- autoplot(oilts1[,"OILPRICE"]) +
autolayer(sel_predts, color="red") +
# autolayer(upper) +
# autolayer(lower) +
xlab("Year") + ylab("Oil Price") +
ggtitle("Probabilistic Forecast with Confidence of 99%") +
guides(colour=guide_legend(title="Probabilistic Forecast for Oil Price"))
g + geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "gray", alpha = 0.6)# OILPRICE ~ BDIY_log_lag + SPX_log_lag + GC1_log_lag +
# HO1_log_lag + GNR_log_lag +
# SHCOMP_log_lag + respLAG,
testing <- data.frame("BDIY_log_lag"=6.579251,
"SPX_log_lag"=7.626482,
"GC1_log_lag"=7.166073,
"HO1_log_lag"=0.4987733,
"GNR_log_lag"=3.771841,
"SHCOMP_log_lag"=8.117241,
"respLAG"=3.809990)
#calculate the number of model parameters - 1
k <- length(model_sel$coefficients)-1
#calculate sum of squared residuals
SSE <- sum(model_sel$residuals**2)
#calculate total observations in dataset
n <- length(model_sel$residuals)
#calculate residual standard error
stderror <- sqrt(SSE/(n-(1+k)))
conf <- round(1.96*stderror, 2)
print(paste0("The Oil Price tomorrow will be ", round(predict(model_sel, newdata = testing, type = "response"),2), "+/-", conf, " with 95% of confindence. Considering the Mean Absolute Percentage Error, the prediction are different from the real value less than 1%. Then, it will probably be between ", round(predict(model_sel, newdata = testing, type = "response"),2)-round(predict(model_sel, newdata = testing, type = "response"),2)*0.01, " and ", round(predict(model_sel, newdata = testing, type = "response"),2)+round(predict(model_sel, newdata = testing, type = "response"),2)*0.01))## [1] "The Oil Price tomorrow will be 3.8+/-1.96 with 95% of confindence. Considering the Mean Absolute Percentage Error, the prediction are different from the real value less than 1%. Then, it will probably be between 3.762 and 3.838"