TS analysis and forecast for European oil consumption

#Libraries
library(tidyverse)
library(broom)
library(dplyr) 
library(ggplot2)
library(psych)

#TS graphs
library(tseries)
library(TSstudio)
library(forecast)
library(plotly)

library(hrbrthemes)
library(viridis)
library(forcats)

#forecasts errors
library(MLmetrics) 

#LaTeX
library(kableExtra)

Downloading data

dat_strange_or = read_csv('C:/Users/Kolya/Documents/R/R files/TS_pr_1/BP-OIL_CONSUM_EUR.csv')

## Rows: 56 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl  (1): Value
## date (1): Date
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

dat_strange_show <- dat_strange_or[1:51, ]
full_ts <- dat_strange_or[2:51,]
dat_strange <- dat_strange_or[2:51, ]

Splitting the data set into samples: training (1970-2009), cross-validation (2010-2014) and test (2015-2019).

#Ts from 1970 till 2019
full_ts_new <- rev(full_ts[2]$Value)
full_ts_new <- ts(full_ts_new, start = 1970, end = 2019)

#Base Ts for log-dif + % change
Ts_testing_extra <- full_ts_new[1:39]
Ts_testing_extra <- ts(Ts_testing_extra, start = 1970, end = 2009)

#First Difference (for validation and prediction)
full_ts_dif <- full_ts_new - stats::lag(full_ts_new, -1)

#Second Difference
full_ts_dif_2 <- diff(full_ts_dif, differences = 2)
full_ts_dif_2 <- full_ts_dif_2[1:39]
full_ts_dif_2 <- ts(full_ts_dif_2, start = 1970, end = 2009)

dif_oil_TS <- full_ts_dif[1:39] #40-1
dat_strange_val <- full_ts_dif[1:45] #40 - 1 + 6
dat_strange_pred <- full_ts_dif[46:49] # 40 - 1 + 6 + 4 -all

#Training TS
dif_oil_TS <- ts(dif_oil_TS, start = 1970, end = 2009)

#Cross-validation TS
oil_TS_val <- ts(dat_strange_val, start = 1970, end = 2014)

#Forecast TS
oil_TS_pred <- ts(dat_strange_pred, start = 2015, end = 2019)

Data analysis - Oil consumption in Europe (1970-2020)

#TS with 2020
oil_TS_vec_show <- rev(dat_strange_show[2]$Value)
oil_TS_show <- ts(oil_TS_vec_show, start = 1970, end = 2020)

# oil_TS_show %>% summary()
show_stat <- oil_TS_show %>% describe()
show_stat <- show_stat[c(2:5, 8:9)]
show_stat

ts_plot(oil_TS_show,
        title = "Oil consumption in Europe, 2020",
        Xtitle = "Time",
        Ytitle = "",
        color =  "green",
        Xgrid = TRUE,
        Ygrid = TRUE) %>%
  layout(paper_bgcolor = "white",
         plot_bgcolor = "white",
         font = list(color = "black"),
         yaxis = list(linecolor = "#6b6b6b",
                      zerolinecolor = "#6b6b6b",
                      gridcolor= "#444444"),
         xaxis = list(linecolor = "#6b6b6b",
                      zerolinecolor = "#6b6b6b",
                      gridcolor= "#444444"))

Data analysis - Oil consumption in Europe (1970-2019). Eliminating year 2020 from the data, leaving values 1970-2019. The number of values changed from 51 to 50.

#TS without 2020
oil_TS_vec <- rev(dat_strange[2]$Value)
oil_TS <- ts(oil_TS_vec, start = 1970, end = 2019)

# oil_TS %>% summary()
oil_stat <- oil_TS %>% describe()
oil_stat <- oil_stat[c(2:5, 8:9)]
oil_stat #min is higher, because there is no 2020 shift

variance_oil = round(var(oil_TS),3)
sd_oil = round(sd(oil_TS),3)

ts_plot(oil_TS,
        title = "Oil consumption in Europe (1970-2019)",
        Xtitle = "Time",
        Ytitle = "",
        color =  "green",
        Xgrid = TRUE,
        Ygrid = TRUE) %>%
  layout(paper_bgcolor = "white",
         plot_bgcolor = "white",
         font = list(color = "black"),
         yaxis = list(linecolor = "#6b6b6b",
                      zerolinecolor = "#6b6b6b",
                      gridcolor= "#444444"),
         xaxis = list(linecolor = "#6b6b6b",
                      zerolinecolor = "#6b6b6b",
                      gridcolor= "#444444"))

ts_plot(full_ts_dif,
        title = "1st difference oil consumption in Europe (1970-2019)",
        Xtitle = "Time",
        Ytitle = "",
        color =  "green",
        Xgrid = TRUE,
        Ygrid = TRUE) %>%
  layout(paper_bgcolor = "white",
         plot_bgcolor = "white",
         font = list(color = "black"),
         yaxis = list(linecolor = "#6b6b6b",
                      zerolinecolor = "#6b6b6b",
                      gridcolor= "#444444"),
         xaxis = list(linecolor = "#6b6b6b",
                      zerolinecolor = "#6b6b6b",
                      gridcolor= "#444444"))

dif_stat <- dif_oil_TS %>% describe()
dif_stat <- dif_stat[c(2:5, 8:9)]

variance_dif_oil = round(var(dif_oil_TS),3)
sd_dif_oil = round(sd(dif_oil_TS),3)
dif_stat

ts_plot(dif_oil_TS,
        title = "Training sample - Oil consumption in Europe, 1st difference",
        Xtitle = "Time",
        Ytitle = "",
        color =  "green",
        Xgrid = TRUE,
        Ygrid = TRUE) %>%
  layout(paper_bgcolor = "white",
         plot_bgcolor = "white",
         font = list(color = "black"),
         yaxis = list(linecolor = "#6b6b6b",
                      zerolinecolor = "#6b6b6b",
                      gridcolor= "#444444"),
         xaxis = list(linecolor = "#6b6b6b",
                      zerolinecolor = "#6b6b6b",
                      gridcolor= "#444444"))

ts_plot(full_ts_dif_2,
        title = "Training sample - Oil consumption in Europe, 2nd difference",
        Xtitle = "Time",
        Ytitle = "",
        color =  "green",
        Xgrid = TRUE,
        Ygrid = TRUE) %>%
  layout(paper_bgcolor = "white",
         plot_bgcolor = "white",
         font = list(color = "black"),
         yaxis = list(linecolor = "#6b6b6b",
                      zerolinecolor = "#6b6b6b",
                      gridcolor= "#444444"),
         xaxis = list(linecolor = "#6b6b6b",
                      zerolinecolor = "#6b6b6b",
                      gridcolor= "#444444"))

Checking the stationarity based on Dickey-Fuller test, Phillips-Perron Unit Root Test and KPSS test. Conducting autocorrelation function and partial autocorrelation function.

#Augmented Dickey-Fuller test
#H0: non-stationary
#H1: stationary
adf.test(oil_TS, alternative = 'stationary', k=1) #st

## 
##  Augmented Dickey-Fuller Test
## 
## data:  oil_TS
## Dickey-Fuller = -3.5126, Lag order = 1, p-value = 0.04929
## alternative hypothesis: stationary

adf.test(dif_oil_TS, alternative = 'stationary', k=1) #st

## Warning in adf.test(dif_oil_TS, alternative = "stationary", k = 1): p-value
## smaller than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  dif_oil_TS
## Dickey-Fuller = -4.6489, Lag order = 1, p-value = 0.01
## alternative hypothesis: stationary

#Phillips-Perron Unit Root Test
#H0: non-stationary
#H1: stationary
pp.test(oil_TS, alternative = 'stationary') #nonst

## 
##  Phillips-Perron Unit Root Test
## 
## data:  oil_TS
## Dickey-Fuller Z(alpha) = -14.945, Truncation lag parameter = 3, p-value
## = 0.2108
## alternative hypothesis: stationary

pp.test(dif_oil_TS, alternative = 'stationary') #st

## Warning in pp.test(dif_oil_TS, alternative = "stationary"): p-value smaller than
## printed p-value

## 
##  Phillips-Perron Unit Root Test
## 
## data:  dif_oil_TS
## Dickey-Fuller Z(alpha) = -26.046, Truncation lag parameter = 3, p-value
## = 0.01
## alternative hypothesis: stationary

#KPSS test
#H0: stationary
#H1: non-stationary
kpss.test(oil_TS) #st

## Warning in kpss.test(oil_TS): p-value greater than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  oil_TS
## KPSS Level = 0.30114, Truncation lag parameter = 3, p-value = 0.1

kpss.test(dif_oil_TS) #st

## Warning in kpss.test(dif_oil_TS): p-value greater than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  dif_oil_TS
## KPSS Level = 0.091596, Truncation lag parameter = 3, p-value = 0.1

#Autocorrelation function
Acf(oil_TS)

Acf(dif_oil_TS)

# Only last percentage change for all observations.
Pacf(oil_TS)

Pacf(dif_oil_TS)

#summary - the original TS is non-stationary

After enumerating models, we choose top five models.

# p, q <= 5 (macro data)

bic_name_list <- c()
bic_value_list <- c()

for(p in 0:5){
  for (q in 0:5){
    #Model
    arma_model <- Arima(dif_oil_TS, order = c(p,0,q))
    arma_name <- paste("arma", p, q, sep='')
    assign(arma_name, arma_model)

    #BIC
    bic_name <- paste("bic", p, q, sep='')
    bic_value = arma_model$bic
    # assign(arma_bic, bic)
    bic_name_list <- append(bic_name_list, bic_name)
    bic_value_list <- append(bic_value_list, bic_value)
  }
}

arma_df <- as.data.frame(bic_name_list)
arma_df$inf_cr <- bic_value_list
names(arma_df) <- c('bic_names', 'bic_values')

min_bics <- arma_df[order(arma_df$bic_values, decreasing = FALSE)[1:5], ]
min_bics #best 5 models

#for naive predictor ARMA(0,0)
arma00$bic

## [1] 390.6967

Conducting residual tests, using Ljung-Box and Box-Pierce tests.

arma_list <- list(arma04, arma21, arma12, arma05, arma03, arma00)

#"Ljung-Box" - more about autocorrelation
# H0 - no autocorrelation - WWN
# H1 - autocorrelation (bad)

print("Ljung-Box test")

## [1] "Ljung-Box test"

 for (ar_ma in arma_list){
   print(paste('p-value:',ar_ma, Box.test(ar_ma$residuals, type = "Ljung-Box")$p.value))
 }

## [1] "p-value: ARIMA(0,0,4) with non-zero mean 0.732160889073166"
## [1] "p-value: ARIMA(2,0,1) with non-zero mean 0.859572663797253"
## [1] "p-value: ARIMA(1,0,2) with non-zero mean 0.919217612337316"
## [1] "p-value: ARIMA(0,0,5) with non-zero mean 0.767953832828051"
## [1] "p-value: ARIMA(0,0,3) with non-zero mean 0.674196310146899"
## [1] "p-value: ARIMA(0,0,0) with non-zero mean 0.112461534982094"

#arma04 - good
#arma21 - good
#arma12 - good
#arma05 - good
#arma03- good
#arma00- naive

#"Box-Pierce" - more about WWN - Q-test
# H0 - no autocorrelation - WWN
# H1 - autororrelation (bad)

print("Box-Pierce test")

## [1] "Box-Pierce test"

 for (ar_ma in arma_list){
   print(paste('p-value:',ar_ma, Box.test(ar_ma$residuals, type = "Box-Pierce")$p.value))
 }

## [1] "p-value: ARIMA(0,0,4) with non-zero mean 0.741549010719818"
## [1] "p-value: ARIMA(2,0,1) with non-zero mean 0.864630603638103"
## [1] "p-value: ARIMA(1,0,2) with non-zero mean 0.922146623418492"
## [1] "p-value: ARIMA(0,0,5) with non-zero mean 0.776165444169754"
## [1] "p-value: ARIMA(0,0,3) with non-zero mean 0.685401387451779"
## [1] "p-value: ARIMA(0,0,0) with non-zero mean 0.126143273901751"

#arma04 - good
#arma21 - good
#arma12 - good
#arma05 - good
#arma03 - good
#arma00 - naive

Using the testing sample, we made a forecast on the validation sample and estimated different loss functions from the differences between the forecast and the actual values.

arma_order = list('04', '21', '12', '05', '03', '00')

for (i in 1:length(arma_list)){
  f <- function(y, h) forecast(arma_list[[i]], h = 1)
  f_name <- arma_order[i]
  assign(paste("f", f_name, sep = '_'), f)
  f_temp <- assign(paste("f", f_name, sep = '_'), f)
  e <- tsCV(dat_strange_val, f_temp, h = 1)
  assign(paste("e", f_name, sep = '_'), e)
 }

rbind(accuracy(ts(dat_strange_val- e_04), ts(dat_strange_val)),
      accuracy(ts(dat_strange_val - e_21), ts(dat_strange_val)),
      accuracy(ts(dat_strange_val - e_12), ts(dat_strange_val)),
      accuracy(ts(dat_strange_val - e_05), ts(dat_strange_val)),
      accuracy(ts(dat_strange_val - e_03), ts(dat_strange_val)),
      accuracy(ts(dat_strange_val - e_00), ts(dat_strange_val))) %>%
  as.data.frame() %>%
  round(3) %>% 
  mutate(model = c("ARMA(0,4)",
                   "ARMA(2,1)",
                   "ARMA(1,2)",
                   "ARMA(0,5)",
                   "ARMA(0,3)",
                   "ARMA(0,0)")) %>%
  dplyr::select(model, everything()) %>%
  knitr::kable()

	model	ME	RMSE	MAE	MPE	MAPE	ACF1	Theil’s U
Test.set	ARMA(0,4)	-26.689	38.445	33.226	25.109	329.668	0.271	0.936
Test.set.1	ARMA(2,1)	-26.449	38.278	33.040	25.358	327.558	0.271	0.928
Test.set.2	ARMA(1,2)	-47.045	54.580	49.389	4.053	514.367	0.271	1.587
Test.set.3	ARMA(0,5)	-33.108	43.149	38.186	18.470	385.980	0.271	1.134
Test.set.4	ARMA(0,3)	-49.477	56.689	51.379	1.538	537.254	0.271	1.668
Test.set.5	ARMA(0,0)	-3.122	27.847	20.703	49.487	197.383	0.271	0.465

#ME - mean error
#RMSE - root mean squared error
#MAE - mean absolute error
#MPE - mean percentage error
#MAPE - mean absolute percentage error
#ACF1 - autocorrelation 1
#Theil' U - another measure of accuracy

#Due to all errors measures except MPE, naive model ARMA(0,0) is the best!
# checkresiduals(arma00)
#top2 - ARMA(2,1)

We have tested our forecast model on the testing sample. It is a simple forecast, but it is inside of the confidence interval, even for 0.8 one.

arma00_graph <- Arima(oil_TS_val, c(0, 0, 0))
fc00 <- forecast(arma00_graph, h = 5)

dif_oil_TS_graph <- ts(full_ts_dif, start = 1970, end = 2019)

autoplot(fc00) + autolayer(dif_oil_TS_graph)

fc00

##      Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## 2015     -0.1108012 -36.15996 35.93835 -55.24322 55.02162
## 2016     -0.1108012 -36.15996 35.93835 -55.24322 55.02162
## 2017     -0.1108012 -36.15996 35.93835 -55.24322 55.02162
## 2018     -0.1108012 -36.15996 35.93835 -55.24322 55.02162
## 2019     -0.1108012 -36.15996 35.93835 -55.24322 55.02162

accuracy(fc00)

##                         ME   RMSE      MAE      MPE     MAPE      MASE
## Training set -3.644076e-16 27.815 20.87459 100.1262 100.1262 0.9407392
##                   ACF1
## Training set 0.3119246

# Fc00 <- Arima(oil_TS_pred, model = arma00)
# Fc00$fitted

arma00_graph <- Arima(oil_TS_val, c(2, 0, 1))
fc00 <- forecast(arma00_graph, h = 5)

dif_oil_TS_graph <- ts(full_ts_dif, start = 1970, end = 2019)

autoplot(fc00) + autolayer(dif_oil_TS_graph)

fc00

##      Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## 2015     2.07710546 -30.27139 34.42560 -47.39565 51.54986
## 2016     1.08733337 -35.60173 37.77639 -55.02374 57.19841
## 2017     0.06190523 -36.91529 37.03910 -56.48983 56.61364
## 2018     0.42329814 -36.73825 37.58484 -56.41038 57.25697
## 2019     0.16500030 -37.05481 37.38481 -56.75779 57.08779

accuracy(fc00)

##                      ME     RMSE      MAE      MPE     MAPE      MASE
## Training set -0.2321904 23.85044 17.59298 70.03307 125.1106 0.7928494
##                      ACF1
## Training set -0.006110017

Therefore, it was decided to build an actual out-sample forecast. It is assumed that a short-term forecast is better, and the forecast is done for only three years, given our small sample.

# Fc00 <- Arima(oil_TS_pred, model = arma00)
# Fc00$fitted
arma00_graph <- Arima(full_ts_dif, c(0, 0, 0))
fc00 <- forecast(arma00_graph, h = 3)

dif_oil_TS_graph <- ts(full_ts_dif, start = 1971, end = 2019)

autoplot(fc00) + autolayer(dif_oil_TS_graph)

fc00

##      Point Forecast     Lo 80    Hi 80    Lo 95    Hi 95
## 2020      0.3666951 -34.35241 35.08581 -52.7316 53.46499
## 2021      0.3666951 -34.35241 35.08581 -52.7316 53.46499
## 2022      0.3666951 -34.35241 35.08581 -52.7316 53.46499

accuracy(fc00)

##                       ME    RMSE     MAE      MPE     MAPE      MASE      ACF1
## Training set 1.88414e-15 26.8136 19.8588 100.2262 100.2262 0.9556164 0.3217259

#in confidece interval- not bad forecast

# Fc00 <- Arima(oil_TS_pred, model = arma00)
# Fc00$fitted

arma00_graph <- Arima(full_ts_dif, c(2, 0, 1))
fc00 <- forecast(arma00_graph, h = 3)

dif_oil_TS_graph <- ts(full_ts_dif, start = 1971, end = 2019)

autoplot(fc00, xlab = "Time", ylab = "Survival",
    title = "Marks show times with censoring", 
    legendLabs = '1st difference of oil consumption') + autolayer(dif_oil_TS_graph)

fc00

##      Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## 2020       5.460705 -26.05668 36.97809 -42.74098 53.66239
## 2021       8.065588 -24.26436 40.39554 -41.37880 57.50998
## 2022       7.219484 -25.48967 39.92864 -42.80485 57.24382

accuracy(fc00)

##                    ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set 3.159206 23.34879 17.19034 98.02276 126.6793 0.8272086 0.01281378

Actually, it is hard to say something precise about our forecast because of its simplicity and limitation of the data. Moreover, this forecasting model has some limitations due to the fact that oil consumption is influenced by many factors, which are not taken into account. However, we can say that the predicted values are 0.37. Therefore, the European consumption of oil is predicted to grow on 0.37 million tonnes. As we mentioned before, this forecast is mostly applicable for the post-covid reality.