#Libraries
library(tidyverse)
library(broom)
library(dplyr)
library(ggplot2)
library(psych)
#TS graphs
library(tseries)
library(TSstudio)
library(forecast)
library(plotly)
library(hrbrthemes)
library(viridis)
library(forcats)
#forecasts errors
library(MLmetrics)
#LaTeX
library(kableExtra)
Downloading data
dat_strange_or = read_csv('C:/Users/Kolya/Documents/R/R files/TS_pr_1/BP-OIL_CONSUM_EUR.csv')
## Rows: 56 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (1): Value
## date (1): Date
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat_strange_show <- dat_strange_or[1:51, ]
full_ts <- dat_strange_or[2:51,]
dat_strange <- dat_strange_or[2:51, ]
Splitting the data set into samples: training (1970-2009), cross-validation (2010-2014) and test (2015-2019).
#Ts from 1970 till 2019
full_ts_new <- rev(full_ts[2]$Value)
full_ts_new <- ts(full_ts_new, start = 1970, end = 2019)
#Base Ts for log-dif + % change
Ts_testing_extra <- full_ts_new[1:39]
Ts_testing_extra <- ts(Ts_testing_extra, start = 1970, end = 2009)
#First Difference (for validation and prediction)
full_ts_dif <- full_ts_new - stats::lag(full_ts_new, -1)
#Second Difference
full_ts_dif_2 <- diff(full_ts_dif, differences = 2)
full_ts_dif_2 <- full_ts_dif_2[1:39]
full_ts_dif_2 <- ts(full_ts_dif_2, start = 1970, end = 2009)
dif_oil_TS <- full_ts_dif[1:39] #40-1
dat_strange_val <- full_ts_dif[1:45] #40 - 1 + 6
dat_strange_pred <- full_ts_dif[46:49] # 40 - 1 + 6 + 4 -all
#Training TS
dif_oil_TS <- ts(dif_oil_TS, start = 1970, end = 2009)
#Cross-validation TS
oil_TS_val <- ts(dat_strange_val, start = 1970, end = 2014)
#Forecast TS
oil_TS_pred <- ts(dat_strange_pred, start = 2015, end = 2019)
Data analysis - Oil consumption in Europe (1970-2020)
#TS with 2020
oil_TS_vec_show <- rev(dat_strange_show[2]$Value)
oil_TS_show <- ts(oil_TS_vec_show, start = 1970, end = 2020)
# oil_TS_show %>% summary()
show_stat <- oil_TS_show %>% describe()
show_stat <- show_stat[c(2:5, 8:9)]
show_stat
ts_plot(oil_TS_show,
title = "Oil consumption in Europe, 2020",
Xtitle = "Time",
Ytitle = "",
color = "green",
Xgrid = TRUE,
Ygrid = TRUE) %>%
layout(paper_bgcolor = "white",
plot_bgcolor = "white",
font = list(color = "black"),
yaxis = list(linecolor = "#6b6b6b",
zerolinecolor = "#6b6b6b",
gridcolor= "#444444"),
xaxis = list(linecolor = "#6b6b6b",
zerolinecolor = "#6b6b6b",
gridcolor= "#444444"))
Data analysis - Oil consumption in Europe (1970-2019). Eliminating year 2020 from the data, leaving values 1970-2019. The number of values changed from 51 to 50.
#TS without 2020
oil_TS_vec <- rev(dat_strange[2]$Value)
oil_TS <- ts(oil_TS_vec, start = 1970, end = 2019)
# oil_TS %>% summary()
oil_stat <- oil_TS %>% describe()
oil_stat <- oil_stat[c(2:5, 8:9)]
oil_stat #min is higher, because there is no 2020 shift
variance_oil = round(var(oil_TS),3)
sd_oil = round(sd(oil_TS),3)
ts_plot(oil_TS,
title = "Oil consumption in Europe (1970-2019)",
Xtitle = "Time",
Ytitle = "",
color = "green",
Xgrid = TRUE,
Ygrid = TRUE) %>%
layout(paper_bgcolor = "white",
plot_bgcolor = "white",
font = list(color = "black"),
yaxis = list(linecolor = "#6b6b6b",
zerolinecolor = "#6b6b6b",
gridcolor= "#444444"),
xaxis = list(linecolor = "#6b6b6b",
zerolinecolor = "#6b6b6b",
gridcolor= "#444444"))
ts_plot(full_ts_dif,
title = "1st difference oil consumption in Europe (1970-2019)",
Xtitle = "Time",
Ytitle = "",
color = "green",
Xgrid = TRUE,
Ygrid = TRUE) %>%
layout(paper_bgcolor = "white",
plot_bgcolor = "white",
font = list(color = "black"),
yaxis = list(linecolor = "#6b6b6b",
zerolinecolor = "#6b6b6b",
gridcolor= "#444444"),
xaxis = list(linecolor = "#6b6b6b",
zerolinecolor = "#6b6b6b",
gridcolor= "#444444"))
dif_stat <- dif_oil_TS %>% describe()
dif_stat <- dif_stat[c(2:5, 8:9)]
variance_dif_oil = round(var(dif_oil_TS),3)
sd_dif_oil = round(sd(dif_oil_TS),3)
dif_stat
ts_plot(dif_oil_TS,
title = "Training sample - Oil consumption in Europe, 1st difference",
Xtitle = "Time",
Ytitle = "",
color = "green",
Xgrid = TRUE,
Ygrid = TRUE) %>%
layout(paper_bgcolor = "white",
plot_bgcolor = "white",
font = list(color = "black"),
yaxis = list(linecolor = "#6b6b6b",
zerolinecolor = "#6b6b6b",
gridcolor= "#444444"),
xaxis = list(linecolor = "#6b6b6b",
zerolinecolor = "#6b6b6b",
gridcolor= "#444444"))
ts_plot(full_ts_dif_2,
title = "Training sample - Oil consumption in Europe, 2nd difference",
Xtitle = "Time",
Ytitle = "",
color = "green",
Xgrid = TRUE,
Ygrid = TRUE) %>%
layout(paper_bgcolor = "white",
plot_bgcolor = "white",
font = list(color = "black"),
yaxis = list(linecolor = "#6b6b6b",
zerolinecolor = "#6b6b6b",
gridcolor= "#444444"),
xaxis = list(linecolor = "#6b6b6b",
zerolinecolor = "#6b6b6b",
gridcolor= "#444444"))
Checking the stationarity based on Dickey-Fuller test, Phillips-Perron Unit Root Test and KPSS test. Conducting autocorrelation function and partial autocorrelation function.
#Augmented Dickey-Fuller test
#H0: non-stationary
#H1: stationary
adf.test(oil_TS, alternative = 'stationary', k=1) #st
##
## Augmented Dickey-Fuller Test
##
## data: oil_TS
## Dickey-Fuller = -3.5126, Lag order = 1, p-value = 0.04929
## alternative hypothesis: stationary
adf.test(dif_oil_TS, alternative = 'stationary', k=1) #st
## Warning in adf.test(dif_oil_TS, alternative = "stationary", k = 1): p-value
## smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: dif_oil_TS
## Dickey-Fuller = -4.6489, Lag order = 1, p-value = 0.01
## alternative hypothesis: stationary
#Phillips-Perron Unit Root Test
#H0: non-stationary
#H1: stationary
pp.test(oil_TS, alternative = 'stationary') #nonst
##
## Phillips-Perron Unit Root Test
##
## data: oil_TS
## Dickey-Fuller Z(alpha) = -14.945, Truncation lag parameter = 3, p-value
## = 0.2108
## alternative hypothesis: stationary
pp.test(dif_oil_TS, alternative = 'stationary') #st
## Warning in pp.test(dif_oil_TS, alternative = "stationary"): p-value smaller than
## printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: dif_oil_TS
## Dickey-Fuller Z(alpha) = -26.046, Truncation lag parameter = 3, p-value
## = 0.01
## alternative hypothesis: stationary
#KPSS test
#H0: stationary
#H1: non-stationary
kpss.test(oil_TS) #st
## Warning in kpss.test(oil_TS): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: oil_TS
## KPSS Level = 0.30114, Truncation lag parameter = 3, p-value = 0.1
kpss.test(dif_oil_TS) #st
## Warning in kpss.test(dif_oil_TS): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: dif_oil_TS
## KPSS Level = 0.091596, Truncation lag parameter = 3, p-value = 0.1
#Autocorrelation function
Acf(oil_TS)
Acf(dif_oil_TS)
# Only last percentage change for all observations.
Pacf(oil_TS)
Pacf(dif_oil_TS)
#summary - the original TS is non-stationary
After enumerating models, we choose top five models.
# p, q <= 5 (macro data)
bic_name_list <- c()
bic_value_list <- c()
for(p in 0:5){
for (q in 0:5){
#Model
arma_model <- Arima(dif_oil_TS, order = c(p,0,q))
arma_name <- paste("arma", p, q, sep='')
assign(arma_name, arma_model)
#BIC
bic_name <- paste("bic", p, q, sep='')
bic_value = arma_model$bic
# assign(arma_bic, bic)
bic_name_list <- append(bic_name_list, bic_name)
bic_value_list <- append(bic_value_list, bic_value)
}
}
arma_df <- as.data.frame(bic_name_list)
arma_df$inf_cr <- bic_value_list
names(arma_df) <- c('bic_names', 'bic_values')
min_bics <- arma_df[order(arma_df$bic_values, decreasing = FALSE)[1:5], ]
min_bics #best 5 models
#for naive predictor ARMA(0,0)
arma00$bic
## [1] 390.6967
Conducting residual tests, using Ljung-Box and Box-Pierce tests.
arma_list <- list(arma04, arma21, arma12, arma05, arma03, arma00)
#"Ljung-Box" - more about autocorrelation
# H0 - no autocorrelation - WWN
# H1 - autocorrelation (bad)
print("Ljung-Box test")
## [1] "Ljung-Box test"
for (ar_ma in arma_list){
print(paste('p-value:',ar_ma, Box.test(ar_ma$residuals, type = "Ljung-Box")$p.value))
}
## [1] "p-value: ARIMA(0,0,4) with non-zero mean 0.732160889073166"
## [1] "p-value: ARIMA(2,0,1) with non-zero mean 0.859572663797253"
## [1] "p-value: ARIMA(1,0,2) with non-zero mean 0.919217612337316"
## [1] "p-value: ARIMA(0,0,5) with non-zero mean 0.767953832828051"
## [1] "p-value: ARIMA(0,0,3) with non-zero mean 0.674196310146899"
## [1] "p-value: ARIMA(0,0,0) with non-zero mean 0.112461534982094"
#arma04 - good
#arma21 - good
#arma12 - good
#arma05 - good
#arma03- good
#arma00- naive
#"Box-Pierce" - more about WWN - Q-test
# H0 - no autocorrelation - WWN
# H1 - autororrelation (bad)
print("Box-Pierce test")
## [1] "Box-Pierce test"
for (ar_ma in arma_list){
print(paste('p-value:',ar_ma, Box.test(ar_ma$residuals, type = "Box-Pierce")$p.value))
}
## [1] "p-value: ARIMA(0,0,4) with non-zero mean 0.741549010719818"
## [1] "p-value: ARIMA(2,0,1) with non-zero mean 0.864630603638103"
## [1] "p-value: ARIMA(1,0,2) with non-zero mean 0.922146623418492"
## [1] "p-value: ARIMA(0,0,5) with non-zero mean 0.776165444169754"
## [1] "p-value: ARIMA(0,0,3) with non-zero mean 0.685401387451779"
## [1] "p-value: ARIMA(0,0,0) with non-zero mean 0.126143273901751"
#arma04 - good
#arma21 - good
#arma12 - good
#arma05 - good
#arma03 - good
#arma00 - naive
Using the testing sample, we made a forecast on the validation sample and estimated different loss functions from the differences between the forecast and the actual values.
arma_order = list('04', '21', '12', '05', '03', '00')
for (i in 1:length(arma_list)){
f <- function(y, h) forecast(arma_list[[i]], h = 1)
f_name <- arma_order[i]
assign(paste("f", f_name, sep = '_'), f)
f_temp <- assign(paste("f", f_name, sep = '_'), f)
e <- tsCV(dat_strange_val, f_temp, h = 1)
assign(paste("e", f_name, sep = '_'), e)
}
rbind(accuracy(ts(dat_strange_val- e_04), ts(dat_strange_val)),
accuracy(ts(dat_strange_val - e_21), ts(dat_strange_val)),
accuracy(ts(dat_strange_val - e_12), ts(dat_strange_val)),
accuracy(ts(dat_strange_val - e_05), ts(dat_strange_val)),
accuracy(ts(dat_strange_val - e_03), ts(dat_strange_val)),
accuracy(ts(dat_strange_val - e_00), ts(dat_strange_val))) %>%
as.data.frame() %>%
round(3) %>%
mutate(model = c("ARMA(0,4)",
"ARMA(2,1)",
"ARMA(1,2)",
"ARMA(0,5)",
"ARMA(0,3)",
"ARMA(0,0)")) %>%
dplyr::select(model, everything()) %>%
knitr::kable()
model | ME | RMSE | MAE | MPE | MAPE | ACF1 | Theil’s U | |
---|---|---|---|---|---|---|---|---|
Test.set | ARMA(0,4) | -26.689 | 38.445 | 33.226 | 25.109 | 329.668 | 0.271 | 0.936 |
Test.set.1 | ARMA(2,1) | -26.449 | 38.278 | 33.040 | 25.358 | 327.558 | 0.271 | 0.928 |
Test.set.2 | ARMA(1,2) | -47.045 | 54.580 | 49.389 | 4.053 | 514.367 | 0.271 | 1.587 |
Test.set.3 | ARMA(0,5) | -33.108 | 43.149 | 38.186 | 18.470 | 385.980 | 0.271 | 1.134 |
Test.set.4 | ARMA(0,3) | -49.477 | 56.689 | 51.379 | 1.538 | 537.254 | 0.271 | 1.668 |
Test.set.5 | ARMA(0,0) | -3.122 | 27.847 | 20.703 | 49.487 | 197.383 | 0.271 | 0.465 |
#ME - mean error
#RMSE - root mean squared error
#MAE - mean absolute error
#MPE - mean percentage error
#MAPE - mean absolute percentage error
#ACF1 - autocorrelation 1
#Theil' U - another measure of accuracy
#Due to all errors measures except MPE, naive model ARMA(0,0) is the best!
# checkresiduals(arma00)
#top2 - ARMA(2,1)
We have tested our forecast model on the testing sample. It is a simple forecast, but it is inside of the confidence interval, even for 0.8 one.
arma00_graph <- Arima(oil_TS_val, c(0, 0, 0))
fc00 <- forecast(arma00_graph, h = 5)
dif_oil_TS_graph <- ts(full_ts_dif, start = 1970, end = 2019)
autoplot(fc00) + autolayer(dif_oil_TS_graph)
fc00
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2015 -0.1108012 -36.15996 35.93835 -55.24322 55.02162
## 2016 -0.1108012 -36.15996 35.93835 -55.24322 55.02162
## 2017 -0.1108012 -36.15996 35.93835 -55.24322 55.02162
## 2018 -0.1108012 -36.15996 35.93835 -55.24322 55.02162
## 2019 -0.1108012 -36.15996 35.93835 -55.24322 55.02162
accuracy(fc00)
## ME RMSE MAE MPE MAPE MASE
## Training set -3.644076e-16 27.815 20.87459 100.1262 100.1262 0.9407392
## ACF1
## Training set 0.3119246
# Fc00 <- Arima(oil_TS_pred, model = arma00)
# Fc00$fitted
arma00_graph <- Arima(oil_TS_val, c(2, 0, 1))
fc00 <- forecast(arma00_graph, h = 5)
dif_oil_TS_graph <- ts(full_ts_dif, start = 1970, end = 2019)
autoplot(fc00) + autolayer(dif_oil_TS_graph)
fc00
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2015 2.07710546 -30.27139 34.42560 -47.39565 51.54986
## 2016 1.08733337 -35.60173 37.77639 -55.02374 57.19841
## 2017 0.06190523 -36.91529 37.03910 -56.48983 56.61364
## 2018 0.42329814 -36.73825 37.58484 -56.41038 57.25697
## 2019 0.16500030 -37.05481 37.38481 -56.75779 57.08779
accuracy(fc00)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.2321904 23.85044 17.59298 70.03307 125.1106 0.7928494
## ACF1
## Training set -0.006110017
Therefore, it was decided to build an actual out-sample forecast. It is assumed that a short-term forecast is better, and the forecast is done for only three years, given our small sample.
# Fc00 <- Arima(oil_TS_pred, model = arma00)
# Fc00$fitted
arma00_graph <- Arima(full_ts_dif, c(0, 0, 0))
fc00 <- forecast(arma00_graph, h = 3)
dif_oil_TS_graph <- ts(full_ts_dif, start = 1971, end = 2019)
autoplot(fc00) + autolayer(dif_oil_TS_graph)
fc00
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2020 0.3666951 -34.35241 35.08581 -52.7316 53.46499
## 2021 0.3666951 -34.35241 35.08581 -52.7316 53.46499
## 2022 0.3666951 -34.35241 35.08581 -52.7316 53.46499
accuracy(fc00)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1.88414e-15 26.8136 19.8588 100.2262 100.2262 0.9556164 0.3217259
#in confidece interval- not bad forecast
# Fc00 <- Arima(oil_TS_pred, model = arma00)
# Fc00$fitted
arma00_graph <- Arima(full_ts_dif, c(2, 0, 1))
fc00 <- forecast(arma00_graph, h = 3)
dif_oil_TS_graph <- ts(full_ts_dif, start = 1971, end = 2019)
autoplot(fc00, xlab = "Time", ylab = "Survival",
title = "Marks show times with censoring",
legendLabs = '1st difference of oil consumption') + autolayer(dif_oil_TS_graph)
fc00
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2020 5.460705 -26.05668 36.97809 -42.74098 53.66239
## 2021 8.065588 -24.26436 40.39554 -41.37880 57.50998
## 2022 7.219484 -25.48967 39.92864 -42.80485 57.24382
accuracy(fc00)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 3.159206 23.34879 17.19034 98.02276 126.6793 0.8272086 0.01281378
Actually, it is hard to say something precise about our forecast because of its simplicity and limitation of the data. Moreover, this forecasting model has some limitations due to the fact that oil consumption is influenced by many factors, which are not taken into account. However, we can say that the predicted values are 0.37. Therefore, the European consumption of oil is predicted to grow on 0.37 million tonnes. As we mentioned before, this forecast is mostly applicable for the post-covid reality.