By Month Analysis

f.1mo <- read_excel("C:/Users/yujun/Downloads/futures_series.xlsx",  sheet = "Sheet1")
f.2mo <- read_excel("C:/Users/yujun/Downloads/futures_series.xlsx",  sheet = "2mon")
f.3mo <- read_excel("C:/Users/yujun/Downloads/futures_series.xlsx",  sheet = "3mon")
f.4mo <- read_excel("C:/Users/yujun/Downloads/futures_series.xlsx",  sheet = "4mon")
f.5mo <- read_excel("C:/Users/yujun/Downloads/futures_series.xlsx",  sheet = "5mon")
f.6mo <- read_excel("C:/Users/yujun/Downloads/futures_series.xlsx",  sheet = "6mon")
urlfile = "https://raw.githubusercontent.com/razin11/IAQF-case-competition/main/Data%20processing/Formatted%20data%20files/crude_gas_data.csv?token=ARWWGURNJSS23UVZ2ZMW5BTAF76NK"
g.data  <- read.csv(url(urlfile))

Ratio: \[\text{Ratio} = \frac{Gas - Forward \ Price}{Forward\ (1 \ lag)}\]

One-Month Analysis

With Covid time

Explanatory Analysis

#turn data to weekly
f.1mo$date = as.Date(f.1mo$date)
f.1mo.xts = xts(f.1mo$price,order.by = f.1mo$date)
startpoints <- function (x, on = "weeks", k = 1) {
  head(endpoints(x, on, k) + 1, -1)
}
index  = startpoints(f.1mo.xts,on="weeks",k=1)
#weekly crude price
f.1mo.wts =period.apply(f.1mo.xts,index,mean)
f.1mo.wts = f.1mo.wts["1993-04-12::2021-01-08"]
f.1mo.w = fortify(f.1mo.wts)
f.1mo.w$index = rep(1:nrow(f.1mo.w))
colnames(f.1mo.w) = c("Date","Weekly_Price","Index")
g.data$Index = rep(1:nrow(g.data))
#to merge data change the date
#f.1mo.w$Date = as.Date(f.1mo.w$Date)-1
#$Date = as.Date(g.data$Date)
#merge data
merge.1mo = left_join(g.data,f.1mo.w,by="Index",copy=T)
merge.1mo = merge.1mo[complete.cases(merge.1mo),]
merge.1mo = merge.1mo %>% select(Date.y,gasoline_retail_price,Weekly_Price)
colnames(merge.1mo) = c("Date","retail_price","weekly_future_price")
lf.1mo = back(merge.1mo$weekly_future_price)
change.1mo = merge.1mo$retail_price-merge.1mo$weekly_future_price
ratio.1mo = change.1mo/lf.1mo
ratio.1mo = ratio.1mo[-1]
adf.test(ratio.1mo)
## Warning in adf.test(ratio.1mo): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ratio.1mo
## Dickey-Fuller = -10.026, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
acf(ratio.1mo,lag=12)

pacf(ratio.1mo,lag=12)

By ADF test, the ratio is stationary for 1month. * ACF plot: considering MA(4) * PACF plot : considering AR(4) Now, select order for ratio

eacf(ratio.1mo)
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x o o o x o o o  o  x  o 
## 1 x o o x o o o x o o o  o  x  o 
## 2 x x o x o o o x o o o  o  x  o 
## 3 x x x x o o o x o o o  o  o  o 
## 4 o x x x o o x o o o o  o  o  o 
## 5 x o x x x o x o o o o  o  o  o 
## 6 x x x x o o o o o o o  o  o  o 
## 7 x x x x x o x o o o o  o  o  o
  • EACF table: vertex possibly at ARMA(1,4)
bic = armasubsets(ratio.1mo,nar = 8,nma=12)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 4 linear dependencies found
## Reordering variables and trying again:
plot(bic)

  • BIC plot: Here is the plot of the possible combination of the ARMA model: AR(1),AR(4)

Now we try to do

ar(ratio.1mo,method = "mle")
## 
## Call:
## ar(x = ratio.1mo, method = "mle")
## 
## Coefficients:
##       1        2        3        4        5        6        7        8  
##  0.1467   0.0641   0.0515   0.0955  -0.0009  -0.0210  -0.0480   0.0595  
## 
## Order selected 8  sigma^2 estimated as  0.002267
  • AIC also sugget AR(4) to fit.

Let’s try fit a AR(4) model

arima(ratio.1mo,order=c(4,0,0),include.mean = T)
## 
## Call:
## arima(x = ratio.1mo, order = c(4, 0, 0), include.mean = T)
## 
## Coefficients:
##          ar1     ar2     ar3     ar4  intercept
##       0.1450  0.0606  0.0463  0.0978    -0.9523
## s.e.  0.0261  0.0264  0.0264  0.0261     0.0019
## 
## sigma^2 estimated as 0.002279:  log likelihood = 2348.42,  aic = -4686.84

ar3 not significant. Drop! limit coef3 = 0

ar4=arima(ratio.1mo,order=c(4,0,0),fixed = c(NA,NA,0,NA,NA),include.mean = T)
## Warning in stats::arima(x = x, order = order, seasonal = seasonal, xreg =
## xreg, : some AR parameters were fixed: setting transform.pars = FALSE

AIC good! Take this!

arma1.1 = arima(ratio.1mo,order=c(1,0,1),include.mean = T)
arma1.1
## 
## Call:
## arima(x = ratio.1mo, order = c(1, 0, 1), include.mean = T)
## 
## Coefficients:
##          ar1      ma1  intercept
##       0.7574  -0.6215    -0.9523
## s.e.  0.0699   0.0842     0.0020
## 
## sigma^2 estimated as 0.002292:  log likelihood = 2344.3,  aic = -4682.6

Let’s test unit root

abs(polyroot(c(1, -coefficients(arma1.1))))
## [1] 0.8156992 0.8156992 1.5781416

Unit roos are all outside the unit-circle.

tsdiag(arma1.1, gof=10)

Second graph, white noice checked. Third graph all are significant.

As we see in ACF, try ARMA(4,4)

auto = auto.arima(ratio.1mo)
auto
## Series: ratio.1mo 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.1354  -0.9885
## s.e.  0.0264   0.0040
## 
## sigma^2 estimated as 0.002287:  log likelihood=2343.67
## AIC=-4681.35   AICc=-4681.33   BIC=-4665.52
tsdiag(auto,gof=20)

arima.tstat <- function(mod) coef(mod)[coef(mod) != 0] / sqrt(diag(mod$var.coef))
round(sort(abs(arima.tstat(auto))), 2)
##    ar1    ma1 
##   5.13 248.28

By t.stat, all significant

Out-of-Sample Forecast

nmin = ceiling(.7*length(ratio.1mo))+1
nmax = length(ratio.1mo)
fcst <- matrix(NA_real_, nmax, 3)
fixed <- rep(NA_real_, 10)
fixed[c(4,6,7)] <- 0

for (t1 in nmin : nmax) {
   n= t1-1
   m1 = arima(ratio.1mo[1:n],order = c(1,0,1),include.mean = T,transform.pars=FALSE)
   m2 = arima(ratio.1mo[1:n],order= c(1,1,1),include.mean = T,transform.pars=FALSE)
   m3 = arima(ratio.1mo[1:n],order= c(4,0,0),include.mean = T,fixed = c(NA,NA,0,NA,NA),transform.pars = FALSE)
   fcst[t1,1] = c(predict(m1, n.ahead=1, newxreg=NULL, se.fit=FALSE))
   fcst[t1,2] = c(predict(m2,n.ahead = 1, newxreg=NULL, se.fit=FALSE))
   fcst[t1,3] = c(predict(m3,n.ahead = 1, newxreg=NULL, se.fit=FALSE))
}

rmsfe = rep(0.0,3)
mafe  = rep(0.0,3)
for (j in 1:3) {
  rmsfe[j]= sqrt(mean((fcst[nmin:nmax,j]-ratio.1mo[nmin:nmax])^2))
  mafe[j] = mean(abs(fcst[nmin:nmax,j]-ratio.1mo[nmin:nmax]))
}
rmsfe
## [1] 0.06682364 0.06585047 0.06531633
mafe
## [1] 0.03251506 0.03235961 0.03186955
tab1 = data.frame("model" = c("ARMA(1,1)","ARIMA(1,1)","AR(4)"),
                  RMSFE = round(rmsfe,5),
                  MAFE = round(mafe,5))
knitr::kable(tab1)
model RMSFE MAFE
ARMA(1,1) 0.06682 0.03252
ARIMA(1,1) 0.06585 0.03236
AR(4) 0.06532 0.03187
#xts.1mo = xts(cbind(ratio.1mo[nmin:nmax],fcst[nmin:nmax,1]),
              #index(f.1mo.xts)[-(1:2)][nmin:nmax])
#plot(xts.1mo,type="l",col=c("grey","red"))
par(mfrow = c(2,2))
plot(ratio.1mo[nmin:nmax],type="l",col="black")
lines(fcst[nmin:nmax,1],col="red")
plot(ratio.1mo[nmin:nmax],type="l",col="black")
lines(fcst[nmin:nmax,2],col="blue")
plot(ratio.1mo[nmin:nmax],type="l",col="black")
lines(fcst[nmin:nmax,3],col="green")

Maybe remove COVID time? ### Without Covid time {.tabset .tabset-fade .tabset-pills}

Explanatory Analysis

merge.remove.covid = merge.1mo[1:1403,]
lf.1mof = back(merge.remove.covid$weekly_future_price)
change.1mof = merge.remove.covid$retail_price-merge.remove.covid$weekly_future_price
ratiof.1mo = change.1mof/lf.1mof
ratiof.1mo = ratiof.1mo[-1]
adf.test(ratiof.1mo)
## Warning in adf.test(ratiof.1mo): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ratiof.1mo
## Dickey-Fuller = -8.5615, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
acf(ratiof.1mo,lag=12)

pacf(ratiof.1mo,lag=12)

* PACF:AR(3) or AR(8)

ar(ratiof.1mo,method = "mle")
## 
## Call:
## ar(x = ratiof.1mo, method = "mle")
## 
## Coefficients:
##       1        2        3        4        5        6        7        8  
##  0.2796  -0.0615   0.1227   0.0108   0.0549  -0.0204  -0.0275   0.1552  
## 
## Order selected 8  sigma^2 estimated as  0.001401
auto.f = auto.arima(ratiof.1mo)
auto.f
## Series: ratiof.1mo 
## ARIMA(1,1,2) 
## 
## Coefficients:
##           ar1      ma1      ma2
##       -0.2348  -0.5005  -0.4715
## s.e.   0.0981   0.0894   0.0855
## 
## sigma^2 estimated as 0.001428:  log likelihood=2601.51
## AIC=-5195.02   AICc=-5194.99   BIC=-5174.04
eacf(ratiof.1mo)
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x x o o x x x o  x  x  o 
## 1 x x x o x o o x o o o  o  x  o 
## 2 x x o o o o o x o o o  o  x  o 
## 3 x x x x o o o x o o o  o  o  o 
## 4 x x x x o o o x x x o  o  o  o 
## 5 x x x x o x o x x o o  o  o  o 
## 6 x o x o x o o x o o o  o  o  o 
## 7 x x x x x x x x o o o  o  o  o
bicf = armasubsets(ratiof.1mo,nar=8,nma=13)
plot(bicf)

* auto-arima suggest ARIMA(0,1,2) * eacf suggest ARMA(2,8) * BIC: AR(1),AR(3),AR(8) * AIC: AR(8) Let’s do it!

ar1 = arima(ratiof.1mo,order = c(1,0,0))
ar1
## 
## Call:
## arima(x = ratiof.1mo, order = c(1, 0, 0))
## 
## Coefficients:
##          ar1  intercept
##       0.2791    -0.9524
## s.e.  0.0256     0.0014
## 
## sigma^2 estimated as 0.001475:  log likelihood = 2580.72,  aic = -5157.43
ar3 = arima(ratiof.1mo,order = c(3,0,0),fixed = c(NA,0,NA,NA),transform.pars = FALSE)
ar3
## 
## Call:
## arima(x = ratiof.1mo, order = c(3, 0, 0), transform.pars = FALSE, fixed = c(NA, 
##     0, NA, NA))
## 
## Coefficients:
##          ar1  ar2     ar3  intercept
##       0.2717    0  0.1224    -0.9524
## s.e.  0.0255    0  0.0255     0.0017
## 
## sigma^2 estimated as 0.001451:  log likelihood = 2592.16,  aic = -5178.32
#ar8 = arima(ratiof.1mo,order = c(8,0,0))
#ar8
#arma2.8 = arima(ratiof.1mo,order = c(2,0,8))
#arma2.8

ar(8),ARMA(2,8) both has too many insignificant coef, bad. Take ar3!

Out-the-sample Forecast

nmin = ceiling(.7*length(ratiof.1mo))+1
nmax = length(ratiof.1mo)
fcst <- matrix(NA_real_, nmax, 3)
fixed <- rep(NA_real_, 10)
fixed[c(4,6,7)] <- 0

for (t1 in nmin : nmax) {
   n= t1-1
   m1 = arima(ratio.1mo[1:n],order = c(1,0,0),include.mean = T,transform.pars=FALSE)
   m2 = arima(ratio.1mo[1:n],order = c(3,0,0),fixed = c(NA,0,NA,NA),transform.pars = FALSE)
   m3 = arima(ratio.1mo[1:n],order= c(1,1,2),include.mean = T,transform.pars=FALSE)
   fcst[t1,1] = c(predict(m1, n.ahead=1, newxreg=NULL, se.fit=FALSE))
   fcst[t1,2] = c(predict(m2,n.ahead = 1, newxreg=NULL, se.fit=FALSE))
   fcst[t1,3] = c(predict(m3,n.ahead = 1, newxreg=NULL, se.fit=FALSE))
}

rmsfe = rep(0.0,3)
mafe  = rep(0.0,3)
for (j in 1:3) {
  rmsfe[j]= sqrt(mean((fcst[nmin:nmax,j]-ratio.1mo[nmin:nmax])^2))
  mafe[j] = mean(abs(fcst[nmin:nmax,j]-ratio.1mo[nmin:nmax]))
}
rmsfe
## [1] 0.03357622 0.03365368 0.03377164
mafe
## [1] 0.02559810 0.02557187 0.02601874
tab2 = data.frame("model" = c("AR(1)","AR(3)","ARIMA(1,1,2)"),
                  RMSFE = round(rmsfe,5),
                  MAFE = round(mafe,5))
knitr::kable(tab2)
model RMSFE MAFE
AR(1) 0.03358 0.02560
AR(3) 0.03365 0.02557
ARIMA(1,1,2) 0.03377 0.02602
#xts.1mo = xts(cbind(ratio.1mo[nmin:nmax],fcst[nmin:nmax,1]),
              #index(f.1mo.xts)[-(1:2)][nmin:nmax])
#plot(xts.1mo,type="l",col=c("grey","red"))
par(mfrow = c(2,2))
plot(ratio.1mo[nmin:nmax],type="l",col="black")
lines(fcst[nmin:nmax,1],col="red")
plot(ratio.1mo[nmin:nmax],type="l",col="black")
lines(fcst[nmin:nmax,2],col="blue")
plot(ratio.1mo[nmin:nmax],type="l",col="black")
lines(fcst[nmin:nmax,3],col="yellow")

good! Error drops but not a good estimation method Let’s test Other Month

Two Month