Data Preparation

# install.packages("Quandl")
# install.packages("dygraphs")
library(Quandl)
library(tidyverse)
library(zoo)
library(xts)
# library(dygraphs)
library(psych)

Quandl.api_key("TWp9zrWTMziHW_hbugiN")

# Load necessary EIA data from Quandl 
f1 <- Quandl("EIA/PET_RCLC1_M", type="xts")
f2 <- Quandl("EIA/PET_RCLC2_M", type="xts")
f3 <- Quandl("EIA/PET_RCLC3_M", type="xts")
f4 <- Quandl("EIA/PET_RCLC4_M", type="xts")
wti_stock <- Quandl("EIA/STEO_COSXPUS_M", type="xts")
gas_stock <- Quandl("EIA/STEO_MGTSPUS_M", type="xts")

# Calculate monthly, 3-month, 6-month, and annual changes in both oil and gasoline inventory data
wstkd <- diff(log(wti_stock))
wstkd3 <- diff(log(wti_stock),lag=3)
wstkd6 <- diff(log(wti_stock),lag=6)
wstkd12 <- diff(log(wti_stock),lag=12)

gstkd <- diff(log(gas_stock))
gstkd3 <- diff(log(gas_stock),lag=3)
gstkd6 <- diff(log(gas_stock),lag=6)
gstkd12 <- diff(log(gas_stock),lag=12)

# Lag the independent variables by 1 month since we want to use them as leading indicators
wstkdL1 <- lag(wstkd,1)
wstkd3L1 <- lag(wstkd3,1)
wstkd6L1 <- lag(wstkd6,1)
wstkd12L1 <- lag(wstkd12,1)

gstkdL1 <- lag(gstkd,1)
gstkd3L1 <- lag(gstkd3,1)
gstkd6L1 <- lag(gstkd6,1)
gstkd12L1 <- lag(gstkd12,1)

# Create calendar spreads and return sign. Positive means curve is in contango, and negative means curve is in backwardation. Since all the variables are of XTS data type, the dates are automatically taken into account.

f1_2 <- sign(f2-f1)
f1_3 <- sign(f3-f1)
f1_4 <- sign(f4-f1)
f2_3 <- sign(f3-f2)
f2_4 <- sign(f4-f2)
f3_4 <- sign(f4-f3)

# Combine all the data:

df <- na.omit(cbind(f1_2,f1_3,f1_4,f2_3,f2_4,f3_4,wstkdL1,wstkd3L1,wstkd6L1,wstkd12L1,gstkdL1,gstkd3L1,gstkd6L1,gstkd12L1))

colnames(df) <- c('f1_2','f1_3','f1_4','f2_3','f2_4','f3_4','wstkdL1','wstkd3L1','wstkd6L1','wstkd12L1','gstkdL1','gstkd3L1','gstkd6L1','gstkd12L1')

Research question

Are changes in WTI and gasoline inventory predictive of an upward sloping (contango) or downward sloping (backwardation) oil curve 2-,3-,4-months out?

Cases

What are the cases, and how many are there?

Each case represents a monthly observation of a futures contract and inventory data. There are 320 observations in the given data set.

Data collection

Describe the method of data collection.

Data is reported by the U.S. Energy Information Administration (EIA) and available via Quandl.

Type of study

What type of study is this (observational/experiment)?

This is an observational study.

Data Source

If you collected the data, state self-collected. If not, provide a citation/link.

Data is collected by the EIA and is available online here: http://quandl.com/eia/ For this project, data was extracted using the Quandl R package.

Response

What is the response variable, and what type is it (numerical/categorical)?

The response variable is the shape of the oil futures curve and is binary: +1 for upward sloping (contango), or -1 for downward sloping (backwardation).

Explanatory

What is the explanatory variable, and what type is it (numerical/categorival)?

The explanatory variables are numerical - percent change in oil and gasoline inventory data.

Relevant summary statistics

Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.

describe(df[,1:6])
##      vars   n mean   sd median trimmed mad min max range  skew kurtosis
## f1_2    1 320 0.23 0.97      1    0.29   0  -1   1     2 -0.48    -1.77
## f1_3    2 320 0.19 0.98      1    0.24   0  -1   1     2 -0.39    -1.84
## f1_4    3 320 0.13 0.99      1    0.16   0  -1   1     2 -0.26    -1.94
## f2_3    4 320 0.12 0.99      1    0.15   0  -1   1     2 -0.24    -1.94
## f2_4    5 320 0.08 1.00      1    0.09   0  -1   1     2 -0.15    -1.98
## f3_4    6 320 0.07 1.00      1    0.08   0  -1   1     2 -0.13    -1.99
##        se
## f1_2 0.05
## f1_3 0.05
## f1_4 0.06
## f2_3 0.06
## f2_4 0.06
## f3_4 0.06
describe(df[,7:14]*100)
##           vars   n mean   sd median trimmed  mad    min   max range skew
## wstkdL1      1 320 0.11 3.14   0.17    0.09 3.35  -9.03  8.83 17.86 0.04
## wstkd3L1     2 320 0.34 5.91   0.08    0.13 6.43 -15.98 20.55 36.53 0.30
## wstkd6L1     3 320 0.68 7.59   0.21    0.45 6.63 -22.75 28.84 51.59 0.37
## wstkd12L1    4 320 1.25 8.43   0.59    0.97 7.75 -19.14 25.79 44.93 0.32
## gstkdL1      5 320 0.01 3.61   0.03   -0.03 3.43  -7.90 10.54 18.44 0.14
## gstkd3L1     6 320 0.06 6.22  -0.40   -0.26 6.21 -14.40 18.55 32.95 0.45
## gstkd6L1     7 320 0.15 7.06   0.42    0.12 7.71 -19.16 19.67 38.83 0.04
## gstkd12L1    8 320 0.25 4.83  -0.41    0.16 5.05 -12.94 12.67 25.61 0.15
##           kurtosis   se
## wstkdL1      -0.32 0.18
## wstkd3L1     -0.08 0.33
## wstkd6L1      0.89 0.42
## wstkd12L1     0.28 0.47
## gstkdL1      -0.23 0.20
## gstkd3L1     -0.07 0.35
## gstkd6L1     -0.21 0.39
## gstkd12L1    -0.47 0.27
table(df$f1_2)
## 
##  -1   0   1 
## 122   1 197
prop.table(table(df$f1_2)) * 100
## 
##      -1       0       1 
## 38.1250  0.3125 61.5625
table(df$f1_3)
## 
##  -1   0   1 
## 127   4 189
prop.table(table(df$f1_3)) * 100
## 
##      -1       0       1 
## 39.6875  1.2500 59.0625
table(df$f1_4)
## 
##  -1   0   1 
## 139   1 180
prop.table(table(df$f1_4)) * 100
## 
##      -1       0       1 
## 43.4375  0.3125 56.2500
table(df$f2_3)
## 
##  -1   0   1 
## 140   1 179
prop.table(table(df$f2_3)) * 100
## 
##      -1       0       1 
## 43.7500  0.3125 55.9375
table(df$f2_4)
## 
##  -1   1 
## 148 172
prop.table(table(df$f2_4)) * 100
## 
##    -1     1 
## 46.25 53.75
boxplot(coredata(df[,7:14]*100),col=c('powderblue'),main="Boxplots of the Explanatory Variables",ylab="%",las=2)