Loading in the data

require(gdata)
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith
df=read.xls("PET_PRI_SPT_S1_M.xls", sheet = 2, header = TRUE,pattern="Date")
df$X<-NULL
for (i in 3:8){
  print (i)
  sheet1 <- read.xls("PET_PRI_SPT_S1_M.xls", sheet = i, header = TRUE,pattern="Date")
  sheet1$X<-NULL
  df<-merge(df,sheet1,by="Date",all=TRUE)}
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8

Data cleaning and formatting

#structure the data and create date column
n<-dim(df)[1]
df<-df[1:(n-1),]
df$Date=as.character(df$Date)
df$date=paste('01',df$Date, sep = "-" )
df$date=as.Date((df$date), format = "%d-%b-%Y")
## Warning in strptime(x, format, tz = "GMT"): unknown timezone 'default/
## America/Los_Angeles'
df1=df[,2:10]

Standardizing the unit variable and create new variables by taking average of prices if the petroleum product has more than 1 production place

#convert CrudeOil to per gallon price
df$CrudeOil=rowMeans(cbind(df$Cushing..OK.WTI.Spot.Price.FOB..Dollars.per.Barrel./42,df$Europe.Brent.Spot.Price.FOB..Dollars.per.Barrel./42),na.rm=TRUE)

df$ConvGasoline=rowMeans(cbind(df$New.York.Harbor.Conventional.Gasoline.Regular.Spot.Price.FOB..Dollars.per.Gallon.,df$U.S..Gulf.Coast.Conventional.Gasoline.Regular.Spot.Price.FOB..Dollars.per.Gallon.),na.rm=TRUE)

df$Diesel=rowMeans(cbind(df$New.York.Harbor.Ultra.Low.Sulfur.No.2.Diesel.Spot.Price..Dollars.per.Gallon.,df$U.S..Gulf.Coast.Ultra.Low.Sulfur.No.2.Diesel.Spot.Price..Dollars.per.Gallon.,df$Los.Angeles..CA.Ultra.Low.Sulfur.CARB.Diesel.Spot.Price..Dollars.per.Gallon.),na.rm=TRUE)
df$RegGasoline=df$Los.Angeles.Reformulated.RBOB.Regular.Gasoline.Spot.Price..Dollars.per.Gallon.
df$HeatingOil=df$New.York.Harbor.No..2.Heating.Oil.Spot.Price.FOB..Dollars.per.Gallon.
df$KeroseneJetFuel=df$U.S..Gulf.Coast.Kerosene.Type.Jet.Fuel.Spot.Price.FOB..Dollars.per.Gallon.

df$propane=df$Mont.Belvieu..TX.Propane.Spot.Price.FOB..Dollars.per.Gallon.

Create percentage change in price for each petroluem product

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:gdata':
## 
##     combine, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
pct <- function(x) ((x-lag(x))/lag(x))
df<-df %>%mutate_each(funs(pct_change=pct),c(RegGasoline,ConvGasoline,Diesel,CrudeOil,HeatingOil,KeroseneJetFuel,propane))

Summary statistics

#Summary statistics for monthly price movement of petroleum products
dd_sub = df[,c(13,14,15,16,17,18,19,20)]
summary(dd_sub)
##       date               CrudeOil       ConvGasoline        Diesel     
##  Min.   :1986-01-01   Min.   :0.2520   Min.   :0.3005   Min.   :0.391  
##  1st Qu.:1993-11-01   1st Qu.:0.4470   1st Qu.:0.5467   1st Qu.:0.841  
##  Median :2001-09-01   Median :0.6601   Median :0.8235   Median :1.514  
##  Mean   :2001-08-31   Mean   :1.0322   Mean   :1.2241   Mean   :1.676  
##  3rd Qu.:2009-07-01   3rd Qu.:1.4851   3rd Qu.:1.8547   3rd Qu.:2.296  
##  Max.   :2017-05-01   Max.   :3.1690   Max.   :3.2880   Max.   :3.877  
##                                        NA's   :5        NA's   :123    
##   RegGasoline      HeatingOil     KeroseneJetFuel     propane      
##  Min.   :0.949   Min.   :0.3040   Min.   :0.3040   Min.   :0.2090  
##  1st Qu.:1.631   1st Qu.:0.5357   1st Qu.:0.5573   1st Qu.:0.3590  
##  Median :2.125   Median :0.7820   Median :0.9730   Median :0.5830  
##  Mean   :2.179   Mean   :1.2453   Mean   :1.3680   Mean   :0.7042  
##  3rd Qu.:2.760   3rd Qu.:1.8110   3rd Qu.:2.0420   3rd Qu.:0.9915  
##  Max.   :3.694   Max.   :3.8010   Max.   :3.8860   Max.   :1.8620  
##  NA's   :210     NA's   :5        NA's   :51       NA's   :77

Visualization for monthly price movement of petroleum products

#visualize monthly price movement of petroleum product
library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following objects are masked from 'package:gdata':
## 
##     first, last
library(ggplot2)
dd = melt(dd_sub, id=c("date"))
names(dd)<-c("Date","Petroleum_Products","Price")
ggplot(dd) + geom_line(aes(x=Date, y=Price, colour=Petroleum_Products)) +scale_colour_manual(values=c("grey","purple","blue","green","yellow","red","black"))+ ggtitle("Price Movement of Petroleum Products since 1986") 
## Warning: Removed 470 rows containing missing values (geom_path).

#Summary statistic for monthly percentange change in price movement of petroleum products
dd_sub = df[,c(13,27,26,25,24,23,22,21)]
summary(dd_sub)
##       date            propane_pct_change KeroseneJetFuel_pct_change
##  Min.   :1986-01-01   Min.   :-0.65735   Min.   :-0.60193          
##  1st Qu.:1993-11-01   1st Qu.:-0.17681   1st Qu.:-0.13873          
##  Median :2001-09-01   Median : 0.09476   Median : 0.02183          
##  Mean   :2001-08-31   Mean   : 0.10346   Mean   : 0.08373          
##  3rd Qu.:2009-07-01   3rd Qu.: 0.35954   3rd Qu.: 0.31394          
##  Max.   :2017-05-01   Max.   : 1.66071   Max.   : 1.52427          
##                       NA's   :89         NA's   :63                
##  HeatingOil_pct_change CrudeOil_pct_change Diesel_pct_change 
##  Min.   :-0.78465      Min.   :-0.78353    Min.   :-0.60042  
##  1st Qu.:-0.14034      1st Qu.:-0.15513    1st Qu.:-0.17268  
##  Median : 0.03132      Median : 0.04000    Median : 0.06524  
##  Mean   : 0.07973      Mean   : 0.07072    Mean   : 0.09444  
##  3rd Qu.: 0.30830      3rd Qu.: 0.28102    3rd Qu.: 0.38335  
##  Max.   : 2.08224      Max.   : 1.56508    Max.   : 1.26854  
##  NA's   :10            NA's   :1           NA's   :135       
##  ConvGasoline_pct_change RegGasoline_pct_change
##  Min.   :-0.79023        Min.   :-0.54694      
##  1st Qu.:-0.13431        1st Qu.:-0.09957      
##  Median : 0.03656        Median : 0.07295      
##  Mean   : 0.07166        Mean   : 0.06639      
##  3rd Qu.: 0.25243        3rd Qu.: 0.28275      
##  Max.   : 1.64136        Max.   : 0.81441      
##  NA's   :10              NA's   :222
#visualize the monthly percentange change in price movement of petroleum product
dd = melt(dd_sub, id=c("date"))
names(dd)<-c("Date","Petroleum_Products","Percentage_Change_in_Price")
ggplot(dd) + geom_line(aes(x=Date, y=Percentage_Change_in_Price, colour=Petroleum_Products)) +scale_colour_manual(values=c("grey","purple","blue","black","green","yellow","red"))+ ggtitle("Petroleum Products' Percentage Change in Price since 1986") 
## Warning: Removed 518 rows containing missing values (geom_path).

Aggregate the data to annual level

#Aggregate by year by getting the mean of the monthly price
df$year=format(as.Date(df$date,"%Y-%m-%d"),"%Y")
df_year<-df %>%group_by(year)%>%summarise_each (funs(mean=mean) , c(RegGasoline,ConvGasoline,Diesel,CrudeOil,HeatingOil,KeroseneJetFuel,propane))
df_year<-df_year %>%mutate_each(funs(pct_change=pct),c(RegGasoline_mean,ConvGasoline_mean,Diesel_mean,CrudeOil_mean,HeatingOil_mean,KeroseneJetFuel_mean,propane_mean))
df_year$date=as.Date(paste0(df_year$year,"-01-01"),stargaformat="%Y-%d-%m")

Summary statistic

#Summary statistic petroleum product prices at yearly level
dd_sub1 = df_year[,c(16, 2,3,4,5,6,7,8)]
summary(dd_sub1)
##       date            RegGasoline_mean ConvGasoline_mean  Diesel_mean    
##  Min.   :1986-01-01   Min.   :1.454    Min.   :0.4205    Min.   :0.4859  
##  1st Qu.:1993-10-01   1st Qu.:1.841    1st Qu.:0.5731    1st Qu.:0.9147  
##  Median :2001-07-02   Median :2.210    Median :0.8468    Median :1.6343  
##  Mean   :2001-07-02   Mean   :2.264    Mean   :1.2459    Mean   :1.7058  
##  3rd Qu.:2009-04-02   3rd Qu.:2.668    3rd Qu.:1.7336    3rd Qu.:2.1847  
##  Max.   :2017-01-01   Max.   :3.031    Max.   :2.8767    Max.   :3.0907  
##                       NA's   :19       NA's   :1         NA's   :11      
##  CrudeOil_mean    HeatingOil_mean  KeroseneJetFuel_mean  propane_mean   
##  Min.   :0.3241   Min.   :0.3930   Min.   :0.4035       Min.   :0.2612  
##  1st Qu.:0.4503   1st Qu.:0.5606   1st Qu.:0.5891       1st Qu.:0.4093  
##  Median :0.6542   Median :0.8502   Median :1.1499       Median :0.5816  
##  Mean   :1.0358   Mean   :1.2663   Mean   :1.3874       Mean   :0.7121  
##  3rd Qu.:1.4899   3rd Qu.:1.7224   3rd Qu.:2.0253       3rd Qu.:1.0039  
##  Max.   :2.4588   Max.   :3.0238   Max.   :3.0565       Max.   :1.4633  
##                   NA's   :1        NA's   :5            NA's   :7

Visualization for annual price movement of petroleum product

#Visualize the price movement  by year
dd1 = melt(dd_sub1, id=c("date")) #transform the data
names(dd1)<-c("Date","Petroleum_Products","Price") # rename the column

#plot the graph
ggplot(dd1) + geom_line(aes(x=year(Date), y=Price, colour=Petroleum_Products)) +scale_colour_manual(values=c("grey","purple","blue","black","green","yellow","red"))+ggtitle("Petroleum Products' Percentage Change in Price by year") 
## Warning: Removed 44 rows containing missing values (geom_path).

Prediction: Predict the crude oil price for next 6 months

Subset crude price and make it into a time series object

library(fpp)
## Loading required package: forecast
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: tseries
crudeoil=df[,13:14]
crudeoil=crudeoil[order(as.Date(crudeoil$date)),]
myts <- ts(crudeoil$CrudeOil, start=c(1986, 1), end=c(2017, 5),frequency=12) 

#First, visualize the data
tsdisplay(myts) #There is a trend

###Use auto-arima model to find the optimal P,Q,D and p,q,d

auto.arima(myts)
## Series: myts 
## ARIMA(2,1,2)(0,0,2)[12]                    
## 
## Coefficients:
##          ar1      ar2      ma1     ma2    sma1     sma2
##       1.5201  -0.6217  -1.1418  0.2359  0.1013  -0.1599
## s.e.  0.0913   0.0861   0.1123  0.1049  0.0566   0.0535
## 
## sigma^2 estimated as 0.008407:  log likelihood=367.33
## AIC=-720.66   AICc=-720.36   BIC=-693.15
#best model is below with lowest AICC
fit<- Arima(myts, order=c(2,1,2), seasonal=c(0,0,2))
summary(fit)
## Series: myts 
## ARIMA(2,1,2)(0,0,2)[12]                    
## 
## Coefficients:
##          ar1      ar2      ma1     ma2    sma1     sma2
##       1.5201  -0.6217  -1.1418  0.2359  0.1013  -0.1599
## s.e.  0.0913   0.0861   0.1123  0.1049  0.0566   0.0535
## 
## sigma^2 estimated as 0.008407:  log likelihood=367.33
## AIC=-720.66   AICc=-720.36   BIC=-693.15
## 
## Training set error measures:
##                       ME       RMSE        MAE        MPE     MAPE
## Training set 0.002082911 0.09083504 0.06372521 -0.0826539 6.690122
##                   MASE         ACF1
## Training set 0.2383274 -0.004120475
#test the result
Acf(residuals(fit)) #From the residual, it looks stable

Box.test(residuals(fit), lag=24, fitdf=3, type="Ljung") #passed!
## 
##  Box-Ljung test
## 
## data:  residuals(fit)
## X-squared = 16.388, df = 21, p-value = 0.7475

Forecast and Visualize

fcast<-forecast(fit,h=6)
plot(fcast)

Correlation between Conventional Gasoline prices between New York Harbor and U.S. Gulf Coast

conv_gas=df[,4:5]
conv_gas=na.omit(conv_gas)
names(conv_gas)<-c("New York Habor","U.S. Gulf Coast")
cor(conv_gas)
##                 New York Habor U.S. Gulf Coast
## New York Habor       1.0000000       0.9980088
## U.S. Gulf Coast      0.9980088       1.0000000