require(gdata)
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
df=read.xls("PET_PRI_SPT_S1_M.xls", sheet = 2, header = TRUE,pattern="Date")
df$X<-NULL
for (i in 3:8){
print (i)
sheet1 <- read.xls("PET_PRI_SPT_S1_M.xls", sheet = i, header = TRUE,pattern="Date")
sheet1$X<-NULL
df<-merge(df,sheet1,by="Date",all=TRUE)}
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
#structure the data and create date column
n<-dim(df)[1]
df<-df[1:(n-1),]
df$Date=as.character(df$Date)
df$date=paste('01',df$Date, sep = "-" )
df$date=as.Date((df$date), format = "%d-%b-%Y")
## Warning in strptime(x, format, tz = "GMT"): unknown timezone 'default/
## America/Los_Angeles'
df1=df[,2:10]
#convert CrudeOil to per gallon price
df$CrudeOil=rowMeans(cbind(df$Cushing..OK.WTI.Spot.Price.FOB..Dollars.per.Barrel./42,df$Europe.Brent.Spot.Price.FOB..Dollars.per.Barrel./42),na.rm=TRUE)
df$ConvGasoline=rowMeans(cbind(df$New.York.Harbor.Conventional.Gasoline.Regular.Spot.Price.FOB..Dollars.per.Gallon.,df$U.S..Gulf.Coast.Conventional.Gasoline.Regular.Spot.Price.FOB..Dollars.per.Gallon.),na.rm=TRUE)
df$Diesel=rowMeans(cbind(df$New.York.Harbor.Ultra.Low.Sulfur.No.2.Diesel.Spot.Price..Dollars.per.Gallon.,df$U.S..Gulf.Coast.Ultra.Low.Sulfur.No.2.Diesel.Spot.Price..Dollars.per.Gallon.,df$Los.Angeles..CA.Ultra.Low.Sulfur.CARB.Diesel.Spot.Price..Dollars.per.Gallon.),na.rm=TRUE)
df$RegGasoline=df$Los.Angeles.Reformulated.RBOB.Regular.Gasoline.Spot.Price..Dollars.per.Gallon.
df$HeatingOil=df$New.York.Harbor.No..2.Heating.Oil.Spot.Price.FOB..Dollars.per.Gallon.
df$KeroseneJetFuel=df$U.S..Gulf.Coast.Kerosene.Type.Jet.Fuel.Spot.Price.FOB..Dollars.per.Gallon.
df$propane=df$Mont.Belvieu..TX.Propane.Spot.Price.FOB..Dollars.per.Gallon.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:gdata':
##
## combine, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pct <- function(x) ((x-lag(x))/lag(x))
df<-df %>%mutate_each(funs(pct_change=pct),c(RegGasoline,ConvGasoline,Diesel,CrudeOil,HeatingOil,KeroseneJetFuel,propane))
#Summary statistics for monthly price movement of petroleum products
dd_sub = df[,c(13,14,15,16,17,18,19,20)]
summary(dd_sub)
## date CrudeOil ConvGasoline Diesel
## Min. :1986-01-01 Min. :0.2520 Min. :0.3005 Min. :0.391
## 1st Qu.:1993-11-01 1st Qu.:0.4470 1st Qu.:0.5467 1st Qu.:0.841
## Median :2001-09-01 Median :0.6601 Median :0.8235 Median :1.514
## Mean :2001-08-31 Mean :1.0322 Mean :1.2241 Mean :1.676
## 3rd Qu.:2009-07-01 3rd Qu.:1.4851 3rd Qu.:1.8547 3rd Qu.:2.296
## Max. :2017-05-01 Max. :3.1690 Max. :3.2880 Max. :3.877
## NA's :5 NA's :123
## RegGasoline HeatingOil KeroseneJetFuel propane
## Min. :0.949 Min. :0.3040 Min. :0.3040 Min. :0.2090
## 1st Qu.:1.631 1st Qu.:0.5357 1st Qu.:0.5573 1st Qu.:0.3590
## Median :2.125 Median :0.7820 Median :0.9730 Median :0.5830
## Mean :2.179 Mean :1.2453 Mean :1.3680 Mean :0.7042
## 3rd Qu.:2.760 3rd Qu.:1.8110 3rd Qu.:2.0420 3rd Qu.:0.9915
## Max. :3.694 Max. :3.8010 Max. :3.8860 Max. :1.8620
## NA's :210 NA's :5 NA's :51 NA's :77
#visualize monthly price movement of petroleum product
library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following objects are masked from 'package:gdata':
##
## first, last
library(ggplot2)
dd = melt(dd_sub, id=c("date"))
names(dd)<-c("Date","Petroleum_Products","Price")
ggplot(dd) + geom_line(aes(x=Date, y=Price, colour=Petroleum_Products)) +scale_colour_manual(values=c("grey","purple","blue","green","yellow","red","black"))+ ggtitle("Price Movement of Petroleum Products since 1986")
## Warning: Removed 470 rows containing missing values (geom_path).
#Summary statistic for monthly percentange change in price movement of petroleum products
dd_sub = df[,c(13,27,26,25,24,23,22,21)]
summary(dd_sub)
## date propane_pct_change KeroseneJetFuel_pct_change
## Min. :1986-01-01 Min. :-0.65735 Min. :-0.60193
## 1st Qu.:1993-11-01 1st Qu.:-0.17681 1st Qu.:-0.13873
## Median :2001-09-01 Median : 0.09476 Median : 0.02183
## Mean :2001-08-31 Mean : 0.10346 Mean : 0.08373
## 3rd Qu.:2009-07-01 3rd Qu.: 0.35954 3rd Qu.: 0.31394
## Max. :2017-05-01 Max. : 1.66071 Max. : 1.52427
## NA's :89 NA's :63
## HeatingOil_pct_change CrudeOil_pct_change Diesel_pct_change
## Min. :-0.78465 Min. :-0.78353 Min. :-0.60042
## 1st Qu.:-0.14034 1st Qu.:-0.15513 1st Qu.:-0.17268
## Median : 0.03132 Median : 0.04000 Median : 0.06524
## Mean : 0.07973 Mean : 0.07072 Mean : 0.09444
## 3rd Qu.: 0.30830 3rd Qu.: 0.28102 3rd Qu.: 0.38335
## Max. : 2.08224 Max. : 1.56508 Max. : 1.26854
## NA's :10 NA's :1 NA's :135
## ConvGasoline_pct_change RegGasoline_pct_change
## Min. :-0.79023 Min. :-0.54694
## 1st Qu.:-0.13431 1st Qu.:-0.09957
## Median : 0.03656 Median : 0.07295
## Mean : 0.07166 Mean : 0.06639
## 3rd Qu.: 0.25243 3rd Qu.: 0.28275
## Max. : 1.64136 Max. : 0.81441
## NA's :10 NA's :222
#visualize the monthly percentange change in price movement of petroleum product
dd = melt(dd_sub, id=c("date"))
names(dd)<-c("Date","Petroleum_Products","Percentage_Change_in_Price")
ggplot(dd) + geom_line(aes(x=Date, y=Percentage_Change_in_Price, colour=Petroleum_Products)) +scale_colour_manual(values=c("grey","purple","blue","black","green","yellow","red"))+ ggtitle("Petroleum Products' Percentage Change in Price since 1986")
## Warning: Removed 518 rows containing missing values (geom_path).
#Aggregate by year by getting the mean of the monthly price
df$year=format(as.Date(df$date,"%Y-%m-%d"),"%Y")
df_year<-df %>%group_by(year)%>%summarise_each (funs(mean=mean) , c(RegGasoline,ConvGasoline,Diesel,CrudeOil,HeatingOil,KeroseneJetFuel,propane))
df_year<-df_year %>%mutate_each(funs(pct_change=pct),c(RegGasoline_mean,ConvGasoline_mean,Diesel_mean,CrudeOil_mean,HeatingOil_mean,KeroseneJetFuel_mean,propane_mean))
df_year$date=as.Date(paste0(df_year$year,"-01-01"),stargaformat="%Y-%d-%m")
#Summary statistic petroleum product prices at yearly level
dd_sub1 = df_year[,c(16, 2,3,4,5,6,7,8)]
summary(dd_sub1)
## date RegGasoline_mean ConvGasoline_mean Diesel_mean
## Min. :1986-01-01 Min. :1.454 Min. :0.4205 Min. :0.4859
## 1st Qu.:1993-10-01 1st Qu.:1.841 1st Qu.:0.5731 1st Qu.:0.9147
## Median :2001-07-02 Median :2.210 Median :0.8468 Median :1.6343
## Mean :2001-07-02 Mean :2.264 Mean :1.2459 Mean :1.7058
## 3rd Qu.:2009-04-02 3rd Qu.:2.668 3rd Qu.:1.7336 3rd Qu.:2.1847
## Max. :2017-01-01 Max. :3.031 Max. :2.8767 Max. :3.0907
## NA's :19 NA's :1 NA's :11
## CrudeOil_mean HeatingOil_mean KeroseneJetFuel_mean propane_mean
## Min. :0.3241 Min. :0.3930 Min. :0.4035 Min. :0.2612
## 1st Qu.:0.4503 1st Qu.:0.5606 1st Qu.:0.5891 1st Qu.:0.4093
## Median :0.6542 Median :0.8502 Median :1.1499 Median :0.5816
## Mean :1.0358 Mean :1.2663 Mean :1.3874 Mean :0.7121
## 3rd Qu.:1.4899 3rd Qu.:1.7224 3rd Qu.:2.0253 3rd Qu.:1.0039
## Max. :2.4588 Max. :3.0238 Max. :3.0565 Max. :1.4633
## NA's :1 NA's :5 NA's :7
#Visualize the price movement by year
dd1 = melt(dd_sub1, id=c("date")) #transform the data
names(dd1)<-c("Date","Petroleum_Products","Price") # rename the column
#plot the graph
ggplot(dd1) + geom_line(aes(x=year(Date), y=Price, colour=Petroleum_Products)) +scale_colour_manual(values=c("grey","purple","blue","black","green","yellow","red"))+ggtitle("Petroleum Products' Percentage Change in Price by year")
## Warning: Removed 44 rows containing missing values (geom_path).
library(fpp)
## Loading required package: forecast
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: tseries
crudeoil=df[,13:14]
crudeoil=crudeoil[order(as.Date(crudeoil$date)),]
myts <- ts(crudeoil$CrudeOil, start=c(1986, 1), end=c(2017, 5),frequency=12)
#First, visualize the data
tsdisplay(myts) #There is a trend
###Use auto-arima model to find the optimal P,Q,D and p,q,d
auto.arima(myts)
## Series: myts
## ARIMA(2,1,2)(0,0,2)[12]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1 sma2
## 1.5201 -0.6217 -1.1418 0.2359 0.1013 -0.1599
## s.e. 0.0913 0.0861 0.1123 0.1049 0.0566 0.0535
##
## sigma^2 estimated as 0.008407: log likelihood=367.33
## AIC=-720.66 AICc=-720.36 BIC=-693.15
#best model is below with lowest AICC
fit<- Arima(myts, order=c(2,1,2), seasonal=c(0,0,2))
summary(fit)
## Series: myts
## ARIMA(2,1,2)(0,0,2)[12]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1 sma2
## 1.5201 -0.6217 -1.1418 0.2359 0.1013 -0.1599
## s.e. 0.0913 0.0861 0.1123 0.1049 0.0566 0.0535
##
## sigma^2 estimated as 0.008407: log likelihood=367.33
## AIC=-720.66 AICc=-720.36 BIC=-693.15
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set 0.002082911 0.09083504 0.06372521 -0.0826539 6.690122
## MASE ACF1
## Training set 0.2383274 -0.004120475
#test the result
Acf(residuals(fit)) #From the residual, it looks stable
Box.test(residuals(fit), lag=24, fitdf=3, type="Ljung") #passed!
##
## Box-Ljung test
##
## data: residuals(fit)
## X-squared = 16.388, df = 21, p-value = 0.7475
fcast<-forecast(fit,h=6)
plot(fcast)
conv_gas=df[,4:5]
conv_gas=na.omit(conv_gas)
names(conv_gas)<-c("New York Habor","U.S. Gulf Coast")
cor(conv_gas)
## New York Habor U.S. Gulf Coast
## New York Habor 1.0000000 0.9980088
## U.S. Gulf Coast 0.9980088 1.0000000