ARIMA Modeling of Fuel Prices

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.5     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tseries)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(ggplot2)
library(quantmod)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## Loading required package: TTR

library(mltools)

## 
## Attaching package: 'mltools'

## The following object is masked from 'package:tidyr':
## 
##     replace_na

library(tidyr)
library(forecast)

## Warning: package 'forecast' was built under R version 4.1.2

library(tseries)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(zoo)
library(sjPlot)

## Learn more about sjPlot with 'browseVignettes("sjPlot")'.

library(sjmisc)

## 
## Attaching package: 'sjmisc'

## The following object is masked from 'package:mltools':
## 
##     replace_na

## The following object is masked from 'package:purrr':
## 
##     is_empty

## The following object is masked from 'package:tidyr':
## 
##     replace_na

## The following object is masked from 'package:tibble':
## 
##     add_case

library(sjlabelled)

## 
## Attaching package: 'sjlabelled'

## The following object is masked from 'package:forcats':
## 
##     as_factor

## The following object is masked from 'package:dplyr':
## 
##     as_label

## The following object is masked from 'package:ggplot2':
## 
##     as_label

Examining the time series and performing the appropriate modifications

library(readxl)
Fuel_prices_data <- read_excel("/Users/nishanthreddy/Downloads/Fuel_prices_data.xlsx")
head(Fuel_prices_data)

## # A tibble: 6 × 2
##   Week                Price
##   <dttm>              <dbl>
## 1 2022-01-17 00:00:00  3.40
## 2 2022-01-10 00:00:00  3.39
## 3 2022-01-03 00:00:00  3.38
## 4 2021-12-27 00:00:00  3.38
## 5 2021-12-20 00:00:00  3.40
## 6 2021-12-13 00:00:00  3.41

str(Fuel_prices_data)

## tibble [1,503 × 2] (S3: tbl_df/tbl/data.frame)
##  $ Week : POSIXct[1:1503], format: "2022-01-17" "2022-01-10" ...
##  $ Price: num [1:1503] 3.4 3.39 3.38 3.38 3.4 ...

Fuel_data <- Fuel_prices_data[which(Fuel_prices_data$`Week` >= "2011-01-01" & Fuel_prices_data$`Week` <= "2021-12-31" ),]

#line chart
ggplot(data = Fuel_data, aes(x = Fuel_data$Week, y = Fuel_data$Price  )) + theme_bw()+
  geom_line() + labs(title = "Line Chart")

## Warning: Use of `Fuel_data$Week` is discouraged. Use `Week` instead.

## Warning: Use of `Fuel_data$Price` is discouraged. Use `Price` instead.

## Time series appears to be variance stationary through out the time frame. However, we’ll test the transformations to assure.

Fuel_Data_roll <- Fuel_data %>%
  mutate(
   Price_mean = zoo::rollmean(
      Price, 
      k = 180, 
      fill = NA),
    Price_sd = zoo::rollapply(
      Price, 
      FUN = sd, 
      width = 180, 
      fill = NA)
  )
Price_roll_mean <- Fuel_Data_roll %>%
  ggplot() +
  geom_line(aes(Week, Price_mean)) +
  theme_bw() +
  ggtitle("Fuel Prices Mean over Time (6 month rolling window)") +
  ylab("Adjusted Fuel Price") +
  xlab("Years")
Price_roll_sd <-Fuel_Data_roll %>%
  ggplot() +
  geom_line(aes(Week, Price_sd)) +
  theme_bw() +
  ggtitle("Differenced Fuel Prices SD over Time (6 month rolling window)") +
  ylab("Adjusted Fuel Price") +
  xlab("Years")
Price_roll_mean

## Warning: Removed 179 row(s) containing missing values (geom_path).

Price_roll_sd

## Warning: Removed 179 row(s) containing missing values (geom_path).

## ADF Test

print(adf.test(Fuel_data$Price))

## 
##  Augmented Dickey-Fuller Test
## 
## data:  Fuel_data$Price
## Dickey-Fuller = -3.4053, Lag order = 8, p-value = 0.05253
## alternative hypothesis: stationary

From the above ADF Test output, we observe the p-value is greater than 0.05. So, we fail to reject the Null Hypothesis and infer that the time series is non-stationary.

kpss.test(Fuel_data$Price, null="Trend")

## Warning in kpss.test(Fuel_data$Price, null = "Trend"): p-value smaller than
## printed p-value

## 
##  KPSS Test for Trend Stationarity
## 
## data:  Fuel_data$Price
## KPSS Trend = 0.95139, Truncation lag parameter = 6, p-value = 0.01

From the above KPSS Test output, we observe the p-value is less than 0.05. So, we reject the Null Hypothesis and infer that the time series is non-stationary.

Log Transformation for Variance

Fuel_data_log = Fuel_data %>%
  mutate(Price_log = log1p(Fuel_data$Price))
Fuel_data_log %>%
  ggplot()+
      geom_line(aes(Week,Price_log ))+
      theme_bw()+
      ggtitle("Fuel Prices Over Time (Log)")+
      ylab("Fuel Price (Log)")+
      xlab("Years")

## Box-Cox Transformation

Fuel_Data_boxcox <- Fuel_data %>%
  mutate(Price_boxcox = forecast::BoxCox(Fuel_data$Price,lambda='auto'))
Fuel_Data_boxcox %>%
  ggplot()+
      geom_line(aes(Week,Price_boxcox))+
      theme_bw()+
      ggtitle("Fuel Prices over Time (Box-Cox Transformation)")+
      ylab(" Price (Box-Cox Tranformation)")+
      xlab("Years")

## The time series after being log-transformed and Box-Cox transformed are extremely identical.

Handling Mean Stationarity

Fuel_diff <- Fuel_data_log %>%
  mutate(Price_diff = Fuel_data$Price - lag(Fuel_data$Price))
Fuel_diff %>% select(Week, Price, Price_diff) %>%
head()

## # A tibble: 6 × 3
##   Week                Price Price_diff
##   <dttm>              <dbl>      <dbl>
## 1 2021-12-27 00:00:00  3.38    NA     
## 2 2021-12-20 00:00:00  3.40     0.0200
## 3 2021-12-13 00:00:00  3.41     0.0190
## 4 2021-12-06 00:00:00  3.44     0.0260
## 5 2021-11-29 00:00:00  3.48     0.0380
## 6 2021-11-22 00:00:00  3.49     0.0150

ggplot() + geom_line(aes(Fuel_diff$Week,Fuel_diff$Price_diff)) + theme_bw() + ggtitle("Fuel Price (First Difference)") + ylab("Price(Difference))") + xlab("Year")

## Warning: Removed 1 row(s) containing missing values (geom_path).

ACF

acf(Fuel_diff$Price)

## We can see that the time series has no seasonality in the ACF plot above. ## In the ACF plot, there is a Dampening effect. As a result, the time series is entirely an AR process.

PACF

pacf(Fuel_diff$Price)

## We can see from the PACF that there are two statistically significant autocorrelations. As a result, the order of the AR process is 2. ARIMA is given as (2,1,0).

Log Transformation and Differencing

Fuel_data_log_diff <- Fuel_data %>%
 arrange(Fuel_data$Week) %>%
  mutate(Price_log = log1p(Fuel_data$Price), Price_diff = Price - lag(Fuel_data$Price),
         Price_log_diff = Price_log - lag(Price_log) ) %>%
  drop_na() 
Fuel_data_log_diff %>%
  ggplot() + geom_line(aes(Fuel_data_log_diff$Week,Fuel_data_log_diff$Price_diff)) + theme_bw() + ggtitle("Fuel Price (First Difference)") +    ylab("Price(Difference))") + xlab("Year")

## Warning: Use of `Fuel_data_log_diff$Week` is discouraged. Use `Week` instead.

## Warning: Use of `Fuel_data_log_diff$Price_diff` is discouraged. Use `Price_diff`
## instead.

## Re-verifying using the ADF Test

adf.test(Fuel_data_log_diff$Price)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  Fuel_data_log_diff$Price
## Dickey-Fuller = -2.2758, Lag order = 8, p-value = 0.4615
## alternative hypothesis: stationary

adf.test(Fuel_data_log_diff$Price_diff)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  Fuel_data_log_diff$Price_diff
## Dickey-Fuller = -3.175, Lag order = 8, p-value = 0.09224
## alternative hypothesis: stationary

adf.test(Fuel_data_log_diff$Price_log)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  Fuel_data_log_diff$Price_log
## Dickey-Fuller = -3.409, Lag order = 8, p-value = 0.0519
## alternative hypothesis: stationary

adf.test(Fuel_data_log_diff$Price_log_diff)

## Warning in adf.test(Fuel_data_log_diff$Price_log_diff): p-value smaller than
## printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  Fuel_data_log_diff$Price_log_diff
## Dickey-Fuller = -6.9354, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary

Re-verifying using the KPSS Test

kpss.test(Fuel_data_log_diff$Price)

## Warning in kpss.test(Fuel_data_log_diff$Price): p-value smaller than printed p-
## value

## 
##  KPSS Test for Level Stationarity
## 
## data:  Fuel_data_log_diff$Price
## KPSS Level = 3.9025, Truncation lag parameter = 6, p-value = 0.01

.test(Fuel_data_log_diff$Price_diff)

kpss.test(Fuel_data_log_diff$Price_diff)

## Warning in kpss.test(Fuel_data_log_diff$Price_diff): p-value smaller than
## printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  Fuel_data_log_diff$Price_diff
## KPSS Level = 6.0787, Truncation lag parameter = 6, p-value = 0.01

kpss.test(Fuel_data_log_diff$Price_log)

## Warning in kpss.test(Fuel_data_log_diff$Price_log): p-value smaller than printed
## p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  Fuel_data_log_diff$Price_log
## KPSS Level = 3.7162, Truncation lag parameter = 6, p-value = 0.01

kpss.test(Fuel_data_log_diff$Price_log_diff)

## Warning in kpss.test(Fuel_data_log_diff$Price_log_diff): p-value greater than
## printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  Fuel_data_log_diff$Price_log_diff
## KPSS Level = 0.10982, Truncation lag parameter = 6, p-value = 0.1

The results of the ADF and KPSS tests show that time series is stationary.

##ARIMA

par(mfrow = c(1,2))
acf(Fuel_data_log_diff$Price_log_diff,lag.max=20)
pacf(Fuel_data_log_diff$Price_log_diff,lag.max=20)

## AIC of ARIMA Models

AIC(
  arima(Fuel_data_log_diff$Price_log,order=c(0,1,1)),
  arima(Fuel_data_log_diff$Price_log,order=c(0,1,2)),
  arima(Fuel_data_log_diff$Price_log,order=c(0,1,3)),
  arima(Fuel_data_log_diff$Price_log,order=c(1,1,0)),
  arima(Fuel_data_log_diff$Price_log,order=c(1,1,1)),
  arima(Fuel_data_log_diff$Price_log,order=c(2,1,0)),
  arima(Fuel_data_log_diff$Price_log,order=c(3,1,0))
)

##                                                         df       AIC
## arima(Fuel_data_log_diff$Price_log, order = c(0, 1, 1))  2 -3568.583
## arima(Fuel_data_log_diff$Price_log, order = c(0, 1, 2))  3 -3612.266
## arima(Fuel_data_log_diff$Price_log, order = c(0, 1, 3))  4 -3622.255
## arima(Fuel_data_log_diff$Price_log, order = c(1, 1, 0))  2 -3636.575
## arima(Fuel_data_log_diff$Price_log, order = c(1, 1, 1))  3 -3634.718
## arima(Fuel_data_log_diff$Price_log, order = c(2, 1, 0))  3 -3634.702
## arima(Fuel_data_log_diff$Price_log, order = c(3, 1, 0))  4 -3633.285

According to the above AIC values Models, ARIMA (1,1,0) is the best model with an AIC = -3636.578.

BIC of ARIMA Models

BIC(
  arima(Fuel_data_log_diff$Price_log,order=c(0,1,1)),
  arima(Fuel_data_log_diff$Price_log,order=c(0,1,2)),
  arima(Fuel_data_log_diff$Price_log,order=c(0,1,3)),
  arima(Fuel_data_log_diff$Price_log,order=c(1,1,0)),
  arima(Fuel_data_log_diff$Price_log,order=c(1,1,1)),
  arima(Fuel_data_log_diff$Price_log,order=c(2,1,0)),
  arima(Fuel_data_log_diff$Price_log,order=c(3,1,0))
)

##                                                         df       BIC
## arima(Fuel_data_log_diff$Price_log, order = c(0, 1, 1))  2 -3559.884
## arima(Fuel_data_log_diff$Price_log, order = c(0, 1, 2))  3 -3599.219
## arima(Fuel_data_log_diff$Price_log, order = c(0, 1, 3))  4 -3604.859
## arima(Fuel_data_log_diff$Price_log, order = c(1, 1, 0))  2 -3627.877
## arima(Fuel_data_log_diff$Price_log, order = c(1, 1, 1))  3 -3621.670
## arima(Fuel_data_log_diff$Price_log, order = c(2, 1, 0))  3 -3621.655
## arima(Fuel_data_log_diff$Price_log, order = c(3, 1, 0))  4 -3615.888

According to the above BIC values Models, ARIMA (1,1,1) is the best model with an BIC = -3621.676.

Auto ARIMA

auto.arima(Fuel_data_log_diff$Price_log,stationary=FALSE,allowdrift=FALSE,
seasonal=FALSE,stepwise=FALSE,approximation=FALSE)

## Series: Fuel_data_log_diff$Price_log 
## ARIMA(1,1,0) 
## 
## Coefficients:
##          ar1
##       0.6031
## s.e.  0.0332
## 
## sigma^2 = 0.0001009:  log likelihood = 1820.29
## AIC=-3636.58   AICc=-3636.55   BIC=-3627.88

As per the output of the model, we can conclude that ARIMA(1,1,0) is the best model which can be used for further analysis.

Ljung - Box Test

Model_1 = arima(Fuel_data_log_diff$Price_log,order=c(1,1,0))
forecast::checkresiduals(Model_1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 7.5908, df = 9, p-value = 0.5758
## 
## Model df: 1.   Total lags used: 10

From the viz, we can say that the residuals seem to be white noise .

par(mfrow=c(1,2))
res = Model_1$resid
acf(res,lag.max=20)
pacf(res,lag.max=20)

## Predicting Model Performance using Forecast

plot(forecast(Model_1,h=150),main="ARIMA(1,1,0) Forecast",ylab="Fuel Price per Gallon ($)",xlab="Date")

prediction <- Arima(Fuel_data_log_diff$Price, model = Model_1)
accuracy(prediction)

##                        ME      RMSE        MAE         MPE      MAPE      MASE
## Training set 0.0001401753 0.0390067 0.02871679 0.009395186 0.9943248 0.7846193
##                      ACF1
## Training set -0.006829676