# Loading required packages

library(tidyverse)
library(dbplyr)
library(lubridate)
library(ggplot2)
library(janitor)
library(kableExtra)
library(broom)
library(vtable)

The Data

The dataset used for this time series analysis is the U.S city avaerage gasoline price dataset sourced from the Federal Reserve Economic Data (FRED) ,St. Louis, and the United States Bureau of Labor Statistics. It contains monthly average regular unleaded gasoline prices from January 1976 to December 2021.The two fields in the data are date of observation and the average price.

When I first came across this dataset on the FRED website, I was impressed with the amount of historical data recorded. I moved here from India, where I’ve driven for 10 years, and I plan to start driving in the US as soon as possible, so analyzing the trend of gasoline prices seemed like an interesting choice.

Fuel price fluctuations are common across the world.The concern is when the rates increase considerably and consistently over a short time period. The most common reasons listed for price fluctuations are market competitors, tax and transportation fees, inflation, supply and demand, etc.

The goal of this analysis, as of now, is to understand how gasoline prices have fluctuated over the years and to determine whether: (i) prices follow patterns based on time of the year and, (ii) the fluctuations can be attributed to any obvious external socio-economic factors.

The dataset is continuous and does not have any gaps in monthly datapoints. it may not be easy to forecast the prices for very far into the future as there could be potential unforeseen changes in economic or social factors that affect the prices. However, predicting fluctuations for the near future should be closer to actuals.

For the purpose of this study, we will be considering data from January 2000 to December 2021.

#Loading the dataset

fueldata_raw <- read_csv("/Users/samishav/Documents/Spring 2022/Forecasting & Time Series/Forecasting/APU000074714.csv")
str(fueldata_raw)
## spec_tbl_df [552 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ DATE        : Date[1:552], format: "1976-01-01" "1976-02-01" ...
##  $ APU000074714: num [1:552] 0.605 0.6 0.594 0.592 0.6 0.616 0.623 0.628 0.63 0.629 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   DATE = col_date(format = ""),
##   ..   APU000074714 = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(fueldata_raw)
##       DATE             APU000074714  
##  Min.   :1976-01-01   Min.   :0.592  
##  1st Qu.:1987-06-23   1st Qu.:1.119  
##  Median :1998-12-16   Median :1.323  
##  Mean   :1998-12-16   Mean   :1.767  
##  3rd Qu.:2010-06-08   3rd Qu.:2.470  
##  Max.   :2021-12-01   Max.   :4.090
#Filter study period (2000-2021) , rename columns and remove any duplicate data entries per month.

fueldata <- fueldata_raw %>% filter(DATE >= ymd("2000=01-01")) %>% rename(obs_date = "DATE",avg_price ='APU000074714') %>% mutate(month_year = format(obs_date, "%Y-%m")) %>% distinct(month_year,.keep_all=TRUE) 

as_tibble(fueldata)
## # A tibble: 264 × 3
##    obs_date   avg_price month_year
##    <date>         <dbl> <chr>     
##  1 2000-01-01      1.30 2000-01   
##  2 2000-02-01      1.37 2000-02   
##  3 2000-03-01      1.54 2000-03   
##  4 2000-04-01      1.51 2000-04   
##  5 2000-05-01      1.50 2000-05   
##  6 2000-06-01      1.62 2000-06   
##  7 2000-07-01      1.59 2000-07   
##  8 2000-08-01      1.51 2000-08   
##  9 2000-09-01      1.58 2000-09   
## 10 2000-10-01      1.56 2000-10   
## # … with 254 more rows
fueldata
## # A tibble: 264 × 3
##    obs_date   avg_price month_year
##    <date>         <dbl> <chr>     
##  1 2000-01-01      1.30 2000-01   
##  2 2000-02-01      1.37 2000-02   
##  3 2000-03-01      1.54 2000-03   
##  4 2000-04-01      1.51 2000-04   
##  5 2000-05-01      1.50 2000-05   
##  6 2000-06-01      1.62 2000-06   
##  7 2000-07-01      1.59 2000-07   
##  8 2000-08-01      1.51 2000-08   
##  9 2000-09-01      1.58 2000-09   
## 10 2000-10-01      1.56 2000-10   
## # … with 254 more rows

The dataset is loaded and filtered for the period of January 2000 till December 2021. The columns have been renamed in the final table for readability.

Exploratory Data Analysis

Structure of the Dataset

str(fueldata)
## tibble [264 × 3] (S3: tbl_df/tbl/data.frame)
##  $ obs_date  : Date[1:264], format: "2000-01-01" "2000-02-01" ...
##  $ avg_price : num [1:264] 1.3 1.37 1.54 1.51 1.5 ...
##  $ month_year: chr [1:264] "2000-01" "2000-02" "2000-03" "2000-04" ...

The dataset considered for the study from the period of January, 2000 to December, 2021, contains 264 records. The average price is numeric and the original observed date field is of date type. The derived column month_year is a string column.

Summary of the Dataset

summary <- sumtable(fueldata,add.median = TRUE,out = 'kable')

summary %>% kable_styling()
Summary Statistics
Variable N Mean Std. Dev. Min Pctl. 25 Pctl. 50 Pctl. 75 Max
avg_price 264 2.525 0.73 1.13 1.965 2.541 3.023 4.09

The mean average price is 2.525 which is almost comparable with the median of 2.541. This indicates that there is a possibility for the price to follow a normal distribution. This needs to be further examined through boxplots and histograms. No significant outliers are evident at this stage.

Trend of Price over Time

# Trend chart
monthly_price_plot = ggplot(fueldata)+
  geom_line(aes(obs_date,avg_price))+
  theme_bw()+
  xlab("Date")+
  ylab("Average Price")+
  labs(
    title = 'Average Price of Unleaded Regular Gasoline, FRED',
    subtitle = 'January 2000 - December 2021'
  )

#monthly_price_plot

monthly_price_plot + 
  geom_smooth(aes(obs_date,avg_price),method='lm',color='orange')

The trend chart of average price over time reveals a gradual positive change in fuel price over the years.The peak in average fuel price of $4.09, during the period of study, is observed in 2008 which maybe attributed to the impact of the global recession. There was a sharp fall in prices immediately after, in 2009, and then again in 2015 and 2016. The next notable decrease in prices was in early 2020 followed again by a steep rise.

#Average Price by Month of Year
fueldata %>%mutate(month = month(obs_date, label = TRUE)) %>% group_by(month) %>% summarize(avg_monthly_price = mean(avg_price)) %>% ungroup -> monthdata

#monthdata 


avg_month_price_plot = ggplot(monthdata)+
  geom_col(aes(month,avg_monthly_price,fill=month))+
  theme_bw()+
  xlab("Month")+
  ylab("Average Price")+
  labs(
    title = 'Average Price of Unleaded Regular Gasoline by Month of the Year',
    subtitle = 'January 2000 - December 2021'
  )

avg_month_price_plot

The average fuel prices seem to be highest in the second and third quarters. The dip in prices during the winter months may be due to demand and supply fluctuations associated with extreme weather.

Histogram of Average Price

#Histogram of Average Fuel Price
p <- fueldata  %>%
  ggplot( aes(x=avg_price)) +
    geom_histogram(binwidth = 0.5, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
    labs(
    title = 'Histogram: Average Price of Unleaded Regular Gasoline, FRED',
    subtitle = 'January 2000 - December 2021'
  )+
  xlab("Average Price")+
  ylab ("Frequency")
p

The distribution does not seem to follow a proper normal distribution. as there is a slight dip in frequency when the average price is between 1.75 and 2.25 before it peaks at around 2.5. Next step is to attempt a logarithmic transformation on the fuel price to see if that reveals a clearer distribution.

Histogram of Transformed Average Price

#Logarithmic Conversion and histogram plot 
p2 <- fueldata  %>%
  ggplot( aes(x=log(avg_price))) +
    geom_histogram(binwidth = 0.5, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
    labs(
    title = 'Histogram: Logarithmic Average Price of Unleaded Regular Gasoline, FRED',
    subtitle = 'January 2000 - December 2021'
  )+
  xlab("Log(Average Price)")+
  ylab ("Frequency")
    
p2

The histogram for the logarithmic avergae fuel price reveals a slightly left skewed distribution.

Boxplot of Average Price

ggplot(fueldata, aes(y= avg_price)) +
  geom_boxplot(fill='#e9ecef', color="#69b3a2")+
  ylab("Average Price")+
  labs(
    title = 'Boxplot: Average Price of Unleaded Regular Gasoline, FRED',
    subtitle = 'January 2000 - December 2021'
  )+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

The boxplot further corroborates the earlier observation that there are no significant outliers in the dataset.

Model Fitting and Results

#Fitting a linear model on the time series.
mod1 = lm(avg_price~obs_date,data=fueldata)
summary(mod1)
## 
## Call:
## lm(formula = avg_price ~ obs_date, data = fueldata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1688 -0.5073 -0.1509  0.5443  1.7024 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.399e-01  2.566e-01   0.935    0.351    
## obs_date    1.527e-04  1.695e-05   9.011   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6389 on 262 degrees of freedom
## Multiple R-squared:  0.2366, Adjusted R-squared:  0.2337 
## F-statistic: 81.19 on 1 and 262 DF,  p-value: < 2.2e-16
# Model Results
mod1 %>%
  tidy() %>%
  mutate(
    p.value = scales::pvalue(p.value),
    term = c("Intercept", 'Time')
  ) %>%
  kable(    
    caption = "Estimates for Impact of Linear Time Trend on Average Gasoline Prices",
    col.names = c("Predictor", "B", "SE", "T Statistic", "p-value"),
    digits = c(0,5, 3, 2, 3)
  )  %>% kable_styling()
Estimates for Impact of Linear Time Trend on Average Gasoline Prices
Predictor B SE T Statistic p-value
Intercept 0.23990 0.257 0.93 0.351
Time 0.00015 0.000 9.01 <0.001

The model results show a p value of less than 0.001 for Time, which indicates that there is a statistically significant positive linear relationship between time and average fuel price. However, intercept value is not statistically significant as per high p value in the model results.The coefficient value for Time indicates that the change in price because of a unit change in time is very low (0.00015) in magnitude.