Time series analysis using Prophet Model

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.5     ✓ stringr 1.4.0
## ✓ tidyr   1.1.4     ✓ forcats 0.5.1
## ✓ readr   2.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(ggplot2)
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
library(mltools)
## 
## Attaching package: 'mltools'
## The following object is masked from 'package:tidyr':
## 
##     replace_na
library(tidyr)
library(forecast)
## Warning: package 'forecast' was built under R version 4.1.2
library(tseries)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(zoo)
library(sjPlot)
library(sjmisc)
## 
## Attaching package: 'sjmisc'
## The following object is masked from 'package:mltools':
## 
##     replace_na
## The following object is masked from 'package:purrr':
## 
##     is_empty
## The following object is masked from 'package:tidyr':
## 
##     replace_na
## The following object is masked from 'package:tibble':
## 
##     add_case
library(sjlabelled)
## 
## Attaching package: 'sjlabelled'
## The following object is masked from 'package:forcats':
## 
##     as_factor
## The following object is masked from 'package:ggplot2':
## 
##     as_label
## The following object is masked from 'package:dplyr':
## 
##     as_label
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:sjlabelled':
## 
##     as_character, as_label
## The following object is masked from 'package:sjmisc':
## 
##     is_empty
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice
Fuel_prices_data <- read_excel("/Users/nishanthreddy/Downloads/Fuel_prices_data.xlsx")
head(Fuel_prices_data)
## # A tibble: 6 × 2
##   Week                Price
##   <dttm>              <dbl>
## 1 2022-01-17 00:00:00  3.40
## 2 2022-01-10 00:00:00  3.39
## 3 2022-01-03 00:00:00  3.38
## 4 2021-12-27 00:00:00  3.38
## 5 2021-12-20 00:00:00  3.40
## 6 2021-12-13 00:00:00  3.41
summary(Fuel_prices_data)
##       Week                         Price      
##  Min.   :1993-04-05 00:00:00   Min.   :0.949  
##  1st Qu.:2000-06-15 12:00:00   1st Qu.:1.329  
##  Median :2007-08-27 00:00:00   Median :2.284  
##  Mean   :2007-08-27 00:00:00   Mean   :2.242  
##  3rd Qu.:2014-11-06 12:00:00   3rd Qu.:2.913  
##  Max.   :2022-01-17 00:00:00   Max.   :4.165
Fuel_data <- Fuel_prices_data %>%
  filter(Week >= ymd("2000=01-01")) %>% rename(Year = "Week",Avg_price ='Price') %>% 
  mutate(Month_yr = format(Year, "%Y-%m")) %>% distinct(Month_yr,.keep_all=TRUE) 
as_tibble(Fuel_data)
## # A tibble: 265 × 3
##    Year                Avg_price Month_yr
##    <dttm>                  <dbl> <chr>   
##  1 2022-01-17 00:00:00      3.40 2022-01 
##  2 2021-12-27 00:00:00      3.38 2021-12 
##  3 2021-11-29 00:00:00      3.48 2021-11 
##  4 2021-10-25 00:00:00      3.48 2021-10 
##  5 2021-09-27 00:00:00      3.27 2021-09 
##  6 2021-08-30 00:00:00      3.24 2021-08 
##  7 2021-07-26 00:00:00      3.23 2021-07 
##  8 2021-06-28 00:00:00      3.18 2021-06 
##  9 2021-05-31 00:00:00      3.12 2021-05 
## 10 2021-04-26 00:00:00      2.96 2021-04 
## # … with 255 more rows

##Prophet Model

The time series data is divided into two parts: train data (from January 2000 to December 2014) and test data (from January 2015 to January 2021). The prophet model is then built based on the train data and forecast. The future forecast period units have been adjusted to reflect the fact that this series is collected at the month level.

prophet_fueldata = Fuel_data %>% 
    rename(ds = Year,
    y = Avg_price)  

train = prophet_fueldata %>% 
  filter(ds<ymd("2015-01-01"))

test = prophet_fueldata %>%
  filter(ds>=ymd("2015-01-01"))

model = prophet(train)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future = make_future_dataframe(model,periods = 85, freq = 'month')
forecast = predict(model,future)

plot(model,forecast)+
ylab("Average Fuel Price")+xlab("Date")+labs(title = 'Forecast: Average Fuel Price',subtitle = 'January 2000 - December 2021')+theme_bw()

Time series Components plots

prophet_plot_components(model,forecast)

We can see taht there’s an upward trend in the Jan-Mid of March period and then there’s a fall in the April followed by few fluctuations till July then during Fall it dips and rises afterwards.

##Finding Saturation points ### After finding the summaries of both the test and train data, we need to find the saturation points which means the threshold of the price which is reasonable as seen in the last 20 years of the data.

summary(train)
##        ds                            y           Month_yr        
##  Min.   :2000-01-31 00:00:00   Min.   :1.137   Length:180        
##  1st Qu.:2003-10-20 00:00:00   1st Qu.:1.686   Class :character  
##  Median :2007-07-12 12:00:00   Median :2.654   Mode  :character  
##  Mean   :2007-07-13 07:36:00   Mean   :2.560                     
##  3rd Qu.:2011-04-04 00:00:00   3rd Qu.:3.373                     
##  Max.   :2014-12-29 00:00:00   Max.   :4.146
summary(test)
##        ds                            y           Month_yr        
##  Min.   :2015-01-26 00:00:00   Min.   :1.870   Length:85         
##  1st Qu.:2016-10-31 00:00:00   1st Qu.:2.341   Class :character  
##  Median :2018-07-30 00:00:00   Median :2.589   Mode  :character  
##  Mean   :2018-07-28 02:32:28   Mean   :2.604                     
##  3rd Qu.:2020-04-27 00:00:00   3rd Qu.:2.857                     
##  Max.   :2022-01-17 00:00:00   Max.   :3.478
#setting floor for train data
train$floor = 1.13
train$cap = 4.14

#setting floor for forecast data

future$floor = 1.8
future$cap = 3.4
sat_model = prophet(train,growth='logistic')
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
sat_forecast = predict(sat_model,future)

plot(sat_model,sat_forecast)+ylim(0,4.5)+xlab("Date")+ylab("Avg Price")+labs(title = 'Forecast: Average Price of Fuel ',subtitle = 'Saturation Points')+theme_bw()

### We set these saturation thresholds for the prophet model, and the resulting figure shows that the predicted values are substantially different from the previous forecast, and that the increasing trend is not as sharp as it was before the limitations were established.

##Examination for Change points

model = prophet(train, n.changepoints = 10)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future = make_future_dataframe(model,periods = 85, freq = 'month')
plot(model,forecast)+add_changepoints_to_plot(model)+xlab("Date")+ylab("Avg Price")+labs(title = 'Forecast: Average Price of Prices',subtitle = 'January 2000 - December 2021')+theme_bw()

These are the two change points observed in the Time series.

##Chekcing the Type of Seasonilty

additive = prophet(train)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
add_fcst = predict(additive,future)

plot(additive,add_fcst)+
ylim(0,5)+xlab("Date")+ylab("Avg Price")+labs(title = 'Forecast: Average Price of Fuel',subtitle = 'Additive Seasonality Model')+theme_bw()

prophet_plot_components(additive,add_fcst)

The additive model does not appear to accurately fit the increased seasonal changes in average price values towards the end of the training period, although it performs well in the beginning.

multi = prophet(train,seasonality.mode = 'multiplicative')
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
multi_fcst = predict(multi,future)

plot(multi,multi_fcst)+ylim(0,5) +theme_bw()+xlab("Date")+ylab("Avg Price")+labs(title = 'Forecast: Average Price of Fuel',subtitle = 'Multiplicative Seasonality Model')

prophet_plot_components(multi,multi_fcst)

Both the models seem to have the same trends.

Model Assessment

forecast_metric_data = forecast %>% 
  as_tibble() %>% 
  filter(ds>=ymd("2015-01-01"))
## Warning in base::check_tzones(e1, e2): 'tzone' attributes are inconsistent
RMSE = sqrt(mean((test$y - forecast_metric_data$yhat)^2))
MAE = mean(abs(test$y - forecast_metric_data$yhat))
MAPE = mean(abs((test$y - forecast_metric_data$yhat)/test$y))
print(paste("RMSE:",round(RMSE,2)))
## [1] "RMSE: 1.85"
print(paste("MAE:",round(MAE,2)))
## [1] "MAE: 1.76"
print(paste("MAPE:",round(MAPE,2)))
## [1] "MAPE: 0.71"

RMSE for the above model is ‘1.85’, whereas MAE for the same model is slightly lower which is ‘1.76’. The Mean Absolute percentage Error is 71%

Cross Validation

Model cross-validation will be performed by making the appropriate adjustments to incorporate monthly data. Setting the initial training time to 5 years and the rolling horizon to 1 year appears to produce relatively low error metrics after a variety of combinations.

df.cv <- cross_validation(model ,horizon=365,period = 365,initial = 5*365,units='days')
## Making 9 forecasts with cutoffs between 2005-12-31 and 2013-12-29
head(df.cv)
## # A tibble: 6 × 6
##       y ds                   yhat yhat_lower yhat_upper cutoff             
##   <dbl> <dttm>              <dbl>      <dbl>      <dbl> <dttm>             
## 1  2.40 2006-01-30 00:00:00  2.34       2.21       2.47 2005-12-31 00:00:00
## 2  2.30 2006-02-27 00:00:00  2.40       2.27       2.52 2005-12-31 00:00:00
## 3  2.54 2006-03-27 00:00:00  2.53       2.40       2.66 2005-12-31 00:00:00
## 4  2.96 2006-04-24 00:00:00  2.52       2.39       2.66 2005-12-31 00:00:00
## 5  2.91 2006-05-29 00:00:00  2.66       2.53       2.79 2005-12-31 00:00:00
## 6  2.91 2006-06-26 00:00:00  2.68       2.55       2.82 2005-12-31 00:00:00
df.cv %>% 
  ggplot()+
  geom_point(aes(ds,y)) +
  geom_point(aes(ds,yhat,color=factor(cutoff)))+
  theme_bw()+
  xlab("Date")+
  ylab("Avg Price")+
  scale_color_discrete(name = 'Cutoff')+
  theme(legend.position = "None")+
  labs(
    title = 'Forecast: Average Price of Fuel',
    subtitle = 'Cross Validation'
  )

plot_cross_validation_metric(df.cv, metric = 'rmse')

plot_cross_validation_metric(df.cv, metric = 'mape')

The rolling window predictions are compared to actuals in the cross validation result plot. For a one-year forecast horizon, the RMSE is between 0.25 and 0.75. The average absolute percent inaccuracy varies from 10 to 25%.