Libraries

library(readxl)
library(ggplot2)
library(fpp3)
library(dplyr)
library(fst)
library(tsibbledata)
library(tsibble)
library(feasts)
library(lubridate)
library(seasonal)
library(zoo)
library(fable)
library(readxl)
library(ggplot2)
library(forecast)
library(tidyverse)
library(kableExtra)

 
 
 

Data

# Reading data and converting date
data <- read_excel("data/DRO Difference 2021 -- formatted.xlsx")
data$Date = as.Date(data$Date, formate = "%Y/%U")

# data
head(data)
# A tibble: 6 x 5
  Date       Day       DRO   Actual Difference
  <date>     <chr>     <chr>  <dbl>      <dbl>
1 2021-01-04 Monday    2759    1873       -886
2 2021-01-05 Tuesday   3387    3100       -287
3 2021-01-06 Wednesday 3689    3378       -311
4 2021-01-07 Thursday  3796    3286       -510
5 2021-01-08 Friday    3492    3215       -277
6 2021-01-09 Saturday  2535    2098       -437
dim(data) # not very large
[1] 196   5

 
 

- Data Exploration

# converting data to tsibble
data <- data %>%
  group_by(Date) %>%
  as_tsibble(index = Date)

data %>% gg_tsdisplay()

ndiffs(data$Actual)
[1] 0
data %>% unitroot_nsdiffs()
nsdiffs 
      0 
# checking NA's
colSums(is.na(data)) 
      Date        Day        DRO     Actual Difference 
         0          0          8          3         11 

 
 

- Data Preparation

# Finding which days have NA's
data$Day[which(is.na(data$Actual))]
[1] "Tuesday" "Monday"  "Sunday" 
# Calculating average value for each missing day
avgSun <- (sum(data[which(data$Day == "Sunday"), 4], na.rm = TRUE)) / (sum(data$Day == "Sunday"))
avgMon <- (sum(data[which(data$Day == "Monday"), 4], na.rm = TRUE)) / (sum(data$Day == "Monday"))
avgTue <- (sum(data[which(data$Day == "Tuesday"), 4], na.rm = TRUE)) / (sum(data$Day == "Tuesday"))

# finding locations
which(is.na(data$Actual))
[1]  44 148 182
data$Day[44]
[1] "Tuesday"
data$Day[148]
[1] "Monday"
data$Day[182]
[1] "Sunday"
# replacing with average - definitely not the best way but works for now
data$Actual[44] = round(avgTue)
data$Actual[148] = round(avgMon)
data$Actual[182] = round(avgSun)




# creating training data
train <- data %>% filter_index(.~ "2021-07-01")

 
 
 
 
 

Models

# Model 1
fit <- train %>%
  model(NNETAR(log(Actual)))

report(fit) # NNAR(1,1,2)[7]
Series: Actual 
Model: NNAR(1,1,2)[7] 
Transformation: log(Actual) 

Average of 20 networks, each of which is
a 2-2-1 network with 9 weights
options were - linear output units 

sigma^2 estimated as 0.02488

 
 

- Model 1 Evaluation

fit %>% gg_tsresiduals()

# There is one significant outlier in the residual histogram. 
# Will find and replace.

test = fit %>% augment()
find.outlier <- boxplot(test$.resid, id=list(method="y"))

find.outlier = find.outlier$out # 4 significant outliers

list = list()
list[[1]] = test[which(test$.resid == find.outlier[1]),c(2,3)]
list[[2]] = test[which(test$.resid == find.outlier[2]),c(2,3)]
list[[3]] = test[which(test$.resid == find.outlier[3]),c(2,3)]
list[[4]] = test[which(test$.resid == find.outlier[4]),c(2,3)]

outliers = do.call(bind_rows, list)
outliers
# A tsibble: 4 x 2 [1D]
  Date       Actual
  <date>      <dbl>
1 2021-01-31    358
2 2021-02-02   2898
3 2021-02-09   1991
4 2021-04-17   1629
# The observation on 2021-01-31 is likely causing the extreme residual. 
# Going to replace it with the average value for it's day. 

 
 

- Replacing Data

loc = which(data == 358, arr.ind = TRUE) # finding location
data[loc[1],2] # finding day
# A tibble: 1 x 1
  Day   
  <chr> 
1 Sunday
data$Actual[loc[1]] = round(avgSun) # replacing

train <- data %>% filter_index(.~ "2021-07-01")

 
 

- Model 2

# Model 2
fit2 <- train %>%
  model(NNETAR(log(Actual)))

report(fit2) # NNAR(6,1,4)[7]
Series: Actual 
Model: NNAR(6,1,4)[7] 
Transformation: log(Actual) 

Average of 20 networks, each of which is
a 7-4-1 network with 37 weights
options were - linear output units 

sigma^2 estimated as 0.004846

 
 

- Model 2 Evaluation

fit2 %>% gg_tsresiduals()

# Much better
test2 = fit2 %>% augment() # just testing
boxplot(test2$.resid)

# forecasted values
fc = fit2 %>%
  forecast(h = 17)

fc %>% autoplot(data, level=35)

fc %>% accuracy(data)
# A tibble: 1 x 10
  .model              .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
  <chr>               <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
1 NNETAR(log(Actual)) Test  -115.  211.  176. -7.49  9.77 0.742 0.706 -0.299

Final Comparison

compare = list()
compare <- data %>% 
  filter_index("2021-07-02" ~.) %>%
  select(Date, Actual, DRO)

compare[,4] = round(fc$.mean)
names(compare)[4] = "Preds"

compare$DRO = as.numeric(compare$DRO)
compare$DRO[3] = mean(compare$DRO, na.rm=TRUE)

ggplot(data = compare, mapping = aes(x = Date)) + 
  geom_line(aes(y = Actual), color = "red") + 
  geom_point(aes(y = Actual), color = "red") + 
  geom_line(aes(y = DRO), color = "orange") + 
  geom_point(aes(y = DRO), color = "orange") + 
  geom_line(aes(y = Preds), color = "darkgreen") + 
  geom_point(aes(y = Preds), color = "darkgreen") + 
  ggtitle("Actual (red)   v   DRO (orange)   v   NNAR (green)")