Â
Â
Â
library(readxl)
library(ggplot2)
library(fpp3)
library(dplyr)
library(fst)
library(tsibbledata)
library(tsibble)
library(feasts)
library(lubridate)
library(seasonal)
library(zoo)
library(fable)
library(readxl)
library(ggplot2)
library(forecast)
library(tidyverse)
library(kableExtra)
Â
Â
Â
# Reading data and converting date
data <- read_excel("data/DRO Difference 2021 -- formatted.xlsx")
data$Date = as.Date(data$Date, formate = "%Y/%U")
# data
head(data)
# A tibble: 6 x 5
Date Day DRO Actual Difference
<date> <chr> <chr> <dbl> <dbl>
1 2021-01-04 Monday 2759 1873 -886
2 2021-01-05 Tuesday 3387 3100 -287
3 2021-01-06 Wednesday 3689 3378 -311
4 2021-01-07 Thursday 3796 3286 -510
5 2021-01-08 Friday 3492 3215 -277
6 2021-01-09 Saturday 2535 2098 -437
dim(data) # not very large
[1] 196 5
Â
Â
# converting data to tsibble
data <- data %>%
group_by(Date) %>%
as_tsibble(index = Date)
data %>% gg_tsdisplay()
ndiffs(data$Actual)
[1] 0
data %>% unitroot_nsdiffs()
nsdiffs
0
# checking NA's
colSums(is.na(data))
Date Day DRO Actual Difference
0 0 8 3 11
Â
Â
# Finding which days have NA's
data$Day[which(is.na(data$Actual))]
[1] "Tuesday" "Monday" "Sunday"
# Calculating average value for each missing day
avgSun <- (sum(data[which(data$Day == "Sunday"), 4], na.rm = TRUE)) / (sum(data$Day == "Sunday"))
avgMon <- (sum(data[which(data$Day == "Monday"), 4], na.rm = TRUE)) / (sum(data$Day == "Monday"))
avgTue <- (sum(data[which(data$Day == "Tuesday"), 4], na.rm = TRUE)) / (sum(data$Day == "Tuesday"))
# finding locations
which(is.na(data$Actual))
[1] 44 148 182
data$Day[44]
[1] "Tuesday"
data$Day[148]
[1] "Monday"
data$Day[182]
[1] "Sunday"
# replacing with average - definitely not the best way but works for now
data$Actual[44] = round(avgTue)
data$Actual[148] = round(avgMon)
data$Actual[182] = round(avgSun)
# creating training data
train <- data %>% filter_index(.~ "2021-07-01")
Â
Â
Â
Â
Â
# Model 1
fit <- train %>%
model(NNETAR(log(Actual)))
report(fit) # NNAR(1,1,2)[7]
Series: Actual
Model: NNAR(1,1,2)[7]
Transformation: log(Actual)
Average of 20 networks, each of which is
a 2-2-1 network with 9 weights
options were - linear output units
sigma^2 estimated as 0.02488
Â
Â
fit %>% gg_tsresiduals()
# There is one significant outlier in the residual histogram.
# Will find and replace.
test = fit %>% augment()
find.outlier <- boxplot(test$.resid, id=list(method="y"))
find.outlier = find.outlier$out # 4 significant outliers
list = list()
list[[1]] = test[which(test$.resid == find.outlier[1]),c(2,3)]
list[[2]] = test[which(test$.resid == find.outlier[2]),c(2,3)]
list[[3]] = test[which(test$.resid == find.outlier[3]),c(2,3)]
list[[4]] = test[which(test$.resid == find.outlier[4]),c(2,3)]
outliers = do.call(bind_rows, list)
outliers
# A tsibble: 4 x 2 [1D]
Date Actual
<date> <dbl>
1 2021-01-31 358
2 2021-02-02 2898
3 2021-02-09 1991
4 2021-04-17 1629
# The observation on 2021-01-31 is likely causing the extreme residual.
# Going to replace it with the average value for it's day.
Â
Â
loc = which(data == 358, arr.ind = TRUE) # finding location
data[loc[1],2] # finding day
# A tibble: 1 x 1
Day
<chr>
1 Sunday
data$Actual[loc[1]] = round(avgSun) # replacing
train <- data %>% filter_index(.~ "2021-07-01")
Â
Â
# Model 2
fit2 <- train %>%
model(NNETAR(log(Actual)))
report(fit2) # NNAR(6,1,4)[7]
Series: Actual
Model: NNAR(6,1,4)[7]
Transformation: log(Actual)
Average of 20 networks, each of which is
a 7-4-1 network with 37 weights
options were - linear output units
sigma^2 estimated as 0.004846
Â
Â
fit2 %>% gg_tsresiduals()
# Much better
test2 = fit2 %>% augment() # just testing
boxplot(test2$.resid)
# forecasted values
fc = fit2 %>%
forecast(h = 17)
fc %>% autoplot(data, level=35)
fc %>% accuracy(data)
# A tibble: 1 x 10
.model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 NNETAR(log(Actual)) Test -115. 211. 176. -7.49 9.77 0.742 0.706 -0.299
compare = list()
compare <- data %>%
filter_index("2021-07-02" ~.) %>%
select(Date, Actual, DRO)
compare[,4] = round(fc$.mean)
names(compare)[4] = "Preds"
compare$DRO = as.numeric(compare$DRO)
compare$DRO[3] = mean(compare$DRO, na.rm=TRUE)
ggplot(data = compare, mapping = aes(x = Date)) +
geom_line(aes(y = Actual), color = "red") +
geom_point(aes(y = Actual), color = "red") +
geom_line(aes(y = DRO), color = "orange") +
geom_point(aes(y = DRO), color = "orange") +
geom_line(aes(y = Preds), color = "darkgreen") +
geom_point(aes(y = Preds), color = "darkgreen") +
ggtitle("Actual (red) v DRO (orange) v NNAR (green)")