Data 624 Project 1
library(knitr)
library(rmdformats)
## Global options
options(max.print="85")
opts_chunk$set(cache=TRUE,
prompt=FALSE,
tidy=TRUE,
comment=NA,
message=FALSE,
warning=FALSE)
opts_knit$set(width=31)
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.3 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ✓ purrr 0.3.3
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x psych::%+%() masks ggplot2::%+%()
## x psych::alpha() masks ggplot2::alpha()
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x xts::first() masks dplyr::first()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x xts::last() masks dplyr::last()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
##
## Attaching package: 'tsibble'
## The following object is masked from 'package:zoo':
##
## index
## The following objects are masked from 'package:lubridate':
##
## interval, new_interval
## ── Attaching packages ─────────────────────────────────────────────────────────────────────── fpp3 0.3 ──
## ✓ tsibbledata 0.2.0 ✓ fable 0.2.1
## ✓ feasts 0.1.5
## ── Conflicts ────────────────────────────────────────────────────────────────────────── fpp3_conflicts ──
## x psych::%+%() masks ggplot2::%+%()
## x psych::alpha() masks ggplot2::alpha()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x xts::first() masks dplyr::first()
## x fabletools::forecast() masks forecast::forecast()
## x tsibble::index() masks zoo::index()
## x tsibble::interval() masks lubridate::interval()
## x dplyr::lag() masks stats::lag()
## x xts::last() masks dplyr::last()
## x tsibble::new_interval() masks lubridate::new_interval()
Part A: Forecasting ATM Withdrawals
What is given is a dataset of cash withdrawals (in 100) from May 1st 2009 to April 30th 2010. To provide value in this forecast, I am going to forecast what is the predicted cash withdrawals at each of the 4 ATMs in May 2010.
# setwd('./Data624/Week 9/Proj1/Data 624 Project 1/')
atm <- readxl::read_excel("ATM624Data.xlsx", col_names = TRUE, col_types = c("date",
"text", "numeric")) %>% transmute(date = as_date(DATE), atm = ATM, cash = Cash) %>%
drop_na() %>% arrange(date, atm)
# atm atm %>% gather(atm, cash, -date)
atm_spread <- atm %>% spread(atm, cash)
atm_spread# A tibble: 365 x 5
date ATM1 ATM2 ATM3 ATM4
<date> <dbl> <dbl> <dbl> <dbl>
1 2009-05-01 96 107 0 777.
2 2009-05-02 82 89 0 524.
3 2009-05-03 85 90 0 793.
4 2009-05-04 90 55 0 908.
5 2009-05-05 99 79 0 52.8
6 2009-05-06 88 19 0 52.2
7 2009-05-07 8 2 0 55.5
8 2009-05-08 104 103 0 559.
9 2009-05-09 87 107 0 904.
10 2009-05-10 93 118 0 879.
# … with 355 more rows
Converting each of the atm series by first grouping by atm (get a grouped tibble), followed by using the function ts on the cash column, making a time series of frequency of every 7 days, a week, into a time series object called atm_ts.
# 1 row 4 columns
par(mfrow = c(1, 4))
# sprintf -- %s for Character string -- %d for integer value
for (i in 1:4) {
boxplot(atm_spread[i + 1], main = sprintf("%s", names(atm_spread)[i + 1]), col = "violet",
fill = "purple", xlab = sprintf("# of outliers = %d", length(boxplot(atm_spread[i +
1], plot = F)$out)))
}min <- as.Date("2009-05-01")
max <- as.Date("2010-04-30")
atm %>% ggplot(aes(x = date, y = cash, col = atm)) + geom_line(show.legend = FALSE) +
facet_wrap(~atm, ncol = 1, scales = "free_y") + labs(title = "Cash Withdrawals by ATM",
subtitle = "May 2009 - April 2010", x = "Date(MON YY)") + scale_x_date(date_breaks = "1 month",
limits = c(min, max), date_labels = "%b %y") + scale_y_continuous("Cash in '000s",
labels = scales::dollar_format(scale = 0.1, suffix = "K"))describe(atm_spread)[-1, ] %>% select(-vars) %>% kable(digits = 2L, caption = "Descriptive Statistics of ATM dataset",
table.attr = "style='width:22%;'") %>% kableExtra::kable_styling(full_width = F)| n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ATM1 | 362 | 83.89 | 36.66 | 91.00 | 86.65 | 25.20 | 1.00 | 180.00 | 179.0 | -0.71 | 0.19 | 1.93 |
| ATM2 | 363 | 62.58 | 38.90 | 67.00 | 62.24 | 48.93 | 0.00 | 147.00 | 147.0 | -0.03 | -1.09 | 2.04 |
| ATM3 | 365 | 0.72 | 7.94 | 0.00 | 0.00 | 0.00 | 0.00 | 96.00 | 96.0 | 10.93 | 118.38 | 0.42 |
| ATM4 | 365 | 474.04 | 650.94 | 403.84 | 416.68 | 425.80 | 1.56 | 10919.76 | 10918.2 | 11.39 | 178.91 | 34.07 |
At this point, we know there are 3 missing values in the ATM1 series, and 2 missing values in the ATM2 series. In addition, we do see some outliers in ATM1, ATM3, and ATM4. So we’re going to use a powerful function in the forecast package where it uses Supsmu for non-seasonal series and a robust STL decomposition for seasonal series to estimate missing values and do outlier replacements.
# lambda = 'auto' would not work for outlier replacement
atm.ts = ts(atm_spread %>% select(-date))
atm.ts1 <- atm_spread %>% as_tsibble()
for (i in 1:4) {
atm.ts1[, i + 1] = tsclean(atm.ts[, i])
}
atm.df <- ts_df(atm.ts1)
# atm.ts1 %>% mutate(date = format(date, '%Y-%M-%D'), .drop = TRUE )
atm.df %>% tidyr::spread(id, value) %>% describe() %>% filter(vars != 1) %>% select(-vars) %>%
kable(digits = 2L, caption = "Descriptive Statistics of ATM dataset", table.attr = "style='width:22%;'") %>%
kableExtra::kable_styling(full_width = F)| n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ATM1 | 365 | 84.15 | 36.64 | 91.00 | 86.93 | 25.20 | 1.00 | 180.00 | 179.00 | -0.72 | 0.20 | 1.92 |
| ATM2 | 365 | 62.59 | 38.81 | 67.00 | 62.26 | 48.93 | 0.00 | 147.00 | 147.00 | -0.03 | -1.09 | 2.03 |
| ATM3 | 365 | 0.72 | 7.94 | 0.00 | 0.00 | 0.00 | 0.00 | 96.00 | 96.00 | 10.93 | 118.38 | 0.42 |
| ATM4 | 365 | 444.76 | 351.08 | 402.77 | 414.35 | 424.21 | 1.56 | 1712.07 | 1710.51 | 0.66 | -0.10 | 18.38 |
'data.frame': 1460 obs. of 3 variables:
$ id : Factor w/ 4 levels "ATM1","ATM2",..: 1 1 1 1 1 1 1 1 1 1 ...
$ date : Date, format: "2009-05-01" "2009-05-02" ...
$ value: Time-Series from 1 to 365: 96 82 85 90 99 88 8 104 87 93 ...
Now, please note that n = 365 for all ATM[i] series for 1 <= i <= 4. All the missing values are interpolated and outliers are replaced. Notably, the obvious outliner at position 285, date 2010-02-09 had been replaced for ATM4 series.
# ts_df (atm.ts)
atm.df %>% ggplot(aes(x = date, y = cash, col = atm)) + geom_line(show.legend = FALSE) +
facet_wrap(~atm, ncol = 1, scales = "free_y") + labs(title = "Cash Withdrawals by ATM",
subtitle = "May 2009 - April 2010", x = "Date(MON YY)") + scale_x_date(date_breaks = "1 month",
limits = c(min, max), date_labels = "%b %y") + scale_y_continuous("Cash in '000s",
labels = scales::dollar_format(scale = 0.1, suffix = "K"))tsibble [365 × 5] (S3: tbl_ts/tbl_df/tbl/data.frame)
$ date: Date[1:365], format: "2009-05-01" "2009-05-02" ...
$ ATM1: Time-Series [1:365] from 1 to 365: 96 82 85 90 99 88 8 104 87 93 ...
$ ATM2: Time-Series [1:365] from 1 to 365: 107 89 90 55 79 19 2 103 107 118 ...
$ ATM3: Time-Series [1:365] from 1 to 365: 0 0 0 0 0 0 0 0 0 0 ...
$ ATM4: Time-Series [1:365] from 1 to 365: 777 524.4 792.8 908.2 52.8 ...
- attr(*, "key")= tibble [1 × 1] (S3: tbl_df/tbl/data.frame)
..$ .rows: list<int> [1:1]
.. ..$ : int [1:365] 1 2 3 4 5 6 7 8 9 10 ...
.. ..@ ptype: int(0)
- attr(*, "index")= chr "date"
..- attr(*, "ordered")= logi TRUE
- attr(*, "index2")= chr "date"
- attr(*, "interval")= interval [1:1] 1D
..@ .regular: logi TRUE
Examine seasonality:
# gg_subseries function (Seasonal subseries plots) is still WIP; going back to
# the original ts objects for subseries plotting
atm_ts <- atm.df %>% group_by(atm) %>% group_map(~ts(.x$cash, freq = 7))
str(atm_ts)List of 4
$ : Time-Series [1:365] from 1 to 53: 96 82 85 90 99 88 8 104 87 93 ...
$ : Time-Series [1:365] from 1 to 53: 107 89 90 55 79 19 2 103 107 118 ...
$ : Time-Series [1:365] from 1 to 53: 0 0 0 0 0 0 0 0 0 0 ...
$ : Time-Series [1:365] from 1 to 53: 777 524.4 792.8 908.2 52.8 ...
Maybe seasonal plots from gg_season might not be the best way of visualizing the individual series. ATM1 and ATM2 did give us some blurred images to confirm weekly seasonality exists. Up next, let’s look at medians given in the seasonal subseries plots.
Subseries Plots
# 2 row 2 columns
par(mfrow = c(2, 2))
for (i in 1:4) {
print(ggsubseriesplot(atm_ts[[i]], "html") + labs(title = paste("Subseries Plot: ATM",
i), y = "Cash in hundreds", x = "Day"))
}# For some reason, I couldn't get the printing of the ggsubseriesplot to print
# within the confines of 2 X 2.It’s not hard to imagine there are weekly seasonality in levels of cash at ATMs. As evidenced by the seasonal subseries plots at the weekly level, one can clearly see that there are dips for the median cash withdrawn amount Saturdays for ATM 1, 2, & 4. For ATM2, the median cash levels are also at a relatively level than the rest of the week Fridays in addition to just Saturdays seen at large at other ATMs.
Time Series Analysis
Up until now, we’ve examined the time series and did the necessary adjustments to minimize the errors in the forecast, which we’re going to move onto next. With a good data prep step, we not only understand the frequency of the seasonality but also put us at the best position to build out the best unbiased time series models with minimal errors, or highest accuracy.
ATM1
Subseries Plots (click to jump to that section)
Subseries plot for ATM1 did show that the amount of cash withdrawals peaked on Tuesday and has a trough on Saturday.
ATM1 ts is apparently non-stationary as the lags in ACF are not quickly decreasing to be below the blue dotted line and there is seasonality. The BoxCox transformation did make some difference to the ATM1 ts, so we’re going to keep it.
boxcox_transform <- function(x) {
lambda <- BoxCox.lambda(x)
BoxCox(x, lambda)
}
atm_bc <- lapply(atm_ts, boxcox_transform)
# Saving lambdas from all 4 ATM ts into lambdas
lambdas <- c(0.2615708, 0.7242585, 1.332968, 0.449771)
autoplot(BoxCox(atm_ts[[1]], lambdas[1])) + labs(title = sprintf("Transformed: %s",
"ATM1 Cash Withdrawals"), subtitle = sprintf("lambda = %0.2f", lambdas[1]), y = "Box-Cox Transformed",
x = "Week")atm_ts[[1]] %>% stl(s.window = 7, robust = TRUE) %>% autoplot() + labs(title = "Seasonal & Trend Decomposition for ATM1 Time Series",
x = "Week")The seasonal component in STL (Seasonal Decomposition of Time Series by Loess) decomposition did illustrate that the variation decreases and then increases, therefore we are not going to use a BoxCox transformation lambda in this method. On top of that, the grey rectangular bars did signify that variations was contributed largely in party by the trend component, more so than seasonal and remainder. From the ts itself, I do conclude that we’re going to treat this ts as additive.
As for ETS (Exponential smoothing state space model), we are going to use the BoxCox transformation as it reduced the AICc by a lot.
ETS(A,N,A)
Call:
ets(y = ., lambda = lambdas[1])
Box-Cox transformation: lambda= 0.2616
Smoothing parameters:
alpha = 1e-04
gamma = 0.3513
Initial states:
l = 7.9717
s = -4.5094 0.5635 1.0854 0.5711 0.9551 0.5582
0.7761
sigma: 1.343
AIC AICc BIC
2379.653 2380.275 2418.652
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 2.161635 24.92166 15.86675 -92.24919 110.7242 0.8965114 0.1467074
[1] 0
[1] 1
atm_ts1.bc <- BoxCox(atm_ts[[1]], lambdas[1])
atm_ts1.bc %>% diff(lag = frequency(atm_ts[[1]])) %>% ur.kpss() %>% summary()
#######################
# KPSS Unit Root Test #
#######################
Test is of type: mu with 5 lags.
Value of test-statistic is: 0.0153
Critical value for a significance level of:
10pct 5pct 2.5pct 1pct
critical values 0.347 0.463 0.574 0.739
we need to do 1 seasonal differencing and 0 ordinary difference.
As we can find out from the test-statistics of the KPSS Unit Root Test, we failed to reject the null hypothesis. So the reality is we do have a stationary data in ATM1. By visualizing the PACF and ACF, I came up with a manual order of the arima model ARIMA(p,d,q)(P,D,Q)m of p = 2, P =1, q = 2, Q = 1, d= 0, D= 1 where m = 7.
ggtsdisplay(diff(atm_ts1.bc, lag = frequency(atm_ts[[1]])), main = "ATM1 Cash Withdrawals",
ylab = "Cash in hundreds", xlab = "Week")auto.arima1 <- atm_ts[[1]] %>% auto.arima(lambda = lambdas[1], biasadj = TRUE)
# Arima(atm2, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm2_lambda)
manual.arima1 <- Arima(atm_ts[[1]], order = c(2, 0, 2), seasonal = c(1, 1, 1), lambda = lambdas[1])order of auto.arima:
p d q P D Q Frequency
0 0 2 0 1 1 7
AICc of auto.arima is: 1228.091
AICc of manual arima (2,0,2)(1,1,1)[7] is: 1231.359
Auto.arima seems to be doing better in the corrected AIC. We’re going to use auto.arima for ATM1.
ATM2
Subseries Plots (click to jump to that section)
Subseries plot for ATM1 did show that the amount of cash withdrawals peaked on Sunday and had a relative peak on Thursday. It also has a trough on Saturday. It’s interesting that this ATM is located in a neighborhood where people’s weekends start early on Thursday and you see its Fridays are the 2nd lowest amongst all days during the week. The ACF plot showed that this series is not stationary as the spikes are not quickly decreasing. PACF plot also warranted that this series has the need for seasonal differencing. We’ll further confirm that with nsdiffs() later.
ggtsdisplay(atm_ts[[2]], main = "ATM2 Cash Withdrawals", ylab = "Cash Withdrawal (in hundreds)",
xlab = "Week")autoplot(BoxCox(atm_ts[[2]], lambdas[2])) + labs(title = sprintf("Transformed: %s",
"ATM2 Cash Withdrawals"), subtitle = sprintf("lambda = %0.2f", lambdas[2]), y = "Box-Cox Transformed",
x = "Week")We will take the BoxCox transformed series as it smoothed out the variations.
atm_ts[[2]] %>% stl(s.window = 7, robust = TRUE) %>% autoplot() + labs(title = "Seasonal & Trend Decomposition for ATM2 Time Series",
x = "Week")Note that trend contributed the most to the variations just by looking at the grey bars above.
Exponential Smoothing
Note that we have some very high AICc.
ETS(A,N,A)
Call:
ets(y = ., lambda = lambdas[2])
Box-Cox transformation: lambda= 0.7243
Smoothing parameters:
alpha = 1e-04
gamma = 0.3852
Initial states:
l = 26.7907
s = -17.8373 -13.3183 10.8218 1.8349 4.281 5.7998
8.4181
sigma: 8.5054
AIC AICc BIC
3727.060 3727.682 3766.059
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.7151812 25.35716 17.79237 -Inf Inf 0.8559082 0.02254807
Next, we need to check and see if there are needs for differencing to reduce changes in the level of the time series.
[1] 1
[1] 1
atm_ts2.bc <- BoxCox(atm_ts[[2]], lambdas[2])
atm_ts2.bc %>% diff(lag = frequency(atm_ts[[2]])) %>% ur.kpss() %>% summary()
#######################
# KPSS Unit Root Test #
#######################
Test is of type: mu with 5 lags.
Value of test-statistic is: 0.0162
Critical value for a significance level of:
10pct 5pct 2.5pct 1pct
critical values 0.347 0.463 0.574 0.739
The ndiffs( ) and nsdiffs( ) are telling us that we need to do an ordinary differencing and a seasonal differencing. But in reality, one seasonal differencing was sufficient to get us back to stationary data as indicated by test-statistics 0.0162, not in critical regions.
By visualizing the PACF and ACF, I came up with a manual order of the arima model ARIMA(p,d,q)(P,D,Q)m of p = 1, P =1, q = 1, Q = 0, d= 0, D= 1 where m = 7.
ggtsdisplay(diff(atm_ts2.bc, lag = frequency(atm_ts[[2]])), main = "ATM2 Cash Withdrawals",
ylab = "Cash in hundreds", xlab = "Week")auto.arima2 <- atm_ts[[2]] %>% auto.arima(lambda = lambdas[2], biasadj = TRUE)
# Arima(atm2, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm2_lambda)
manual.arima2 <- Arima(atm_ts[[2]], order = c(1, 0, 1), seasonal = c(1, 1, 0), lambda = lambdas[2])order of auto.arima:
p d q P D Q Frequency
3 0 3 0 1 1 7
AICc of auto.arima is: 2539.692
AICc of manual arima (1,0,1)(1,1,0)[7] is: 2578.072
Auto.arima wins out again for ATM2 with a lower corrected AIC.
One of the options: ARIMA(3,0,3)(0,1,1)[7] with drift
ATM3
Given that we have very limited data points, we should just use a naive method to estimate May 2010 amount in this ATM ts. It is apparent that ATM4 might not be a good candidate for taking the average. I’d assume ATM1 & ATM2 would fit well as a good candidate.
Approach: Expected Value
E[ATM3] at t+1 = \(\sum_{i=0}^{n}\frac{{ATM}_i}{n} \space for \space 1 \leq i \leq 2\)
To illustrate the logic here, we essentially can look at the total withdrawal amount on the following dates where ATM3 has data.
| Time Series | 2010-04-28 | 2010-04-29 | 2010-04-30 | Average Amount |
|---|---|---|---|---|
| ATM1 | $96 | $82 | $85 | $87.67 |
| ATM2 | $107 | $89 | $90 | $95.33 |
| ATM3 | $96 | $82 | $85 | $87.67 |
Note that the average amount of ATM1 and ATM3 are exactly the same. So in order to give ATM3 enough reserves, I’d take the average of ATM1 & ATM2.
ATM4
Subseries Plots (click to jump to that section)
The subseries plot for ATM4 tells me that ATM withdrawals peaked on Sunday and has a trough on Saturday. While it doesn’t appear that there are significant lags outside the seasonal ones that are outside of the blue dotted line for ACF, it does have 2 instances where we see there are spikes outside of the dotted region. I do see there is seasonality shown in the ACF that the ts is non-stationary.
ggtsdisplay(atm_ts[[4]], main = "ATM4 Cash Withdrawals", ylab = "Cash Withdrawal (in hundreds)",
xlab = "Week")autoplot(BoxCox(atm_ts[[4]], lambdas[4])) + labs(title = sprintf("Transformed: %s",
"ATM4 Cash Withdrawals"), subtitle = sprintf("lambda = %0.2f", lambdas[4]), y = "Box-Cox Transformed",
x = "Week")Reduced variations make me feel that I should keep the BoxCox transformation and lambda.
atm_ts[[4]] %>% stl(s.window = 7, robust = TRUE) %>% autoplot() + labs(title = "Seasonal & Trend Decomposition for ATM4 Time Series",
x = "Week")A majority of the variation is attributed to the trend. That’s what the grey bars are telling us in this STL decomposition.
Exponential smoothing always remains an option.
ETS(A,N,A)
Call:
ets(y = ., lambda = lambdas[4])
Box-Cox transformation: lambda= 0.4498
Smoothing parameters:
alpha = 1e-04
gamma = 0.1035
Initial states:
l = 28.6369
s = -18.6502 -3.3529 1.6831 4.7437 5.4471 4.9022
5.2271
sigma: 12.9202
AIC AICc BIC
4032.268 4032.890 4071.267
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 66.43723 337.9663 259.7696 -377.1127 420.802 0.7498771 0.09916026
We’re going to need the lambda and BoxCox transformation to get to a relatively lower AICc with ETS.
[1] 0
[1] 0
#######################
# KPSS Unit Root Test #
#######################
Test is of type: mu with 5 lags.
Value of test-statistic is: 0.0792
Critical value for a significance level of:
10pct 5pct 2.5pct 1pct
critical values 0.347 0.463 0.574 0.739
Good news. There is need to do differencing. Let’s confirm that with KPSS Unit Root Test. Unfortunately, the test-statistic did tell us that we’re rejecting the null hypothesis with 99% level of significance. What that tells me is that we do need to do differencing. Let’s do a seasonal differncing based on the ACF and PACF plots.
#######################
# KPSS Unit Root Test #
#######################
Test is of type: mu with 5 lags.
Value of test-statistic is: 0.0192
Critical value for a significance level of:
10pct 5pct 2.5pct 1pct
critical values 0.347 0.463 0.574 0.739
Okay. With 1 seasonal differencing, we get stationarity in ATM4.
By visualizing the PACF and ACF, I came up with a manual order of the arima model ARIMA(p,d,q)(P,D,Q)m of p = 2, P = 1, q = 1, Q = 1, d= 0, D= 1 where m = 7.
ggtsdisplay(diff(atm_ts4.bc, lag = frequency(atm_ts[[4]])), main = "ATM4 Cash Withdrawals",
ylab = "Cash in hundreds", xlab = "Week")auto.arima4 <- atm_ts[[4]] %>% auto.arima(lambda = lambdas[4], biasadj = TRUE)
# Arima(atm2, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm2_lambda)
manual.arima4 <- Arima(atm_ts[[4]], order = c(2, 0, 1), seasonal = c(1, 1, 1), lambda = lambdas[4])order of auto.arima:
p d q P D Q Frequency
0 0 1 2 0 0 7
AICc of auto.arima is: 2931.303
AICc of manual arima (2,0,1)(1,1,1)[7] is: 2864.292
The manual model wins out. So we don’t use the auto.arima but to use the manual model.
With all the proposed models, let’s compile a summary to see how each and every model compare for each ATM ts.
ts_accuracy = function(i, week, rmse) {
# Split train and test
train = window(atm_ts[[i]], end = c(week, 3))
test = window(atm_ts[[i]], start = c(week, 4))
# Forecast Models
if (i == 1) {
arima.model = train %>% auto.arima(lambda = lambdas[1], biasadj = TRUE)
}
if (i == 2) {
arima.model = train %>% Arima(order = c(3, 0, 3), seasonal = c(0, 1, 1),
include.drift = TRUE, lambda = lambdas[2], biasadj = TRUE)
}
if (i == 4) {
arima.model = train %>% Arima(order = c(2, 0, 1), seasonal = c(1, 1, 1),
include.mean = FALSE, lambda = lambdas[4], biasadj = TRUE)
}
# Forecast
stl.fc = stlf(train, h = length(test), s.window = 7, robust = TRUE, lambda = lambdas[i])
ets.fc = forecast(ets(seasadj(decompose(train, "additive")), model = "ZZZ", lambda = lambdas[i],
biasadj = T, additive.only = T, allow.multiplicative.trend = F), h = length(test))
arima.fc = forecast(arima.model, h = length(test))
# Performance statistics
accuracy = data.frame(RMSE = cbind(accuracy(stl.fc$mean, test)[, 2], accuracy(ets.fc$mean,
test)[, 2], accuracy(arima.fc$mean, test)[, 2]))
row.names(accuracy) = c(sprintf("ATM #%d", i))
names(accuracy) = c("STL", "ETS", "ARIMA")
# Returns RMSE
if (rmse) {
accuracy
}
}results = data.frame()
for (i in c(1, 2, 4)) {
results = rbind(results, ts_accuracy(i, 44, rmse = TRUE))
}
results STL ETS ARIMA
ATM #1 47.91159 37.07566 37.00498
ATM #2 60.93905 39.73803 46.91697
ATM #4 334.01518 291.74128 328.69325
Final Results
ATM #1 : ARIMA(0,0,2)(0,1,1)[7] with Box-Cox Transformation of 0.26.
ATM #2 : ETS with additive trend
ATM #3 : E[ATM3]= \(\sum_{i=0}^{n}\frac{{ATM}_i}{n} \space for \space 1 \leq i \leq 2\)
ATM #4 : ETS with additive trend
DATE = seq(as.Date("2010-05-01"), as.Date("2010-05-31"), 1)
# periods = seq(as.Date('2009-05-01'), as.Date('2010-04-30'),1)
ATM = c(rep("ATM1", 31), rep("ATM2", 31), rep("ATM3", 31), rep("ATM4", 31))
autoplot_final <- function(ts, atm.no, method, is_auto_arima) {
train <- ts
if (atm.no == 1) {
arima.model = train %>% auto.arima(lambda = lambdas[1], biasadj = TRUE)
}
if (i == 2) {
arima.model = train %>% Arima(order = c(3, 0, 3), seasonal = c(0, 1, 1),
include.drift = TRUE, lambda = lambdas[2], biasadj = TRUE)
}
if (i == 4) {
arima.model = train %>% Arima(order = c(2, 0, 1), seasonal = c(1, 1, 1),
include.mean = FALSE, lambda = lambdas[4], biasadj = TRUE)
}
stl.fc = stlf(train, h = 31, s.window = 7, robust = TRUE, lambda = lambdas[atm.no])
ets.fc = forecast(ets(seasadj(decompose(train, "additive")), model = "ZZZ", lambda = lambdas[atm.no],
biasadj = T, additive.only = T, allow.multiplicative.trend = F), h = 31)
arima.fc = forecast(arima.model, h = 31)
p1 <- autoplot(train, series = "train") + autolayer(stl.fc$mean, series = "STL Forecast") +
autolayer(ets.fc$mean, series = "ETS Forcast") + autolayer(arima.fc$mean,
series = "Seasonal Arima Forecast") + scale_y_continuous("Amount of Cash Withdrawals",
labels = scales::dollar_format(scale = 0.1, suffix = "K")) + guides(colour = guide_legend(title = "Models")) +
theme(legend.position = "top") + labs(title = sprintf("ATM%0.0f Cash Withdrawal Forecast",
atm.no), subtitle = "1 May, 2009 to 31 May, 2010", x = "Days")
{
gridExtra::grid.arrange(p1, ncol = 1)
}
# selectively return the forecast values of the method to my chooseing
if (method == "arima") {
return(arima.fc$mean)
} else if (method == "ets") {
return(ets.fc$mean)
} else {
return(stl.fc$mean)
}
# is_auto_arima. If it is TRUE, we print out the order of auto.arima
if (is_auto_arima) {
auto.arima <- atm_ts[[atm.no]] %>% auto.arima(lambda = lambdas[atm.no], biasadj = TRUE)
cat("order of auto.arima: \n")
print(arimaorder(auto.arima))
}
}
Cash1 <- autoplot_final(atm_ts[[1]], 1, "arima", TRUE)Part B: Forecasting Residential Power Load
A simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014.
power <- readxl::read_excel("./ResidentialCustomerForecastLoad-624.xlsx", col_names = TRUE) %>%
rename(id = CaseSequence, dt = `YYYY-MMM`, kwh = KWH) %>% mutate(dt = as_date(paste(dt,
"01", sep = "-"), format = "%Y-%b-%d", tz = "UTC"))
power# A tibble: 192 x 3
id dt kwh
<dbl> <date> <dbl>
1 733 1998-01-01 6862583
2 734 1998-02-01 5838198
3 735 1998-03-01 5420658
4 736 1998-04-01 5010364
5 737 1998-05-01 4665377
6 738 1998-06-01 6467147
7 739 1998-07-01 8914755
8 740 1998-08-01 8607428
9 741 1998-09-01 6989888
10 742 1998-10-01 6345620
# … with 182 more rows
power.ts = ts(power$kwh, start = c(1998, 1), frequency = 12)
autoplot(power.ts, main = "Residential Power Consumption")Before data cleaning, let’s see how many missing records we have.
psych::describe(power.ts) %>% select(-vars) %>% kable(digits = 2L, caption = "Descriptive Statistics of Residential Power Consumption dataset",
table.attr = "style='width:22%;'") %>% kableExtra::kable_styling(full_width = F)| n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| X1 | 191 | 6502475 | 1447571 | 6283324 | 6439475 | 1543074 | 770523 | 10655730 | 9885207 | 0.17 | 0.45 | 104742.6 |
# sprintf
boxplot(power$kwh, main = sprintf("%s", "Residential Power Consumption"), col = "violet",
fill = "purple", xlab = sprintf("# of outliers = %d", length(boxplot(power, plot = F)$out)))# head(power) tail(power)
min <- as.Date("1998-01-01")
max <- as.Date("2013-12-01")
# cex.lab=.8 to decrease the fonts for x axis labels
power %>% ggplot(aes(x = dt, y = kwh)) + geom_line(color = "violet") + labs(subtitle = "Residential Power Consumption Time Plot",
x = "Date(MON YYYY)", y = "Power Used (kWh)", cex.lab = 0.8) + scale_x_date(date_breaks = "1 year",
limits = c(min, max), date_labels = "%b %y")power.ts = tsclean(power.ts)
psych::describe(power.ts)[, -1] %>% kable(digits = 2L, caption = "Descriptive Statistics of Power Usage",
table.attr = "style='width:22%;'") %>% kableExtra::kable_styling(full_width = F)| n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| X1 | 192 | 6529701 | 1369032 | 6351262 | 6455590 | 1587856 | 4313019 | 10655730 | 6342711 | 0.47 | -0.48 | 98801.41 |
The power consumption time series apparently has a seasonality. With consumption peaking in August and January, it has troughs around May and November. This is not hard to explain as the consumption of electricity rises naturally with there is extra heating needed during the winter months and air conditioning in the summer.
The subseries plot does show the medians in a clearer and unarguable manner.
lambda <- BoxCox.lambda(power.ts)
power.bc <- BoxCox(power.ts, lambda = lambda)
ggtsdisplay(power.bc, main = "Residential Power Consumption", ylab = "Power Used(kWh)",
xlab = "Month")The BoxCox transformation did really smoothed out the variance. Now it looks more like a constant variance.
[1] 1
[1] 1
#######################
# KPSS Unit Root Test #
#######################
Test is of type: mu with 4 lags.
Value of test-statistic is: 0.0218
Critical value for a significance level of:
10pct 5pct 2.5pct 1pct
critical values 0.347 0.463 0.574 0.739
The KPSS Unit Root Test shows that the ordinary first-order difference was sufficient to get to stationary data. As the test-statistics is not in the critical region, it fails to reject the null hypothesis.
Modeling
Let’s examine the ACF and PACF plots of first-differenced and Box-Cox transformed ts. Since there are more than 5% of the spikes in ACF that are outside the dotted blue line, we suspect that this transformed and differenced ts is white noise.
gridExtra::grid.arrange(ncol = 2, ggAcf(diff(diff(power.bc, lag = 12))), ggPacf(diff(diff(power.bc,
lag = 12))))By visualizing the PACF and ACF, I came up with a manual order of the arima model ARIMA(p,d,q)(P,D,Q)m of p = 2, P = 1, q = 1, Q = 1, d= 1, D= 1 where m = 7.
manual.arima.pwr <- Arima(power.ts, order = c(2, 1, 1), seasonal = c(1, 1, 1), lambda = lambda)
manual.arima.pwr %>% summarySeries: power.ts
ARIMA(2,1,1)(1,1,1)[12]
Box Cox transformation: lambda= -0.1442665
Coefficients:
ar1 ar2 ma1 sar1 sma1
0.2272 0.0006 -0.9502 -0.1052 -0.6824
s.e. 0.0817 0.0789 0.0327 0.1182 0.1010
sigma^2 estimated as 8.912e-05: log likelihood=579.98
AIC=-1147.95 AICc=-1147.47 BIC=-1128.83
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set 32516.63 588772.5 437553.9 0.03413681 6.612284 0.7336461
ACF1
Training set 0.04711048
auto.arima.pwr <- auto.arima(power.ts, d = 1, lambda = lambda, approximation = FALSE)
auto.arima.pwr %>% summarySeries: power.ts
ARIMA(1,1,2)(0,1,1)[12]
Box Cox transformation: lambda= -0.1442665
Coefficients:
ar1 ma1 ma2 sma1
-0.4446 -0.2494 -0.6452 -0.7360
s.e. 0.1807 0.1520 0.1321 0.0629
sigma^2 estimated as 8.777e-05: log likelihood=580.92
AIC=-1151.83 AICc=-1151.48 BIC=-1135.89
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 33549.31 585381.9 436711.6 0.05510506 6.59288 0.7322338 0.07434323
I fit a ARIMA(2,1,1)(1,1,1)[12] model, while the algorithm selected a ARIMA(1,1,2)(0,1,1)[12] model. That is, the manual model has one additional autoregressive term and 1 less MA team. On the seasonal arima component, the manual model has one more autoregressive term. The manual model boasts an AICc=−1148 versus the automated’s AICc=-1151 and the manual has RMSE=588772.5 versus the automated has a RMSE = 585381.9.
Judging comprehensively, I think I would keep the auto.arima as it has a lower RMSE.
Ljung-Box test
data: Residuals from ARIMA(1,1,2)(0,1,1)[12]
Q* = 15.697, df = 20, p-value = 0.7352
Model df: 4. Total lags used: 24
Part C [Bonus]: Forecasting Water Pipe Flow
To be completed Oct 26, 2020. Will send separately.