Data 624 Project 1

library(knitr)
library(rmdformats)

## Global options
options(max.print="85")
opts_chunk$set(cache=TRUE,
               prompt=FALSE,
               tidy=TRUE,
               comment=NA,
               message=FALSE,
               warning=FALSE)
opts_knit$set(width=31)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(tidyr)
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(xts) #extensible time-series
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(tsbox)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.3     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ✓ purrr   0.3.3
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x psych::%+%()             masks ggplot2::%+%()
## x psych::alpha()           masks ggplot2::alpha()
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x xts::first()             masks dplyr::first()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x xts::last()              masks dplyr::last()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
library(tsibble)
## 
## Attaching package: 'tsibble'
## The following object is masked from 'package:zoo':
## 
##     index
## The following objects are masked from 'package:lubridate':
## 
##     interval, new_interval
library(fpp3)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────── fpp3 0.3 ──
## ✓ tsibbledata 0.2.0     ✓ fable       0.2.1
## ✓ feasts      0.1.5
## ── Conflicts ────────────────────────────────────────────────────────────────────────── fpp3_conflicts ──
## x psych::%+%()            masks ggplot2::%+%()
## x psych::alpha()          masks ggplot2::alpha()
## x lubridate::date()       masks base::date()
## x dplyr::filter()         masks stats::filter()
## x xts::first()            masks dplyr::first()
## x fabletools::forecast()  masks forecast::forecast()
## x tsibble::index()        masks zoo::index()
## x tsibble::interval()     masks lubridate::interval()
## x dplyr::lag()            masks stats::lag()
## x xts::last()             masks dplyr::last()
## x tsibble::new_interval() masks lubridate::new_interval()
library(urca)
library(ggseas)
library(zoo)

Part A: Forecasting ATM Withdrawals

What is given is a dataset of cash withdrawals (in 100) from May 1st 2009 to April 30th 2010. To provide value in this forecast, I am going to forecast what is the predicted cash withdrawals at each of the 4 ATMs in May 2010.

# setwd('./Data624/Week 9/Proj1/Data 624 Project 1/')
atm <- readxl::read_excel("ATM624Data.xlsx", col_names = TRUE, col_types = c("date", 
    "text", "numeric")) %>% transmute(date = as_date(DATE), atm = ATM, cash = Cash) %>% 
    drop_na() %>% arrange(date, atm)

# atm atm %>% gather(atm, cash, -date)

atm_spread <- atm %>% spread(atm, cash)
atm_spread
# A tibble: 365 x 5
   date        ATM1  ATM2  ATM3  ATM4
   <date>     <dbl> <dbl> <dbl> <dbl>
 1 2009-05-01    96   107     0 777. 
 2 2009-05-02    82    89     0 524. 
 3 2009-05-03    85    90     0 793. 
 4 2009-05-04    90    55     0 908. 
 5 2009-05-05    99    79     0  52.8
 6 2009-05-06    88    19     0  52.2
 7 2009-05-07     8     2     0  55.5
 8 2009-05-08   104   103     0 559. 
 9 2009-05-09    87   107     0 904. 
10 2009-05-10    93   118     0 879. 
# … with 355 more rows

Converting each of the atm series by first grouping by atm (get a grouped tibble), followed by using the function ts on the cash column, making a time series of frequency of every 7 days, a week, into a time series object called atm_ts.

# 1 row 4 columns
par(mfrow = c(1, 4))

# sprintf -- %s for Character string -- %d for integer value
for (i in 1:4) {
    boxplot(atm_spread[i + 1], main = sprintf("%s", names(atm_spread)[i + 1]), col = "violet", 
        fill = "purple", xlab = sprintf("# of outliers = %d", length(boxplot(atm_spread[i + 
            1], plot = F)$out)))
}

min <- as.Date("2009-05-01")
max <- as.Date("2010-04-30")
atm %>% ggplot(aes(x = date, y = cash, col = atm)) + geom_line(show.legend = FALSE) + 
    facet_wrap(~atm, ncol = 1, scales = "free_y") + labs(title = "Cash Withdrawals by ATM", 
    subtitle = "May 2009 - April 2010", x = "Date(MON YY)") + scale_x_date(date_breaks = "1 month", 
    limits = c(min, max), date_labels = "%b %y") + scale_y_continuous("Cash in '000s", 
    labels = scales::dollar_format(scale = 0.1, suffix = "K"))

describe(atm_spread)[-1, ] %>% select(-vars) %>% kable(digits = 2L, caption = "Descriptive Statistics of ATM dataset", 
    table.attr = "style='width:22%;'") %>% kableExtra::kable_styling(full_width = F)
Descriptive Statistics of ATM dataset
n mean sd median trimmed mad min max range skew kurtosis se
ATM1 362 83.89 36.66 91.00 86.65 25.20 1.00 180.00 179.0 -0.71 0.19 1.93
ATM2 363 62.58 38.90 67.00 62.24 48.93 0.00 147.00 147.0 -0.03 -1.09 2.04
ATM3 365 0.72 7.94 0.00 0.00 0.00 0.00 96.00 96.0 10.93 118.38 0.42
ATM4 365 474.04 650.94 403.84 416.68 425.80 1.56 10919.76 10918.2 11.39 178.91 34.07

At this point, we know there are 3 missing values in the ATM1 series, and 2 missing values in the ATM2 series. In addition, we do see some outliers in ATM1, ATM3, and ATM4. So we’re going to use a powerful function in the forecast package where it uses Supsmu for non-seasonal series and a robust STL decomposition for seasonal series to estimate missing values and do outlier replacements.

# lambda = 'auto' would not work for outlier replacement
atm.ts = ts(atm_spread %>% select(-date))
atm.ts1 <- atm_spread %>% as_tsibble()
for (i in 1:4) {
    atm.ts1[, i + 1] = tsclean(atm.ts[, i])
}
atm.df <- ts_df(atm.ts1)
# atm.ts1 %>% mutate(date = format(date, '%Y-%M-%D'), .drop = TRUE )
atm.df %>% tidyr::spread(id, value) %>% describe() %>% filter(vars != 1) %>% select(-vars) %>% 
    kable(digits = 2L, caption = "Descriptive Statistics of ATM dataset", table.attr = "style='width:22%;'") %>% 
    kableExtra::kable_styling(full_width = F)
Descriptive Statistics of ATM dataset
n mean sd median trimmed mad min max range skew kurtosis se
ATM1 365 84.15 36.64 91.00 86.93 25.20 1.00 180.00 179.00 -0.72 0.20 1.92
ATM2 365 62.59 38.81 67.00 62.26 48.93 0.00 147.00 147.00 -0.03 -1.09 2.03
ATM3 365 0.72 7.94 0.00 0.00 0.00 0.00 96.00 96.00 10.93 118.38 0.42
ATM4 365 444.76 351.08 402.77 414.35 424.21 1.56 1712.07 1710.51 0.66 -0.10 18.38
str(atm.df)
'data.frame':   1460 obs. of  3 variables:
 $ id   : Factor w/ 4 levels "ATM1","ATM2",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ date : Date, format: "2009-05-01" "2009-05-02" ...
 $ value: Time-Series  from 1 to 365: 96 82 85 90 99 88 8 104 87 93 ...
names(atm.df) <- c("atm", "date", "cash")

Now, please note that n = 365 for all ATM[i] series for 1 <= i <= 4. All the missing values are interpolated and outliers are replaced. Notably, the obvious outliner at position 285, date 2010-02-09 had been replaced for ATM4 series.

# ts_df (atm.ts)

atm.df %>% ggplot(aes(x = date, y = cash, col = atm)) + geom_line(show.legend = FALSE) + 
    facet_wrap(~atm, ncol = 1, scales = "free_y") + labs(title = "Cash Withdrawals by ATM", 
    subtitle = "May 2009 - April 2010", x = "Date(MON YY)") + scale_x_date(date_breaks = "1 month", 
    limits = c(min, max), date_labels = "%b %y") + scale_y_continuous("Cash in '000s", 
    labels = scales::dollar_format(scale = 0.1, suffix = "K"))

str(atm.ts1)
tsibble [365 × 5] (S3: tbl_ts/tbl_df/tbl/data.frame)
 $ date: Date[1:365], format: "2009-05-01" "2009-05-02" ...
 $ ATM1: Time-Series [1:365] from 1 to 365: 96 82 85 90 99 88 8 104 87 93 ...
 $ ATM2: Time-Series [1:365] from 1 to 365: 107 89 90 55 79 19 2 103 107 118 ...
 $ ATM3: Time-Series [1:365] from 1 to 365: 0 0 0 0 0 0 0 0 0 0 ...
 $ ATM4: Time-Series [1:365] from 1 to 365: 777 524.4 792.8 908.2 52.8 ...
 - attr(*, "key")= tibble [1 × 1] (S3: tbl_df/tbl/data.frame)
  ..$ .rows: list<int> [1:1] 
  .. ..$ : int [1:365] 1 2 3 4 5 6 7 8 9 10 ...
  .. ..@ ptype: int(0) 
 - attr(*, "index")= chr "date"
  ..- attr(*, "ordered")= logi TRUE
 - attr(*, "index2")= chr "date"
 - attr(*, "interval")= interval [1:1] 1D
  ..@ .regular: logi TRUE

Examine seasonality:

gg_season(atm.ts1, y = ATM1, period = 7, labels = c("none"))

gg_season(atm.ts1, y = ATM2, period = 7, labels = c("none"))

gg_season(atm.ts1, y = ATM3, period = 7, labels = c("none"))

gg_season(atm.ts1, y = ATM4, period = 7, labels = c("none"))

# gg_subseries function (Seasonal subseries plots) is still WIP; going back to
# the original ts objects for subseries plotting

atm_ts <- atm.df %>% group_by(atm) %>% group_map(~ts(.x$cash, freq = 7))


str(atm_ts)
List of 4
 $ : Time-Series [1:365] from 1 to 53: 96 82 85 90 99 88 8 104 87 93 ...
 $ : Time-Series [1:365] from 1 to 53: 107 89 90 55 79 19 2 103 107 118 ...
 $ : Time-Series [1:365] from 1 to 53: 0 0 0 0 0 0 0 0 0 0 ...
 $ : Time-Series [1:365] from 1 to 53: 777 524.4 792.8 908.2 52.8 ...

Maybe seasonal plots from gg_season might not be the best way of visualizing the individual series. ATM1 and ATM2 did give us some blurred images to confirm weekly seasonality exists. Up next, let’s look at medians given in the seasonal subseries plots.

Subseries Plots

# 2 row 2 columns
par(mfrow = c(2, 2))

for (i in 1:4) {
    
    print(ggsubseriesplot(atm_ts[[i]], "html") + labs(title = paste("Subseries Plot: ATM", 
        i), y = "Cash in hundreds", x = "Day"))
    
}

# For some reason, I couldn't get the printing of the ggsubseriesplot to print
# within the confines of 2 X 2.

It’s not hard to imagine there are weekly seasonality in levels of cash at ATMs. As evidenced by the seasonal subseries plots at the weekly level, one can clearly see that there are dips for the median cash withdrawn amount Saturdays for ATM 1, 2, & 4. For ATM2, the median cash levels are also at a relatively level than the rest of the week Fridays in addition to just Saturdays seen at large at other ATMs.

Time Series Analysis

Up until now, we’ve examined the time series and did the necessary adjustments to minimize the errors in the forecast, which we’re going to move onto next. With a good data prep step, we not only understand the frequency of the seasonality but also put us at the best position to build out the best unbiased time series models with minimal errors, or highest accuracy.

ATM1

ggtsdisplay(atm_ts[[1]], main = "ATM1 Cash Withdrawals", ylab = "Cash(in hundreds)", 
    xlab = "Week")

Subseries Plots (click to jump to that section)

Subseries plot for ATM1 did show that the amount of cash withdrawals peaked on Tuesday and has a trough on Saturday.

ATM1 ts is apparently non-stationary as the lags in ACF are not quickly decreasing to be below the blue dotted line and there is seasonality. The BoxCox transformation did make some difference to the ATM1 ts, so we’re going to keep it.

boxcox_transform <- function(x) {
    lambda <- BoxCox.lambda(x)
    BoxCox(x, lambda)
}

atm_bc <- lapply(atm_ts, boxcox_transform)

# Saving lambdas from all 4 ATM ts into lambdas
lambdas <- c(0.2615708, 0.7242585, 1.332968, 0.449771)

autoplot(BoxCox(atm_ts[[1]], lambdas[1])) + labs(title = sprintf("Transformed: %s", 
    "ATM1 Cash Withdrawals"), subtitle = sprintf("lambda = %0.2f", lambdas[1]), y = "Box-Cox Transformed", 
    x = "Week")

atm_ts[[1]] %>% stl(s.window = 7, robust = TRUE) %>% autoplot() + labs(title = "Seasonal & Trend Decomposition for ATM1 Time Series", 
    x = "Week")

The seasonal component in STL (Seasonal Decomposition of Time Series by Loess) decomposition did illustrate that the variation decreases and then increases, therefore we are not going to use a BoxCox transformation lambda in this method. On top of that, the grey rectangular bars did signify that variations was contributed largely in party by the trend component, more so than seasonal and remainder. From the ts itself, I do conclude that we’re going to treat this ts as additive.

As for ETS (Exponential smoothing state space model), we are going to use the BoxCox transformation as it reduced the AICc by a lot.

atm_ts[[1]] %>% ets(lambda = lambdas[1]) %>% summary
ETS(A,N,A) 

Call:
 ets(y = ., lambda = lambdas[1]) 

  Box-Cox transformation: lambda= 0.2616 

  Smoothing parameters:
    alpha = 1e-04 
    gamma = 0.3513 

  Initial states:
    l = 7.9717 
    s = -4.5094 0.5635 1.0854 0.5711 0.9551 0.5582
           0.7761

  sigma:  1.343

     AIC     AICc      BIC 
2379.653 2380.275 2418.652 

Training set error measures:
                   ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
Training set 2.161635 24.92166 15.86675 -92.24919 110.7242 0.8965114 0.1467074
# Number of differences required for a stationary series
ndiffs(atm_ts[[1]])
[1] 0
# Number of differences required for a seasonally stationary series
nsdiffs(atm_ts[[1]])
[1] 1
atm_ts1.bc <- BoxCox(atm_ts[[1]], lambdas[1])
atm_ts1.bc %>% diff(lag = frequency(atm_ts[[1]])) %>% ur.kpss() %>% summary()

####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0153 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

we need to do 1 seasonal differencing and 0 ordinary difference.

As we can find out from the test-statistics of the KPSS Unit Root Test, we failed to reject the null hypothesis. So the reality is we do have a stationary data in ATM1. By visualizing the PACF and ACF, I came up with a manual order of the arima model ARIMA(p,d,q)(P,D,Q)m of p = 2, P =1, q = 2, Q = 1, d= 0, D= 1 where m = 7.

ggtsdisplay(diff(atm_ts1.bc, lag = frequency(atm_ts[[1]])), main = "ATM1 Cash Withdrawals", 
    ylab = "Cash in hundreds", xlab = "Week")

auto.arima1 <- atm_ts[[1]] %>% auto.arima(lambda = lambdas[1], biasadj = TRUE)


# Arima(atm2, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm2_lambda)
manual.arima1 <- Arima(atm_ts[[1]], order = c(2, 0, 2), seasonal = c(1, 1, 1), lambda = lambdas[1])
cat("order of auto.arima: \n")
order of auto.arima: 
print(arimaorder(auto.arima1))
        p         d         q         P         D         Q Frequency 
        0         0         2         0         1         1         7 
cat("AICc of auto.arima is: ", auto.arima1$aicc, "\n")
AICc of auto.arima is:  1228.091 
cat("AICc of manual arima (2,0,2)(1,1,1)[7] is: ", manual.arima1$aicc)
AICc of manual arima (2,0,2)(1,1,1)[7] is:  1231.359

Auto.arima seems to be doing better in the corrected AIC. We’re going to use auto.arima for ATM1.

ATM2

Subseries Plots (click to jump to that section)

Subseries plot for ATM1 did show that the amount of cash withdrawals peaked on Sunday and had a relative peak on Thursday. It also has a trough on Saturday. It’s interesting that this ATM is located in a neighborhood where people’s weekends start early on Thursday and you see its Fridays are the 2nd lowest amongst all days during the week. The ACF plot showed that this series is not stationary as the spikes are not quickly decreasing. PACF plot also warranted that this series has the need for seasonal differencing. We’ll further confirm that with nsdiffs() later.

ggtsdisplay(atm_ts[[2]], main = "ATM2 Cash Withdrawals", ylab = "Cash Withdrawal (in hundreds)", 
    xlab = "Week")

autoplot(BoxCox(atm_ts[[2]], lambdas[2])) + labs(title = sprintf("Transformed: %s", 
    "ATM2 Cash Withdrawals"), subtitle = sprintf("lambda = %0.2f", lambdas[2]), y = "Box-Cox Transformed", 
    x = "Week")

We will take the BoxCox transformed series as it smoothed out the variations.

atm_ts[[2]] %>% stl(s.window = 7, robust = TRUE) %>% autoplot() + labs(title = "Seasonal & Trend Decomposition for ATM2 Time Series", 
    x = "Week")

Note that trend contributed the most to the variations just by looking at the grey bars above.

Exponential Smoothing

Note that we have some very high AICc.

atm_ts[[2]] %>% ets(lambda = lambdas[2]) %>% summary
ETS(A,N,A) 

Call:
 ets(y = ., lambda = lambdas[2]) 

  Box-Cox transformation: lambda= 0.7243 

  Smoothing parameters:
    alpha = 1e-04 
    gamma = 0.3852 

  Initial states:
    l = 26.7907 
    s = -17.8373 -13.3183 10.8218 1.8349 4.281 5.7998
           8.4181

  sigma:  8.5054

     AIC     AICc      BIC 
3727.060 3727.682 3766.059 

Training set error measures:
                    ME     RMSE      MAE  MPE MAPE      MASE       ACF1
Training set 0.7151812 25.35716 17.79237 -Inf  Inf 0.8559082 0.02254807

Next, we need to check and see if there are needs for differencing to reduce changes in the level of the time series.

# Number of differences required for a stationary series
ndiffs(atm_ts[[2]])
[1] 1
# Number of differences required for a seasonally stationary series
nsdiffs(atm_ts[[2]])
[1] 1
atm_ts2.bc <- BoxCox(atm_ts[[2]], lambdas[2])

atm_ts2.bc %>% diff(lag = frequency(atm_ts[[2]])) %>% ur.kpss() %>% summary()

####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0162 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

The ndiffs( ) and nsdiffs( ) are telling us that we need to do an ordinary differencing and a seasonal differencing. But in reality, one seasonal differencing was sufficient to get us back to stationary data as indicated by test-statistics 0.0162, not in critical regions.

By visualizing the PACF and ACF, I came up with a manual order of the arima model ARIMA(p,d,q)(P,D,Q)m of p = 1, P =1, q = 1, Q = 0, d= 0, D= 1 where m = 7.

ggtsdisplay(diff(atm_ts2.bc, lag = frequency(atm_ts[[2]])), main = "ATM2 Cash Withdrawals", 
    ylab = "Cash in hundreds", xlab = "Week")

auto.arima2 <- atm_ts[[2]] %>% auto.arima(lambda = lambdas[2], biasadj = TRUE)


# Arima(atm2, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm2_lambda)
manual.arima2 <- Arima(atm_ts[[2]], order = c(1, 0, 1), seasonal = c(1, 1, 0), lambda = lambdas[2])
cat("order of auto.arima: \n")
order of auto.arima: 
print(arimaorder(auto.arima2))
        p         d         q         P         D         Q Frequency 
        3         0         3         0         1         1         7 
cat("AICc of auto.arima is: ", auto.arima2$aicc, "\n")
AICc of auto.arima is:  2539.692 
cat("AICc of manual arima (1,0,1)(1,1,0)[7] is: ", manual.arima2$aicc)
AICc of manual arima (1,0,1)(1,1,0)[7] is:  2578.072

Auto.arima wins out again for ATM2 with a lower corrected AIC.

One of the options: ARIMA(3,0,3)(0,1,1)[7] with drift

ATM3

Given that we have very limited data points, we should just use a naive method to estimate May 2010 amount in this ATM ts. It is apparent that ATM4 might not be a good candidate for taking the average. I’d assume ATM1 & ATM2 would fit well as a good candidate.

Approach: Expected Value

E[ATM3] at t+1 = \(\sum_{i=0}^{n}\frac{{ATM}_i}{n} \space for \space 1 \leq i \leq 2\)

To illustrate the logic here, we essentially can look at the total withdrawal amount on the following dates where ATM3 has data.

illustration of cash withdrawals for ATM1, ATM2, & ATM3
Time Series 2010-04-28 2010-04-29 2010-04-30 Average Amount
ATM1 $96 $82 $85 $87.67
ATM2 $107 $89 $90 $95.33
ATM3 $96 $82 $85 $87.67

Note that the average amount of ATM1 and ATM3 are exactly the same. So in order to give ATM3 enough reserves, I’d take the average of ATM1 & ATM2.

ATM4

Subseries Plots (click to jump to that section)

The subseries plot for ATM4 tells me that ATM withdrawals peaked on Sunday and has a trough on Saturday. While it doesn’t appear that there are significant lags outside the seasonal ones that are outside of the blue dotted line for ACF, it does have 2 instances where we see there are spikes outside of the dotted region. I do see there is seasonality shown in the ACF that the ts is non-stationary.

ggtsdisplay(atm_ts[[4]], main = "ATM4 Cash Withdrawals", ylab = "Cash Withdrawal (in hundreds)", 
    xlab = "Week")

autoplot(BoxCox(atm_ts[[4]], lambdas[4])) + labs(title = sprintf("Transformed: %s", 
    "ATM4 Cash Withdrawals"), subtitle = sprintf("lambda = %0.2f", lambdas[4]), y = "Box-Cox Transformed", 
    x = "Week")

Reduced variations make me feel that I should keep the BoxCox transformation and lambda.

atm_ts[[4]] %>% stl(s.window = 7, robust = TRUE) %>% autoplot() + labs(title = "Seasonal & Trend Decomposition for ATM4 Time Series", 
    x = "Week")

A majority of the variation is attributed to the trend. That’s what the grey bars are telling us in this STL decomposition.

Exponential smoothing always remains an option.

atm_ts[[4]] %>% ets(lambda = lambdas[4]) %>% summary
ETS(A,N,A) 

Call:
 ets(y = ., lambda = lambdas[4]) 

  Box-Cox transformation: lambda= 0.4498 

  Smoothing parameters:
    alpha = 1e-04 
    gamma = 0.1035 

  Initial states:
    l = 28.6369 
    s = -18.6502 -3.3529 1.6831 4.7437 5.4471 4.9022
           5.2271

  sigma:  12.9202

     AIC     AICc      BIC 
4032.268 4032.890 4071.267 

Training set error measures:
                   ME     RMSE      MAE       MPE    MAPE      MASE       ACF1
Training set 66.43723 337.9663 259.7696 -377.1127 420.802 0.7498771 0.09916026
# atm_ts[[4]] %>% ets() %>% summary

We’re going to need the lambda and BoxCox transformation to get to a relatively lower AICc with ETS.

# Number of differences required for a stationary series
ndiffs(atm_ts[[4]])
[1] 0
# Number of differences required for a seasonally stationary series
nsdiffs(atm_ts[[4]])
[1] 0
atm_ts4.bc <- BoxCox(atm_ts[[4]], lambdas[4])

atm_ts4.bc %>% ur.kpss() %>% summary()

####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0792 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

Good news. There is need to do differencing. Let’s confirm that with KPSS Unit Root Test. Unfortunately, the test-statistic did tell us that we’re rejecting the null hypothesis with 99% level of significance. What that tells me is that we do need to do differencing. Let’s do a seasonal differncing based on the ACF and PACF plots.

atm_ts4.bc %>% diff(lag = frequency(atm_ts[[4]])) %>% ur.kpss() %>% summary()

####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0192 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

Okay. With 1 seasonal differencing, we get stationarity in ATM4.

By visualizing the PACF and ACF, I came up with a manual order of the arima model ARIMA(p,d,q)(P,D,Q)m of p = 2, P = 1, q = 1, Q = 1, d= 0, D= 1 where m = 7.

ggtsdisplay(diff(atm_ts4.bc, lag = frequency(atm_ts[[4]])), main = "ATM4 Cash Withdrawals", 
    ylab = "Cash in hundreds", xlab = "Week")

auto.arima4 <- atm_ts[[4]] %>% auto.arima(lambda = lambdas[4], biasadj = TRUE)


# Arima(atm2, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm2_lambda)
manual.arima4 <- Arima(atm_ts[[4]], order = c(2, 0, 1), seasonal = c(1, 1, 1), lambda = lambdas[4])
cat("order of auto.arima: \n")
order of auto.arima: 
print(arimaorder(auto.arima4))
        p         d         q         P         D         Q Frequency 
        0         0         1         2         0         0         7 
cat("AICc of auto.arima is: ", auto.arima4$aicc, "\n")
AICc of auto.arima is:  2931.303 
cat("AICc of manual arima (2,0,1)(1,1,1)[7] is: ", manual.arima4$aicc)
AICc of manual arima (2,0,1)(1,1,1)[7] is:  2864.292

The manual model wins out. So we don’t use the auto.arima but to use the manual model.

With all the proposed models, let’s compile a summary to see how each and every model compare for each ATM ts.

ts_accuracy = function(i, week, rmse) {
    # Split train and test
    train = window(atm_ts[[i]], end = c(week, 3))
    test = window(atm_ts[[i]], start = c(week, 4))
    # Forecast Models
    if (i == 1) {
        arima.model = train %>% auto.arima(lambda = lambdas[1], biasadj = TRUE)
    }
    if (i == 2) {
        arima.model = train %>% Arima(order = c(3, 0, 3), seasonal = c(0, 1, 1), 
            include.drift = TRUE, lambda = lambdas[2], biasadj = TRUE)
    }
    if (i == 4) {
        arima.model = train %>% Arima(order = c(2, 0, 1), seasonal = c(1, 1, 1), 
            include.mean = FALSE, lambda = lambdas[4], biasadj = TRUE)
    }
    
    # Forecast
    stl.fc = stlf(train, h = length(test), s.window = 7, robust = TRUE, lambda = lambdas[i])
    ets.fc = forecast(ets(seasadj(decompose(train, "additive")), model = "ZZZ", lambda = lambdas[i], 
        biasadj = T, additive.only = T, allow.multiplicative.trend = F), h = length(test))
    arima.fc = forecast(arima.model, h = length(test))
    # Performance statistics
    accuracy = data.frame(RMSE = cbind(accuracy(stl.fc$mean, test)[, 2], accuracy(ets.fc$mean, 
        test)[, 2], accuracy(arima.fc$mean, test)[, 2]))
    row.names(accuracy) = c(sprintf("ATM #%d", i))
    names(accuracy) = c("STL", "ETS", "ARIMA")
    # Returns RMSE
    if (rmse) {
        accuracy
    }
}
results = data.frame()
for (i in c(1, 2, 4)) {
    results = rbind(results, ts_accuracy(i, 44, rmse = TRUE))
}
results
             STL       ETS     ARIMA
ATM #1  47.91159  37.07566  37.00498
ATM #2  60.93905  39.73803  46.91697
ATM #4 334.01518 291.74128 328.69325

Final Results

  • ATM #1 : ARIMA(0,0,2)(0,1,1)[7] with Box-Cox Transformation of 0.26.

  • ATM #2 : ETS with additive trend

  • ATM #3 : E[ATM3]= \(\sum_{i=0}^{n}\frac{{ATM}_i}{n} \space for \space 1 \leq i \leq 2\)

  • ATM #4 : ETS with additive trend

DATE = seq(as.Date("2010-05-01"), as.Date("2010-05-31"), 1)
# periods = seq(as.Date('2009-05-01'), as.Date('2010-04-30'),1)
ATM = c(rep("ATM1", 31), rep("ATM2", 31), rep("ATM3", 31), rep("ATM4", 31))


autoplot_final <- function(ts, atm.no, method, is_auto_arima) {
    train <- ts
    if (atm.no == 1) {
        arima.model = train %>% auto.arima(lambda = lambdas[1], biasadj = TRUE)
    }
    if (i == 2) {
        arima.model = train %>% Arima(order = c(3, 0, 3), seasonal = c(0, 1, 1), 
            include.drift = TRUE, lambda = lambdas[2], biasadj = TRUE)
    }
    if (i == 4) {
        arima.model = train %>% Arima(order = c(2, 0, 1), seasonal = c(1, 1, 1), 
            include.mean = FALSE, lambda = lambdas[4], biasadj = TRUE)
    }
    
    stl.fc = stlf(train, h = 31, s.window = 7, robust = TRUE, lambda = lambdas[atm.no])
    ets.fc = forecast(ets(seasadj(decompose(train, "additive")), model = "ZZZ", lambda = lambdas[atm.no], 
        biasadj = T, additive.only = T, allow.multiplicative.trend = F), h = 31)
    arima.fc = forecast(arima.model, h = 31)
    
    p1 <- autoplot(train, series = "train") + autolayer(stl.fc$mean, series = "STL Forecast") + 
        autolayer(ets.fc$mean, series = "ETS Forcast") + autolayer(arima.fc$mean, 
        series = "Seasonal Arima Forecast") + scale_y_continuous("Amount of Cash Withdrawals", 
        labels = scales::dollar_format(scale = 0.1, suffix = "K")) + guides(colour = guide_legend(title = "Models")) + 
        theme(legend.position = "top") + labs(title = sprintf("ATM%0.0f Cash Withdrawal Forecast", 
        atm.no), subtitle = "1 May, 2009 to 31 May, 2010", x = "Days")
    
    {
        gridExtra::grid.arrange(p1, ncol = 1)
    }
    
    # selectively return the forecast values of the method to my chooseing
    if (method == "arima") {
        return(arima.fc$mean)
    } else if (method == "ets") {
        return(ets.fc$mean)
    } else {
        return(stl.fc$mean)
    }
    # is_auto_arima. If it is TRUE, we print out the order of auto.arima
    if (is_auto_arima) {
        auto.arima <- atm_ts[[atm.no]] %>% auto.arima(lambda = lambdas[atm.no], biasadj = TRUE)
        cat("order of auto.arima: \n")
        print(arimaorder(auto.arima))
    }
    
}

Cash1 <- autoplot_final(atm_ts[[1]], 1, "arima", TRUE)

Cash2 <- autoplot_final(atm_ts[[2]], 2, "ets", FALSE)

Cash3 <- (Cash1 + Cash2)/2
Cash4 <- autoplot_final(atm_ts[[4]], 4, "ets", FALSE)

Cash = c(Cash1, Cash2, Cash3, Cash4)
# xlsx::write.xlsx(data.frame(DATE, ATM, Cash), file = 'ATM624Data.xlsx',
# sheetName='ATM_Forecast(May2010)', append=TRUE)

Part B: Forecasting Residential Power Load

A simple dataset of residential power usage for January 1998 until December 2013.  Your assignment is to model these data and a monthly forecast for 2014.

power <- readxl::read_excel("./ResidentialCustomerForecastLoad-624.xlsx", col_names = TRUE) %>% 
    rename(id = CaseSequence, dt = `YYYY-MMM`, kwh = KWH) %>% mutate(dt = as_date(paste(dt, 
    "01", sep = "-"), format = "%Y-%b-%d", tz = "UTC"))
power
# A tibble: 192 x 3
      id dt             kwh
   <dbl> <date>       <dbl>
 1   733 1998-01-01 6862583
 2   734 1998-02-01 5838198
 3   735 1998-03-01 5420658
 4   736 1998-04-01 5010364
 5   737 1998-05-01 4665377
 6   738 1998-06-01 6467147
 7   739 1998-07-01 8914755
 8   740 1998-08-01 8607428
 9   741 1998-09-01 6989888
10   742 1998-10-01 6345620
# … with 182 more rows
power.ts = ts(power$kwh, start = c(1998, 1), frequency = 12)
autoplot(power.ts, main = "Residential Power Consumption")

Before data cleaning, let’s see how many missing records we have.

psych::describe(power.ts) %>% select(-vars) %>% kable(digits = 2L, caption = "Descriptive Statistics of Residential Power Consumption dataset", 
    table.attr = "style='width:22%;'") %>% kableExtra::kable_styling(full_width = F)
Descriptive Statistics of Residential Power Consumption dataset
n mean sd median trimmed mad min max range skew kurtosis se
X1 191 6502475 1447571 6283324 6439475 1543074 770523 10655730 9885207 0.17 0.45 104742.6
# sprintf
boxplot(power$kwh, main = sprintf("%s", "Residential Power Consumption"), col = "violet", 
    fill = "purple", xlab = sprintf("# of outliers = %d", length(boxplot(power, plot = F)$out)))

# head(power) tail(power)
min <- as.Date("1998-01-01")
max <- as.Date("2013-12-01")

# cex.lab=.8 to decrease the fonts for x axis labels
power %>% ggplot(aes(x = dt, y = kwh)) + geom_line(color = "violet") + labs(subtitle = "Residential Power Consumption Time Plot", 
    x = "Date(MON YYYY)", y = "Power Used (kWh)", cex.lab = 0.8) + scale_x_date(date_breaks = "1 year", 
    limits = c(min, max), date_labels = "%b %y")

power.ts = tsclean(power.ts)
psych::describe(power.ts)[, -1] %>% kable(digits = 2L, caption = "Descriptive Statistics of Power Usage", 
    table.attr = "style='width:22%;'") %>% kableExtra::kable_styling(full_width = F)
Descriptive Statistics of Power Usage
n mean sd median trimmed mad min max range skew kurtosis se
X1 192 6529701 1369032 6351262 6455590 1587856 4313019 10655730 6342711 0.47 -0.48 98801.41
gridExtra::grid.arrange(ncol = 2, ggseasonplot(power.ts), ggsubseriesplot(power.ts))

The power consumption time series apparently has a seasonality. With consumption peaking in August and January, it has troughs around May and November. This is not hard to explain as the consumption of electricity rises naturally with there is extra heating needed during the winter months and air conditioning in the summer.

The subseries plot does show the medians in a clearer and unarguable manner.

lambda <- BoxCox.lambda(power.ts)
power.bc <- BoxCox(power.ts, lambda = lambda)

ggtsdisplay(power.bc, main = "Residential Power Consumption", ylab = "Power Used(kWh)", 
    xlab = "Month")

gridExtra::grid.arrange(ncol = 2, ggseasonplot(power.bc), ggsubseriesplot(power.bc))

The BoxCox transformation did really smoothed out the variance. Now it looks more like a constant variance.

# Number of differences required for a stationary series
ndiffs(power.bc)
[1] 1
# Number of differences required for a seasonally stationary series
nsdiffs(power.bc)
[1] 1
power.bc %>% diff %>% ur.kpss %>% summary

####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 4 lags. 

Value of test-statistic is: 0.0218 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

The KPSS Unit Root Test shows that the ordinary first-order difference was sufficient to get to stationary data. As the test-statistics is not in the critical region, it fails to reject the null hypothesis.

Modeling

Let’s examine the ACF and PACF plots of first-differenced and Box-Cox transformed ts. Since there are more than 5% of the spikes in ACF that are outside the dotted blue line, we suspect that this transformed and differenced ts is white noise.

gridExtra::grid.arrange(ncol = 2, ggAcf(diff(power.bc)), ggPacf(diff(power.bc)))

gridExtra::grid.arrange(ncol = 2, ggAcf(diff(diff(power.bc, lag = 12))), ggPacf(diff(diff(power.bc, 
    lag = 12))))

By visualizing the PACF and ACF, I came up with a manual order of the arima model ARIMA(p,d,q)(P,D,Q)m of p = 2, P = 1, q = 1, Q = 1, d= 1, D= 1 where m = 7.

manual.arima.pwr <- Arima(power.ts, order = c(2, 1, 1), seasonal = c(1, 1, 1), lambda = lambda)

manual.arima.pwr %>% summary
Series: power.ts 
ARIMA(2,1,1)(1,1,1)[12] 
Box Cox transformation: lambda= -0.1442665 

Coefficients:
         ar1     ar2      ma1     sar1     sma1
      0.2272  0.0006  -0.9502  -0.1052  -0.6824
s.e.  0.0817  0.0789   0.0327   0.1182   0.1010

sigma^2 estimated as 8.912e-05:  log likelihood=579.98
AIC=-1147.95   AICc=-1147.47   BIC=-1128.83

Training set error measures:
                   ME     RMSE      MAE        MPE     MAPE      MASE
Training set 32516.63 588772.5 437553.9 0.03413681 6.612284 0.7336461
                   ACF1
Training set 0.04711048
auto.arima.pwr <- auto.arima(power.ts, d = 1, lambda = lambda, approximation = FALSE)

auto.arima.pwr %>% summary
Series: power.ts 
ARIMA(1,1,2)(0,1,1)[12] 
Box Cox transformation: lambda= -0.1442665 

Coefficients:
          ar1      ma1      ma2     sma1
      -0.4446  -0.2494  -0.6452  -0.7360
s.e.   0.1807   0.1520   0.1321   0.0629

sigma^2 estimated as 8.777e-05:  log likelihood=580.92
AIC=-1151.83   AICc=-1151.48   BIC=-1135.89

Training set error measures:
                   ME     RMSE      MAE        MPE    MAPE      MASE       ACF1
Training set 33549.31 585381.9 436711.6 0.05510506 6.59288 0.7322338 0.07434323

I fit a ARIMA(2,1,1)(1,1,1)[12] model, while the algorithm selected a ARIMA(1,1,2)(0,1,1)[12] model. That is, the manual model has one additional autoregressive term and 1 less MA team. On the seasonal arima component, the manual model has one more autoregressive term. The manual model boasts an AICc=−1148 versus the automated’s AICc=-1151 and the manual has RMSE=588772.5 versus the automated has a RMSE = 585381.9.

Judging comprehensively, I think I would keep the auto.arima as it has a lower RMSE.

checkresiduals(auto.arima.pwr)


    Ljung-Box test

data:  Residuals from ARIMA(1,1,2)(0,1,1)[12]
Q* = 15.697, df = 20, p-value = 0.7352

Model df: 4.   Total lags used: 24

Forecast with the chosen model

power.fc <- forecast(auto.arima.pwr, h = 12)
autoplot(power.fc, main = "Residential Power Consumption", ylab = "Power Used(kWh)", 
    xlab = "Month")

date = seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month") %>% format("%b %Y")
# openxlsx::write.xlsx(data.frame(Date = date, kWh = power.fc$mean), file =
# 'Residential-624.xlsx', sheetName='Residential_Power_2014', append=TRUE)

Part C [Bonus]: Forecasting Water Pipe Flow

To be completed Oct 26, 2020. Will send separately.