DATA 624 Project 1

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(fpp3)

## Registered S3 methods overwritten by 'ggtime':
##   method           from      
##   autolayer.fbl_ts fabletools
##   autolayer.tbl_ts fabletools
##   autoplot.dcmp_ts fabletools
##   autoplot.fbl_ts  fabletools
##   autoplot.tbl_ts  fabletools
##   fortify.fbl_ts   fabletools
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tsibble     1.2.0     ✔ feasts      0.5.0
## ✔ tsibbledata 0.4.1     ✔ fable       0.5.0
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(readxl)
library(writexl)

PART A

atm_raw <- read_excel("ATM624Data.xlsx",
                      col_types = c("date", "text", "numeric")) |>
  rename(date = DATE, atm = ATM, cash = Cash)

glimpse(atm_raw)

## Rows: 1,474
## Columns: 3
## $ date <dttm> 2009-05-01, 2009-05-01, 2009-05-02, 2009-05-02, 2009-05-03, 2009…
## $ atm  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …

summary(atm_raw)

##       date                         atm                 cash        
##  Min.   :2009-05-01 00:00:00   Length:1474        Min.   :    0.0  
##  1st Qu.:2009-08-01 00:00:00   Class :character   1st Qu.:    0.5  
##  Median :2009-11-01 00:00:00   Mode  :character   Median :   73.0  
##  Mean   :2009-10-31 19:11:48                      Mean   :  155.6  
##  3rd Qu.:2010-02-01 00:00:00                      3rd Qu.:  114.0  
##  Max.   :2010-05-14 00:00:00                      Max.   :10919.8  
##                                                   NA's   :19

head(atm_raw)

## # A tibble: 6 × 3
##   date                atm    cash
##   <dttm>              <chr> <dbl>
## 1 2009-05-01 00:00:00 ATM1     96
## 2 2009-05-01 00:00:00 ATM2    107
## 3 2009-05-02 00:00:00 ATM1     82
## 4 2009-05-02 00:00:00 ATM2     89
## 5 2009-05-03 00:00:00 ATM1     85
## 6 2009-05-03 00:00:00 ATM2     90

#How many unique ATMs, and what is the date range?
atm_raw |>
  filter(!is.na(atm)) |>
  group_by(atm) |>
  summarise(
    n         = n(),
    start     = min(date),
    end       = max(date),
    n_missing = sum(is.na(cash)),
    n_zero    = sum(cash == 0, na.rm = TRUE),
    max_cash  = max(cash, na.rm = TRUE)
  )

## # A tibble: 4 × 7
##   atm       n start               end                 n_missing n_zero max_cash
##   <chr> <int> <dttm>              <dttm>                  <int>  <int>    <dbl>
## 1 ATM1    365 2009-05-01 00:00:00 2010-04-30 00:00:00         3      0     180 
## 2 ATM2    365 2009-05-01 00:00:00 2010-04-30 00:00:00         2      2     147 
## 3 ATM3    365 2009-05-01 00:00:00 2010-04-30 00:00:00         0    362      96 
## 4 ATM4    365 2009-05-01 00:00:00 2010-04-30 00:00:00         0      0   10920.

#What do the NA rows look like?
atm_raw |> filter(is.na(atm))

## # A tibble: 14 × 3
##    date                atm    cash
##    <dttm>              <chr> <dbl>
##  1 2010-05-01 00:00:00 <NA>     NA
##  2 2010-05-02 00:00:00 <NA>     NA
##  3 2010-05-03 00:00:00 <NA>     NA
##  4 2010-05-04 00:00:00 <NA>     NA
##  5 2010-05-05 00:00:00 <NA>     NA
##  6 2010-05-06 00:00:00 <NA>     NA
##  7 2010-05-07 00:00:00 <NA>     NA
##  8 2010-05-08 00:00:00 <NA>     NA
##  9 2010-05-09 00:00:00 <NA>     NA
## 10 2010-05-10 00:00:00 <NA>     NA
## 11 2010-05-11 00:00:00 <NA>     NA
## 12 2010-05-12 00:00:00 <NA>     NA
## 13 2010-05-13 00:00:00 <NA>     NA
## 14 2010-05-14 00:00:00 <NA>     NA

#Convert to tsibble
atm_raw <- atm_raw |>
  mutate(date = as.Date(date)) |>
  as_tsibble(index = date, key = atm)

There are 14 rows at the bottom with no ATM or Cash. These are the May 2010 forecast placeholder rows embedded in the file. I also see some missing Cash values and one extreme max value in ATM4. I will investigate these after visualising the raw series.

atm_raw |>
  filter(!is.na(atm)) |>
  mutate(date = as.Date(date)) |>
  ggplot(aes(x = date, y = cash)) +
  geom_line() +
  facet_wrap(~atm, scales = "free_y", ncol = 1) +
  labs(
    title    = "Daily ATM Cash Withdrawals - Raw Data", y = "Cash ($100s)"
  )

Since these initial plots indicated there might be some weekly seasonality, I decided to check for that. But before that, I removed the outlier in ATM4. I replaced it with the previous week’s value from the same day because I suspected said weekly seasonality. The plots below ended up confirming just that.

#Detecting the outlier
atm_raw |>
  filter(atm == "ATM4") |>
  arrange(desc(cash)) |>
  head(5)

## # A tsibble: 5 x 3 [1D]
## # Key:       atm [1]
##   date       atm     cash
##   <date>     <chr>  <dbl>
## 1 2010-02-09 ATM4  10920.
## 2 2009-09-22 ATM4   1712.
## 3 2010-01-29 ATM4   1575.
## 4 2009-09-04 ATM4   1495.
## 5 2009-06-09 ATM4   1484.

#Removing ATM4 outlier
atm_fixed <- atm_raw |>
  filter(!is.na(atm)) |>
  mutate(cash = if_else(atm == "ATM4" & date == as.Date("2010-02-09"),
                        153.24, cash))

#Season plot to visually check for weekly patterns
atm_fixed |>
  filter(atm %in% c("ATM1", "ATM2", "ATM4")) |>
  gg_season(cash, period = "week") +
  facet_wrap(~atm, scales = "free_y", ncol = 1) +
  labs(title = "Weekly Season Plot - ATM Cash Withdrawals", y = "Cash ($100s)")

#ACF, spikes at lags 7, 14, 21 indicate weekly seasonality
atm_fixed |>
  filter(atm %in% c("ATM1", "ATM2", "ATM4")) |>
  ACF(cash, lag_max = 28) |>
  autoplot() +
  facet_wrap(~atm, scales = "free_y") +
  labs(title = "ACF - ATM Cash Withdrawals")

There is a lot of useful information in the plots. The season plot shows consistent day-of-week patterns across all weeks for ATM1 and ATM2, most notably a pronounced dip on Thursday visible in nearly every week. ATM4 shows a similar but noisier pattern. This is strong visual evidence of weekly seasonality. The ACF confirms what the season plot showed. ATM1 and ATM2 have large, significant spikes at lags 7, 14, and 21, which is a sign of weekly seasonality. ATM4’s ACF is weaker but still shows spikes at the same lags, consistent with a noisier version of the same pattern. Both plots together give us sufficient evidence to proceed with weekly seasonal models.

The summarized findings: ATM1: Reasonably consistent withdrawals throughout the series with visible weekly seasonality. ATM2: Similar behavior to ATM1, maybe even a trend too. ATM3: Almost entirely zero for the whole series, then a sudden jump at the very end. This is abnormal and needs investigation. ATM4: One extreme spike that completely dominates the y-axis, making the rest of the series impossible to read. This is almost certainly an error because that’s way too much money for an ATM to hold, which I ended up removing, as explained.

#Investigate ATM3
atm_fixed |>
  filter(atm == "ATM3") |>
  filter(cash > 0)

## # A tsibble: 3 x 3 [1D]
## # Key:       atm [1]
##   date       atm    cash
##   <date>     <chr> <dbl>
## 1 2010-04-28 ATM3     96
## 2 2010-04-29 ATM3     82
## 3 2010-04-30 ATM3     85

#Compare ATM3 non-zero dates against ATM1 on same dates
atm_fixed |>
  filter(atm %in% c("ATM1", "ATM3")) |>
  filter(date >= as.Date("2010-04-28")) |>
  pivot_wider(names_from = atm, values_from = cash)

## # A tsibble: 3 x 3 [1D]
##   date        ATM1  ATM3
##   <date>     <dbl> <dbl>
## 1 2010-04-28    96    96
## 2 2010-04-29    82    82
## 3 2010-04-30    85    85

#Check remaining missing values
atm_fixed |>
  filter(is.na(cash))

## # A tsibble: 5 x 3 [1D]
## # Key:       atm [2]
##   date       atm    cash
##   <date>     <chr> <dbl>
## 1 2009-06-13 ATM1     NA
## 2 2009-06-16 ATM1     NA
## 3 2009-06-22 ATM1     NA
## 4 2009-06-18 ATM2     NA
## 5 2009-06-24 ATM2     NA

ATM3 has only 3 non-zero observations and they are identical to ATM1 on those same dates. This strongly suggests ATM3 was not operational during this period and its final 3 entries are just duplicates of ATM1. With only 3 real data points there is no meaningful basis for an independent model, so I will forecast ATM3 using ATM1’s model. ATM1 and ATM2 each have a small number of missing Cash values in June 2009. Now that weekly seasonality is confirmed, lag-7 imputation replacing each missing value with the observation from the same day the prior week can be done.

#Impute missing values using same day previous week, now we have no NA values at all
atm_clean <- atm_fixed |>
  group_by_key() |>
  mutate(cash = if_else(is.na(cash), lag(cash, 7), cash)) |>
  ungroup()

#Cleaned series
atm_clean |>
  filter(atm %in% c("ATM1", "ATM2", "ATM4")) |>
  autoplot(cash) +
  facet_wrap(~atm, scales = "free_y", ncol = 1) +
  labs(
    title    = "Daily ATM Cash Withdrawals- Cleaned", y = "Cash ($100s)"
  )

ATM 1 and 2 look great, but ATM4 still looks rather noisy, so I will attempt to see if it requires a BoxCox transformation by finding the optimal lambda.

#Check if variance is changing over time for ATM4
atm_clean |>
  filter(atm == "ATM4") |>
  autoplot(cash) +
  labs(title = "ATM4 - Check for changing variance", y = "Cash ($100s)")

#Guerrero method to find optimal lambda
lambda <- atm_clean |>
  filter(atm == "ATM4") |>
  features(cash, features = guerrero) |>
  pull(lambda_guerrero)

cat("Guerrero lambda for ATM4:", round(lambda, 4))

## Guerrero lambda for ATM4: 0.4489

The Guerrero method returns a lambda of 0.45, which is close to a square root transformation. This is sufficiently far from 1 to justify applying a Box-Cox transformation to ATM4.

atm_clean |> features(cash, unitroot_kpss)

## # A tibble: 4 × 3
##   atm   kpss_stat kpss_pvalue
##   <chr>     <dbl>       <dbl>
## 1 ATM1      0.506      0.0403
## 2 ATM2      2.02       0.01  
## 3 ATM3      0.389      0.0818
## 4 ATM4      0.123      0.1

atm_clean |> features(cash, unitroot_ndiffs)  #number of differences needed

## # A tibble: 4 × 2
##   atm   ndiffs
##   <chr>  <int>
## 1 ATM1       1
## 2 ATM2       1
## 3 ATM3       0
## 4 ATM4       0

This essentially confirms that ATM1 and 2 are non-stationary, while the other 2 are.

#Pivot wide and convert to tsibble for modelling
atm_ts <- atm_clean |>
  pivot_wider(names_from = atm, values_from = cash) |>
  as_tsibble(index = date)

#STL decomposition to examine trend and seasonal components before model selection
atm_clean |>
  filter(atm %in% "ATM1") |>
  model(STL(cash)) |>
  components() |>
  autoplot() +
  labs(title = "STL Decomposition - ATM1")

atm_clean |>
  filter(atm %in% "ATM2") |>
  model(STL(cash)) |>
  components() |>
  autoplot() +
  labs(title = "STL Decomposition - ATM2")

atm_clean |>
  filter(atm %in% "ATM4") |>
  model(STL(cash)) |>
  components() |>
  autoplot() +
  labs(title = "STL Decomposition - ATM4")

The STL decomposition confirms what I saw in the season plots. ATM1 and ATM2 show no meaningful trend, as the trend component is essentially flat across the full year. Although I initially thought there might be a trend in ATM2, and it might look like there is a downwards trend, it recovers towards the end of the series, so I will simply consider it as not having a trend. The seasonal component is strong and stable for all 3, consistent with the weekly pattern I saw earlier. The remainder is reasonably well-behaved with no obvious structure, which is a good sign for modelling. Though ATM4’s remainder is a bit noisier with more variance, but this is just more justification for the BoxCox transformation later on.

atm_ts |> gg_tsdisplay(ATM1, plot_type = "partial", lag_max = 28) +
  labs(title = "ATM1 - ACF and PACF")

atm_ts |> gg_tsdisplay(ATM2, plot_type = "partial", lag_max = 28) +
  labs(title = "ATM2 - ACF and PACF")

atm_ts |> gg_tsdisplay(ATM4, plot_type = "partial", lag_max = 28) +
  labs(title = "ATM4 - ACF and PACF")

The ACF and PACF plots inform my model selection. ATM1 and ATM2 both show large, significant spikes at lags 7, 14, 21, and 28 in the ACF with a slow decay, and the PACF cuts off sharply after lag 7. This is the signature of seasonal autocorrelation at a weekly period, confirming that seasonal differencing at period 7 is indeed needed. ATM4 shows much weaker autocorrelation overall, as most spikes barely exceed the significance bounds, which is consistent with the high variance I observed. The BoxCox transformation should help expose the underlying structure more clearly. It already looks stationary, while the other 2 don’t. I will fit three models for each ATM: a seasonal NAIVE model as a benchmark, an automatically selected ARIMA constrained to weekly seasonality, and an ETS model. If ARIMA and ETS cannot beat the SNAIVE benchmark, then obviously added complexity is not justified.

fit_atm1 <- atm_ts |>
  model(
    snaive = SNAIVE(ATM1 ~ lag("week")),
    arima  = ARIMA(ATM1 ~ PDQ(period = 7)),
    ets    = ETS(ATM1)
  )

fit_atm2 <- atm_ts |>
  model(
    snaive = SNAIVE(ATM2 ~ lag("week")),
    arima  = ARIMA(ATM2 ~ PDQ(period = 7)),
    ets    = ETS(ATM2)
  )

#Fitting atm4 with the BoxCox transformation applied
fit_atm4 <- atm_ts |>
  model(
    snaive = SNAIVE(ATM4 ~ lag("week")),
    arima  = ARIMA(box_cox(ATM4, lambda) ~ PDQ(period = 7)),
    ets    = ETS(box_cox(ATM4, lambda))
  )

#Reports
fit_atm1 |> select(arima) |> report()

## Series: ATM1 
## Model: ARIMA(0,0,1)(0,1,2)[7] 
## 
## Coefficients:
##          ma1     sma1     sma2
##       0.1868  -0.5762  -0.1096
## s.e.  0.0548   0.0507   0.0496
## 
## sigma^2 estimated as 556.4:  log likelihood=-1640.02
## AIC=3288.04   AICc=3288.16   BIC=3303.57

fit_atm1 |> select(ets)   |> report()

## Series: ATM1 
## Model: ETS(A,N,A) 
##   Smoothing parameters:
##     alpha = 0.0206385 
##     gamma = 0.3301984 
## 
##   Initial states:
##      l[0]      s[0]    s[-1]    s[-2]    s[-3]    s[-4]    s[-5]    s[-6]
##  79.39605 -62.73693 6.754848 13.60636 10.95823 12.94413 10.40877 8.064604
## 
##   sigma^2:  578.8824
## 
##      AIC     AICc      BIC 
## 4486.151 4486.772 4525.150

fit_atm2 |> select(arima) |> report()

## Series: ATM2 
## Model: ARIMA(2,0,2)(0,1,1)[7] 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2     sma1
##       -0.4236  -0.9122  0.4590  0.7992  -0.7487
## s.e.   0.0548   0.0423  0.0848  0.0576   0.0388
## 
## sigma^2 estimated as 586:  log likelihood=-1648.63
## AIC=3309.25   AICc=3309.49   BIC=3332.54

fit_atm2 |> select(ets)   |> report()

## Series: ATM2 
## Model: ETS(A,N,A) 
##   Smoothing parameters:
##     alpha = 0.003771524 
##     gamma = 0.3621959 
## 
##   Initial states:
##      l[0]      s[0]     s[-1]   s[-2]    s[-3]    s[-4]    s[-5]   s[-6]
##  72.74555 -53.45142 -42.43341 16.7643 6.024302 26.48906 17.09206 29.5151
## 
##   sigma^2:  624.0025
## 
##      AIC     AICc      BIC 
## 4513.546 4514.168 4552.545

fit_atm4 |> select(arima) |> report()

## Series: ATM4 
## Model: ARIMA(0,0,1)(2,0,0)[7] w/ mean 
## Transformation: box_cox(ATM4, lambda) 
## 
## Coefficients:
##          ma1    sar1    sar2  constant
##       0.0801  0.2070  0.2037   16.8086
## s.e.  0.0527  0.0516  0.0524    0.7292
## 
## sigma^2 estimated as 174.9:  log likelihood=-1458.97
## AIC=2927.93   AICc=2928.1   BIC=2947.43

fit_atm4 |> select(ets)   |> report()

## Series: ATM4 
## Model: ETS(M,N,A) 
## Transformation: box_cox(ATM4, lambda) 
##   Smoothing parameters:
##     alpha = 0.01221965 
##     gamma = 0.1804067 
## 
##   Initial states:
##      l[0]      s[0]     s[-1]    s[-2]    s[-3]    s[-4]    s[-5]    s[-6]
##  26.99257 -19.19674 -4.763839 1.615992 2.809201 7.310503 8.141445 4.083436
## 
##   sigma^2:  0.2214
## 
##      AIC     AICc      BIC 
## 4018.952 4019.574 4057.951

#Information criteria comparison (ARIMA and ETS only becauseSNAIVE has no AIC)
glance(fit_atm1)

## # A tibble: 3 × 11
##   .model sigma2 log_lik   AIC  AICc   BIC ar_roots  ma_roots     MSE  AMSE   MAE
##   <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <list>    <list>     <dbl> <dbl> <dbl>
## 1 snaive   767.     NA    NA    NA    NA  <NULL>    <NULL>       NA    NA   NA  
## 2 arima    556.  -1640. 3288. 3288. 3304. <cpl [0]> <cpl [15]>   NA    NA   NA  
## 3 ets      579.  -2233. 4486. 4487. 4525. <NULL>    <NULL>      565.  568.  15.1

glance(fit_atm2)

## # A tibble: 3 × 11
##   .model sigma2 log_lik   AIC  AICc   BIC ar_roots  ma_roots    MSE  AMSE   MAE
##   <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <list>    <list>    <dbl> <dbl> <dbl>
## 1 snaive   882.     NA    NA    NA    NA  <NULL>    <NULL>      NA    NA   NA  
## 2 arima    586.  -1649. 3309. 3309. 3333. <cpl [2]> <cpl [9]>   NA    NA   NA  
## 3 ets      624.  -2247. 4514. 4514. 4553. <NULL>    <NULL>     609.  610.  17.2

glance(fit_atm4)

## # A tibble: 3 × 11
##   .model   sigma2 log_lik   AIC  AICc   BIC ar_roots ma_roots   MSE  AMSE    MAE
##   <chr>     <dbl>   <dbl> <dbl> <dbl> <dbl> <list>   <list>   <dbl> <dbl>  <dbl>
## 1 snaive  2.06e+5     NA    NA    NA    NA  <NULL>   <NULL>     NA    NA  NA    
## 2 arima   1.75e+2  -1459. 2928. 2928. 2947. <cpl>    <cpl>      NA    NA  NA    
## 3 ets     2.21e-1  -1999. 4019. 4020. 4058. <NULL>   <NULL>    164.  165.  0.369

#In-sample accuracy for all three models including SNAIVE
accuracy(fit_atm1)

## # A tibble: 3 × 10
##   .model .type         ME  RMSE   MAE    MPE  MAPE  MASE RMSSE     ACF1
##   <chr>  <chr>      <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 snaive Training -0.0363  27.7  17.5  -96.5  116. 1     1      0.180  
## 2 arima  Training -0.0736  23.3  14.6 -102.   117. 0.830 0.841 -0.00824
## 3 ets    Training -0.144   23.8  15.1 -106.   121. 0.858 0.859  0.138

accuracy(fit_atm2)

## # A tibble: 3 × 10
##   .model .type         ME  RMSE   MAE   MPE  MAPE  MASE RMSSE     ACF1
##   <chr>  <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 snaive Training  0.0223  29.6  20.3  -Inf   Inf 1     1      0.0483 
## 2 arima  Training -0.853   23.8  16.7  -Inf   Inf 0.824 0.803 -0.00207
## 3 ets    Training -0.626   24.7  17.2  -Inf   Inf 0.848 0.832  0.0106

accuracy(fit_atm4)

## # A tibble: 3 × 10
##   .model .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>  <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 snaive Training -3.70  453.  346. -392.  447. 1     1     0.0628
## 2 arima  Training 85.0   352.  275. -367.  412. 0.793 0.777 0.0195
## 3 ets    Training 64.5   345.  263. -360.  405. 0.759 0.760 0.0894

#Residual diagnostics, checking that model residuals resemble white noise
fit_atm1 |> select(arima) |> gg_tsresiduals() +
  labs(title = "ATM1 - ARIMA residuals")

fit_atm1 |> select(ets) |> gg_tsresiduals() +
  labs(title = "ATM1 - ETS residuals")

fit_atm2 |> select(arima) |> gg_tsresiduals() +
  labs(title = "ATM2 - ARIMA residuals")

fit_atm2 |> select(ets) |> gg_tsresiduals() +
  labs(title = "ATM2 - ETS residuals")

fit_atm4 |> select(arima) |> gg_tsresiduals() +
  labs(title = "ATM4 - ARIMA residuals")

fit_atm4 |> select(ets) |> gg_tsresiduals() +
  labs(title = "ATM4 - ETS residuals")

#Ljung-Box test, the formal test for residual autocorrelation
#H0: residuals are white noise. p > 0.05 means fail to reject.
#lag = 14 covers two seasonal periods
augment(fit_atm1) |> features(.innov, ljung_box, lag = 14) |> select(.model, lb_stat, lb_pvalue)

## # A tibble: 3 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 arima     15.8  3.26e- 1
## 2 ets       29.9  7.98e- 3
## 3 snaive    82.6  9.33e-12

augment(fit_atm2) |> features(.innov, ljung_box, lag = 14) |> select(.model, lb_stat, lb_pvalue)

## # A tibble: 3 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 arima     9.00  8.31e- 1
## 2 ets      36.8   7.84e- 4
## 3 snaive  104.    8.88e-16

augment(fit_atm4) |> features(.innov, ljung_box, lag = 14) |> select(.model, lb_stat, lb_pvalue)

## # A tibble: 3 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 arima     17.1     0.252
## 2 ets       16.4     0.290
## 3 snaive   117.      0

Model Results and Diagnostics A good model should have residuals that resemble white noise - no patterns in the time plot, no significant spikes in the ACF, and a roughly symmetric histogram centred at zero Three models were fitted for each ATM: a SNAIVE benchmark, ARIMA with weekly seasonality, and ETS. For ATM1 and ATM2, ARIMA selected seasonal differencing models - ARIMA(0,0,1)(0,1,2)[7] and ARIMA(2,0,2)(0,1,1)[7] respectively. Both ETS models selected ETS(A,N,A), showing the flat trend and stable seasonality from the STL decomposition. ATM4’s models differ slightly. ARIMA used seasonal AR terms without differencing given its weaker autocorrelation structure, and ETS selected ETS(M,N,A) with multiplicative errors to account for the higher variance.

Residual diagnostics show that, for ATM1 and ATM2, ARIMA passes the Ljung-Box test comfortably (p = 0.33 and p = 0.83), while ETS fails for both (p = 0.008 and p = 0.0008) and SNAIVE fails completely. For ATM4 both ARIMA and ETS pass (p = 0.25 and p = 0.29), so diagnostics alone cannot separate them. In-sample RMSE follows the same pattern: ARIMA outperforms on ATM1 (23.3 vs 23.8 for ETS) and ATM2 (23.8 vs 24.7), while ATM4 is too close to call (ETS at 345 versus ARIMA at 352).

Since in-sample accuracy is evaluated on data the model already saw, I will use time series cross-validation with a rolling forecasting origin to get a true picture of out-of-sample forecast performance.

#CV

#Would have used .step = 1, but my computer exploded
atm1_stretch <- atm_ts |>
  slice(1:(n() - 31)) |>
  select(ATM1) |>
  stretch_tsibble(.init = 182, .step = 7)

cv_atm1 <- atm1_stretch |>
  model(
    snaive = SNAIVE(ATM1 ~ lag("week")),
    arima  = ARIMA(ATM1 ~ PDQ(period = 7)),
    ets    = ETS(ATM1)
  ) |>
  forecast(h = 30)

cv_atm1 |>
  group_by(.id, .model) |>
  mutate(h = row_number()) |>
  ungroup() |>
  as_fable(response = "ATM1", distribution = ATM1) |>
  accuracy(atm_ts, by = c("h", ".model")) |>
  select(h, .model, RMSE, MAE) |>
  ggplot(aes(x = h, y = RMSE, col = .model)) +
  geom_line() +
  labs(title = "ATM1 - Cross-validated RMSE by forecast horizon",
       x = "Horizon (days)", y = "RMSE")

atm2_stretch <- atm_ts |>
  slice(1:(n() - 31)) |>
  select(ATM2) |>
  stretch_tsibble(.init = 182, .step = 7)

cv_atm2 <- atm2_stretch |>
  model(
    snaive = SNAIVE(ATM2 ~ lag("week")),
    arima  = ARIMA(ATM2 ~ PDQ(period = 7)),
    ets    = ETS(ATM2)
  ) |>
  forecast(h = 30)

cv_atm2 |>
  group_by(.id, .model) |>
  mutate(h = row_number()) |>
  ungroup() |>
  as_fable(response = "ATM2", distribution = ATM2) |>
  accuracy(atm_ts, by = c("h", ".model")) |>
  select(h, .model, RMSE, MAE) |>
  ggplot(aes(x = h, y = RMSE, col = .model)) +
  geom_line() +
  labs(title = "ATM2 - Cross-validated RMSE by forecast horizon",
       x = "Horizon (days)", y = "RMSE")

atm4_stretch <- atm_ts |>
  slice(1:(n() - 31)) |>
  select(ATM4) |>
  stretch_tsibble(.init = 182, .step = 7)

cv_atm4 <- atm4_stretch |>
  model(
    snaive = SNAIVE(ATM4 ~ lag("week")),
    arima  = ARIMA(box_cox(ATM4, lambda) ~ PDQ(period = 7)),
    ets    = ETS(box_cox(ATM4, lambda))
  ) |>
  forecast(h = 30)

cv_atm4 |>
  group_by(.id, .model) |>
  mutate(h = row_number()) |>
  ungroup() |>
  as_fable(response = "ATM4", distribution = ATM4) |>
  accuracy(atm_ts, by = c("h", ".model")) |>
  select(h, .model, RMSE, MAE) |>
  ggplot(aes(x = h, y = RMSE, col = .model)) +
  geom_line() +
  labs(title = "ATM4 - Cross-validated RMSE by forecast horizon",
       x = "Horizon (days)", y = "RMSE")

Cross-validation confirms the residual diagnostics for ATM1 and ATM2 but tells a different story for ATM4. For ATM1, ARIMA consistently produces the lowest RMSE across nearly all forecast horizons, with ETS slightly worse and SNAIVE the weakest. The periodic dips in RMSE at multiples of 7 are expected, predicting 7, 14, 21 days ahead again aligns with the weekly seasonal cycle and makes those horizons naturally easier to forecast. ARIMA is the clear choice for ATM1, from my understanding. For ATM2, the picture is less clear because ARIMA and ETS track closely together across all horizons and both are comparable to SNAIVE for much of the range, with ARIMA pulling slightly ahead at longer horizons. However, ARIMA already passed the Ljung-Box test while ETS failed, so combined with the small CV advantage ARIMA is also my preferred model for ATM2. For ATM4, ARIMA and ETS are very close across all horizons and both substantially outperform SNAIVE. Given that both models pass the Ljung-Box test and CV does not clearly separate them, ETS is selected as the slightly more parsimonious model, because simplicity is preferred.

#Final forecasts - 31 days for May 2010
fc_atm1 <- fit_atm1 |>
  select(arima) |>
  forecast(h = 31)

fc_atm2 <- fit_atm2 |>
  select(arima) |>
  forecast(h = 31)

fc_atm4 <- fit_atm4 |>
  select(ets) |>
  forecast(h = 31)

fc_atm1 |>
  autoplot(atm_ts |> filter_index("2010-02-01" ~ .), level = 80) +
  labs(title = "ATM1 - May 2010 Forecast", y = "Cash ($100s)")

fc_atm2 |>
  autoplot(atm_ts |> filter_index("2010-02-01" ~ .), level = 80) +
  labs(title = "ATM2 - May 2010 Forecast", y = "Cash ($100s)")

fc_atm4 |>
  autoplot(atm_ts |> filter_index("2010-02-01" ~ .), level = 80) +
  labs(title = "ATM4 - May 2010 Forecast", y = "Cash ($100s)")

The ATM4 forecast has intervals so wide that it doesn’t really make sense. Since the ARIMA model also passed the tests here, I will just switch to that instead and see if it is any better.

#Checking the means to confirm whether the model is wrong, or the data simply has high variance and the model is accounting for it
fc_atm4 |>
  as_tibble() |>
  select(date, .mean)

## # A tibble: 31 × 2
##    date       .mean
##    <date>     <dbl>
##  1 2010-05-01 337. 
##  2 2010-05-02 460. 
##  3 2010-05-03 473. 
##  4 2010-05-04  43.2
##  5 2010-05-05 589. 
##  6 2010-05-06 385. 
##  7 2010-05-07 623. 
##  8 2010-05-08 342. 
##  9 2010-05-09 465. 
## 10 2010-05-10 474. 
## # ℹ 21 more rows

The point forecasts for all three ATMs are plausible and consistent with historical withdrawal patterns. ATM1 and ATM2 forecasts show the expected weekly cycle continuing into May 2010 with relatively tight prediction intervals. ATM4’s prediction intervals are notably wider, which was surprising to me. I considered using the ARIMA model, which also passed the tests, but ultimately decide against it. Because I believe they are simply reflecting the genuine high variance observed throughout that series, and the model is correctly acknowledging uncertainty rather than overstating precision. The point forecasts for ATM4 range between approximately $24000 and $51000 per day, which is consistent with the historical average of around $44500. ATM3 is just assigned ATM1’s forecast directly, given the evidence that the two machines recorded identical values on their only overlapping dates.

#ATM3 uses ATM1's forecast because no independent model possible
atm1_out <- fc_atm1 |>
  as_tibble() |>
  mutate(ATM = "ATM1") |>
  select(DATE = date, ATM, Cash = .mean)

atm3_out <- atm1_out |>
  mutate(ATM = "ATM3")

atm2_out <- fc_atm2 |>
  as_tibble() |>
  mutate(ATM = "ATM2") |>
  select(DATE = date, ATM, Cash = .mean)

atm4_out <- fc_atm4 |>
  as_tibble() |>
  mutate(ATM = "ATM4") |>
  select(DATE = date, ATM, Cash = .mean)

#Showing the forecast cash values
atm_forecast <- bind_rows(atm1_out, atm2_out, atm3_out, atm4_out) |>
  arrange(ATM, DATE)

write_xlsx(atm_forecast, "PartA_ATM_May2010_Forecast.xlsx")
print(atm_forecast)

## # A tibble: 124 × 3
##    DATE       ATM     Cash
##    <date>     <chr>  <dbl>
##  1 2010-05-01 ATM1   87.2 
##  2 2010-05-02 ATM1  104.  
##  3 2010-05-03 ATM1   73.1 
##  4 2010-05-04 ATM1    8.16
##  5 2010-05-05 ATM1  100.0 
##  6 2010-05-06 ATM1   80.9 
##  7 2010-05-07 ATM1   86.7 
##  8 2010-05-08 ATM1   88.0 
##  9 2010-05-09 ATM1  103.  
## 10 2010-05-10 ATM1   73.4 
## # ℹ 114 more rows

PART B

power_raw <- read_excel("ResidentialCustomerForecastLoad-624.xlsx",
                        col_types = c("numeric", "text", "numeric")) |>
  rename(seq_id = CaseSequence, period = `YYYY-MMM`, kwh = KWH)

glimpse(power_raw)

## Rows: 192
## Columns: 3
## $ seq_id <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745…
## $ period <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May", "19…
## $ kwh    <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 8914755, …

summary(power_raw)

##      seq_id         period               kwh          
##  Min.   :733.0   Length:192         Min.   :  770523  
##  1st Qu.:780.8   Class :character   1st Qu.: 5429912  
##  Median :828.5   Mode  :character   Median : 6283324  
##  Mean   :828.5                      Mean   : 6502475  
##  3rd Qu.:876.2                      3rd Qu.: 7620524  
##  Max.   :924.0                      Max.   :10655730  
##                                     NA's   :1

#Convert to tsibble
power_ts <- power_raw |>
  mutate(month = yearmonth(period)) |>
  select(month, kwh) |>
  as_tsibble(index = month)

head(power_ts)

## # A tsibble: 6 x 2 [1M]
##      month     kwh
##      <mth>   <dbl>
## 1 1998 Jan 6862583
## 2 1998 Feb 5838198
## 3 1998 Mar 5420658
## 4 1998 Apr 5010364
## 5 1998 May 4665377
## 6 1998 Jun 6467147

#Initial plot
power_ts |>
  autoplot(kwh) +
  labs(title = "Monthly Residential Power Consumption",
       subtitle = "January 1998 - December 2013", y = "KWH")

We have a clear annual seasonality with summer and winter peaks, a slight upward trend from 1998 through roughly 2010, then a slightly leveling off. And there’s an obvious outlier around 2010 because that extreme dip is clearly wrong, normal values are in the 4-9 million range. There is also 1 missing value, just from observing the dataset, in Sep 2009. I’ll follow the same approach and impute both strange values, just with values from 12 months prior.

power_ts |> filter(is.na(kwh))

## # A tsibble: 1 x 2 [1M]
##      month   kwh
##      <mth> <dbl>
## 1 2008 Sep    NA

power_ts |> filter(kwh < 3000000)

## # A tsibble: 1 x 2 [1M]
##      month    kwh
##      <mth>  <dbl>
## 1 2010 Jul 770523

#Check prior year values for imputation
power_ts |> filter(month == yearmonth("2007 Sep") | month == yearmonth("2009 Jul"))

## # A tsibble: 2 x 2 [1M]
##      month     kwh
##      <mth>   <dbl>
## 1 2007 Sep 7666970
## 2 2009 Jul 7713260

power_clean <- power_ts |>
  mutate(kwh = case_when(
    month == yearmonth("2008 Sep") ~ power_ts$kwh[power_ts$month == yearmonth("2007 Sep")],
    month == yearmonth("2010 Jul") ~ power_ts$kwh[power_ts$month == yearmonth("2009 Jul")],
    TRUE ~ kwh
  ))

#Confirm no remaining NAs and outlier is fixed
power_clean |> filter(is.na(kwh))

## # A tsibble: 0 x 2 [?]
## # ℹ 2 variables: month <mth>, kwh <dbl>

power_clean |> filter(kwh < 3000000)

## # A tsibble: 0 x 2 [?]
## # ℹ 2 variables: month <mth>, kwh <dbl>

#Replot to see if it looks better
power_clean |>
  autoplot(kwh) +
  labs(title = "Monthly Residential Power Consumption - Cleaned",
       y = "KWH")

It now looks significantly better, and the upwards trend became more obvious.

#Decompose to inspect trends and seasonality
power_clean |> 
  model(STL(kwh ~ trend(window=7) + season(window="periodic"))) |>
  components() |>
  autoplot()

I already knew there was a monthly seasonal pattern, this is just additional confirmation. This confirms the upwards trend as well. Summer and winter peaks can be seen, which can be attributed to cooling and heating needs. I will try and fit candidate models, ETS, ARIMA and SNAIVE.

models <- power_ts |>
  model(
    ETS = ETS(kwh),
    ARIMA = ARIMA(kwh),
    SNAIVE = SNAIVE(kwh ~ lag("year"))
  )

#Compares in-sample accuracy
accuracy(models)

## # A tibble: 3 × 10
##   .model .type        ME     RMSE      MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>  <chr>     <dbl>    <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 ETS    Training 89189. 1419998. 1184278. -6.28  21.8 1.71  1.20  0.380  
## 2 ARIMA  Training -9734.  845161.  506277. -5.06  11.6 0.731 0.717 0.00853
## 3 SNAIVE Training 94772. 1176118.  689275. -3.94  14.5 0.995 0.997 0.231

The chosen models were (M,N,N) for ETS, and ARIMA(0,0,2)(2,1,0)[12].Based on the in-sample accuracy, ARIMA clearly outperforms both ETS and SNAIVE, with the lowest RMSE (845160.5) and MAE, and near-zero autocorrelation in residuals. ETS struggles with the variability (RMSE 1419997.8), while SNAIVE captures seasonality partially, but misses the trend. Overall, ARIMA provides the most reliable fit for forecasting 2014 power consumption.

#ARIMA has the lowest RMSE and MAE, so I chose that
best_model <- models |> select(ARIMA)

#Forecast 12 months ahead (Jan-Dec 2014)
forecast_2014 <- best_model |> 
  forecast(h = "12 months")

#Plot forecast against historical data
autoplot(forecast_2014, power_clean) +
  labs(title = "Residential Power Consumption Forecast - 2014",
       y = "kwh")

#Check forecast values
forecast_2014 |> as_tibble()

## # A tibble: 12 × 4
##    .model    month
##    <chr>     <mth>
##  1 ARIMA  2014 Jan
##  2 ARIMA  2014 Feb
##  3 ARIMA  2014 Mar
##  4 ARIMA  2014 Apr
##  5 ARIMA  2014 May
##  6 ARIMA  2014 Jun
##  7 ARIMA  2014 Jul
##  8 ARIMA  2014 Aug
##  9 ARIMA  2014 Sep
## 10 ARIMA  2014 Oct
## 11 ARIMA  2014 Nov
## 12 ARIMA  2014 Dec
## # ℹ 2 more variables: kwh <dist>, .mean <dbl>

Based on in-sample accuracy, ARIMA was selected as the best model for forecasting residential power consumption in 2014. It achieved the lowest RMSE and MAE, meaning that it captures both the upward trend and the seasonal patterns more effectively than ETS or SNAIVE. The 12-month forecast for 2014 shows the expected seasonal peaks in summer and winter, with monthly kwh values ranging from roughly 5.9 million in October to 10 million in August. The forecast aligns closely with historical patterns, and the prediction intervals appropriately widen to reflect the uncertainty further into the year, thus providing a reliable estimate of monthly power usage for 2014.

#Export
forecast_2014 |> 
  as_tibble() |>
  select(month, .mean) |>
  rename(Forecast_KWH = .mean) |>
  write_excel_csv("PartB_ResidentialPowerForecast_2014.csv")

PART C

#Read Pipe 1 and Pipe 2
pipe1 <- read_excel("Waterflow_Pipe1.xlsx")
pipe2 <- read_excel("Waterflow_Pipe2.xlsx")

#Name columns consistently
colnames(pipe1) <- c("datetime", "flow")
colnames(pipe2) <- c("datetime", "flow")

Excel stores dates as serial numbers, so they must be converted into proper datetime objects in R for accurate time-based operations.

#Excel serial number to POSIXct (adjusting for Excel's 1900 origin)
pipe1 <- pipe1 |>
  mutate(datetime = as_datetime((datetime - 25569) * 86400))

pipe2 <- pipe2 |>
  mutate(datetime = as_datetime((datetime - 25569) * 86400))

#Aggregate based on hour and then take the mean, as suggested
pipe1_hourly <- pipe1 |>
  mutate(hour = floor_date(datetime, "hour")) |>
  group_by(hour) |>
  summarise(flow = mean(flow, na.rm = TRUE))

pipe2_hourly <- pipe2 |>
  mutate(hour = floor_date(datetime, "hour")) |>
  group_by(hour) |>
  summarise(flow = mean(flow, na.rm = TRUE))

#Convert to tsibble
pipe1_ts <- pipe1_hourly |> as_tsibble(index = hour)
pipe2_ts <- pipe2_hourly |> as_tsibble(index = hour)

pipe1_ts |>
  autoplot(flow) +
  labs(title = "Pipe 1 flow")

pipe2_ts |>
  autoplot(flow) +
  labs(title = "Pipe 2 flow")

No seasonality or trend to be obsevred, variance is normal.

pipe1_ts |> features(flow, unitroot_kpss)

## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.247         0.1

pipe1_ts |> features(flow, unitroot_ndiffs)  #number of differences needed

## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0

pipe2_ts |> features(flow, unitroot_kpss)

## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.420      0.0683

pipe2_ts |> features(flow, unitroot_ndiffs)

## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0

Stationarity is assessed using the KPSS test. Both series appear stationary (KPSS pvalues > 0.05, ndiffs = 0), so no differencing is required before forecasting.

#There are gaps in the series, so they need to be filled with NA
pipe1_ts <- pipe1_ts |> fill_gaps(flow = NA)
pipe2_ts <- pipe2_ts |> fill_gaps(flow = NA)

#Fit model
pipe1_fit <- pipe1_ts |> model(ARIMA(flow))
pipe2_fit <- pipe2_ts |> model(ARIMA(flow))

#Forecast a week ahead (168 hours) with intervals
pipe1_fc <- pipe1_fit |> forecast(h = "168 hours", level = c(80, 95))
pipe2_fc <- pipe2_fit |> forecast(h = "168 hours", level = c(80, 95))

#Plot with intervals
pipe1_fc |> autoplot(pipe1_ts) + labs(title = "Pipe 1 Forecast")

pipe2_fc |> autoplot(pipe2_ts) + labs(title = "Pipe 2 Forecast")

Visual inspection showed no clear trend or seasonality in either series. As a result, the forecasts reflect the underlying stability of the data. Pipe 1 produces a nearly flat wide forecast around its mean, while Pipe 2 shows wider intervals due to small fluctuations in the timestamps. The two datasets were analyzed separately.

#export
pipe1_fc |> as_tibble() |> write_xlsx("PartC_Pipe1_Forecast.xlsx")
pipe2_fc |> as_tibble() |> write_xlsx("PartC_Pipe2_Forecast.xlsx")

DATA 624 Project 1

Hristiyana Yaneva

2026-03-24

DATA 624 Project 1

PART A

PART B

PART C