Project 1 - data 624

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'ggplot2' was built under R version 4.5.2

## Warning: package 'tibble' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(fpp3)

## Warning: package 'fpp3' was built under R version 4.5.2

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tsibble     1.1.6     ✔ feasts      0.4.2
## ✔ tsibbledata 0.4.1     ✔ fable       0.5.0

## Warning: package 'tsibble' was built under R version 4.5.2

## Warning: package 'tsibbledata' was built under R version 4.5.2

## Warning: package 'feasts' was built under R version 4.5.2

## Warning: package 'fabletools' was built under R version 4.5.2

## Warning: package 'fable' was built under R version 4.5.2

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

Project 1 - Part A, ATM data

Loading the dataframe

I converted the file to a .csv and uploaded it to github.

atm_data <- read.table("https://raw.githubusercontent.com/samanthabarbaro/data624/refs/heads/main/ATM624Data.csv", sep = ",", header = TRUE)

#how did the variables import?
glimpse(atm_data)

## Rows: 1,474
## Columns: 3
## $ DATE <chr> "5/1/2009 12:00:00 AM", "5/1/2009 12:00:00 AM", "5/2/2009 12:00:0…
## $ ATM  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ Cash <int> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …

Cleaning up formatting and looking for null values

The dates imported formatted as characters. There’s a 12:00:00 a.m. timestamp for every instance, which we can remove to simplify things.

atm_data |> distinct(DATE)

##                       DATE
## 1     5/1/2009 12:00:00 AM
## 2     5/2/2009 12:00:00 AM
## 3     5/3/2009 12:00:00 AM
## 4     5/4/2009 12:00:00 AM
## 5     5/5/2009 12:00:00 AM
## 6     5/6/2009 12:00:00 AM
## 7     5/7/2009 12:00:00 AM
## 8     5/8/2009 12:00:00 AM
## 9     5/9/2009 12:00:00 AM
## 10   5/10/2009 12:00:00 AM
## 11   5/11/2009 12:00:00 AM
## 12   5/12/2009 12:00:00 AM
## 13   5/13/2009 12:00:00 AM
## 14   5/14/2009 12:00:00 AM
## 15   5/15/2009 12:00:00 AM
## 16   5/16/2009 12:00:00 AM
## 17   5/17/2009 12:00:00 AM
## 18   5/18/2009 12:00:00 AM
## 19   5/19/2009 12:00:00 AM
## 20   5/20/2009 12:00:00 AM
## 21   5/21/2009 12:00:00 AM
## 22   5/22/2009 12:00:00 AM
## 23   5/23/2009 12:00:00 AM
## 24   5/24/2009 12:00:00 AM
## 25   5/25/2009 12:00:00 AM
## 26   5/26/2009 12:00:00 AM
## 27   5/27/2009 12:00:00 AM
## 28   5/28/2009 12:00:00 AM
## 29   5/29/2009 12:00:00 AM
## 30   5/30/2009 12:00:00 AM
## 31   5/31/2009 12:00:00 AM
## 32    6/1/2009 12:00:00 AM
## 33    6/2/2009 12:00:00 AM
## 34    6/3/2009 12:00:00 AM
## 35    6/4/2009 12:00:00 AM
## 36    6/5/2009 12:00:00 AM
## 37    6/6/2009 12:00:00 AM
## 38    6/7/2009 12:00:00 AM
## 39    6/8/2009 12:00:00 AM
## 40    6/9/2009 12:00:00 AM
## 41   6/10/2009 12:00:00 AM
## 42   6/11/2009 12:00:00 AM
## 43   6/12/2009 12:00:00 AM
## 44   6/13/2009 12:00:00 AM
## 45   6/14/2009 12:00:00 AM
## 46   6/15/2009 12:00:00 AM
## 47   6/16/2009 12:00:00 AM
## 48   6/17/2009 12:00:00 AM
## 49   6/18/2009 12:00:00 AM
## 50   6/19/2009 12:00:00 AM
## 51   6/20/2009 12:00:00 AM
## 52   6/21/2009 12:00:00 AM
## 53   6/22/2009 12:00:00 AM
## 54   6/23/2009 12:00:00 AM
## 55   6/24/2009 12:00:00 AM
## 56   6/25/2009 12:00:00 AM
## 57   6/26/2009 12:00:00 AM
## 58   6/27/2009 12:00:00 AM
## 59   6/28/2009 12:00:00 AM
## 60   6/29/2009 12:00:00 AM
## 61   6/30/2009 12:00:00 AM
## 62    7/1/2009 12:00:00 AM
## 63    7/2/2009 12:00:00 AM
## 64    7/3/2009 12:00:00 AM
## 65    7/4/2009 12:00:00 AM
## 66    7/5/2009 12:00:00 AM
## 67    7/6/2009 12:00:00 AM
## 68    7/7/2009 12:00:00 AM
## 69    7/8/2009 12:00:00 AM
## 70    7/9/2009 12:00:00 AM
## 71   7/10/2009 12:00:00 AM
## 72   7/11/2009 12:00:00 AM
## 73   7/12/2009 12:00:00 AM
## 74   7/13/2009 12:00:00 AM
## 75   7/14/2009 12:00:00 AM
## 76   7/15/2009 12:00:00 AM
## 77   7/16/2009 12:00:00 AM
## 78   7/17/2009 12:00:00 AM
## 79   7/18/2009 12:00:00 AM
## 80   7/19/2009 12:00:00 AM
## 81   7/20/2009 12:00:00 AM
## 82   7/21/2009 12:00:00 AM
## 83   7/22/2009 12:00:00 AM
## 84   7/23/2009 12:00:00 AM
## 85   7/24/2009 12:00:00 AM
## 86   7/25/2009 12:00:00 AM
## 87   7/26/2009 12:00:00 AM
## 88   7/27/2009 12:00:00 AM
## 89   7/28/2009 12:00:00 AM
## 90   7/29/2009 12:00:00 AM
## 91   7/30/2009 12:00:00 AM
## 92   7/31/2009 12:00:00 AM
## 93    8/1/2009 12:00:00 AM
## 94    8/2/2009 12:00:00 AM
## 95    8/3/2009 12:00:00 AM
## 96    8/4/2009 12:00:00 AM
## 97    8/5/2009 12:00:00 AM
## 98    8/6/2009 12:00:00 AM
## 99    8/7/2009 12:00:00 AM
## 100   8/8/2009 12:00:00 AM
## 101   8/9/2009 12:00:00 AM
## 102  8/10/2009 12:00:00 AM
## 103  8/11/2009 12:00:00 AM
## 104  8/12/2009 12:00:00 AM
## 105  8/13/2009 12:00:00 AM
## 106  8/14/2009 12:00:00 AM
## 107  8/15/2009 12:00:00 AM
## 108  8/16/2009 12:00:00 AM
## 109  8/17/2009 12:00:00 AM
## 110  8/18/2009 12:00:00 AM
## 111  8/19/2009 12:00:00 AM
## 112  8/20/2009 12:00:00 AM
## 113  8/21/2009 12:00:00 AM
## 114  8/22/2009 12:00:00 AM
## 115  8/23/2009 12:00:00 AM
## 116  8/24/2009 12:00:00 AM
## 117  8/25/2009 12:00:00 AM
## 118  8/26/2009 12:00:00 AM
## 119  8/27/2009 12:00:00 AM
## 120  8/28/2009 12:00:00 AM
## 121  8/29/2009 12:00:00 AM
## 122  8/30/2009 12:00:00 AM
## 123  8/31/2009 12:00:00 AM
## 124   9/1/2009 12:00:00 AM
## 125   9/2/2009 12:00:00 AM
## 126   9/3/2009 12:00:00 AM
## 127   9/4/2009 12:00:00 AM
## 128   9/5/2009 12:00:00 AM
## 129   9/6/2009 12:00:00 AM
## 130   9/7/2009 12:00:00 AM
## 131   9/8/2009 12:00:00 AM
## 132   9/9/2009 12:00:00 AM
## 133  9/10/2009 12:00:00 AM
## 134  9/11/2009 12:00:00 AM
## 135  9/12/2009 12:00:00 AM
## 136  9/13/2009 12:00:00 AM
## 137  9/14/2009 12:00:00 AM
## 138  9/15/2009 12:00:00 AM
## 139  9/16/2009 12:00:00 AM
## 140  9/17/2009 12:00:00 AM
## 141  9/18/2009 12:00:00 AM
## 142  9/19/2009 12:00:00 AM
## 143  9/20/2009 12:00:00 AM
## 144  9/21/2009 12:00:00 AM
## 145  9/22/2009 12:00:00 AM
## 146  9/23/2009 12:00:00 AM
## 147  9/24/2009 12:00:00 AM
## 148  9/25/2009 12:00:00 AM
## 149  9/26/2009 12:00:00 AM
## 150  9/27/2009 12:00:00 AM
## 151  9/28/2009 12:00:00 AM
## 152  9/29/2009 12:00:00 AM
## 153  9/30/2009 12:00:00 AM
## 154  10/1/2009 12:00:00 AM
## 155  10/2/2009 12:00:00 AM
## 156  10/3/2009 12:00:00 AM
## 157  10/4/2009 12:00:00 AM
## 158  10/5/2009 12:00:00 AM
## 159  10/6/2009 12:00:00 AM
## 160  10/7/2009 12:00:00 AM
## 161  10/8/2009 12:00:00 AM
## 162  10/9/2009 12:00:00 AM
## 163 10/10/2009 12:00:00 AM
## 164 10/11/2009 12:00:00 AM
## 165 10/12/2009 12:00:00 AM
## 166 10/13/2009 12:00:00 AM
## 167 10/14/2009 12:00:00 AM
## 168 10/15/2009 12:00:00 AM
## 169 10/16/2009 12:00:00 AM
## 170 10/17/2009 12:00:00 AM
## 171 10/18/2009 12:00:00 AM
## 172 10/19/2009 12:00:00 AM
## 173 10/20/2009 12:00:00 AM
## 174 10/21/2009 12:00:00 AM
## 175 10/22/2009 12:00:00 AM
## 176 10/23/2009 12:00:00 AM
## 177 10/24/2009 12:00:00 AM
## 178 10/25/2009 12:00:00 AM
## 179 10/26/2009 12:00:00 AM
## 180 10/27/2009 12:00:00 AM
## 181 10/28/2009 12:00:00 AM
## 182 10/29/2009 12:00:00 AM
## 183 10/30/2009 12:00:00 AM
## 184 10/31/2009 12:00:00 AM
## 185  11/1/2009 12:00:00 AM
## 186  11/2/2009 12:00:00 AM
## 187  11/3/2009 12:00:00 AM
## 188  11/4/2009 12:00:00 AM
## 189  11/5/2009 12:00:00 AM
## 190  11/6/2009 12:00:00 AM
## 191  11/7/2009 12:00:00 AM
## 192  11/8/2009 12:00:00 AM
## 193  11/9/2009 12:00:00 AM
## 194 11/10/2009 12:00:00 AM
## 195 11/11/2009 12:00:00 AM
## 196 11/12/2009 12:00:00 AM
## 197 11/13/2009 12:00:00 AM
## 198 11/14/2009 12:00:00 AM
## 199 11/15/2009 12:00:00 AM
## 200 11/16/2009 12:00:00 AM
## 201 11/17/2009 12:00:00 AM
## 202 11/18/2009 12:00:00 AM
## 203 11/19/2009 12:00:00 AM
## 204 11/20/2009 12:00:00 AM
## 205 11/21/2009 12:00:00 AM
## 206 11/22/2009 12:00:00 AM
## 207 11/23/2009 12:00:00 AM
## 208 11/24/2009 12:00:00 AM
## 209 11/25/2009 12:00:00 AM
## 210 11/26/2009 12:00:00 AM
## 211 11/27/2009 12:00:00 AM
## 212 11/28/2009 12:00:00 AM
## 213 11/29/2009 12:00:00 AM
## 214 11/30/2009 12:00:00 AM
## 215  12/1/2009 12:00:00 AM
## 216  12/2/2009 12:00:00 AM
## 217  12/3/2009 12:00:00 AM
## 218  12/4/2009 12:00:00 AM
## 219  12/5/2009 12:00:00 AM
## 220  12/6/2009 12:00:00 AM
## 221  12/7/2009 12:00:00 AM
## 222  12/8/2009 12:00:00 AM
## 223  12/9/2009 12:00:00 AM
## 224 12/10/2009 12:00:00 AM
## 225 12/11/2009 12:00:00 AM
## 226 12/12/2009 12:00:00 AM
## 227 12/13/2009 12:00:00 AM
## 228 12/14/2009 12:00:00 AM
## 229 12/15/2009 12:00:00 AM
## 230 12/16/2009 12:00:00 AM
## 231 12/17/2009 12:00:00 AM
## 232 12/18/2009 12:00:00 AM
## 233 12/19/2009 12:00:00 AM
## 234 12/20/2009 12:00:00 AM
## 235 12/21/2009 12:00:00 AM
## 236 12/22/2009 12:00:00 AM
## 237 12/23/2009 12:00:00 AM
## 238 12/24/2009 12:00:00 AM
## 239 12/25/2009 12:00:00 AM
## 240 12/26/2009 12:00:00 AM
## 241 12/27/2009 12:00:00 AM
## 242 12/28/2009 12:00:00 AM
## 243 12/29/2009 12:00:00 AM
## 244 12/30/2009 12:00:00 AM
## 245 12/31/2009 12:00:00 AM
## 246   1/1/2010 12:00:00 AM
## 247   1/2/2010 12:00:00 AM
## 248   1/3/2010 12:00:00 AM
## 249   1/4/2010 12:00:00 AM
## 250   1/5/2010 12:00:00 AM
## 251   1/6/2010 12:00:00 AM
## 252   1/7/2010 12:00:00 AM
## 253   1/8/2010 12:00:00 AM
## 254   1/9/2010 12:00:00 AM
## 255  1/10/2010 12:00:00 AM
## 256  1/11/2010 12:00:00 AM
## 257  1/12/2010 12:00:00 AM
## 258  1/13/2010 12:00:00 AM
## 259  1/14/2010 12:00:00 AM
## 260  1/15/2010 12:00:00 AM
## 261  1/16/2010 12:00:00 AM
## 262  1/17/2010 12:00:00 AM
## 263  1/18/2010 12:00:00 AM
## 264  1/19/2010 12:00:00 AM
## 265  1/20/2010 12:00:00 AM
## 266  1/21/2010 12:00:00 AM
## 267  1/22/2010 12:00:00 AM
## 268  1/23/2010 12:00:00 AM
## 269  1/24/2010 12:00:00 AM
## 270  1/25/2010 12:00:00 AM
## 271  1/26/2010 12:00:00 AM
## 272  1/27/2010 12:00:00 AM
## 273  1/28/2010 12:00:00 AM
## 274  1/29/2010 12:00:00 AM
## 275  1/30/2010 12:00:00 AM
## 276  1/31/2010 12:00:00 AM
## 277   2/1/2010 12:00:00 AM
## 278   2/2/2010 12:00:00 AM
## 279   2/3/2010 12:00:00 AM
## 280   2/4/2010 12:00:00 AM
## 281   2/5/2010 12:00:00 AM
## 282   2/6/2010 12:00:00 AM
## 283   2/7/2010 12:00:00 AM
## 284   2/8/2010 12:00:00 AM
## 285   2/9/2010 12:00:00 AM
## 286  2/10/2010 12:00:00 AM
## 287  2/11/2010 12:00:00 AM
## 288  2/12/2010 12:00:00 AM
## 289  2/13/2010 12:00:00 AM
## 290  2/14/2010 12:00:00 AM
## 291  2/15/2010 12:00:00 AM
## 292  2/16/2010 12:00:00 AM
## 293  2/17/2010 12:00:00 AM
## 294  2/18/2010 12:00:00 AM
## 295  2/19/2010 12:00:00 AM
## 296  2/20/2010 12:00:00 AM
## 297  2/21/2010 12:00:00 AM
## 298  2/22/2010 12:00:00 AM
## 299  2/23/2010 12:00:00 AM
## 300  2/24/2010 12:00:00 AM
## 301  2/25/2010 12:00:00 AM
## 302  2/26/2010 12:00:00 AM
## 303  2/27/2010 12:00:00 AM
## 304  2/28/2010 12:00:00 AM
## 305   3/1/2010 12:00:00 AM
## 306   3/2/2010 12:00:00 AM
## 307   3/3/2010 12:00:00 AM
## 308   3/4/2010 12:00:00 AM
## 309   3/5/2010 12:00:00 AM
## 310   3/6/2010 12:00:00 AM
## 311   3/7/2010 12:00:00 AM
## 312   3/8/2010 12:00:00 AM
## 313   3/9/2010 12:00:00 AM
## 314  3/10/2010 12:00:00 AM
## 315  3/11/2010 12:00:00 AM
## 316  3/12/2010 12:00:00 AM
## 317  3/13/2010 12:00:00 AM
## 318  3/14/2010 12:00:00 AM
## 319  3/15/2010 12:00:00 AM
## 320  3/16/2010 12:00:00 AM
## 321  3/17/2010 12:00:00 AM
## 322  3/18/2010 12:00:00 AM
## 323  3/19/2010 12:00:00 AM
## 324  3/20/2010 12:00:00 AM
## 325  3/21/2010 12:00:00 AM
## 326  3/22/2010 12:00:00 AM
## 327  3/23/2010 12:00:00 AM
## 328  3/24/2010 12:00:00 AM
## 329  3/25/2010 12:00:00 AM
## 330  3/26/2010 12:00:00 AM
## 331  3/27/2010 12:00:00 AM
## 332  3/28/2010 12:00:00 AM
## 333  3/29/2010 12:00:00 AM
## 334  3/30/2010 12:00:00 AM
## 335  3/31/2010 12:00:00 AM
## 336   4/1/2010 12:00:00 AM
## 337   4/2/2010 12:00:00 AM
## 338   4/3/2010 12:00:00 AM
## 339   4/4/2010 12:00:00 AM
## 340   4/5/2010 12:00:00 AM
## 341   4/6/2010 12:00:00 AM
## 342   4/7/2010 12:00:00 AM
## 343   4/8/2010 12:00:00 AM
## 344   4/9/2010 12:00:00 AM
## 345  4/10/2010 12:00:00 AM
## 346  4/11/2010 12:00:00 AM
## 347  4/12/2010 12:00:00 AM
## 348  4/13/2010 12:00:00 AM
## 349  4/14/2010 12:00:00 AM
## 350  4/15/2010 12:00:00 AM
## 351  4/16/2010 12:00:00 AM
## 352  4/17/2010 12:00:00 AM
## 353  4/18/2010 12:00:00 AM
## 354  4/19/2010 12:00:00 AM
## 355  4/20/2010 12:00:00 AM
## 356  4/21/2010 12:00:00 AM
## 357  4/22/2010 12:00:00 AM
## 358  4/23/2010 12:00:00 AM
## 359  4/24/2010 12:00:00 AM
## 360  4/25/2010 12:00:00 AM
## 361  4/26/2010 12:00:00 AM
## 362  4/27/2010 12:00:00 AM
## 363  4/28/2010 12:00:00 AM
## 364  4/29/2010 12:00:00 AM
## 365  4/30/2010 12:00:00 AM
## 366   5/1/2010 12:00:00 AM
## 367   5/2/2010 12:00:00 AM
## 368   5/3/2010 12:00:00 AM
## 369   5/4/2010 12:00:00 AM
## 370   5/5/2010 12:00:00 AM
## 371   5/6/2010 12:00:00 AM
## 372   5/7/2010 12:00:00 AM
## 373   5/8/2010 12:00:00 AM
## 374   5/9/2010 12:00:00 AM
## 375  5/10/2010 12:00:00 AM
## 376  5/11/2010 12:00:00 AM
## 377  5/12/2010 12:00:00 AM
## 378  5/13/2010 12:00:00 AM
## 379  5/14/2010 12:00:00 AM

atm_data |> summary()

##      DATE               ATM                 Cash        
##  Length:1474        Length:1474        Min.   :    0.0  
##  Class :character   Class :character   1st Qu.:    0.5  
##  Mode  :character   Mode  :character   Median :   73.0  
##                                        Mean   :  155.6  
##                                        3rd Qu.:  114.0  
##                                        Max.   :10920.0  
##                                        NA's   :19

#format the dates properly
atm_data <- atm_data |> 
  mutate(DATE = as_date(mdy_hms(DATE)))

#format as a tsibble
atm_data <- atm_data |>
  as_tsibble(key = ATM, index = DATE)

Check the missing values

atm_data |> filter(is.na(Cash))

## # A tsibble: 19 x 3 [1D]
## # Key:       ATM [3]
##    DATE       ATM     Cash
##    <date>     <chr>  <int>
##  1 2010-05-01 ""        NA
##  2 2010-05-02 ""        NA
##  3 2010-05-03 ""        NA
##  4 2010-05-04 ""        NA
##  5 2010-05-05 ""        NA
##  6 2010-05-06 ""        NA
##  7 2010-05-07 ""        NA
##  8 2010-05-08 ""        NA
##  9 2010-05-09 ""        NA
## 10 2010-05-10 ""        NA
## 11 2010-05-11 ""        NA
## 12 2010-05-12 ""        NA
## 13 2010-05-13 ""        NA
## 14 2010-05-14 ""        NA
## 15 2009-06-13 "ATM1"    NA
## 16 2009-06-16 "ATM1"    NA
## 17 2009-06-22 "ATM1"    NA
## 18 2009-06-18 "ATM2"    NA
## 19 2009-06-24 "ATM2"    NA

atm_data |> filter(is.na(ATM))

## # A tsibble: 0 x 3 [?]
## # Key:       ATM [0]
## # ℹ 3 variables: DATE <date>, ATM <chr>, Cash <int>

atm_data |> filter(is.na(DATE))

## # A tsibble: 0 x 3 [?]
## # Key:       ATM [0]
## # ℹ 3 variables: DATE <date>, ATM <chr>, Cash <int>

#none of these are "NA" - just blank

atm_data |> filter(!ATM %in% c("ATM1", "ATM2", "ATM3", "ATM4"))

## # A tsibble: 14 x 3 [1D]
## # Key:       ATM [1]
##    DATE       ATM    Cash
##    <date>     <chr> <int>
##  1 2010-05-01 ""       NA
##  2 2010-05-02 ""       NA
##  3 2010-05-03 ""       NA
##  4 2010-05-04 ""       NA
##  5 2010-05-05 ""       NA
##  6 2010-05-06 ""       NA
##  7 2010-05-07 ""       NA
##  8 2010-05-08 ""       NA
##  9 2010-05-09 ""       NA
## 10 2010-05-10 ""       NA
## 11 2010-05-11 ""       NA
## 12 2010-05-12 ""       NA
## 13 2010-05-13 ""       NA
## 14 2010-05-14 ""       NA

#it seems like data for ATMs 1 and 2 are missing for the last month of recorded data (May 2010), so I will just exclude those values rather than impute them. I will impute the missing June 2009 values, though.

atm_data |> group_by(ATM) |> summarize(mean(Cash, na.rm = TRUE))

## # A tsibble: 1,474 x 3 [1D]
## # Key:       ATM [5]
##    ATM   DATE       `mean(Cash, na.rm = TRUE)`
##    <chr> <date>                          <dbl>
##  1 ""    2010-05-01                        NaN
##  2 ""    2010-05-02                        NaN
##  3 ""    2010-05-03                        NaN
##  4 ""    2010-05-04                        NaN
##  5 ""    2010-05-05                        NaN
##  6 ""    2010-05-06                        NaN
##  7 ""    2010-05-07                        NaN
##  8 ""    2010-05-08                        NaN
##  9 ""    2010-05-09                        NaN
## 10 ""    2010-05-10                        NaN
## # ℹ 1,464 more rows

atm_data |> group_by(ATM) |> summarize(sum(Cash, na.rm = TRUE))

## # A tsibble: 1,474 x 3 [1D]
## # Key:       ATM [5]
##    ATM   DATE       `sum(Cash, na.rm = TRUE)`
##    <chr> <date>                         <int>
##  1 ""    2010-05-01                         0
##  2 ""    2010-05-02                         0
##  3 ""    2010-05-03                         0
##  4 ""    2010-05-04                         0
##  5 ""    2010-05-05                         0
##  6 ""    2010-05-06                         0
##  7 ""    2010-05-07                         0
##  8 ""    2010-05-08                         0
##  9 ""    2010-05-09                         0
## 10 ""    2010-05-10                         0
## # ℹ 1,464 more rows

Forecasting for ATM3

I started with ATM 3 because it’s the simplest ATM to forecast for.

Most values for ATM 3 are 0 - this ATM may have not existed or been accessible until May 2010. Here, it’s best to go with some sort of naive forecast, since we only have three days’ worth of data, and all three days are relatively similar. I won’t check fit here or traint the data, since it isn’t hugely relevant for a data set that only has three usable data points.

#the three days with withdrawals
atm_3 <- atm_data |> 
  filter (ATM == "ATM3") |>
  filter(Cash > 0)
atm_3

## # A tsibble: 3 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM    Cash
##   <date>     <chr> <int>
## 1 2010-04-28 ATM3     96
## 2 2010-04-29 ATM3     82
## 3 2010-04-30 ATM3     85

#all data
atm_3_all <- atm_data |> 
  filter (ATM == "ATM3")


autoplot(atm_3_all)

## Plot variable not specified, automatically selected `.vars = Cash`

Naive forecasting for limited data

Let’s go with the mean (but only for the three available days) and a simple naive forecast.

atm_3_fit <- atm_3_all |> model(NAIVE(Cash))

atm_3_fit <- atm_3_all |>
  model(
    Naive = NAIVE(Cash)
  )

#forecasts for all of May
atm_3_fit |> forecast(h = 31) |> autoplot(atm_3_all, level = NULL)

atm_3_fit |> forecast(h = 31) |> autoplot(atm_3_all)

#let's forecast the mean based on 3 days' data. I'll also add the mean so it's easier 
#to see.
atm_3_fit_mean <- atm_3 |> model(MEAN(Cash),
                                 NAIVE(Cash)
                                 )

atm_3_fit_mean |> forecast(h = 31) |> autoplot(atm_3, level = NULL)

#including the confidence interval
atm_3_fit_mean |> forecast(h = 31) |> autoplot(atm_3)

#create a data frame and exporting as CSV
fc_atm3 <- atm_3_fit_mean |> forecast(h = 31)
#atm3_forecast <- as.data.frame(fc_atm3)

#write.csv(atm3_forecast, file="atm3forecast.csv", row.names=TRUE)

More data cleaning

Moving on, let’s get rid of the days in May we have no data for.

atm_data_clean <- atm_data |> 
  filter(ATM %in% c("ATM1", "ATM2", "ATM3", "ATM4"))

#let's plot and look for outliers

#values drop dramatically roughly every 7th day, presumably for a weekend 
atm_data_clean |> 
  filter(ATM == "ATM1") |> 
  autoplot() +
  labs(title = "ATM 1")

## Plot variable not specified, automatically selected `.vars = Cash`

atm1 <- atm_data_clean |> 
  filter(ATM == "ATM1")



#here they tend to drop on days 6 and 7 - there's also a weekly pattern
atm_data_clean |> filter(ATM == "ATM2") |> autoplot() +
  labs(title = "ATM 2")

## Plot variable not specified, automatically selected `.vars = Cash`

#there's clearly an outlier here
atm_data_clean |> 
  filter(ATM == "ATM4") |> 
  autoplot() +
  labs(title = "ATM 4")

## Plot variable not specified, automatically selected `.vars = Cash`

#let's learn more
atm_data_clean |> 
  filter(ATM == "ATM4") |> 
  arrange(desc(Cash))

## Warning: Current temporal ordering may yield unexpected results.
## ℹ Suggest to sort by `ATM`, `DATE` first.

## # A tsibble: 365 x 3 [1D]
## # Key:       ATM [1]
##    DATE       ATM    Cash
##    <date>     <chr> <int>
##  1 2010-02-09 ATM4  10920
##  2 2009-09-22 ATM4   1712
##  3 2010-01-29 ATM4   1575
##  4 2009-09-04 ATM4   1495
##  5 2009-06-09 ATM4   1484
##  6 2009-09-05 ATM4   1301
##  7 2010-02-28 ATM4   1276
##  8 2009-08-23 ATM4   1246
##  9 2009-06-14 ATM4   1221
## 10 2009-09-29 ATM4   1195
## # ℹ 355 more rows

ATM 4 - fixing outliers

My best guess is that, in data entry, someone added a zero by accident. Many of the other large values are in the 1000 + range.

If I were to use some other method, like calculating the mean for a similar day, I don’t think it would be that different from removing a zero. I will go ahead and replace this value, 10920, with 1092.

#10920 with 1092

atm4 <- atm_data_clean |> 
  filter(ATM == "ATM4")

atm4 <- atm4 |> 
  mutate(Cash = case_when(
    Cash == 10920 ~ 1092, 
    TRUE ~ Cash
  ))

autoplot(atm4)

## Plot variable not specified, automatically selected `.vars = Cash`

Now we can more clearly see the ups and downs. All of these plots seem to have some sort of weekly pattern. Let’s see how strong it is.

dcmp <- atm4 |>
  model(stl = STL(Cash ~ season(period = 7))) |>
  components()


autoplot(dcmp)

#if I don't pick 7 days, STL decomp seems to choose a weekly pattern: 
dcmp <- atm4 |>
    model(STL(Cash)) |>
    components()

autoplot(dcmp)

#Using ETS

fit <- atm4 |>
    model(ets = ETS(Cash))

report(fit)

## Series: Cash 
## Model: ETS(A,N,A) 
##   Smoothing parameters:
##     alpha = 0.000100039 
##     gamma = 0.0001001906 
## 
##   Initial states:
##      l[0]      s[0]     s[-1]    s[-2]    s[-3]    s[-4]    s[-5]    s[-6]
##  445.8873 -272.9881 -35.41945 18.60843 39.49149 88.67104 47.84768 113.7889
## 
##   sigma^2:  111656.2
## 
##      AIC     AICc      BIC 
## 6406.810 6407.432 6445.809

#chooses ANA - additive error, no trend (we can see from the plot that 
#there's no trend). The seasonality does look additive rather than multiplicitive.

Forecasting ATM4

What works here? The data has a 7-day seasonality and no apparent overall trend, so we have a few options.

Let’s try ETS (ANA model), which is suggested above. We’ll also try the suggested ARIMA model, and seasonal naive model, which seems appropriate for data that follows a simple weekly trend. We can also try the Holt-Winters additive model (AAA) for seasonal data to see – though this data doesn’t seem to have a strong trend.

#use 9 months to train and 3 months to test
myseries_train <- atm4 |>
  filter(DATE < as_date("2010-02-01"))

fit <- myseries_train |>
  model(ana = ETS(Cash ~ error("A") + trend("N") + season("A")))

fit |> 
  forecast(h = "3 months") |> 
  autoplot(atm4)

#to find RMSE
fit |> accuracy()

## # A tibble: 1 × 11
##   ATM   .model .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr> <chr>  <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM4  ana    Training -9.04  321.  249. -410.  439. 0.715 0.704 0.114

It doesn’t look great – the RMSE is really high at 345. Let’s try Holt-Winters.

fit2 <- myseries_train |>
  model(hw = ETS(Cash ~ error("A") + trend("A") + season("A")))

fit2 |> 
  forecast(h = "3 months") |> 
  autoplot(atm4)

#to find the fit
fit2 |> accuracy()

## # A tibble: 1 × 11
##   ATM   .model .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr> <chr>  <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 ATM4  hw     Training -2.55  324.  256. -384.  431. 0.737 0.711 0.0764

slightly better RMSE - 324 (and that wasn’t even the suggested model). Next, we’ll try seasonal naive.

fit_snaive <- myseries_train |>
  model(snaive = SNAIVE(Cash))

fc_snaive <- fit_snaive |>
  forecast(h = "3 months")

fc_snaive |>
  autoplot(atm4)

fit_snaive |>
  accuracy()

## # A tibble: 1 × 11
##   ATM   .model .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr> <chr>  <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 ATM4  snaive Training  4.72  456.  348. -372.  428.     1     1 0.0566

The RMSE is much higher at 456. Finally, let’s try an ARIMA model.

atm4 |> features(Cash, features = guerrero)

## # A tibble: 1 × 2
##   ATM   lambda_guerrero
##   <chr>           <dbl>
## 1 ATM4            0.464

#data is stationary
atm4 |>
    mutate(diff_data = box_cox(Cash, 0.464) ) |>
    features(diff_data, unitroot_kpss)

## # A tibble: 1 × 3
##   ATM   kpss_stat kpss_pvalue
##   <chr>     <dbl>       <dbl>
## 1 ATM4     0.0737         0.1

atm4 |>
  features(Cash, unitroot_nsdiffs)

## # A tibble: 1 × 2
##   ATM   nsdiffs
##   <chr>   <int>
## 1 ATM4        0

atm4 <- atm4 |> as_tsibble(key = ATM, index = DATE)

atm4 |>
  model(auto_arima = ARIMA(box_cox(Cash, 0.464))) |>
  report()

## Series: Cash 
## Model: ARIMA(0,0,0)(2,0,0)[7] w/ mean 
## Transformation: box_cox(Cash, 0.464) 
## 
## Coefficients:
##         sar1    sar2  constant
##       0.2134  0.1813   18.4929
## s.e.  0.0518  0.0522    0.7413
## 
## sigma^2 estimated as 209.5:  log likelihood=-1492.31
## AIC=2992.62   AICc=2992.73   BIC=3008.22

#it's the same without stwpwise (it just takes longer)
atm4 |>
  model(auto_arima = ARIMA(box_cox(Cash, 0.464), stepwise = FALSE)) |>
  report()

## Series: Cash 
## Model: ARIMA(0,0,0)(2,0,0)[7] w/ mean 
## Transformation: box_cox(Cash, 0.464) 
## 
## Coefficients:
##         sar1    sar2  constant
##       0.2134  0.1813   18.4929
## s.e.  0.0518  0.0522    0.7413
## 
## sigma^2 estimated as 209.5:  log likelihood=-1492.31
## AIC=2992.62   AICc=2992.73   BIC=3008.22

#check the fit 

fit_a <- atm4 |> 
  model(ARIMA(box_cox(Cash, 0.464) ~ pdq(0,0,0) + PDQ(2,0,0)))
gg_tsresiduals(fit_a)

## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#the distribution is a little strange (skewed right), but everything else looks fine

#ACF and PACF reflect the 7-day trend

#ACF 
atm4 |> ACF(
  box_cox(Cash, 0.464)
  ) |> autoplot()

#PACF 
atm4 |> PACF(
  box_cox(Cash, 0.464)
  ) |> autoplot()

fit_sarima <- myseries_train |>
  model(
    arima = ARIMA(box_cox(Cash, 0.464) ~ pdq(0,0,0) + PDQ(2,0,0))
  )

#changed this to 88 days because I was getting an error below for not having data on May 1-2
fc <- fit_sarima |> 
  forecast(h = "88 days")

fc |> autoplot(atm4)

#check fit
fc |> 
  accuracy(atm4)

## # A tibble: 1 × 11
##   .model ATM   .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>  <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 arima  ATM4  Test  -30.8  318.  267. -630.  658. 0.767 0.698 -0.0165

RMSE is the lowest of all models at 318.

Plot all the forecasts together

fit_all <- atm4 |>
  model(
    arima = ARIMA(box_cox(Cash, 0.464) ~ pdq(0,0,0) + PDQ(2,0,0)),
    snaive = SNAIVE(Cash),
    hw_ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
    ana_ets = ETS(Cash ~ error("A") + trend("N") + season("A"))
  )

fc_all <- fit_all |> 
  forecast(h = "1 month")


#with and without confidence intervals
fc_all |> 
  autoplot(atm4)

fc_all |> 
  autoplot(atm4, level = NULL)

#Facet-wrapped
fc_all |> 
  autoplot(atm4) +
  facet_wrap(~ .model, ncol = 1)

#create a data frame and write the CSV
atm4_forecast <- as.data.frame(fc_all)

#write.csv(atm4_forecast, file="atm4forecast.csv", row.names=TRUE)

While ARIMA had the lowest RMSE, all of these models really reflect the last (lower than usual) week, and don’t incorporate the variability between weeks. Overall, I would pick ARIMA.

ATM 1

Let’s start by filling in missing values using KNN. This is a small data set, so it’s a good fit for simple KNN (VIM package). I won’t use mean imputation because there is a weekly seasonality to the data – I would like the numbers to reflect the trend for the day of the week.

library(VIM)

## Warning: package 'VIM' was built under R version 4.5.3

## Loading required package: colorspace

## Warning: package 'colorspace' was built under R version 4.5.3

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

atm1 <- kNN(atm1, variable = "Cash", k = 5)

atm1_imputed <- atm1 |> select(-Cash_imp)

atm1_imputed <- atm1_imputed |> as_tsibble(key = ATM, index = DATE)

dcmp_1 <- atm1_imputed |>
  model(stl = STL(Cash ~ season(period = 7))) |>
  components()


autoplot(atm1_imputed)

## Plot variable not specified, automatically selected `.vars = Cash`

# Plot the seasonal decomp for atm1_imputed
autoplot(dcmp_1)

There’s one week/set of weeks in March that both ATM4 and 1 are showing a squeeze for.

Looking at the overall plot, the data may be somewhat cyclical – or it could be a second seasonal trend. We can’t tell because we only have on year of data, but from May - December, withdrawals go up and then down. The same happens from mid-December to the end of April.

Again, I will try ARIMA, ETS (recommended model along with both Holt-Winters models), and a seasonal naive approach.

Seasonal naive

This is simple weekly seasonal data. When I looked at the numbers, ATM 1 seemed to have a distinct weekly trend with numbers dropping every seventh day.

#use 9 months to train and 3 months to test
myseries_train <- atm1_imputed |>
  filter(DATE < as_date("2010-02-01"))

fit_snaive_1 <- myseries_train |>
  model(snaive = SNAIVE(Cash))

fc_snaive_1 <- fit_snaive_1 |>
  forecast(h = "3 months")

#confidence interval
fc_snaive_1 |>
  autoplot(atm1_imputed)

#no confidence interval
fc_snaive_1 |>
  autoplot(atm1_imputed, level = NULL)

#the plot goes a little higher than the acutal data

fit_snaive_1 |>
  accuracy()

## # A tibble: 1 × 11
##   ATM   .model .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr> <chr>  <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1  snaive Training 0.223  27.3  17.6 -87.5  107.     1     1 0.123

#RMSE is 27.3

ETS

ANA is the suggested model. I’m going to try both Holt-Winters models because it allows for a trend and a pattern (data is sort of trending downward, as discussed above, this may be a seasonal pattern we can’t identify yet). I’m including HW multiplicative because, as we can see in the decomposition plot, big peaks tend to be associated with big drops.

fit_atm1 <- atm1_imputed |>
    model(ets = ETS(Cash))

report(fit_atm1)

## Series: Cash 
## Model: ETS(A,N,A) 
##   Smoothing parameters:
##     alpha = 0.01625932 
##     gamma = 0.3283959 
## 
##   Initial states:
##      l[0]      s[0]     s[-1]    s[-2]    s[-3]   s[-4]    s[-5]    s[-6]
##  76.88695 -59.09764 -6.012421 12.29652 9.634747 18.0129 9.612253 15.55365
## 
##   sigma^2:  579.2506
## 
##      AIC     AICc      BIC 
## 4486.383 4487.005 4525.382

#ANA

fit_1_ets <- myseries_train |>
  model(ana = ETS(Cash ~ error("A") + trend("N") + season("A")),
        hw_ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
        hwm_ets = ETS(Cash ~ error("M") + trend("A") + season("M")),
        
        )

fit_1_ets |> 
  forecast(h = "3 months") |> 
  autoplot(atm1_imputed)

#all plots separately 
fit_1_ets |> 
    forecast(h = "3 months") |> 
    autoplot(atm1_imputed) +
    facet_wrap(~ .model, ncol = 1, scales = "free_y")

#to find RMSE and other fit metrics
fit_1_ets |> accuracy()

## # A tibble: 3 × 11
##   ATM   .model  .type         ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr> <chr>   <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 ATM1  ana     Training  0.0692  20.0  12.9 -75.0  92.1 0.734 0.732 0.0906
## 2 ATM1  hw_ets  Training -0.106   20.0  12.9 -76.1  92.3 0.735 0.732 0.0862
## 3 ATM1  hwm_ets Training -1.12    19.7  12.3 -84.3  95.2 0.700 0.722 0.0404

These are all lower than the seasonal naive model, but multiplicative HW is the lowest.

ARIMA

atm1_imputed |> features(Cash, features = guerrero)

## # A tibble: 1 × 2
##   ATM   lambda_guerrero
##   <chr>           <dbl>
## 1 ATM1            0.244

#.241 is pretty low, so we should use it

#data is stationary (though the value is pretty close to .05)
atm1_imputed |>
  features(box_cox(Cash, 0.241), unitroot_kpss)

## # A tibble: 1 × 3
##   ATM   kpss_stat kpss_pvalue
##   <chr>     <dbl>       <dbl>
## 1 ATM1      0.418      0.0695

#calling for one difference
atm1_imputed |>
  features(box_cox(Cash, 0.241), unitroot_nsdiffs)

## # A tibble: 1 × 2
##   ATM   nsdiffs
##   <chr>   <int>
## 1 ATM1        1

#difference 
atm1_imputed |>
    features(box_cox(Cash, 0.241) |> difference(7), unitroot_nsdiffs)

## # A tibble: 1 × 2
##   ATM   nsdiffs
##   <chr>   <int>
## 1 ATM1        0

atm1_imputed |> autoplot((box_cox(Cash, 0.241)) |>
                           difference(7))

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).

#auto-ARIMA
atm1_imputed |>
  model(auto_arima = ARIMA(box_cox(Cash, 0.241), stepwise = FALSE)) |>
  report()

## Series: Cash 
## Model: ARIMA(0,0,2)(0,1,1)[7] 
## Transformation: box_cox(Cash, 0.241) 
## 
## Coefficients:
##          ma1      ma2     sma1
##       0.0984  -0.1144  -0.6460
## s.e.  0.0524   0.0527   0.0426
## 
## sigma^2 estimated as 1.539:  log likelihood=-585.51
## AIC=1179.01   AICc=1179.13   BIC=1194.54

atm1_imputed |>
  model(auto_arima = ARIMA(box_cox(Cash, 0.241))) |>
  report()

## Series: Cash 
## Model: ARIMA(0,0,2)(0,1,1)[7] 
## Transformation: box_cox(Cash, 0.241) 
## 
## Coefficients:
##          ma1      ma2     sma1
##       0.0984  -0.1144  -0.6460
## s.e.  0.0524   0.0527   0.0426
## 
## sigma^2 estimated as 1.539:  log likelihood=-585.51
## AIC=1179.01   AICc=1179.13   BIC=1194.54

#(0,0,2)(0,1,1)

#ACF and PACF
atm1_imputed |>
  gg_tsdisplay(difference(box_cox(Cash, 0.241),7), plot_type = 'partial')

## Warning: `gg_tsdisplay()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsdisplay()` instead.
## Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).

#ACF shows one big spike an a quick taper, so we can try an 000 011 model
#and skip the first 2


#residuals look normal 

fit_a_1 <- atm1_imputed |> 
  model(ARIMA(box_cox(Cash, 0.241) ~ pdq(0,0,2) + PDQ(0,1,1)))
gg_tsresiduals(fit_a_1)

#residuals also look normal for this model, though there are a couple of lags outside the lines

fit_a_1_b <- atm1_imputed |> model(ARIMA(box_cox(Cash, .241) ~ pdq(0,0,0) + PDQ(0,1,1)))

gg_tsresiduals(fit_a_1_b)

fit_sarima <- atm1_imputed |>
  model(
    auto_sarima = ARIMA(box_cox(Cash, 0.241) ~ pdq(0,0,2) + PDQ(0,1,1)),
    sarima000011 = ARIMA(box_cox(Cash, 0.241) ~ pdq(0,0,0) + PDQ(0,1,1))
  )

#forecasting for fit
fc <- fit_sarima |> 
  forecast(h = "3 months")

fc |> autoplot(atm1_imputed)

#facet-wrap
fc |> autoplot(atm1_imputed) +
     facet_wrap(~ .model, ncol = 1)

#remove confidence intervals
fc |> autoplot(atm1_imputed, level = NULL) +
     facet_wrap(~ .model, ncol = 1)

#check RMSE
fit_sarima |> accuracy()

## # A tibble: 2 × 11
##   ATM   .model       .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr> <chr>        <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 ATM1  auto_sarima  Training  2.36  25.0  16.1 -87.8  107. 0.906 0.901 0.0116
## 2 ATM1  sarima000011 Training  2.41  25.0  15.8 -91.4  110. 0.890 0.899 0.123

RMSE is a little higher at 25 (for both). With the other residuals, the model I chose, 001011, slightly edges out the auto-ARIMA’ model.

Forecasts

Here are one-month forecasts for all the models. Overall, my pick is Holt-Winters’ multiplicative.

fit_all <- atm1_imputed |>
  model(
    ana = ETS(Cash ~ error("A") + trend("N") + season("A")),
    hw_ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
    hwm_ets = ETS(Cash ~ error("M") + trend("A") + season("M")),
    snaive = SNAIVE(Cash),
    auto_sarima = ARIMA(box_cox(Cash, 0.241) ~ pdq(0,0,2) + PDQ(0,1,1)),
    sarima000011 = ARIMA(box_cox(Cash, 0.241) ~ pdq(0,0,0) + PDQ(0,1,1))
  )

fc_all <- fit_all |> 
  forecast(h = "1 month")

#all forecasts together 
fc_all |> 
  autoplot(atm1_imputed) + 
  labs(title = "All forecasts, ATM 1")

fc_all |> 
  autoplot(atm1_imputed, level = NULL) + 
  labs(title = "All forecasts, ATM 1 (no CI)")

fc_all |> 
  autoplot(atm1_imputed) +
  facet_wrap(~ .model, ncol = 1)

#breaking these up 

fc_all |> 
  filter(.model %in% c("ana", "hw_ets", "hwm_ets")) |> 
  autoplot(atm1_imputed) +
  facet_wrap(~ .model, ncol = 1) + 
  labs(title = "ETS")

fc_all |> 
  filter(.model %in% c("auto_sarima", "sarima001011")) |> 
  autoplot(atm1_imputed) +
  facet_wrap(~ .model, ncol = 1)+ 
  labs(title = "ARIMA")

fc_all |> 
  filter(.model == "snaive") |> 
  autoplot(atm1_imputed) + 
  labs(title = "SNAIVE")

#creating CSV
atm1_forecast <- as.data.frame(fc_all)

#write.csv(atm1_forecast, file="atm1forecast.csv", row.names=TRUE)

ATM 2

On second look, this data has a little bit of trend – it seems to be slowly trending downward as the year goes by. Weekly fluctuations still apply.

atm_data_clean |> filter(ATM == "ATM2") |> autoplot() +
  labs(title = "ATM 2")

## Plot variable not specified, automatically selected `.vars = Cash`

atm2 <- atm_data_clean |> filter(ATM == "ATM2")

#this has 2 NA values. Let's use KNN again. 

atm2 <- kNN(atm2, variable = "Cash", k = 5)

atm2_imputed <- atm2 |> select(-Cash_imp)

atm2_imputed <- atm2_imputed |> as_tsibble(key = ATM, index = DATE)


#decompose the data 

dcmp_2 <- atm2_imputed |>
  model(stl = STL(Cash ~ season(period = 7))) |>
  components()

# Plot the seasonal decomp for atm1_imputed
autoplot(dcmp_2)

We can see the trend a little more clearly. We can also see that February/March dip/contraction here (this may have been due to bad weather or a holiday week).

Approaches

seasonal naive to capture the weekly seasonal trend
ARIMA (suggested model + anything else that looks plausible based on graphs
ETS (recommended model + Holt-Winters and Holt-Winters multiplicative, because it seems like the seasonal effects rise with the mean)

### ETS

fit_atm2 <- atm2_imputed |>
    model(ets = ETS(Cash))

report(fit_atm2)

## Series: Cash 
## Model: ETS(A,N,A) 
##   Smoothing parameters:
##     alpha = 0.0001000463 
##     gamma = 0.3546428 
## 
##   Initial states:
##      l[0]      s[0]     s[-1]   s[-2]     s[-3]    s[-4]   s[-5]    s[-6]
##  72.11628 -38.36075 -25.27067 18.3668 -20.00746 16.45155 23.7113 25.10924
## 
##   sigma^2:  661.4986
## 
##      AIC     AICc      BIC 
## 4534.845 4535.467 4573.844

#ANA 


#use 9 months to train and 3 months to test
myseries_train <- atm2_imputed |>
  filter(DATE < as_date("2010-02-01"))

fit_2_ets <- myseries_train |>
  model(ana = ETS(Cash ~ error("A") + trend("N") + season("A")),
        hw_ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
        hwm_ets = ETS(Cash ~ error("M") + trend("A") + season("M")),
        
        )

fit_2_ets |> 
  forecast(h = "3 months") |> 
  autoplot(atm2_imputed)

#all plots separately 
fit_2_ets |> 
    forecast(h = "3 months") |> 
    autoplot(atm2_imputed) +
    facet_wrap(~ .model, ncol = 1, scales = "free_y")

#to find RMSE
fit_2_ets |> accuracy()

## # A tibble: 3 × 11
##   ATM   .model  .type        ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr> <chr>   <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 ATM2  ana     Training -2.00   22.5  15.7  -Inf   Inf 0.725 0.715 0.0175
## 2 ATM2  hw_ets  Training  0.293  22.2  15.3  -Inf   Inf 0.706 0.708 0.0181
## 3 ATM2  hwm_ets Training  0.612  24.7  17.7  -Inf   Inf 0.816 0.784 0.117

RMSE is lowest for Holt-Winters (non-multiplicative).

Seasonal Naive

fit_snaive_2 <- myseries_train |>
  model(snaive = SNAIVE(Cash))

fc_snaive_2 <- fit_snaive_2 |>
  forecast(h = "3 months")

fc_snaive_2 |>
  autoplot(atm2_imputed)

fit_snaive_2 |>
  accuracy()

## # A tibble: 1 × 11
##   ATM   .model .type        ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr> <chr>  <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 ATM2  snaive Training -0.312  31.4  21.7  -Inf   Inf     1     1 0.0517

RMSE is 31, which is higher than the other models.

ARIMA

atm2_imputed |> features(Cash, features = guerrero)

## # A tibble: 1 × 2
##   ATM   lambda_guerrero
##   <chr>           <dbl>
## 1 ATM2            0.741

#.741 is pretty close to 1 - we may not need it. Let's see if the data is stationary. 

#data is not stationary
atm2_imputed |>
  features(Cash, unitroot_kpss)

## # A tibble: 1 × 3
##   ATM   kpss_stat kpss_pvalue
##   <chr>     <dbl>       <dbl>
## 1 ATM2       2.20        0.01

#calling for one difference
atm2_imputed |>
  features(Cash, unitroot_nsdiffs)

## # A tibble: 1 × 2
##   ATM   nsdiffs
##   <chr>   <int>
## 1 ATM2        1

#let's take a daily difference 
atm2_imputed |>
  features(difference(Cash), unitroot_kpss)

## # A tibble: 1 × 3
##   ATM   kpss_stat kpss_pvalue
##   <chr>     <dbl>       <dbl>
## 1 ATM2     0.0147         0.1

#data is stationary

#still calling for 1 difference
atm2_imputed |>
  features(difference(Cash), unitroot_nsdiffs)

## # A tibble: 1 × 2
##   ATM   nsdiffs
##   <chr>   <int>
## 1 ATM2        1

#it's still calling for another difference after adding the box-cox
atm2_imputed |>
  features(box_cox(Cash, 0.741), unitroot_nsdiffs)

## # A tibble: 1 × 2
##   ATM   nsdiffs
##   <chr>   <int>
## 1 ATM2        1

#weekly difference 
atm2_imputed |>
  features(difference(Cash, 7), unitroot_kpss)

## # A tibble: 1 × 3
##   ATM   kpss_stat kpss_pvalue
##   <chr>     <dbl>       <dbl>
## 1 ATM2     0.0171         0.1

#this gives us zero, so this is the correct difference
atm2_imputed |>
  features(difference(Cash, 7), unitroot_nsdiffs)

## # A tibble: 1 × 2
##   ATM   nsdiffs
##   <chr>   <int>
## 1 ATM2        0

#plot it

atm2_imputed |> autoplot(difference(Cash))

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

atm2_imputed |> autoplot(difference(Cash,7))

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).

atm2_imputed |>
  model(auto_arima = ARIMA(Cash)) |>
  report()

## Series: Cash 
## Model: ARIMA(2,0,2)(0,1,1)[7] 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2     sma1
##       -0.4322  -0.9172  0.4729  0.8135  -0.7571
## s.e.   0.0550   0.0398  0.0860  0.0546   0.0382
## 
## sigma^2 estimated as 620.1:  log likelihood=-1658.84
## AIC=3329.68   AICc=3329.92   BIC=3352.96

#this is calling for a seasonal model (2,0,2)(0,1,1)


#check the fit 

fit_a_2 <- atm2_imputed |> model(ARIMA(Cash ~ pdq(2,0,2) + PDQ(0,1,1)))
report(fit_a_2)

## Series: Cash 
## Model: ARIMA(2,0,2)(0,1,1)[7] 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2     sma1
##       -0.4322  -0.9172  0.4729  0.8135  -0.7571
## s.e.   0.0550   0.0398  0.0860  0.0546   0.0382
## 
## sigma^2 estimated as 620.1:  log likelihood=-1658.84
## AIC=3329.68   AICc=3329.92   BIC=3352.96

gg_tsresiduals(fit_a_2)

#the distribution is a little strange (skewed right), but everything else looks fine

#ACF and PACF reflect the 7-day trend, but it's also showing correlations in other days

#ACF 
atm2_imputed |> ACF(
  (Cash)
  ) |> autoplot()

#PACF 
atm2_imputed |> PACF(
  (Cash)
  ) |> autoplot()

#one big lag at 7 and quite a few lags outside
atm2_imputed |> 
  gg_tsdisplay(difference(Cash, 7), plot_type = "partial")

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).

#I will also try a seasonal-only model (0,0,0)(0,1,1), since there's no regular 
#differencing and  the plot seems to tail off gradually (slow ACF decay), as does
#the PACF -- they look similar
#and we'll try with a non-seasonal difference as well 

#looks normal, some lags outside the lines
fit_a_2_b <- atm2_imputed |> model(ARIMA(Cash ~ pdq(0,0,0) + PDQ(0,1,1)))
gg_tsresiduals(fit_a_2_b)

#one big and one small lag outside the lines
fit_a_2_c <- atm2_imputed |> model(ARIMA(Cash ~ pdq(0,1,0) + PDQ(0,1,1)))
gg_tsresiduals(fit_a_2_c)

fit_sarima <- myseries_train |>
  model(
    auto_sarima = ARIMA(Cash ~ pdq(2,0,2) + PDQ(0,1,1)),
    sarima001011 = ARIMA(Cash ~ pdq(0,0,0) + PDQ(0,1,1)),
    sarima010011 = ARIMA(Cash ~ pdq(0,1,0) + PDQ(0,1,1))
  )

#residuals

#looking pretty normal 
fit_sarima |> 
  select(auto_sarima) |> 
  gg_tsresiduals()

#similarly normal
fit_sarima |> 
  select(sarima001011) |> 
  gg_tsresiduals()

#one big spike at lag one (this is also the worst fit of the three, so it's not surprising)
fit_sarima |> 
  select(sarima010011) |> 
  gg_tsresiduals()

#forecasting for fit
fc <- fit_sarima |> 
  forecast(h = "3 months")

fc |> autoplot(atm2_imputed)

#facet-wrap
fc |> autoplot(atm2_imputed) +
     facet_wrap(~ .model, ncol = 1)

#remove confidence intervals
fc |> autoplot(atm2_imputed, level = NULL) +
     facet_wrap(~ .model, ncol = 1)

#check RMSE and other fit metrics
fc |> 
  accuracy(atm2_imputed)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 2 observations are missing between 2010-05-01 and 2010-05-02

## # A tibble: 3 × 11
##   .model       ATM   .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE      ACF1
##   <chr>        <chr> <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>     <dbl>
## 1 auto_sarima  ATM2  Test    7.28  60.2  51.7  -Inf   Inf  2.38  1.92  0.000738
## 2 sarima001011 ATM2  Test    7.23  60.2  51.8  -Inf   Inf  2.38  1.92  0.000938
## 3 sarima010011 ATM2  Test  -16.8   62.2  53.2  -Inf   Inf  2.45  1.98 -0.00707

These are all pretty close, with auto-SARIMA as the lowest.

We can also see that the sarima010011 provides a really wide confidence interval, probably due to the way the difference was applied.

Plot the one-month forecasts

fit_all <- atm2_imputed |>
  model(
    ana = ETS(Cash ~ error("A") + trend("N") + season("A")),
    hw_ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
    hwm_ets = ETS(Cash ~ error("M") + trend("A") + season("M")),
    snaive = SNAIVE(Cash),
    auto_sarima = ARIMA(Cash ~ pdq(2,0,2) + PDQ(0,1,1)),
    sarima001011 = ARIMA(Cash ~ pdq(0,0,0) + PDQ(0,1,1)),
    sarima010011 = ARIMA(Cash ~ pdq(0,1,0) + PDQ(0,1,1))
  )

fc_all <- fit_all |> 
  forecast(h = "1 month")

fc_all |> 
  autoplot(atm2_imputed)

fc_all |> 
  autoplot(atm2_imputed, level = NULL)

fc_all |> 
  autoplot(atm2_imputed) +
  facet_wrap(~ .model, ncol = 1)

#breaking these up so they are easier to see

fc_all |> 
  filter(.model %in% c("ana", "hw_ets", "hwm_ets")) |> 
  autoplot(atm2_imputed) +
  facet_wrap(~ .model, ncol = 1) + 
  labs(title = "ETS")

fc_all |> 
  filter(.model %in% c("auto_sarima", "sarima001011", "sarima010011")) |> 
  autoplot(atm2_imputed) +
  facet_wrap(~ .model, ncol = 1)+ 
  labs(title = "ARIMA")

fc_all |> 
  filter(.model == "snaive") |> 
  autoplot(atm2_imputed) + 
  labs(title = "SNAIVE")

#CSV forecasts

atm2_forecast <- as.data.frame(fc_all)

#write.csv(atm2_forecast, file="atm2forecast.csv", row.names=TRUE)

Overall, the RMSE was lowest for Holt-Winters (non-multiplicative). Based on these plots, ANA and Holt_Winters look pretty good. Seasonal naive doesn’t seem like a bad model, given the downward trend.

Project 1 - data 624

Samantha Barbaro

2026-03-22

Project 1 - Part A, ATM data

Loading the dataframe

Cleaning up formatting and looking for null values

Check the missing values

Forecasting for ATM3

Naive forecasting for limited data

More data cleaning

ATM 4 - fixing outliers

Forecasting ATM4

Plot all the forecasts together

ATM 1

Seasonal naive

ETS

ARIMA

Forecasts

ATM 2

Seasonal Naive

ARIMA

Plot the one-month forecasts