library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.6
## âś” forcats 1.0.1 âś” stringr 1.5.2
## âś” ggplot2 4.0.1 âś” tibble 3.3.1
## âś” lubridate 1.9.4 âś” tidyr 1.3.1
## âś” purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.5.2
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## âś” tsibble 1.1.6 âś” feasts 0.4.2
## âś” tsibbledata 0.4.1 âś” fable 0.5.0
## Warning: package 'tsibble' was built under R version 4.5.2
## Warning: package 'tsibbledata' was built under R version 4.5.2
## Warning: package 'feasts' was built under R version 4.5.2
## Warning: package 'fabletools' was built under R version 4.5.2
## Warning: package 'fable' was built under R version 4.5.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## âś– lubridate::date() masks base::date()
## âś– dplyr::filter() masks stats::filter()
## âś– tsibble::intersect() masks base::intersect()
## âś– tsibble::interval() masks lubridate::interval()
## âś– dplyr::lag() masks stats::lag()
## âś– tsibble::setdiff() masks base::setdiff()
## âś– tsibble::union() masks base::union()
I converted the file to a .csv and uploaded it to github.
atm_data <- read.table("https://raw.githubusercontent.com/samanthabarbaro/data624/refs/heads/main/ATM624Data.csv", sep = ",", header = TRUE)
#how did the variables import?
glimpse(atm_data)
## Rows: 1,474
## Columns: 3
## $ DATE <chr> "5/1/2009 12:00:00 AM", "5/1/2009 12:00:00 AM", "5/2/2009 12:00:0…
## $ ATM <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ Cash <int> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …
The dates imported formatted as characters. There’s a 12:00:00 a.m. timestamp for every instance, which we can remove to simplify things.
atm_data |> distinct(DATE)
## DATE
## 1 5/1/2009 12:00:00 AM
## 2 5/2/2009 12:00:00 AM
## 3 5/3/2009 12:00:00 AM
## 4 5/4/2009 12:00:00 AM
## 5 5/5/2009 12:00:00 AM
## 6 5/6/2009 12:00:00 AM
## 7 5/7/2009 12:00:00 AM
## 8 5/8/2009 12:00:00 AM
## 9 5/9/2009 12:00:00 AM
## 10 5/10/2009 12:00:00 AM
## 11 5/11/2009 12:00:00 AM
## 12 5/12/2009 12:00:00 AM
## 13 5/13/2009 12:00:00 AM
## 14 5/14/2009 12:00:00 AM
## 15 5/15/2009 12:00:00 AM
## 16 5/16/2009 12:00:00 AM
## 17 5/17/2009 12:00:00 AM
## 18 5/18/2009 12:00:00 AM
## 19 5/19/2009 12:00:00 AM
## 20 5/20/2009 12:00:00 AM
## 21 5/21/2009 12:00:00 AM
## 22 5/22/2009 12:00:00 AM
## 23 5/23/2009 12:00:00 AM
## 24 5/24/2009 12:00:00 AM
## 25 5/25/2009 12:00:00 AM
## 26 5/26/2009 12:00:00 AM
## 27 5/27/2009 12:00:00 AM
## 28 5/28/2009 12:00:00 AM
## 29 5/29/2009 12:00:00 AM
## 30 5/30/2009 12:00:00 AM
## 31 5/31/2009 12:00:00 AM
## 32 6/1/2009 12:00:00 AM
## 33 6/2/2009 12:00:00 AM
## 34 6/3/2009 12:00:00 AM
## 35 6/4/2009 12:00:00 AM
## 36 6/5/2009 12:00:00 AM
## 37 6/6/2009 12:00:00 AM
## 38 6/7/2009 12:00:00 AM
## 39 6/8/2009 12:00:00 AM
## 40 6/9/2009 12:00:00 AM
## 41 6/10/2009 12:00:00 AM
## 42 6/11/2009 12:00:00 AM
## 43 6/12/2009 12:00:00 AM
## 44 6/13/2009 12:00:00 AM
## 45 6/14/2009 12:00:00 AM
## 46 6/15/2009 12:00:00 AM
## 47 6/16/2009 12:00:00 AM
## 48 6/17/2009 12:00:00 AM
## 49 6/18/2009 12:00:00 AM
## 50 6/19/2009 12:00:00 AM
## 51 6/20/2009 12:00:00 AM
## 52 6/21/2009 12:00:00 AM
## 53 6/22/2009 12:00:00 AM
## 54 6/23/2009 12:00:00 AM
## 55 6/24/2009 12:00:00 AM
## 56 6/25/2009 12:00:00 AM
## 57 6/26/2009 12:00:00 AM
## 58 6/27/2009 12:00:00 AM
## 59 6/28/2009 12:00:00 AM
## 60 6/29/2009 12:00:00 AM
## 61 6/30/2009 12:00:00 AM
## 62 7/1/2009 12:00:00 AM
## 63 7/2/2009 12:00:00 AM
## 64 7/3/2009 12:00:00 AM
## 65 7/4/2009 12:00:00 AM
## 66 7/5/2009 12:00:00 AM
## 67 7/6/2009 12:00:00 AM
## 68 7/7/2009 12:00:00 AM
## 69 7/8/2009 12:00:00 AM
## 70 7/9/2009 12:00:00 AM
## 71 7/10/2009 12:00:00 AM
## 72 7/11/2009 12:00:00 AM
## 73 7/12/2009 12:00:00 AM
## 74 7/13/2009 12:00:00 AM
## 75 7/14/2009 12:00:00 AM
## 76 7/15/2009 12:00:00 AM
## 77 7/16/2009 12:00:00 AM
## 78 7/17/2009 12:00:00 AM
## 79 7/18/2009 12:00:00 AM
## 80 7/19/2009 12:00:00 AM
## 81 7/20/2009 12:00:00 AM
## 82 7/21/2009 12:00:00 AM
## 83 7/22/2009 12:00:00 AM
## 84 7/23/2009 12:00:00 AM
## 85 7/24/2009 12:00:00 AM
## 86 7/25/2009 12:00:00 AM
## 87 7/26/2009 12:00:00 AM
## 88 7/27/2009 12:00:00 AM
## 89 7/28/2009 12:00:00 AM
## 90 7/29/2009 12:00:00 AM
## 91 7/30/2009 12:00:00 AM
## 92 7/31/2009 12:00:00 AM
## 93 8/1/2009 12:00:00 AM
## 94 8/2/2009 12:00:00 AM
## 95 8/3/2009 12:00:00 AM
## 96 8/4/2009 12:00:00 AM
## 97 8/5/2009 12:00:00 AM
## 98 8/6/2009 12:00:00 AM
## 99 8/7/2009 12:00:00 AM
## 100 8/8/2009 12:00:00 AM
## 101 8/9/2009 12:00:00 AM
## 102 8/10/2009 12:00:00 AM
## 103 8/11/2009 12:00:00 AM
## 104 8/12/2009 12:00:00 AM
## 105 8/13/2009 12:00:00 AM
## 106 8/14/2009 12:00:00 AM
## 107 8/15/2009 12:00:00 AM
## 108 8/16/2009 12:00:00 AM
## 109 8/17/2009 12:00:00 AM
## 110 8/18/2009 12:00:00 AM
## 111 8/19/2009 12:00:00 AM
## 112 8/20/2009 12:00:00 AM
## 113 8/21/2009 12:00:00 AM
## 114 8/22/2009 12:00:00 AM
## 115 8/23/2009 12:00:00 AM
## 116 8/24/2009 12:00:00 AM
## 117 8/25/2009 12:00:00 AM
## 118 8/26/2009 12:00:00 AM
## 119 8/27/2009 12:00:00 AM
## 120 8/28/2009 12:00:00 AM
## 121 8/29/2009 12:00:00 AM
## 122 8/30/2009 12:00:00 AM
## 123 8/31/2009 12:00:00 AM
## 124 9/1/2009 12:00:00 AM
## 125 9/2/2009 12:00:00 AM
## 126 9/3/2009 12:00:00 AM
## 127 9/4/2009 12:00:00 AM
## 128 9/5/2009 12:00:00 AM
## 129 9/6/2009 12:00:00 AM
## 130 9/7/2009 12:00:00 AM
## 131 9/8/2009 12:00:00 AM
## 132 9/9/2009 12:00:00 AM
## 133 9/10/2009 12:00:00 AM
## 134 9/11/2009 12:00:00 AM
## 135 9/12/2009 12:00:00 AM
## 136 9/13/2009 12:00:00 AM
## 137 9/14/2009 12:00:00 AM
## 138 9/15/2009 12:00:00 AM
## 139 9/16/2009 12:00:00 AM
## 140 9/17/2009 12:00:00 AM
## 141 9/18/2009 12:00:00 AM
## 142 9/19/2009 12:00:00 AM
## 143 9/20/2009 12:00:00 AM
## 144 9/21/2009 12:00:00 AM
## 145 9/22/2009 12:00:00 AM
## 146 9/23/2009 12:00:00 AM
## 147 9/24/2009 12:00:00 AM
## 148 9/25/2009 12:00:00 AM
## 149 9/26/2009 12:00:00 AM
## 150 9/27/2009 12:00:00 AM
## 151 9/28/2009 12:00:00 AM
## 152 9/29/2009 12:00:00 AM
## 153 9/30/2009 12:00:00 AM
## 154 10/1/2009 12:00:00 AM
## 155 10/2/2009 12:00:00 AM
## 156 10/3/2009 12:00:00 AM
## 157 10/4/2009 12:00:00 AM
## 158 10/5/2009 12:00:00 AM
## 159 10/6/2009 12:00:00 AM
## 160 10/7/2009 12:00:00 AM
## 161 10/8/2009 12:00:00 AM
## 162 10/9/2009 12:00:00 AM
## 163 10/10/2009 12:00:00 AM
## 164 10/11/2009 12:00:00 AM
## 165 10/12/2009 12:00:00 AM
## 166 10/13/2009 12:00:00 AM
## 167 10/14/2009 12:00:00 AM
## 168 10/15/2009 12:00:00 AM
## 169 10/16/2009 12:00:00 AM
## 170 10/17/2009 12:00:00 AM
## 171 10/18/2009 12:00:00 AM
## 172 10/19/2009 12:00:00 AM
## 173 10/20/2009 12:00:00 AM
## 174 10/21/2009 12:00:00 AM
## 175 10/22/2009 12:00:00 AM
## 176 10/23/2009 12:00:00 AM
## 177 10/24/2009 12:00:00 AM
## 178 10/25/2009 12:00:00 AM
## 179 10/26/2009 12:00:00 AM
## 180 10/27/2009 12:00:00 AM
## 181 10/28/2009 12:00:00 AM
## 182 10/29/2009 12:00:00 AM
## 183 10/30/2009 12:00:00 AM
## 184 10/31/2009 12:00:00 AM
## 185 11/1/2009 12:00:00 AM
## 186 11/2/2009 12:00:00 AM
## 187 11/3/2009 12:00:00 AM
## 188 11/4/2009 12:00:00 AM
## 189 11/5/2009 12:00:00 AM
## 190 11/6/2009 12:00:00 AM
## 191 11/7/2009 12:00:00 AM
## 192 11/8/2009 12:00:00 AM
## 193 11/9/2009 12:00:00 AM
## 194 11/10/2009 12:00:00 AM
## 195 11/11/2009 12:00:00 AM
## 196 11/12/2009 12:00:00 AM
## 197 11/13/2009 12:00:00 AM
## 198 11/14/2009 12:00:00 AM
## 199 11/15/2009 12:00:00 AM
## 200 11/16/2009 12:00:00 AM
## 201 11/17/2009 12:00:00 AM
## 202 11/18/2009 12:00:00 AM
## 203 11/19/2009 12:00:00 AM
## 204 11/20/2009 12:00:00 AM
## 205 11/21/2009 12:00:00 AM
## 206 11/22/2009 12:00:00 AM
## 207 11/23/2009 12:00:00 AM
## 208 11/24/2009 12:00:00 AM
## 209 11/25/2009 12:00:00 AM
## 210 11/26/2009 12:00:00 AM
## 211 11/27/2009 12:00:00 AM
## 212 11/28/2009 12:00:00 AM
## 213 11/29/2009 12:00:00 AM
## 214 11/30/2009 12:00:00 AM
## 215 12/1/2009 12:00:00 AM
## 216 12/2/2009 12:00:00 AM
## 217 12/3/2009 12:00:00 AM
## 218 12/4/2009 12:00:00 AM
## 219 12/5/2009 12:00:00 AM
## 220 12/6/2009 12:00:00 AM
## 221 12/7/2009 12:00:00 AM
## 222 12/8/2009 12:00:00 AM
## 223 12/9/2009 12:00:00 AM
## 224 12/10/2009 12:00:00 AM
## 225 12/11/2009 12:00:00 AM
## 226 12/12/2009 12:00:00 AM
## 227 12/13/2009 12:00:00 AM
## 228 12/14/2009 12:00:00 AM
## 229 12/15/2009 12:00:00 AM
## 230 12/16/2009 12:00:00 AM
## 231 12/17/2009 12:00:00 AM
## 232 12/18/2009 12:00:00 AM
## 233 12/19/2009 12:00:00 AM
## 234 12/20/2009 12:00:00 AM
## 235 12/21/2009 12:00:00 AM
## 236 12/22/2009 12:00:00 AM
## 237 12/23/2009 12:00:00 AM
## 238 12/24/2009 12:00:00 AM
## 239 12/25/2009 12:00:00 AM
## 240 12/26/2009 12:00:00 AM
## 241 12/27/2009 12:00:00 AM
## 242 12/28/2009 12:00:00 AM
## 243 12/29/2009 12:00:00 AM
## 244 12/30/2009 12:00:00 AM
## 245 12/31/2009 12:00:00 AM
## 246 1/1/2010 12:00:00 AM
## 247 1/2/2010 12:00:00 AM
## 248 1/3/2010 12:00:00 AM
## 249 1/4/2010 12:00:00 AM
## 250 1/5/2010 12:00:00 AM
## 251 1/6/2010 12:00:00 AM
## 252 1/7/2010 12:00:00 AM
## 253 1/8/2010 12:00:00 AM
## 254 1/9/2010 12:00:00 AM
## 255 1/10/2010 12:00:00 AM
## 256 1/11/2010 12:00:00 AM
## 257 1/12/2010 12:00:00 AM
## 258 1/13/2010 12:00:00 AM
## 259 1/14/2010 12:00:00 AM
## 260 1/15/2010 12:00:00 AM
## 261 1/16/2010 12:00:00 AM
## 262 1/17/2010 12:00:00 AM
## 263 1/18/2010 12:00:00 AM
## 264 1/19/2010 12:00:00 AM
## 265 1/20/2010 12:00:00 AM
## 266 1/21/2010 12:00:00 AM
## 267 1/22/2010 12:00:00 AM
## 268 1/23/2010 12:00:00 AM
## 269 1/24/2010 12:00:00 AM
## 270 1/25/2010 12:00:00 AM
## 271 1/26/2010 12:00:00 AM
## 272 1/27/2010 12:00:00 AM
## 273 1/28/2010 12:00:00 AM
## 274 1/29/2010 12:00:00 AM
## 275 1/30/2010 12:00:00 AM
## 276 1/31/2010 12:00:00 AM
## 277 2/1/2010 12:00:00 AM
## 278 2/2/2010 12:00:00 AM
## 279 2/3/2010 12:00:00 AM
## 280 2/4/2010 12:00:00 AM
## 281 2/5/2010 12:00:00 AM
## 282 2/6/2010 12:00:00 AM
## 283 2/7/2010 12:00:00 AM
## 284 2/8/2010 12:00:00 AM
## 285 2/9/2010 12:00:00 AM
## 286 2/10/2010 12:00:00 AM
## 287 2/11/2010 12:00:00 AM
## 288 2/12/2010 12:00:00 AM
## 289 2/13/2010 12:00:00 AM
## 290 2/14/2010 12:00:00 AM
## 291 2/15/2010 12:00:00 AM
## 292 2/16/2010 12:00:00 AM
## 293 2/17/2010 12:00:00 AM
## 294 2/18/2010 12:00:00 AM
## 295 2/19/2010 12:00:00 AM
## 296 2/20/2010 12:00:00 AM
## 297 2/21/2010 12:00:00 AM
## 298 2/22/2010 12:00:00 AM
## 299 2/23/2010 12:00:00 AM
## 300 2/24/2010 12:00:00 AM
## 301 2/25/2010 12:00:00 AM
## 302 2/26/2010 12:00:00 AM
## 303 2/27/2010 12:00:00 AM
## 304 2/28/2010 12:00:00 AM
## 305 3/1/2010 12:00:00 AM
## 306 3/2/2010 12:00:00 AM
## 307 3/3/2010 12:00:00 AM
## 308 3/4/2010 12:00:00 AM
## 309 3/5/2010 12:00:00 AM
## 310 3/6/2010 12:00:00 AM
## 311 3/7/2010 12:00:00 AM
## 312 3/8/2010 12:00:00 AM
## 313 3/9/2010 12:00:00 AM
## 314 3/10/2010 12:00:00 AM
## 315 3/11/2010 12:00:00 AM
## 316 3/12/2010 12:00:00 AM
## 317 3/13/2010 12:00:00 AM
## 318 3/14/2010 12:00:00 AM
## 319 3/15/2010 12:00:00 AM
## 320 3/16/2010 12:00:00 AM
## 321 3/17/2010 12:00:00 AM
## 322 3/18/2010 12:00:00 AM
## 323 3/19/2010 12:00:00 AM
## 324 3/20/2010 12:00:00 AM
## 325 3/21/2010 12:00:00 AM
## 326 3/22/2010 12:00:00 AM
## 327 3/23/2010 12:00:00 AM
## 328 3/24/2010 12:00:00 AM
## 329 3/25/2010 12:00:00 AM
## 330 3/26/2010 12:00:00 AM
## 331 3/27/2010 12:00:00 AM
## 332 3/28/2010 12:00:00 AM
## 333 3/29/2010 12:00:00 AM
## 334 3/30/2010 12:00:00 AM
## 335 3/31/2010 12:00:00 AM
## 336 4/1/2010 12:00:00 AM
## 337 4/2/2010 12:00:00 AM
## 338 4/3/2010 12:00:00 AM
## 339 4/4/2010 12:00:00 AM
## 340 4/5/2010 12:00:00 AM
## 341 4/6/2010 12:00:00 AM
## 342 4/7/2010 12:00:00 AM
## 343 4/8/2010 12:00:00 AM
## 344 4/9/2010 12:00:00 AM
## 345 4/10/2010 12:00:00 AM
## 346 4/11/2010 12:00:00 AM
## 347 4/12/2010 12:00:00 AM
## 348 4/13/2010 12:00:00 AM
## 349 4/14/2010 12:00:00 AM
## 350 4/15/2010 12:00:00 AM
## 351 4/16/2010 12:00:00 AM
## 352 4/17/2010 12:00:00 AM
## 353 4/18/2010 12:00:00 AM
## 354 4/19/2010 12:00:00 AM
## 355 4/20/2010 12:00:00 AM
## 356 4/21/2010 12:00:00 AM
## 357 4/22/2010 12:00:00 AM
## 358 4/23/2010 12:00:00 AM
## 359 4/24/2010 12:00:00 AM
## 360 4/25/2010 12:00:00 AM
## 361 4/26/2010 12:00:00 AM
## 362 4/27/2010 12:00:00 AM
## 363 4/28/2010 12:00:00 AM
## 364 4/29/2010 12:00:00 AM
## 365 4/30/2010 12:00:00 AM
## 366 5/1/2010 12:00:00 AM
## 367 5/2/2010 12:00:00 AM
## 368 5/3/2010 12:00:00 AM
## 369 5/4/2010 12:00:00 AM
## 370 5/5/2010 12:00:00 AM
## 371 5/6/2010 12:00:00 AM
## 372 5/7/2010 12:00:00 AM
## 373 5/8/2010 12:00:00 AM
## 374 5/9/2010 12:00:00 AM
## 375 5/10/2010 12:00:00 AM
## 376 5/11/2010 12:00:00 AM
## 377 5/12/2010 12:00:00 AM
## 378 5/13/2010 12:00:00 AM
## 379 5/14/2010 12:00:00 AM
atm_data |> summary()
## DATE ATM Cash
## Length:1474 Length:1474 Min. : 0.0
## Class :character Class :character 1st Qu.: 0.5
## Mode :character Mode :character Median : 73.0
## Mean : 155.6
## 3rd Qu.: 114.0
## Max. :10920.0
## NA's :19
#format the dates properly
atm_data <- atm_data |>
mutate(DATE = as_date(mdy_hms(DATE)))
#format as a tsibble
atm_data <- atm_data |>
as_tsibble(key = ATM, index = DATE)
atm_data |> filter(is.na(Cash))
## # A tsibble: 19 x 3 [1D]
## # Key: ATM [3]
## DATE ATM Cash
## <date> <chr> <int>
## 1 2010-05-01 "" NA
## 2 2010-05-02 "" NA
## 3 2010-05-03 "" NA
## 4 2010-05-04 "" NA
## 5 2010-05-05 "" NA
## 6 2010-05-06 "" NA
## 7 2010-05-07 "" NA
## 8 2010-05-08 "" NA
## 9 2010-05-09 "" NA
## 10 2010-05-10 "" NA
## 11 2010-05-11 "" NA
## 12 2010-05-12 "" NA
## 13 2010-05-13 "" NA
## 14 2010-05-14 "" NA
## 15 2009-06-13 "ATM1" NA
## 16 2009-06-16 "ATM1" NA
## 17 2009-06-22 "ATM1" NA
## 18 2009-06-18 "ATM2" NA
## 19 2009-06-24 "ATM2" NA
atm_data |> filter(is.na(ATM))
## # A tsibble: 0 x 3 [?]
## # Key: ATM [0]
## # ℹ 3 variables: DATE <date>, ATM <chr>, Cash <int>
atm_data |> filter(is.na(DATE))
## # A tsibble: 0 x 3 [?]
## # Key: ATM [0]
## # ℹ 3 variables: DATE <date>, ATM <chr>, Cash <int>
#none of these are "NA" - just blank
atm_data |> filter(!ATM %in% c("ATM1", "ATM2", "ATM3", "ATM4"))
## # A tsibble: 14 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <int>
## 1 2010-05-01 "" NA
## 2 2010-05-02 "" NA
## 3 2010-05-03 "" NA
## 4 2010-05-04 "" NA
## 5 2010-05-05 "" NA
## 6 2010-05-06 "" NA
## 7 2010-05-07 "" NA
## 8 2010-05-08 "" NA
## 9 2010-05-09 "" NA
## 10 2010-05-10 "" NA
## 11 2010-05-11 "" NA
## 12 2010-05-12 "" NA
## 13 2010-05-13 "" NA
## 14 2010-05-14 "" NA
#it seems like data for ATMs 1 and 2 are missing for the last month of recorded data (May 2010), so I will just exclude those values rather than impute them. I will impute the missing June 2009 values, though.
atm_data |> group_by(ATM) |> summarize(mean(Cash, na.rm = TRUE))
## # A tsibble: 1,474 x 3 [1D]
## # Key: ATM [5]
## ATM DATE `mean(Cash, na.rm = TRUE)`
## <chr> <date> <dbl>
## 1 "" 2010-05-01 NaN
## 2 "" 2010-05-02 NaN
## 3 "" 2010-05-03 NaN
## 4 "" 2010-05-04 NaN
## 5 "" 2010-05-05 NaN
## 6 "" 2010-05-06 NaN
## 7 "" 2010-05-07 NaN
## 8 "" 2010-05-08 NaN
## 9 "" 2010-05-09 NaN
## 10 "" 2010-05-10 NaN
## # ℹ 1,464 more rows
atm_data |> group_by(ATM) |> summarize(sum(Cash, na.rm = TRUE))
## # A tsibble: 1,474 x 3 [1D]
## # Key: ATM [5]
## ATM DATE `sum(Cash, na.rm = TRUE)`
## <chr> <date> <int>
## 1 "" 2010-05-01 0
## 2 "" 2010-05-02 0
## 3 "" 2010-05-03 0
## 4 "" 2010-05-04 0
## 5 "" 2010-05-05 0
## 6 "" 2010-05-06 0
## 7 "" 2010-05-07 0
## 8 "" 2010-05-08 0
## 9 "" 2010-05-09 0
## 10 "" 2010-05-10 0
## # ℹ 1,464 more rows
I started with ATM 3 because it’s the simplest ATM to forecast for.
Most values for ATM 3 are 0 - this ATM may have not existed or been accessible until May 2010. Here, it’s best to go with some sort of naive forecast, since we only have three days’ worth of data, and all three days are relatively similar. I won’t check fit here or traint the data, since it isn’t hugely relevant for a data set that only has three usable data points.
#the three days with withdrawals
atm_3 <- atm_data |>
filter (ATM == "ATM3") |>
filter(Cash > 0)
atm_3
## # A tsibble: 3 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <int>
## 1 2010-04-28 ATM3 96
## 2 2010-04-29 ATM3 82
## 3 2010-04-30 ATM3 85
#all data
atm_3_all <- atm_data |>
filter (ATM == "ATM3")
autoplot(atm_3_all)
## Plot variable not specified, automatically selected `.vars = Cash`
Let’s go with the mean (but only for the three available days) and a simple naive forecast.
atm_3_fit <- atm_3_all |> model(NAIVE(Cash))
atm_3_fit <- atm_3_all |>
model(
Naive = NAIVE(Cash)
)
#forecasts for all of May
atm_3_fit |> forecast(h = 31) |> autoplot(atm_3_all, level = NULL)
atm_3_fit |> forecast(h = 31) |> autoplot(atm_3_all)
#let's forecast the mean based on 3 days' data. I'll also add the mean so it's easier
#to see.
atm_3_fit_mean <- atm_3 |> model(MEAN(Cash),
NAIVE(Cash)
)
atm_3_fit_mean |> forecast(h = 31) |> autoplot(atm_3, level = NULL)
#including the confidence interval
atm_3_fit_mean |> forecast(h = 31) |> autoplot(atm_3)
#create a data frame and exporting as CSV
fc_atm3 <- atm_3_fit_mean |> forecast(h = 31)
#atm3_forecast <- as.data.frame(fc_atm3)
#write.csv(atm3_forecast, file="atm3forecast.csv", row.names=TRUE)
Moving on, let’s get rid of the days in May we have no data for.
atm_data_clean <- atm_data |>
filter(ATM %in% c("ATM1", "ATM2", "ATM3", "ATM4"))
#let's plot and look for outliers
#values drop dramatically roughly every 7th day, presumably for a weekend
atm_data_clean |>
filter(ATM == "ATM1") |>
autoplot() +
labs(title = "ATM 1")
## Plot variable not specified, automatically selected `.vars = Cash`
atm1 <- atm_data_clean |>
filter(ATM == "ATM1")
#here they tend to drop on days 6 and 7 - there's also a weekly pattern
atm_data_clean |> filter(ATM == "ATM2") |> autoplot() +
labs(title = "ATM 2")
## Plot variable not specified, automatically selected `.vars = Cash`
#there's clearly an outlier here
atm_data_clean |>
filter(ATM == "ATM4") |>
autoplot() +
labs(title = "ATM 4")
## Plot variable not specified, automatically selected `.vars = Cash`
#let's learn more
atm_data_clean |>
filter(ATM == "ATM4") |>
arrange(desc(Cash))
## Warning: Current temporal ordering may yield unexpected results.
## ℹ Suggest to sort by `ATM`, `DATE` first.
## # A tsibble: 365 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <int>
## 1 2010-02-09 ATM4 10920
## 2 2009-09-22 ATM4 1712
## 3 2010-01-29 ATM4 1575
## 4 2009-09-04 ATM4 1495
## 5 2009-06-09 ATM4 1484
## 6 2009-09-05 ATM4 1301
## 7 2010-02-28 ATM4 1276
## 8 2009-08-23 ATM4 1246
## 9 2009-06-14 ATM4 1221
## 10 2009-09-29 ATM4 1195
## # ℹ 355 more rows
My best guess is that, in data entry, someone added a zero by accident. Many of the other large values are in the 1000 + range.
If I were to use some other method, like calculating the mean for a similar day, I don’t think it would be that different from removing a zero. I will go ahead and replace this value, 10920, with 1092.
#10920 with 1092
atm4 <- atm_data_clean |>
filter(ATM == "ATM4")
atm4 <- atm4 |>
mutate(Cash = case_when(
Cash == 10920 ~ 1092,
TRUE ~ Cash
))
autoplot(atm4)
## Plot variable not specified, automatically selected `.vars = Cash`
Now we can more clearly see the ups and downs. All of these plots seem to have some sort of weekly pattern. Let’s see how strong it is.
dcmp <- atm4 |>
model(stl = STL(Cash ~ season(period = 7))) |>
components()
autoplot(dcmp)
#if I don't pick 7 days, STL decomp seems to choose a weekly pattern:
dcmp <- atm4 |>
model(STL(Cash)) |>
components()
autoplot(dcmp)
#Using ETS
fit <- atm4 |>
model(ets = ETS(Cash))
report(fit)
## Series: Cash
## Model: ETS(A,N,A)
## Smoothing parameters:
## alpha = 0.000100039
## gamma = 0.0001001906
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 445.8873 -272.9881 -35.41945 18.60843 39.49149 88.67104 47.84768 113.7889
##
## sigma^2: 111656.2
##
## AIC AICc BIC
## 6406.810 6407.432 6445.809
#chooses ANA - additive error, no trend (we can see from the plot that
#there's no trend). The seasonality does look additive rather than multiplicitive.
What works here? The data has a 7-day seasonality and no apparent overall trend, so we have a few options.
Let’s try ETS (ANA model), which is suggested above. We’ll also try the suggested ARIMA model, and seasonal naive model, which seems appropriate for data that follows a simple weekly trend. We can also try the Holt-Winters additive model (AAA) for seasonal data to see – though this data doesn’t seem to have a strong trend.
#use 9 months to train and 3 months to test
myseries_train <- atm4 |>
filter(DATE < as_date("2010-02-01"))
fit <- myseries_train |>
model(ana = ETS(Cash ~ error("A") + trend("N") + season("A")))
fit |>
forecast(h = "3 months") |>
autoplot(atm4)
#to find RMSE
fit |> accuracy()
## # A tibble: 1 Ă— 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM4 ana Training -9.04 321. 249. -410. 439. 0.715 0.704 0.114
It doesn’t look great – the RMSE is really high at 345. Let’s try Holt-Winters.
fit2 <- myseries_train |>
model(hw = ETS(Cash ~ error("A") + trend("A") + season("A")))
fit2 |>
forecast(h = "3 months") |>
autoplot(atm4)
#to find the fit
fit2 |> accuracy()
## # A tibble: 1 Ă— 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM4 hw Training -2.55 324. 256. -384. 431. 0.737 0.711 0.0764
slightly better RMSE - 324 (and that wasn’t even the suggested model). Next, we’ll try seasonal naive.
fit_snaive <- myseries_train |>
model(snaive = SNAIVE(Cash))
fc_snaive <- fit_snaive |>
forecast(h = "3 months")
fc_snaive |>
autoplot(atm4)
fit_snaive |>
accuracy()
## # A tibble: 1 Ă— 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM4 snaive Training 4.72 456. 348. -372. 428. 1 1 0.0566
The RMSE is much higher at 456. Finally, let’s try an ARIMA model.
atm4 |> features(Cash, features = guerrero)
## # A tibble: 1 Ă— 2
## ATM lambda_guerrero
## <chr> <dbl>
## 1 ATM4 0.464
#data is stationary
atm4 |>
mutate(diff_data = box_cox(Cash, 0.464) ) |>
features(diff_data, unitroot_kpss)
## # A tibble: 1 Ă— 3
## ATM kpss_stat kpss_pvalue
## <chr> <dbl> <dbl>
## 1 ATM4 0.0737 0.1
atm4 |>
features(Cash, unitroot_nsdiffs)
## # A tibble: 1 Ă— 2
## ATM nsdiffs
## <chr> <int>
## 1 ATM4 0
atm4 <- atm4 |> as_tsibble(key = ATM, index = DATE)
atm4 |>
model(auto_arima = ARIMA(box_cox(Cash, 0.464))) |>
report()
## Series: Cash
## Model: ARIMA(0,0,0)(2,0,0)[7] w/ mean
## Transformation: box_cox(Cash, 0.464)
##
## Coefficients:
## sar1 sar2 constant
## 0.2134 0.1813 18.4929
## s.e. 0.0518 0.0522 0.7413
##
## sigma^2 estimated as 209.5: log likelihood=-1492.31
## AIC=2992.62 AICc=2992.73 BIC=3008.22
#it's the same without stwpwise (it just takes longer)
atm4 |>
model(auto_arima = ARIMA(box_cox(Cash, 0.464), stepwise = FALSE)) |>
report()
## Series: Cash
## Model: ARIMA(0,0,0)(2,0,0)[7] w/ mean
## Transformation: box_cox(Cash, 0.464)
##
## Coefficients:
## sar1 sar2 constant
## 0.2134 0.1813 18.4929
## s.e. 0.0518 0.0522 0.7413
##
## sigma^2 estimated as 209.5: log likelihood=-1492.31
## AIC=2992.62 AICc=2992.73 BIC=3008.22
#check the fit
fit_a <- atm4 |>
model(ARIMA(box_cox(Cash, 0.464) ~ pdq(0,0,0) + PDQ(2,0,0)))
gg_tsresiduals(fit_a)
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#the distribution is a little strange (skewed right), but everything else looks fine
#ACF and PACF reflect the 7-day trend
#ACF
atm4 |> ACF(
box_cox(Cash, 0.464)
) |> autoplot()
#PACF
atm4 |> PACF(
box_cox(Cash, 0.464)
) |> autoplot()
fit_sarima <- myseries_train |>
model(
arima = ARIMA(box_cox(Cash, 0.464) ~ pdq(0,0,0) + PDQ(2,0,0))
)
#changed this to 88 days because I was getting an error below for not having data on May 1-2
fc <- fit_sarima |>
forecast(h = "88 days")
fc |> autoplot(atm4)
#check fit
fc |>
accuracy(atm4)
## # A tibble: 1 Ă— 11
## .model ATM .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima ATM4 Test -30.8 318. 267. -630. 658. 0.767 0.698 -0.0165
RMSE is the lowest of all models at 318.
fit_all <- atm4 |>
model(
arima = ARIMA(box_cox(Cash, 0.464) ~ pdq(0,0,0) + PDQ(2,0,0)),
snaive = SNAIVE(Cash),
hw_ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
ana_ets = ETS(Cash ~ error("A") + trend("N") + season("A"))
)
fc_all <- fit_all |>
forecast(h = "1 month")
#with and without confidence intervals
fc_all |>
autoplot(atm4)
fc_all |>
autoplot(atm4, level = NULL)
#Facet-wrapped
fc_all |>
autoplot(atm4) +
facet_wrap(~ .model, ncol = 1)
#create a data frame and write the CSV
atm4_forecast <- as.data.frame(fc_all)
#write.csv(atm4_forecast, file="atm4forecast.csv", row.names=TRUE)
While ARIMA had the lowest RMSE, all of these models really reflect the last (lower than usual) week, and don’t incorporate the variability between weeks. Overall, I would pick ARIMA.
Let’s start by filling in missing values using KNN. This is a small data set, so it’s a good fit for simple KNN (VIM package). I won’t use mean imputation because there is a weekly seasonality to the data – I would like the numbers to reflect the trend for the day of the week.
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.3
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.3
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
atm1 <- kNN(atm1, variable = "Cash", k = 5)
atm1_imputed <- atm1 |> select(-Cash_imp)
atm1_imputed <- atm1_imputed |> as_tsibble(key = ATM, index = DATE)
dcmp_1 <- atm1_imputed |>
model(stl = STL(Cash ~ season(period = 7))) |>
components()
autoplot(atm1_imputed)
## Plot variable not specified, automatically selected `.vars = Cash`
# Plot the seasonal decomp for atm1_imputed
autoplot(dcmp_1)
There’s one week/set of weeks in March that both ATM4 and 1 are showing a squeeze for.
Looking at the overall plot, the data may be somewhat cyclical – or it could be a second seasonal trend. We can’t tell because we only have on year of data, but from May - December, withdrawals go up and then down. The same happens from mid-December to the end of April.
Again, I will try ARIMA, ETS (recommended model along with both Holt-Winters models), and a seasonal naive approach.
This is simple weekly seasonal data. When I looked at the numbers, ATM 1 seemed to have a distinct weekly trend with numbers dropping every seventh day.
#use 9 months to train and 3 months to test
myseries_train <- atm1_imputed |>
filter(DATE < as_date("2010-02-01"))
fit_snaive_1 <- myseries_train |>
model(snaive = SNAIVE(Cash))
fc_snaive_1 <- fit_snaive_1 |>
forecast(h = "3 months")
#confidence interval
fc_snaive_1 |>
autoplot(atm1_imputed)
#no confidence interval
fc_snaive_1 |>
autoplot(atm1_imputed, level = NULL)
#the plot goes a little higher than the acutal data
fit_snaive_1 |>
accuracy()
## # A tibble: 1 Ă— 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 snaive Training 0.223 27.3 17.6 -87.5 107. 1 1 0.123
#RMSE is 27.3
ANA is the suggested model. I’m going to try both Holt-Winters models because it allows for a trend and a pattern (data is sort of trending downward, as discussed above, this may be a seasonal pattern we can’t identify yet). I’m including HW multiplicative because, as we can see in the decomposition plot, big peaks tend to be associated with big drops.
fit_atm1 <- atm1_imputed |>
model(ets = ETS(Cash))
report(fit_atm1)
## Series: Cash
## Model: ETS(A,N,A)
## Smoothing parameters:
## alpha = 0.01625932
## gamma = 0.3283959
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 76.88695 -59.09764 -6.012421 12.29652 9.634747 18.0129 9.612253 15.55365
##
## sigma^2: 579.2506
##
## AIC AICc BIC
## 4486.383 4487.005 4525.382
#ANA
fit_1_ets <- myseries_train |>
model(ana = ETS(Cash ~ error("A") + trend("N") + season("A")),
hw_ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
hwm_ets = ETS(Cash ~ error("M") + trend("A") + season("M")),
)
fit_1_ets |>
forecast(h = "3 months") |>
autoplot(atm1_imputed)
#all plots separately
fit_1_ets |>
forecast(h = "3 months") |>
autoplot(atm1_imputed) +
facet_wrap(~ .model, ncol = 1, scales = "free_y")
#to find RMSE and other fit metrics
fit_1_ets |> accuracy()
## # A tibble: 3 Ă— 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 ana Training 0.0692 20.0 12.9 -75.0 92.1 0.734 0.732 0.0906
## 2 ATM1 hw_ets Training -0.106 20.0 12.9 -76.1 92.3 0.735 0.732 0.0862
## 3 ATM1 hwm_ets Training -1.12 19.7 12.3 -84.3 95.2 0.700 0.722 0.0404
These are all lower than the seasonal naive model, but multiplicative HW is the lowest.
atm1_imputed |> features(Cash, features = guerrero)
## # A tibble: 1 Ă— 2
## ATM lambda_guerrero
## <chr> <dbl>
## 1 ATM1 0.244
#.241 is pretty low, so we should use it
#data is stationary (though the value is pretty close to .05)
atm1_imputed |>
features(box_cox(Cash, 0.241), unitroot_kpss)
## # A tibble: 1 Ă— 3
## ATM kpss_stat kpss_pvalue
## <chr> <dbl> <dbl>
## 1 ATM1 0.418 0.0695
#calling for one difference
atm1_imputed |>
features(box_cox(Cash, 0.241), unitroot_nsdiffs)
## # A tibble: 1 Ă— 2
## ATM nsdiffs
## <chr> <int>
## 1 ATM1 1
#difference
atm1_imputed |>
features(box_cox(Cash, 0.241) |> difference(7), unitroot_nsdiffs)
## # A tibble: 1 Ă— 2
## ATM nsdiffs
## <chr> <int>
## 1 ATM1 0
atm1_imputed |> autoplot((box_cox(Cash, 0.241)) |>
difference(7))
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
#auto-ARIMA
atm1_imputed |>
model(auto_arima = ARIMA(box_cox(Cash, 0.241), stepwise = FALSE)) |>
report()
## Series: Cash
## Model: ARIMA(0,0,2)(0,1,1)[7]
## Transformation: box_cox(Cash, 0.241)
##
## Coefficients:
## ma1 ma2 sma1
## 0.0984 -0.1144 -0.6460
## s.e. 0.0524 0.0527 0.0426
##
## sigma^2 estimated as 1.539: log likelihood=-585.51
## AIC=1179.01 AICc=1179.13 BIC=1194.54
atm1_imputed |>
model(auto_arima = ARIMA(box_cox(Cash, 0.241))) |>
report()
## Series: Cash
## Model: ARIMA(0,0,2)(0,1,1)[7]
## Transformation: box_cox(Cash, 0.241)
##
## Coefficients:
## ma1 ma2 sma1
## 0.0984 -0.1144 -0.6460
## s.e. 0.0524 0.0527 0.0426
##
## sigma^2 estimated as 1.539: log likelihood=-585.51
## AIC=1179.01 AICc=1179.13 BIC=1194.54
#(0,0,2)(0,1,1)
#ACF and PACF
atm1_imputed |>
gg_tsdisplay(difference(box_cox(Cash, 0.241),7), plot_type = 'partial')
## Warning: `gg_tsdisplay()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsdisplay()` instead.
## Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).
#ACF shows one big spike an a quick taper, so we can try an 000 011 model
#and skip the first 2
#residuals look normal
fit_a_1 <- atm1_imputed |>
model(ARIMA(box_cox(Cash, 0.241) ~ pdq(0,0,2) + PDQ(0,1,1)))
gg_tsresiduals(fit_a_1)
#residuals also look normal for this model, though there are a couple of lags outside the lines
fit_a_1_b <- atm1_imputed |> model(ARIMA(box_cox(Cash, .241) ~ pdq(0,0,0) + PDQ(0,1,1)))
gg_tsresiduals(fit_a_1_b)
fit_sarima <- atm1_imputed |>
model(
auto_sarima = ARIMA(box_cox(Cash, 0.241) ~ pdq(0,0,2) + PDQ(0,1,1)),
sarima000011 = ARIMA(box_cox(Cash, 0.241) ~ pdq(0,0,0) + PDQ(0,1,1))
)
#forecasting for fit
fc <- fit_sarima |>
forecast(h = "3 months")
fc |> autoplot(atm1_imputed)
#facet-wrap
fc |> autoplot(atm1_imputed) +
facet_wrap(~ .model, ncol = 1)
#remove confidence intervals
fc |> autoplot(atm1_imputed, level = NULL) +
facet_wrap(~ .model, ncol = 1)
#check RMSE
fit_sarima |> accuracy()
## # A tibble: 2 Ă— 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 auto_sarima Training 2.36 25.0 16.1 -87.8 107. 0.906 0.901 0.0116
## 2 ATM1 sarima000011 Training 2.41 25.0 15.8 -91.4 110. 0.890 0.899 0.123
RMSE is a little higher at 25 (for both). With the other residuals, the model I chose, 001011, slightly edges out the auto-ARIMA’ model.
Here are one-month forecasts for all the models. Overall, my pick is Holt-Winters’ multiplicative.
fit_all <- atm1_imputed |>
model(
ana = ETS(Cash ~ error("A") + trend("N") + season("A")),
hw_ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
hwm_ets = ETS(Cash ~ error("M") + trend("A") + season("M")),
snaive = SNAIVE(Cash),
auto_sarima = ARIMA(box_cox(Cash, 0.241) ~ pdq(0,0,2) + PDQ(0,1,1)),
sarima000011 = ARIMA(box_cox(Cash, 0.241) ~ pdq(0,0,0) + PDQ(0,1,1))
)
fc_all <- fit_all |>
forecast(h = "1 month")
#all forecasts together
fc_all |>
autoplot(atm1_imputed) +
labs(title = "All forecasts, ATM 1")
fc_all |>
autoplot(atm1_imputed, level = NULL) +
labs(title = "All forecasts, ATM 1 (no CI)")
fc_all |>
autoplot(atm1_imputed) +
facet_wrap(~ .model, ncol = 1)
#breaking these up
fc_all |>
filter(.model %in% c("ana", "hw_ets", "hwm_ets")) |>
autoplot(atm1_imputed) +
facet_wrap(~ .model, ncol = 1) +
labs(title = "ETS")
fc_all |>
filter(.model %in% c("auto_sarima", "sarima001011")) |>
autoplot(atm1_imputed) +
facet_wrap(~ .model, ncol = 1)+
labs(title = "ARIMA")
fc_all |>
filter(.model == "snaive") |>
autoplot(atm1_imputed) +
labs(title = "SNAIVE")
#creating CSV
atm1_forecast <- as.data.frame(fc_all)
#write.csv(atm1_forecast, file="atm1forecast.csv", row.names=TRUE)
On second look, this data has a little bit of trend – it seems to be slowly trending downward as the year goes by. Weekly fluctuations still apply.
atm_data_clean |> filter(ATM == "ATM2") |> autoplot() +
labs(title = "ATM 2")
## Plot variable not specified, automatically selected `.vars = Cash`
atm2 <- atm_data_clean |> filter(ATM == "ATM2")
#this has 2 NA values. Let's use KNN again.
atm2 <- kNN(atm2, variable = "Cash", k = 5)
atm2_imputed <- atm2 |> select(-Cash_imp)
atm2_imputed <- atm2_imputed |> as_tsibble(key = ATM, index = DATE)
#decompose the data
dcmp_2 <- atm2_imputed |>
model(stl = STL(Cash ~ season(period = 7))) |>
components()
# Plot the seasonal decomp for atm1_imputed
autoplot(dcmp_2)
We can see the trend a little more clearly. We can also see that February/March dip/contraction here (this may have been due to bad weather or a holiday week).
Approaches
seasonal naive to capture the weekly seasonal trend
ARIMA (suggested model + anything else that looks plausible based on graphs
ETS (recommended model + Holt-Winters and Holt-Winters multiplicative, because it seems like the seasonal effects rise with the mean)
### ETS
fit_atm2 <- atm2_imputed |>
model(ets = ETS(Cash))
report(fit_atm2)
## Series: Cash
## Model: ETS(A,N,A)
## Smoothing parameters:
## alpha = 0.0001000463
## gamma = 0.3546428
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 72.11628 -38.36075 -25.27067 18.3668 -20.00746 16.45155 23.7113 25.10924
##
## sigma^2: 661.4986
##
## AIC AICc BIC
## 4534.845 4535.467 4573.844
#ANA
#use 9 months to train and 3 months to test
myseries_train <- atm2_imputed |>
filter(DATE < as_date("2010-02-01"))
fit_2_ets <- myseries_train |>
model(ana = ETS(Cash ~ error("A") + trend("N") + season("A")),
hw_ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
hwm_ets = ETS(Cash ~ error("M") + trend("A") + season("M")),
)
fit_2_ets |>
forecast(h = "3 months") |>
autoplot(atm2_imputed)
#all plots separately
fit_2_ets |>
forecast(h = "3 months") |>
autoplot(atm2_imputed) +
facet_wrap(~ .model, ncol = 1, scales = "free_y")
#to find RMSE
fit_2_ets |> accuracy()
## # A tibble: 3 Ă— 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM2 ana Training -2.00 22.5 15.7 -Inf Inf 0.725 0.715 0.0175
## 2 ATM2 hw_ets Training 0.293 22.2 15.3 -Inf Inf 0.706 0.708 0.0181
## 3 ATM2 hwm_ets Training 0.612 24.7 17.7 -Inf Inf 0.816 0.784 0.117
RMSE is lowest for Holt-Winters (non-multiplicative).
fit_snaive_2 <- myseries_train |>
model(snaive = SNAIVE(Cash))
fc_snaive_2 <- fit_snaive_2 |>
forecast(h = "3 months")
fc_snaive_2 |>
autoplot(atm2_imputed)
fit_snaive_2 |>
accuracy()
## # A tibble: 1 Ă— 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM2 snaive Training -0.312 31.4 21.7 -Inf Inf 1 1 0.0517
RMSE is 31, which is higher than the other models.
atm2_imputed |> features(Cash, features = guerrero)
## # A tibble: 1 Ă— 2
## ATM lambda_guerrero
## <chr> <dbl>
## 1 ATM2 0.741
#.741 is pretty close to 1 - we may not need it. Let's see if the data is stationary.
#data is not stationary
atm2_imputed |>
features(Cash, unitroot_kpss)
## # A tibble: 1 Ă— 3
## ATM kpss_stat kpss_pvalue
## <chr> <dbl> <dbl>
## 1 ATM2 2.20 0.01
#calling for one difference
atm2_imputed |>
features(Cash, unitroot_nsdiffs)
## # A tibble: 1 Ă— 2
## ATM nsdiffs
## <chr> <int>
## 1 ATM2 1
#let's take a daily difference
atm2_imputed |>
features(difference(Cash), unitroot_kpss)
## # A tibble: 1 Ă— 3
## ATM kpss_stat kpss_pvalue
## <chr> <dbl> <dbl>
## 1 ATM2 0.0147 0.1
#data is stationary
#still calling for 1 difference
atm2_imputed |>
features(difference(Cash), unitroot_nsdiffs)
## # A tibble: 1 Ă— 2
## ATM nsdiffs
## <chr> <int>
## 1 ATM2 1
#it's still calling for another difference after adding the box-cox
atm2_imputed |>
features(box_cox(Cash, 0.741), unitroot_nsdiffs)
## # A tibble: 1 Ă— 2
## ATM nsdiffs
## <chr> <int>
## 1 ATM2 1
#weekly difference
atm2_imputed |>
features(difference(Cash, 7), unitroot_kpss)
## # A tibble: 1 Ă— 3
## ATM kpss_stat kpss_pvalue
## <chr> <dbl> <dbl>
## 1 ATM2 0.0171 0.1
#this gives us zero, so this is the correct difference
atm2_imputed |>
features(difference(Cash, 7), unitroot_nsdiffs)
## # A tibble: 1 Ă— 2
## ATM nsdiffs
## <chr> <int>
## 1 ATM2 0
#plot it
atm2_imputed |> autoplot(difference(Cash))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
atm2_imputed |> autoplot(difference(Cash,7))
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
atm2_imputed |>
model(auto_arima = ARIMA(Cash)) |>
report()
## Series: Cash
## Model: ARIMA(2,0,2)(0,1,1)[7]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4322 -0.9172 0.4729 0.8135 -0.7571
## s.e. 0.0550 0.0398 0.0860 0.0546 0.0382
##
## sigma^2 estimated as 620.1: log likelihood=-1658.84
## AIC=3329.68 AICc=3329.92 BIC=3352.96
#this is calling for a seasonal model (2,0,2)(0,1,1)
#check the fit
fit_a_2 <- atm2_imputed |> model(ARIMA(Cash ~ pdq(2,0,2) + PDQ(0,1,1)))
report(fit_a_2)
## Series: Cash
## Model: ARIMA(2,0,2)(0,1,1)[7]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4322 -0.9172 0.4729 0.8135 -0.7571
## s.e. 0.0550 0.0398 0.0860 0.0546 0.0382
##
## sigma^2 estimated as 620.1: log likelihood=-1658.84
## AIC=3329.68 AICc=3329.92 BIC=3352.96
gg_tsresiduals(fit_a_2)
#the distribution is a little strange (skewed right), but everything else looks fine
#ACF and PACF reflect the 7-day trend, but it's also showing correlations in other days
#ACF
atm2_imputed |> ACF(
(Cash)
) |> autoplot()
#PACF
atm2_imputed |> PACF(
(Cash)
) |> autoplot()
#one big lag at 7 and quite a few lags outside
atm2_imputed |>
gg_tsdisplay(difference(Cash, 7), plot_type = "partial")
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).
#I will also try a seasonal-only model (0,0,0)(0,1,1), since there's no regular
#differencing and the plot seems to tail off gradually (slow ACF decay), as does
#the PACF -- they look similar
#and we'll try with a non-seasonal difference as well
#looks normal, some lags outside the lines
fit_a_2_b <- atm2_imputed |> model(ARIMA(Cash ~ pdq(0,0,0) + PDQ(0,1,1)))
gg_tsresiduals(fit_a_2_b)
#one big and one small lag outside the lines
fit_a_2_c <- atm2_imputed |> model(ARIMA(Cash ~ pdq(0,1,0) + PDQ(0,1,1)))
gg_tsresiduals(fit_a_2_c)
fit_sarima <- myseries_train |>
model(
auto_sarima = ARIMA(Cash ~ pdq(2,0,2) + PDQ(0,1,1)),
sarima001011 = ARIMA(Cash ~ pdq(0,0,0) + PDQ(0,1,1)),
sarima010011 = ARIMA(Cash ~ pdq(0,1,0) + PDQ(0,1,1))
)
#residuals
#looking pretty normal
fit_sarima |>
select(auto_sarima) |>
gg_tsresiduals()
#similarly normal
fit_sarima |>
select(sarima001011) |>
gg_tsresiduals()
#one big spike at lag one (this is also the worst fit of the three, so it's not surprising)
fit_sarima |>
select(sarima010011) |>
gg_tsresiduals()
#forecasting for fit
fc <- fit_sarima |>
forecast(h = "3 months")
fc |> autoplot(atm2_imputed)
#facet-wrap
fc |> autoplot(atm2_imputed) +
facet_wrap(~ .model, ncol = 1)
#remove confidence intervals
fc |> autoplot(atm2_imputed, level = NULL) +
facet_wrap(~ .model, ncol = 1)
#check RMSE and other fit metrics
fc |>
accuracy(atm2_imputed)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 2 observations are missing between 2010-05-01 and 2010-05-02
## # A tibble: 3 Ă— 11
## .model ATM .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 auto_sarima ATM2 Test 7.28 60.2 51.7 -Inf Inf 2.38 1.92 0.000738
## 2 sarima001011 ATM2 Test 7.23 60.2 51.8 -Inf Inf 2.38 1.92 0.000938
## 3 sarima010011 ATM2 Test -16.8 62.2 53.2 -Inf Inf 2.45 1.98 -0.00707
These are all pretty close, with auto-SARIMA as the lowest.
We can also see that the sarima010011 provides a really wide confidence interval, probably due to the way the difference was applied.
fit_all <- atm2_imputed |>
model(
ana = ETS(Cash ~ error("A") + trend("N") + season("A")),
hw_ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
hwm_ets = ETS(Cash ~ error("M") + trend("A") + season("M")),
snaive = SNAIVE(Cash),
auto_sarima = ARIMA(Cash ~ pdq(2,0,2) + PDQ(0,1,1)),
sarima001011 = ARIMA(Cash ~ pdq(0,0,0) + PDQ(0,1,1)),
sarima010011 = ARIMA(Cash ~ pdq(0,1,0) + PDQ(0,1,1))
)
fc_all <- fit_all |>
forecast(h = "1 month")
fc_all |>
autoplot(atm2_imputed)
fc_all |>
autoplot(atm2_imputed, level = NULL)
fc_all |>
autoplot(atm2_imputed) +
facet_wrap(~ .model, ncol = 1)
#breaking these up so they are easier to see
fc_all |>
filter(.model %in% c("ana", "hw_ets", "hwm_ets")) |>
autoplot(atm2_imputed) +
facet_wrap(~ .model, ncol = 1) +
labs(title = "ETS")
fc_all |>
filter(.model %in% c("auto_sarima", "sarima001011", "sarima010011")) |>
autoplot(atm2_imputed) +
facet_wrap(~ .model, ncol = 1)+
labs(title = "ARIMA")
fc_all |>
filter(.model == "snaive") |>
autoplot(atm2_imputed) +
labs(title = "SNAIVE")
#CSV forecasts
atm2_forecast <- as.data.frame(fc_all)
#write.csv(atm2_forecast, file="atm2forecast.csv", row.names=TRUE)
Overall, the RMSE was lowest for Holt-Winters (non-multiplicative). Based on these plots, ANA and Holt_Winters look pretty good. Seasonal naive doesn’t seem like a bad model, given the downward trend.