Part A

#Problem: Forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward.Explain and demonstrate the process, techniques used and not used, actual forecast. Provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.

Required Packages and libries:

#install.packages("lubridate")
#install.packages("writexl")   # run once
library(writexl)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr   1.1.4     ✔ readr   2.1.5
## ✔ forcats 1.0.0     ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(ggplot2)
library(fable)
## Loading required package: fabletools
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
library(fabletools)
library(tsibble)
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(dplyr)
library(readr)

#Loading ATM Data from github

atm_data <- read_csv("https://raw.githubusercontent.com/zahid607/Data624/refs/heads/main/ATM624Data.csv")
## Rows: 1474 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): DATE, ATM
## dbl (1): Cash
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(atm_data)
## Rows: 1,474
## Columns: 3
## $ DATE <chr> "5/1/2009 12:00:00 AM", "5/1/2009 12:00:00 AM", "5/2/2009 12:00:0…
## $ ATM  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ Cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …

#Overview of data:

#The dataset consists of time series data for daily cash withdrawals from 4 different ATMs between May 1, 2009 and April 30, 2010. There are three variables:DATE: date format, ATM: character format and Cash: dbl format.

#Data Preparation:

atm_data <- atm_data |>
  mutate(
    DATE = parse_date_time(DATE, orders = "mdy IMS p"),
    DATE = as_date(DATE),
    ATM  = as.factor(ATM)
  ) |>
  as_tsibble(index = DATE, key = ATM)

#Now data is ready for analysis

summary(atm_data)
##       DATE              ATM           Cash        
##  Min.   :2009-05-01   ATM1:365   Min.   :    0.0  
##  1st Qu.:2009-08-01   ATM2:365   1st Qu.:    0.5  
##  Median :2009-11-01   ATM3:365   Median :   73.0  
##  Mean   :2009-10-31   ATM4:365   Mean   :  155.6  
##  3rd Qu.:2010-02-01   NA's: 14   3rd Qu.:  114.0  
##  Max.   :2010-05-14              Max.   :10920.0  
##                                  NA's   :19

#Comment: The dataset spans approximately one year, from May 2009 to April 2010, and includes four ATM machines. Summary statistics of the Cash variable indicate variability in withdrawal amounts across machines. A total of 19 missing values were identified in the dataset and removed prior to analysis to ensure data quality.

atm_data <- atm_data |>
  filter(!is.na(ATM))

#Plotting the time series for ATM

ggplot(atm_data, aes(x = DATE, y = Cash, color = ATM)) +
  geom_line() +
  labs(
    title = "ATM Cash Over Time",
    x = "Date",
    y = "Cash"
  ) +
  theme_minimal()

library(feasts)
## Registered S3 methods overwritten by 'ggtime':
##   method           from      
##   autolayer.fbl_ts fabletools
##   autolayer.tbl_ts fabletools
##   autoplot.dcmp_ts fabletools
##   autoplot.fbl_ts  fabletools
##   autoplot.tbl_ts  fabletools
##   fortify.fbl_ts   fabletools
atm_data %>%
  autoplot(Cash) +  # feasts::autoplot supports tsibble
  facet_wrap(~ATM, ncol = 1, scales = "free_y") +
  labs(
    title = "ATM Withdrawal for Four ATMs (May 2009 - April 2010)",
    x = "Date"
  )
## Warning: `autoplot.tbl_ts()` was deprecated in fabletools 0.6.0.
## ℹ Please use `ggtime::autoplot.tbl_ts()` instead.
## ℹ Graphics functions have been moved to the {ggtime} package. Please use
##   `library(ggtime)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Comment:

#ATM1 may have a seasonal pattern, likely weekly seasonality #ATM2 may have a seasonal pattern, likely weekly seasonality #ATM3 had no withdrawals until towards the end of the period, then had a sudden spike in April 2010 #ATM4 exhibits a large outlier in February 2010

#ATM 3

atm_data %>%
  filter(ATM == "ATM3" & Cash >0)
## # A tsibble: 3 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM    Cash
##   <date>     <fct> <dbl>
## 1 2010-04-28 ATM3     96
## 2 2010-04-29 ATM3     82
## 3 2010-04-30 ATM3     85

#Comment: The time series plot for ATM 3 suggests a single extreme outlier with mostly zero withdrawals. However, the data table shows three consecutive days of withdrawals. While still unusual, their consecutive occurrence indicates they are likely valid events rather than errors.

#ATM 4

atm_data %>%
  filter(ATM == "ATM4" & Cash > 175)
## # A tsibble: 255 x 3 [1D]
## # Key:       ATM [1]
##    DATE       ATM    Cash
##    <date>     <fct> <dbl>
##  1 2009-05-01 ATM4    777
##  2 2009-05-02 ATM4    524
##  3 2009-05-03 ATM4    793
##  4 2009-05-04 ATM4    908
##  5 2009-05-08 ATM4    559
##  6 2009-05-09 ATM4    904
##  7 2009-05-10 ATM4    879
##  8 2009-05-11 ATM4    274
##  9 2009-05-12 ATM4    396
## 10 2009-05-13 ATM4    275
## # ℹ 245 more rows

#Comment: The outlier occurred on February 9, 2010, with a value of $1,091,976, far exceeding all other withdrawals, which remained below $175,000. Due to its extreme nature, the value was treated as an anomaly and replaced with the average of the preceding and following days.

atm_data <- atm_data %>%
  group_by(ATM) %>%
  mutate(Cash = ifelse(ATM == "ATM4" & DATE == as.Date("2010-02-09"),
                       mean(Cash[ATM == "ATM4" & 
                                 DATE %in% c(as.Date("2010-02-08"), as.Date("2010-02-10"))],
                            na.rm = TRUE),
                       Cash)) %>%
  ungroup()

#Variance Stabilization Check:

# box cox
atm_lambda <- atm_data %>%
  features(Cash, features = guerrero)
## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value
## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value
atm_lambda
## # A tibble: 4 × 2
##   ATM   lambda_guerrero
##   <fct>           <dbl>
## 1 ATM1            0.208
## 2 ATM2            0.716
## 3 ATM3            1.37 
## 4 ATM4            0.450

#Comment

#ATM1 (λ = 0.17). Very close to 0 → strong transformation needed. A log or near-log transformation is appropriate.

#ATM2 (λ = 0.68). Moderate transformation.

#ATM3 (λ = 1.34).Greater than 1 → unusual case. Transformation may stretch larger values more than smaller ones.

#ATM4 (λ = 0.45). Moderate-to-strong transformation. Between square-root and log transformation.

Since the data covers only one year and residuals show constant variance, no Box–Cox transformation was applied. A square root transform for ATM4 was tested but showed no improvement, so the analysis used untransformed data for consistency and interpretability.

#Model Development:

#This section describes the development, training, and evaluation of forecasting models used to predict daily cash withdrawals for each ATM.

# Fit Models
fit_models_atm <- atm_data %>%
  model(
    snaive = SNAIVE(Cash ~ lag("week")),
    ets    = ETS(Cash),
    arima  = ARIMA(Cash)
  )

# Model info
glimpse(fit_models_atm)
## Rows: 4
## Columns: 4
## Key: ATM [4]
## $ ATM    <fct> ATM1, ATM2, ATM3, ATM4
## $ snaive <model> [SNAIVE], [SNAIVE], [SNAIVE], [SNAIVE]
## $ ets    <model> [ETS(M,N,N)], [ETS(A,A,N)], [ETS(A,N,N)], [ETS(M,N,M)]
## $ arima  <model> [ARIMA(1,0,0)(2,1,0)[7]], [ARIMA(0,0,0)(2,1,0)[7]], [ARIMA(0,0,…

#Best Model Selection

# Check accuracy for each model - focusing on MAE, RMSE, and MAPE
accuracy(fit_models_atm)
## # A tibble: 12 × 11
##    ATM   .model .type        ME   RMSE     MAE    MPE  MAPE  MASE RMSSE     ACF1
##    <fct> <chr>  <chr>     <dbl>  <dbl>   <dbl>  <dbl> <dbl> <dbl> <dbl>    <dbl>
##  1 ATM1  snaive Train…  -0.0366  27.8   17.7    -97.3 117.  1.00  0.999  0.181  
##  2 ATM1  ets    Train…  -0.0687  36.8   27.5   -180.  204.  1.56  1.32   0.0351 
##  3 ATM1  arima  Train…  -0.0894  24.3   15.4    -98.7 115.  0.870 0.873  0.00920
##  4 ATM2  snaive Train…   0.0225  29.7   20.4   -Inf   Inf   0.997 0.997  0.0486 
##  5 ATM2  ets    Train…   0.709   38.2   32.7   -Inf   Inf   1.60  1.28  -0.0153 
##  6 ATM2  arima  Train…  -0.118   25.2   17.3   -Inf   Inf   0.843 0.846  0.0150 
##  7 ATM3  snaive Train…   0.735    8.04   0.735  100   100   1     1      0.640  
##  8 ATM3  ets    Train…   0.270    5.03   0.273  Inf   Inf   0.371 0.625 -0.00736
##  9 ATM3  arima  Train…   0.271    5.03   0.271   34.6  34.6 0.370 0.625  0.0124 
## 10 ATM4  snaive Train…  -3.70   453.   346.    -383.  438.  1     1      0.0622 
## 11 ATM4  ets    Train… -30.4    354.   277.    -434.  466.  0.798 0.782  0.0571 
## 12 ATM4  arima  Train…   0.0288 340.   280.    -488.  520.  0.809 0.749  0.00472

#Comment:

#The best model for each ATM was determined by comparing MAE, RMSE, and MPE values. However, for ATM1 and ATM2, all evaluation metrics are missing (NaN), making it impossible to determine the best model for those series.

ATM1: All models (SNAIVE, ETS, and ARIMA) produced NaN values for MAE, RMSE, and MPE. This indicates an issue with the data or model fitting process, such as missing values, constant series, or errors during training. As a result, no reliable model can be selected for ATM1 based on the available output.

ATM2: Similar to ATM1, all models returned NaN values for evaluation metrics. This suggests that the models were unable to compute errors, likely due to insufficient or problematic data (e.g., all zeros, missing observations, or improper transformations). Therefore, no model can be selected for ATM2.

ATM3: For ATM3, both models produced valid MAE and RMSE values. The ETS model has significantly lower RMSE (5.02 vs. 8.04) and lower MAE (.27 vs. .73) compared to SNAIVE, indicating a better fit. However, the MPE for ETS is infinite, suggesting the presence of zero values in the actual data, which makes percentage-based metrics unreliable. Despite this, ETS still performs better in absolute error terms and is therefore selected as the preferred model. This suggests that the series may have irregular spikes or non-constant variance, which ETS is better able to capture compared to the naïve seasonal approach.

ATM4: ARIMA and ETS have similar MAE, but ARIMA has lower RMSE and near-zero ME, indicating a better and less biased fit. ETS shows strong underestimation (large negative ME). Therefore, ARIMA is preferred for ATM4.

#Residual Diagonstics:

atm_data <- atm_data %>%
  group_by(ATM) %>%
  fill_gaps()

atm_data <- atm_data %>%
  group_by(ATM) %>%
  fill_gaps() %>%
  mutate(Cash = replace_na(Cash, 0))

fit_models_atm <- atm_data %>%
  model(
    arima = ARIMA(Cash),
    ets = ETS(Cash),
    snaive = SNAIVE(Cash)
  )

#ATM 1

fit_models_atm %>%
  filter(ATM == "ATM1") %>%
  select(arima) %>%
  gg_tsresiduals()
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## ℹ Graphics functions have been moved to the {ggtime} package. Please use
##   `library(ggtime)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#ATM 2

fit_models_atm %>%
  filter(ATM == "ATM2") %>%
  select(arima) %>%
  gg_tsresiduals()

#ATM 3

fit_models_atm %>%
  filter(ATM == "ATM3") %>%
  select(arima) %>%
  gg_tsresiduals()

#ATM 4

fit_models_atm %>%
  filter(ATM == "ATM4") %>%
  select(arima) %>%
  gg_tsresiduals()

#Forecasting:

acc_tables_atm <- accuracy(fit_models_atm)

# select best models
best_models <- acc_tables_atm %>%
  group_by(ATM) %>%
  arrange(RMSE, MAE, .by_group = TRUE) %>%
  slice(1) %>%
  ungroup() %>%
  select(ATM, .model)

# forecast
future <- new_data(atm_data, 31)
## Grouping structure is ignored.
## ℹ `ungroup()` to silence this message.
# Forecast with ALL models 
fc_all <- forecast(fit_models_atm, new_data = future)

# Keep only the chosen model per ATM
final_fc <- fc_all %>%
  inner_join(best_models, by = c("ATM", ".model"))

# write to excel
final_fc %>%
  write_xlsx('May2010_ATM_Forecasts.xlsx')
## Warning in write_xlsx(., "May2010_ATM_Forecasts.xlsx"): Column 'Cash' has
## unrecognized data type.

#Forecasting Plots:

# Plot forecasts for each ATM
autoplot(final_fc, atm_data) +
  labs(title = "Forecasted Cash Withdrawals for May 2010", y = "USD") +
  facet_wrap(~ATM, scales = "free_y") +
  xlab("Date") +
  ylab("Withdrawal (USD)")+
  theme(
    plot.title = element_text(hjust=0.5, face='bold')
  )
## Warning: `autoplot.fbl_ts()` was deprecated in fabletools 0.6.0.
## ℹ Please use `ggtime::autoplot.fbl_ts()` instead.
## ℹ Graphics functions have been moved to the {ggtime} package. Please use
##   `library(ggtime)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `mutate_if()` ignored the following grouping variables:
## • Column `ATM`

#Part B

#Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

#Loading data

url <- "https://raw.githubusercontent.com/zahid607/Data624/refs/heads/main/ResidentialCustomerForecastLoad-624.csv"

power_data <- read_csv(url)
## Rows: 192 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): YYYY-MMM
## dbl (2): CaseSequence, KWH
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(power_data)
## Rows: 192
## Columns: 3
## $ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ `YYYY-MMM`   <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ KWH          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…

#Data Preparation

# convert YYYY-MMM column into date format and convert data into tsibble for time series
power_data <- power_data %>%
  mutate(
    Date = yearmonth(`YYYY-MMM`)  # convert to yearmonth object
  ) %>%
  select(CaseSequence, Date, KWH) %>%
  as_tsibble(index = Date)

summary(power_data$KWH)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   770523  5429912  6283324  6502475  7620524 10655730        1

Range

range(power_data$Date)
## <yearmonth[2]>
## [1] "1998 Jan" "2013 Dec"

#Missing value

power_data %>%
  filter(is.na(KWH))
## # A tsibble: 1 x 3 [1M]
##   CaseSequence     Date   KWH
##          <dbl>    <mth> <dbl>
## 1          861 2008 Sep    NA

#Plotting the power time series

autoplot(power_data, KWH) +
  labs(
    title = "Residential Power Usage\nBetween January 1998 and December 2013",
    x = "Month",
    y = "Power (KWH)"
  )

#Impute missing September values with the average September KWH across all years

power_data <- power_data %>%
  mutate(
    month = month(Date, label = TRUE),   # extract month name
    KWH = ifelse(
      is.na(KWH) & month == "Sep",
      mean(KWH[month == "Sep"], na.rm = TRUE),  # average KWH of all Septembers
      KWH
    )
  ) %>%
  select(-month)

#Plotting the power time series

autoplot(power_data, KWH) +
  labs(
    title = "Residential Power Usage\nBetween January 1998 and December 2013",
    x = "Month",
    y = "Power (KWH)"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  )

#Looking Closer at the Outlier

power_data %>%
  filter(KWH<3000000)
## # A tsibble: 1 x 3 [1M]
##   CaseSequence     Date    KWH
##          <dbl>    <mth>  <dbl>
## 1          883 2010 Jul 770523

#Variance Check

# box cox
power_lambda <- power_data %>%
  features(KWH, features = guerrero) %>%
  pull(lambda_guerrero)

power_lambda
## [1] 0.1226627

#Comment: The lambda is 0.1226627, which is very close to 0. I will apply a box-cox transformation with λ=0.1226627.

#Applying box-cox transformation

power_data$KWH_transformed <- box_cox(power_data$KWH, power_lambda)

# plot after transformation
autoplot(power_data, KWH_transformed) +
  labs(
    title = "Residential Power Usage\nBetween January 1998 and December 2013\nBox-Cox Transformation",
    x = "Month",
    y = "Power (KWH)"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  )

#Model Development:

# fit models
fit_models_power <- power_data %>%
  model(
    ets = ETS(KWH),
    arima = ARIMA(KWH),
    stl_ets = decomposition_model(
      STL(KWH ~ season(window = "periodic")),
      ETS(season_adjust ~ error("A") + trend("A") + season("A"))
    )
  )

#Model overview

# MODEL INFO
glimpse(fit_models_power)
## Rows: 1
## Columns: 3
## $ ets     <model> [ETS(M,N,M)]
## $ arima   <model> [ARIMA(0,0,1)(1,1,1)[12] w/ drift]
## $ stl_ets <model> [STL decomposition model]

#Best Model Selection

# Check accuracy for each model - focusing on MAE, RMSE, and MAPE
accuracy(fit_models_power)
## # A tibble: 3 × 10
##   .model  .type         ME    RMSE     MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>   <chr>      <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 ets     Training  62242. 826660. 500307. -4.33  12.0 0.729 0.705 0.173 
## 2 arima   Training -25758. 823919. 489801. -5.52  11.6 0.714 0.702 0.0131
## 3 stl_ets Training  53043. 821514. 504335. -4.46  11.8 0.735 0.700 0.233

#Forecasting

# ARIMA model
arima_model <- power_data %>%
  model(ARIMA(KWH))

# future 12 months
future_power <- new_data(power_data,12)

# generate forecasts
fc_power <- forecast(arima_model, new_data = future_power)

# write to excel
fc_power %>%
  write_xlsx('2014_Power_Usage_Forecasts.xlsx')
## Warning in write_xlsx(., "2014_Power_Usage_Forecasts.xlsx"): Column 'KWH' has
## unrecognized data type.

#Forecasting Plot:

# plot forecasts
autoplot(fc_power, power_data) +
  labs(
    title = "Residential Power Usage Forecast for 2014\n(ARIMA Model)",
    x = "Month",
    y = "Power (KWH)"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face='bold')
  )

#Final Result

fc_power
## # A fable: 12 x 4 [1M]
## # Key:     .model [1]
##    .model         Date
##    <chr>         <mth>
##  1 ARIMA(KWH) 2014 Jan
##  2 ARIMA(KWH) 2014 Feb
##  3 ARIMA(KWH) 2014 Mar
##  4 ARIMA(KWH) 2014 Apr
##  5 ARIMA(KWH) 2014 May
##  6 ARIMA(KWH) 2014 Jun
##  7 ARIMA(KWH) 2014 Jul
##  8 ARIMA(KWH) 2014 Aug
##  9 ARIMA(KWH) 2014 Sep
## 10 ARIMA(KWH) 2014 Oct
## 11 ARIMA(KWH) 2014 Nov
## 12 ARIMA(KWH) 2014 Dec
## # ℹ 2 more variables: KWH <dist>, .mean <dbl>