library(tidyverse)
library(fpp3)
library(naniar) # Excellent for visualizing missingness
# 2. Read the raw data
url <- "https://raw.githubusercontent.com/uzmabb182/Data_624_Predictive_Analytics/refs/heads/main/ATM624Data.csv"
atm_df <- read_csv(url)
#view(atm_df)
# Summary of missing values by column
cat("--- Count of NAs per Column ---\n")
## --- Count of NAs per Column ---
print(colSums(is.na(atm_df)))
## DATE ATM Cash
## 0 14 19
# --- Step 2.1: Fixing Date Formats and Filtering ---
library(tidyverse)
library(fpp3)
library(lubridate)
# 1. Parse the complex date string and remove trailing empty rows
atm_ts <- atm_df |>
dplyr::mutate(
# mdy_hms handles the "6/13/2009 12:00:00 AM" format
DATE_clean = lubridate::mdy_hms(DATE) |> as_date()
) |>
# Remove the 14 rows where ATM is NA (the trailing May 2010 rows)
dplyr::filter(!is.na(ATM)) |>
dplyr::select(DATE = DATE_clean, ATM, Cash) |>
# Convert to tsibble: DATE is the index, ATM is the key
as_tsibble(index = DATE, key = ATM)
# 2. Verify the count of ATMs and that the date range is correct
atm_ts |>
as_tibble() |>
group_by(ATM) |>
summarise(Start = min(DATE), End = max(DATE), Rows = n())
## # A tibble: 4 × 4
## ATM Start End Rows
## <chr> <date> <date> <int>
## 1 ATM1 2009-05-01 2010-04-30 365
## 2 ATM2 2009-05-01 2010-04-30 365
## 3 ATM3 2009-05-01 2010-04-30 365
## 4 ATM4 2009-05-01 2010-04-30 365
We will use a Time Series Linear Model (TSLM) with trend and seasonality to estimate what those missing values should have been. This is a more robust technique than simple averages because it accounts for the”weekly pattern of ATM withdrawals.
# --- Step 2.2: Imputing Historical Gaps ---
# Apply interpolation using a Time Series Linear Model (TSLM)
# This uses the trend and seasonal patterns to fill the NAs
atm_final <- atm_ts |>
fill_gaps() |>
model(imputer = TSLM(Cash ~ trend() + season())) |>
interpolate(atm_ts)
# Confirm that all 1,460 rows (365 days * 4 ATMs) now have Cash values
atm_final |>
as_tibble() |>
summarise(Total_NAs = sum(is.na(Cash))) |>
print()
## # A tibble: 1 × 1
## Total_NAs
## <int>
## 1 0
# View the 5 rows that were specifically filled for June 2009
atm_final |>
filter(year(DATE) == 2009, month(DATE) == 6, is.na(Cash) == FALSE) |>
# We join with the original to see which ones changed
anti_join(as_tibble(atm_ts), by = c("DATE", "ATM", "Cash")) |>
print()
## # A tsibble: 5 x 3 [1D]
## # Key: ATM [2]
## ATM DATE Cash
## <chr> <date> <dbl>
## 1 ATM1 2009-06-13 99.8
## 2 ATM1 2009-06-16 92.8
## 3 ATM1 2009-06-22 89.0
## 4 ATM2 2009-06-18 34.3
## 5 ATM2 2009-06-24 52.1
# --- Step 3: Exploratory Data Analysis (EDA) ---
# Plot the time series for each ATM to identify trends and cycles
atm_final |>
autoplot(Cash) +
facet_wrap(~ATM, scales = "free_y", ncol = 1) +
labs(title = "Daily ATM Cash Withdrawals",
subtitle = "May 2009 - April 2010",
y = "Cash (Hundreds of dollars)") +
theme_minimal()
Looking at the plot, each ATM has a very different behavior. Now we need to address these unique behaviors before we can model them effectively.
ATM1 & ATM2: Both show a very clear, stable weekly seasonality (a 7-day cycle) with a relatively flat trend. This suggests that a Seasonal Naive (SNAIVE) model or a seasonal ETS or ARIMA model will perform well here.
ATM3: This is a special case. It has zero values for almost the entire year, with data only appearing in the final few days. Because there is no historical pattern to learn from, the most sensible forecast is a Naive approach or a simple mean based on those few active days.
ATM4: There is a massive outlier in early 2010 where cash withdrawals spike to nearly 10,000 (hundreds of dollars). And, such an extreme value will skew your model’s mean and variance.
we must clean the outlier in ATM4. If we leave it, the forecast for May 2010 will be unnaturally high.
We will use the simple replacement with the median/mean to bring that spike down to a reasonable level.
# --- Step 4: Handling the ATM4 Outlier ---
# Identify the extreme outlier in ATM4
# We'll replace any value over 5000 with the median for that ATM
# This is a standard pre-processing step from Applied Predictive Modeling
atm_final <- atm_final |>
dplyr::mutate(Cash = ifelse(ATM == "ATM4" & Cash > 5000, median(Cash, na.rm = TRUE), Cash))
# Re-visualize ATM4 to ensure the scale is now readable
atm_final |>
filter(ATM == "ATM4") |>
autoplot(Cash) +
labs(title = "ATM4: After Outlier Treatment",
y = "Cash (Hundreds of dollars)")
ATM4: Now that the extreme outlier is removed, we can see a much more
unpredictable behavior, high-variance series compared to ATM1 and ATM2.
It has some seasonality but is far more noisy. An ARIMA model is often
better at handling this kind of random behavior.
The next step is to define multiple models for each ATM.
# --- Step 5: Model Training ---
# We fit a variety of models to each ATM:
# ETS: Automatically handles trend and seasonality (Hyndman Ch 8)
# ARIMA: Automatically handles complex correlations (Hyndman Ch 9)
# SNAIVE: A benchmark that simply repeats the last week's values (Hyndman Ch 5)
atm_models <- atm_final |>
model(
ets = ETS(Cash),
arima = ARIMA(Cash),
snaive = SNAIVE(Cash)
)
# Display the trained model table
atm_models
## # A mable: 4 x 4
## # Key: ATM [4]
## ATM ets arima snaive
## <chr> <model> <model> <model>
## 1 ATM1 <ETS(A,N,A)> <ARIMA(0,0,1)(0,1,2)[7]> <SNAIVE>
## 2 ATM2 <ETS(A,N,A)> <ARIMA(2,0,2)(0,1,1)[7]> <SNAIVE>
## 3 ATM3 <ETS(A,N,N)> <ARIMA(0,0,2)> <SNAIVE>
## 4 ATM4 <ETS(M,N,M)> <ARIMA(3,0,2)(1,0,0)[7] w/ mean> <SNAIVE>
# --- Step 7.1: Augmented Data (List View) ---
# Use augment to see the fitted values and residuals for every day
atm_augmented <- atm_models |>
augment()
# View the first 10 rows for ATM1's ARIMA model
atm_augmented |>
dplyr::filter(ATM == "ATM1", .model == "arima") |>
dplyr::select(DATE, ATM, Cash, .fitted, .resid) |>
head(10) |>
print()
## # A tsibble: 10 x 5 [1D]
## # Key: ATM [1]
## DATE ATM Cash .fitted .resid
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2009-05-01 ATM1 96 95.9 0.0960
## 2 2009-05-02 ATM1 82 81.9 0.0820
## 3 2009-05-03 ATM1 85 84.9 0.0850
## 4 2009-05-04 ATM1 90 89.9 0.0900
## 5 2009-05-05 ATM1 99 98.9 0.0990
## 6 2009-05-06 ATM1 88 87.9 0.0880
## 7 2009-05-07 ATM1 8 7.99 0.00800
## 8 2009-05-08 ATM1 104 97.2 6.78
## 9 2009-05-09 ATM1 87 83.9 3.14
## 10 2009-05-10 ATM1 93 86.7 6.33
# Check a specific date (like a Monday) to see if it captures the pattern
# This gives you that "business feeling" of double-checking the model's logic
atm_augmented |>
dplyr::filter(ATM == "ATM1", .model == "arima") |>
dplyr::slice(100:110) |> # Looking at a random slice in the middle
dplyr::select(DATE, ATM, Cash, .fitted)
## # A tsibble: 11 x 4 [1D]
## # Key: ATM [1]
## DATE ATM Cash .fitted
## <date> <chr> <dbl> <dbl>
## 1 2009-08-08 ATM1 105 102.
## 2 2009-08-09 ATM1 123 91.0
## 3 2009-08-10 ATM1 114 99.0
## 4 2009-08-11 ATM1 127 109.
## 5 2009-08-12 ATM1 111 84.9
## 6 2009-08-13 ATM1 34 18.2
## 7 2009-08-14 ATM1 151 107.
## 8 2009-08-15 ATM1 110 110.
## 9 2009-08-16 ATM1 115 108.
## 10 2009-08-17 ATM1 112 105.
## 11 2009-08-18 ATM1 132 118.
the models have successfully captured the underlying rhythms of the data.
In this step, we will use the forecast() function to predict the next 31 days (May 2010) for all four ATMs simultaneously. We will use a “h” (horizon) of 31 days.
# --- Step 8: Generating the Forecast ---
# Generate the forecast for the 31 days of May 2010
# The forecast() function automatically applies the best model
# found in the previous steps for each ATM.
atm_forecast <- atm_models |>
forecast(h = "31 days")
# Visualize the forecast alongside the historical data
# This plot will show the "confidence intervals" (the shaded areas)
atm_forecast |>
autoplot(atm_final) +
facet_wrap(~ATM, scales = "free_y") +
labs(title = "ATM Cash Withdrawal Forecast: May 2010",
subtitle = "Forecasts include 80% and 95% confidence intervals",
y = "Cash (Hundreds)",
x = "Date") +
theme_minimal()
# Interpretation of Final Forecasts
ATM1 & ATM2: Your forecast successfully continues the strong 7-day weekly cycles.
ATM3: You will notice the forecast is essentially a flat line or a very simple trend because there were only a few days of data at the very end of April to learn from.
ATM4: The forecast is less rhythmic than ATM1/ATM2, reflecting the high-variance noisy behavior we saw in the historical plots even after outlier treatment.
The final step is to extract the numerical forecast values for May 2010 and save them into the Excel-readable format.
Now we will filter for the best models and export the mean forecast values.
# --- Step 9: Final Forecast Export ---
# Extract the mean forecast values for May 2010
# We select the '.mean' column which contains the actual forecast numbers
final_forecast_values <- atm_forecast |>
dplyr::as_tibble() |>
dplyr::select(DATE, ATM, .model, .mean) |>
# pivotting this so it's easy to read in Excel: Days as rows, ATMs as columns
tidyr::pivot_wider(names_from = ATM, values_from = .mean)
# View the first few days of the forecast
head(final_forecast_values) |>
print()
## # A tibble: 6 × 6
## DATE .model ATM1 ATM2 ATM3 ATM4
## <date> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2010-05-01 ets 87.0 68.4 84.6 287.
## 2 2010-05-02 ets 101. 74.2 84.6 364.
## 3 2010-05-03 ets 73.2 11.1 84.6 354.
## 4 2010-05-04 ets 5.67 2.13 84.6 50.0
## 5 2010-05-05 ets 100. 102. 84.6 438.
## 6 2010-05-06 ets 79.4 92.4 84.6 306.
# Save to your Downloads folder as an Excel-readable CSV
submission_path <- "C:/Users/Uzma/Downloads/ATM_May2010_Forecast_Submission.csv"
final_forecast_values |>
readr::write_csv(file = submission_path)
cat("Success! May 2010 forecast:", submission_path)
## Success! May 2010 forecast: C:/Users/Uzma/Downloads/ATM_May2010_Forecast_Submission.csv