library(caret)
library(dplyr)
library(e1071)
library(fable)
library(forecast)
library(fpp2)
library(fpp3)
library(feasts)
library(ggplot2)
library(ggfortify)
library(GGally)
library(httr)
library(knitr)
library(lubridate)
library(latex2exp) 
library(readxl)
library(readr) 
library(tsibble)
library(tidyr)
library(writexl)

——————————————————————————–

Part A – ATM Forecast.

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file. Also please submit the forecast which you will put in an Excel readable file.

Load data
atm_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS/refs/heads/main/DATA%20624/ATM624Data.csv"
atm_data <- read_csv(atm_url)
head(atm_data)
## # A tibble: 6 × 3
##   DATE                 ATM    Cash
##   <chr>                <chr> <dbl>
## 1 5/1/2009 12:00:00 AM ATM1     96
## 2 5/1/2009 12:00:00 AM ATM2    107
## 3 5/2/2009 12:00:00 AM ATM1     82
## 4 5/2/2009 12:00:00 AM ATM2     89
## 5 5/3/2009 12:00:00 AM ATM1     85
## 6 5/3/2009 12:00:00 AM ATM2     90
The data contains 14 blank rows, for May 1-14, 2010, which are the values we want to later forecast for each ATM. For now, I’ll remove them.
# Filter the data to remove rows where ATM is blank and DATE is from May 1-14, 2010
atm_data <- atm_data %>%
  filter(!(is.na(ATM) | ATM == ""))
Impute missing values
# Impute missing values by averaging the previous and next values
impute_cash <- function(x) {
  # Loop over each value
  for(i in 2:(length(x) - 1)) {
    if(is.na(x[i])) {
      # If the current value is missing, replace with the average of the previous and next values
      x[i] <- (x[i - 1] + x[i + 1]) / 2
    }
  }
  return(x)
}

# Apply the imputation function to the 'Cash' column 
atm_data_imputed <- atm_data %>%
  group_by(ATM) %>%
  mutate(Cash = impute_cash(Cash)) %>%
  ungroup()  

# Check for NA values
colSums(is.na(atm_data_imputed))
## DATE  ATM Cash 
##    0    0    0
Plot the 4 ATM cash withdrawals
# Convert DATE column to Date format 
atm_data_imputed$DATE <- as.Date(atm_data_imputed$DATE, format = "%m/%d/%Y")

# Convert to tsibble, using DATE as the index and ATM as the key
atm_data_tsibble <- as_tsibble(atm_data_imputed, index = DATE, key = ATM)

# Plot cash for all ATM's
atm_data_tsibble %>%
  autoplot(Cash)  + 
  labs(title = "ATM Cash Withdrawals", subtitle="05/01/2009 to 04/30/2010", x="Date")

# Faceted line plot by ATM
ggplot(atm_data_tsibble, aes(x = DATE, y = Cash, color = ATM, group = ATM)) +
  geom_line() +  
  labs(title = "Faceted ATM Cash Withdrawals",
       subtitle="05/01/2009 to 04/30/2010",
       x = "Date",
       y = "Cash",
       color = "ATM") +  
  theme_minimal() + 
  facet_wrap(~ ATM, scales = "free_y") +  
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Plot ETS for Each ATM- Additive as the magnitude of the time-series does not change with the level
################################################################################
# Filter data for ATM 1
atm_1_data <- atm_data_tsibble %>%
  filter(ATM == "ATM1")  %>%
  as_tsibble(index = DATE)

# Classical decomposition for ATM 1
atm_1_dcmp <- atm_1_data %>%
  model(classical_decomposition(Cash, type = "additive")) %>%  
  components()

atm_1_dcmp %>% autoplot() +
  labs(title = "ATM 1 Cash Withdrawals: Classical Additive Decomposition")

################################################################################
# Filter data for ATM 2
atm_2_data <- atm_data_tsibble %>%
  filter(ATM == "ATM2")  %>%
  as_tsibble(index = DATE)

# Classical decomposition for ATM 2
atm_2_dcmp <- atm_2_data %>%
  model(classical_decomposition(Cash, type = "additive")) %>%  
  components()

atm_2_dcmp %>% autoplot() +
  labs(title = "ATM 2 Cash Withdrawals: Classical Additive Decomposition")

################################################################################
# Filter data for ATM 3
atm_3_data <- atm_data_tsibble %>%
  filter(ATM == "ATM3")  %>%
  as_tsibble(index = DATE)

# Classical decomposition for ATM 2
atm_3_dcmp <- atm_3_data %>%
  model(classical_decomposition(Cash, type = "additive")) %>%  
  components()

atm_3_dcmp %>% autoplot() +
  labs(title = "ATM 3 Cash Withdrawals: Classical Additive Decomposition")

################################################################################
# Filter data for ATM 4
atm_4_data <- atm_data_tsibble %>%
  filter(ATM == "ATM4")  %>%
  as_tsibble(index = DATE)

# Classical decomposition for ATM 2
atm_4_dcmp <- atm_4_data %>%
  model(classical_decomposition(Cash, type = "additive")) %>%  
  components()

atm_4_dcmp %>% autoplot() +
  labs(title = "ATM 4 Cash Withdrawals: Classical Additive Decomposition")

Forecast data with seasonal naive method, as there is seasonality for all 4 ATM’s
# Fill gaps in the tsibble (if any)
atm_data_tsibble <- atm_data_tsibble %>%
  fill_gaps()

# Recreate the tsibble
atm_data_tsibble <- atm_data_imputed %>%
  as_tsibble(index = DATE, key = ATM)

################################################################################
# Fit Seasonal Naive model for a single ATM
#atm1_snaive_model <- atm_data_tsibble %>%
#  filter(ATM == "ATM1") %>%
#  model(SNAIVE(Cash))

# Forecast the next 14 days for ATM 1 (May 1-14)
#atm_snaive_forecast <- atm1_snaive_model %>%
#  forecast(h = 14)

# Visualize the forecast
#atm_snaive_forecast %>%
#  autoplot(atm_data_tsibble) +
#  labs(title = "ATM 1", subtitle = "Seasonal Naive Forecast" , x = "Date", y = "Cash Withdrawals")

################################################################################
# Seasonal Naive Forecast for all 4 ATMs
atm_snaive_forecast_all <- atm_data_tsibble %>%
  model(SNAIVE(Cash)) %>%
  forecast(h = 14)

# Plot forecast
atm_snaive_forecast_all %>%
  autoplot(atm_data_tsibble) +
  facet_wrap(~ATM, scales = "free_y") + 
  labs(title = "14-Day Seasonal Naive Forecast for ATM Cash Withdrawals", subtitle = "May 1 - 14, 2010", x = "Date", y = "Cash Withdrawals")

The forecasts for ATM 1, 2, and 4 all look reasonable given the seasonality and overall trend for the data. When I observe the actual forecasted values, I see that the seasonality value is a repeating pattern for all 4 ATM’s, exactly how we see it in the ETS plot. I also see the trend and random error components are rather sporadic, I can’t say there is a clear overall trend. It is for this reason I chose to rely on the SNAIVE method, trying to remain close to the more recent values, which hold more importance than older trend components.

One strange outlier is ATM4, which rather steady cash withdrawals for almost all its time period, with a withdrawal of $10,920 on February 9th, 2010. I could have chosen to rule this error out and replace it with a more average variable, but I’d only think it worth doing if it occurred within May of 2009. But I’ll leave it in only because it could have been something of note- perhaps someone withdrew alot of money at once, or it signaled an important day in the world of finance.

The outlier ATM activity seems to be ATM3, which up until April 27, 2010 had 0 withdrawals. It had a withdrawal of 96, 82, and 85 for the next 3 days. What I could have done would be to assume that these are just random transactions, and the ATM would likely have no withdrawals, similar to how it was for 99% of the time-series. However, I am assuming that this ATM was only recently opened, hence why I am now seeing transaction data for it, so I will stick the Seasonal naive forecast for all 4 ATM’s.

Check ACF of the existing actual ATM data

atm_data_tsibble %>%
  ACF(Cash) %>%
  autoplot() +
  labs(title = "ACF of ATM Cash Withdrawals", subtitle="05/01/2009 to 04/30/2010", x = "Lag", y = "ACF")

Since I have no actual data for May, I cannot calculate residuals by subtracting actual May values from forecasted values directly. Instead, I will plot the auto correlations (ACF) of existing data. From looking at the ACF plots for the 4 ATM’s, I can see that ATM 1 has large spikes at lags 7,14, and 21. I see the same behavior for ATM 2, whereas ATM 4 has very regular ACF’s that are centered around the mean. ATM 3, as expected, has large spikes given that it likely only began recently operating, but I see the decreasing spike. The spikes at lags 7,14, and 21 suggest a seasonal pattern for the cash withdrawals of those 2 ATM’s, which might be a weekly cycle given this is daily data. If customers withdraw more cash from an ATM on a given day, say a payday, it will create this pattern.

Output data
# Convert atm_data to data frame for compatibility
atm_data_df <- as.data.frame(atm_data)

# Make DATE in atm_snaive_forecast_all to date format
atm_snaive_forecast_all <- atm_snaive_forecast_all %>%
  mutate(DATE = as.Date(DATE)) 

# Create a formatted version of DATE to day/month/year format and remove leading zeros
atm_snaive_forecast_all <- atm_snaive_forecast_all %>%
  mutate(DATE_str = format(DATE, "%m/%d/%Y"),  
         DATE_str = gsub("^0", "", DATE_str), 
         # Check if the 3rd character is a '0' and remove it
         DATE_str = ifelse(substr(DATE_str, 3, 3) == "0", 
                           paste0(substr(DATE_str, 1, 2), substr(DATE_str, 4, nchar(DATE_str))), 
                           DATE_str),
         DATE_str = paste(DATE_str, "12:00:00 AM"))  # Append "12:00:00 AM" at the end

# Convert to a data frame
atm_snaive_forecast_all_df <- as.data.frame(atm_snaive_forecast_all)

# Select the relevant columns from the forecast
atm_forecast_df <- atm_snaive_forecast_all_df %>%
  select(DATE_str, ATM, .mean) %>%  
  rename(Cash = .mean, DATE = DATE_str) %>% 
  select(DATE, ATM, Cash) 

# Combine original data with forecasted data
atm_data_combined <- bind_rows(atm_data_df, atm_forecast_df)

# Output
write_xlsx(atm_data_combined, "atm_data_combined.xlsx")

——————————————————————————–

Part B – Forecasting Power

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

Load data
power_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS/refs/heads/main/DATA%20624/ResidentialCustomerForecastLoad-624.csv"
power_data <- read_csv(power_url)
Change date format
# Missing values
#colSums(is.na(power_data))

# Rename the 'YYYY-MMM' column to 'Date', make it date format
colnames(power_data)[colnames(power_data) == 'YYYY-MMM'] <- 'Date'

# Convert 'YYYY-MMM' to Date format (01-01-1998)
power_data$Date <- as.Date(paste0(power_data$Date, "-01"), format = "%Y-%b-%d")
Plot the data
# Data overview
ggplot(power_data, aes(x = Date, y = KWH)) +
  geom_line() + 
  labs(title = "KWH Power Usage Over Time", subtitle = "1998-01-01 to 2013-12-01")

# Convert power_data into a tsibble 
power_data_tsibble <- power_data %>%
  as_tsibble(index = Date)
Impute missing value for 2008-Sep
# Method 1: Impute using the average value of September from all years
september_data <- power_data_tsibble %>%
  filter(month(Date) == 9 & year(Date) != 2008) %>%
  pull(KWH)  # Extract KWH values for September from all years

# Calculate the mean of the September values from all years (excluding 2008)
prediction_September <- mean(september_data, na.rm = TRUE)

################################################################################
# Method 2: Impute using the average KWH value for 2008
average_2008 <- power_data_tsibble %>%
  filter(year(Date) == 2008 & month(Date) != 9) %>%
  pull(KWH)

# Predicted value for September 2008 using Method 2 (average for 2008)
prediction_2008 <- mean(average_2008, na.rm = TRUE)

################################################################################
# Comparing the 2 values
cat("Predicted value for September 2008 (September Method): ", prediction_September, "\n")
## Predicted value for September 2008 (September Method):  7702333
cat("Predicted value for September 2008 (2008 Method): ", prediction_2008, "\n")
## Predicted value for September 2008 (2008 Method):  6357963

I considered 2 approaches for imputing the missing value for 2008-Sep. The first being to look at the KWH consumption from September for all other years, and the other being to look at it for just 2008. However, I think it more apt to use the method where we look at just 2008 data, which I believe is more conservative and closer to the overall trend. However, given the data exhibits seasonality, and an overall stationary trend, I will try an exponential smoothing approach as well to assign more weight to recent observations.

Impute by ETS Method
# Subset the data until 2008-08-01
power_data_tsibble_subset <- power_data_tsibble %>%
  filter(Date <= as.Date("2008-08-01"))

# Set index to a monthly frequency
power_data_tsibble_subset <- power_data_tsibble_subset %>%
  mutate(Date = yearmonth(Date)) %>%
  as_tsibble(index = Date)

# Fit an ETS model 
ets_model <- power_data_tsibble_subset %>%
  model(ETS(KWH ~ error("A") + trend("A") + season("A"))) 

# Forecast 2008-09-01
prediction_ets <- ets_model %>%
  forecast(h = 1) 

cat("Predicted value for September 2008 (ETS Method): ", prediction_ets$.mean, "\n")
## Predicted value for September 2008 (ETS Method):  7678438
Replace missing value for September 2008
# Convert to monthly intervals
power_data_tsibble <- power_data %>%
  mutate(Date = yearmonth(Date)) %>%  
  as_tsibble(index = Date)

power_data_tsibble <- power_data_tsibble %>%
  mutate(KWH = ifelse(Date == yearmonth("2008 Sep"), prediction_ets$.mean, KWH))
Seasonal plot- Strange drop in 2010 July
# Subseries plot: separate plot for each month across all years
gg_subseries(power_data_tsibble, KWH, labels = "both") +
  labs(title = "Monthly KWH Power Usage Across Years", x = "Month", y = "KWH") +
  theme_minimal()

ETS Visualizing via STL
# Decompose using STL
stl_decomp <- power_data_tsibble %>%
  model(STL(KWH ~ trend(window = 7) + season(window = 13), robust = TRUE))  %>%
  components() 


autoplot(stl_decomp) + 
  labs(title = "Decomposition of KWH Consumption")

Check ACF/PACF
# ACF/PACF plot
power_data_tsibble %>%
  gg_tsdisplay(KWH, plot_type='partial') +
  labs(title = "ACF/PACF of KWH Consumption")

The ACF plot shows an alternating sequence of values - sinusoidal fluctuations, which means I will likely need to forecast with Seasonal ARIMA. The period of these fluctuations is every 6 months, hence the consistent lag spikes. Given that the ACF plot appears to be sinusoidal, I see the same spike occurring at lags 6 and 12. This indicates that the KWH consumption in prior months generally has a consistent rising trend. The trend for the KWH consumption for all the years seem to have a consistent pattern of their own as well- a repeating rise-drop pattern, to a positive trend. The exception does seem to be July, which had a severely low outlier of 770,523 KWH on July 2020. 5 significant spikes in the partial ACF plot tells me there is an auto-regressive decreasing component.

Forecast 2014 values
# Fit Seasonal ARIMA model
sarima_model <- power_data_tsibble %>%
  model(ARIMA(KWH ~ pdq(1, 0, 0) + PDQ(1, 1, 0, period = 6)))


# Forecast the next 12 months 
forecast_sarima <- sarima_model %>%
  forecast(h = 12)

forecast_sarima %>%
  autoplot(power_data_tsibble) +
  labs(title = "KWH Consumption: 2014 Seasonal ARIMA Forecast", x = "Date", y = "KWH")

The pdq(1,0,0) indicates this is an AR(1) model for the non-seasonal component, meaning the value of KWH is influenced by the value at t-1, which is confirmed by the spikes in the ACF plot. There was 0 differencing performed as the data is already rather stationary, and the q represents a lack of a moving average component for the non-seasonal part of the model, which implies that prior innovation residuals don’t have a significant impact on forecasting KWH. PDQ(1,1,0, period=6) accounts for a seasonal AR(1) component, which is confirmed by the last significant lag spike in the PACF occurring at a lag of 12 months.D = 1 is a seasonal differencing due to the sinusoidal fluctuations in the ACF plot. There is no seasonal moving average component, so past residuals have no significant impact on forecasting again. Period=6 is due to the ACF fluctuations occurring every 6 months.

Output data
# Select the relevant columns from the forecast
forecasted_data <- forecast_sarima %>%
  as_tibble() %>%  
  select(Date, .mean) %>% 
  rename(KWH = .mean)  

# Add CaseSequence to forecasted_data 
forecasted_data <- forecasted_data %>%
  mutate(CaseSequence = 925 + row_number() - 1)

# Combine original data with forecasted data
power_data_combined <- bind_rows(
  power_data_tsibble, 
  forecasted_data)

# Create a new column 'FormattedDate' with the 'YYYY-MMM' format as a character
power_data_combined <- power_data_combined %>%
  mutate(FormattedDate = paste0(year(Date), "-", month(Date, label = TRUE, abbr = TRUE)))

# Drop original 'Date' column
power_data_combined <- power_data_combined %>%
  as.data.frame() %>%
  select(-Date) %>%  
  rename(Date = FormattedDate)  


# Output data
write_xlsx(power_data_combined, "power_data_combined.xlsx")