Data 624 project 1

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.1     ✔ tibble    3.3.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(forecast)
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
library(lubridate)
library(readxl)
Warning: package 'readxl' was built under R version 4.5.3
library(writexl)
Warning: package 'writexl' was built under R version 4.5.3

Part A – ATM Forecast, ATM624Data.xlsx

First we convert the date to standard date format year, month, day format to make it easy to read.

#Convert date column and remove empty money columns
df <- read_excel("ATM624Data.xlsx") %>%
  mutate(Date = as.Date(DATE, origin = "1899-12-30")) %>%
  filter(!is.na(Cash))

When we analyized the data I noticed that ATM3 is missing almost the entire years worth of data. In a business context I would assume that ATM3 was out of service or the data is missing so it was removed from the final model.

zero_counts <- df %>%
  group_by(ATM) %>%
  summarise(
    Total_Days = n(),
    Zero_Count = sum(Cash == 0, na.rm = TRUE),
    NA_Count = sum(is.na(Cash)),
    Percent_Zero = (Zero_Count / Total_Days) * 100
  )

# View the results
print(zero_counts)
# A tibble: 4 × 5
  ATM   Total_Days Zero_Count NA_Count Percent_Zero
  <chr>      <int>      <int>    <int>        <dbl>
1 ATM1         362          0        0        0    
2 ATM2         363          2        0        0.551
3 ATM3         365        362        0       99.2  
4 ATM4         365          0        0        0    

Here I train the model our model. Converted zero to NA and used to TSclean function to remove them and smooth any extreme outliers, to make sure the model training wan’t corrupted. When it comes to picking a model Auto-ARIMA was choosen and the Seasonality was set to weekly. Linear interpolation was used for missing values to maintain the weekly pattern. SMA wasn’t used since it lags behind trends and fails to capture specific day of the week seasonality and Linear Regression assume the data points are independent which isn’t true in the case of an ATM

train_df <- df %>% filter(Date < as.Date("2010-05-01"))

#Creating Forecasting Function
get_atm_forecast <- function(atm_id){
  #Adding in cleaning step due to atm3 and atm4
  temp_data <- train_df %>%
    filter(ATM == atm_id) %>%
    #Converting 0 as NA for tsclean
    mutate(Cash = ifelse(Cash <= 0, NA, Cash))
    
  #Creating a time series and cleaning it
  atm_ts <- ts(temp_data$Cash, frequency = 7) %>%
    tsclean()
  
  # Fit Model Auto-ARIMA
  fit <- auto.arima(atm_ts, stepwise = FALSE, approximation = FALSE)
  
  # Month Forecast
  fc <- forecast(fit, h = 31)
  return(as.vector(fc$mean))
}

The final forecast shows that ATM 1 has a clear weekly pattern with projections staying mostly within 75 and 100. ATM2 has extremely high volatility with high peaks and low days. After the removing the outliers ATM4 has exetremely high sustained volume, hovering around $450.

atms <- c("ATM1", "ATM2", "ATM4")
forecast_results <- map_dfc(atms, get_atm_forecast)
New names:
• `` -> `...1`
• `` -> `...2`
• `` -> `...3`
colnames(forecast_results) <- atms

may_dates <- seq(as.Date("2010-05-01"), as.Date("2010-05-31"), by="day")
final_forecast <- cbind(Date = may_dates, forecast_results)

write_xlsx(final_forecast, "ATM_May_2010_Forecast.xlsx")

Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx

library(readxl)
library(forecast)
library(ggplot2)
library(tsoutliers)
Warning: package 'tsoutliers' was built under R version 4.5.3

There’s a large drop off around 2010. We’ll need to account for that outlier when we’re making the model.

data <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")

#Creating time series object
kwh_ts <- ts(data$KWH, start = c(1998, 1), frequency = 12)

autoplot(kwh_ts) + 
  labs(title="Residential Power Consumption", y="KWH", x="Year")

Cleaning the data and removing the outlier

#TSclean Identify and replaces them outliers with interpolated values)
kwh_ts_clean <- tsclean(kwh_ts)

autoplot(kwh_ts, series="Original") +
  autolayer(kwh_ts_clean, series="Cleaned") +
  labs(title="Original vs. Outlier-Cleaned Data", y="KWH") +
  scale_color_manual(values=c("Original"="red", "Cleaned"="black"))

I used seasonal ARIMA similar the previous problem the only main difference is I used didn’t use stepwise = FALSE to force the model to work harder to find the best AIC score

fit_arima_clean <- auto.arima(kwh_ts_clean)

forecast_2014 <- forecast(fit_arima_clean, h=12)

autoplot(forecast_2014) + 
  theme_minimal() +
  labs(title="2014 KWH Forecast", y="KWH")

print(forecast_2014)
         Point Forecast   Lo 80    Hi 80   Lo 95    Hi 95
Jan 2014        9421697 8676510 10166883 8282032 10561361
Feb 2014        8568792 7784509  9353074 7369335  9768248
Mar 2014        6604824 5819961  7389686 5404480  7805167
Apr 2014        6005782 5196618  6814947 4768272  7243293
May 2014        5915475 5106309  6724641 4677963  7152988
Jun 2014        8185075 7375121  8995029 6946357  9423792
Jul 2014        9528425 8717769 10339081 8288633 10768217
Aug 2014        9990243 9179566 10800921 8750419 11230067
Sep 2014        8547264 7736450  9358078 7307231  9787297
Oct 2014        5856069 5045252  6666886 4616031  7096107
Nov 2014        6152739 5341922  6963556 4912701  7392777
Dec 2014        7720467 6909641  8531293 6480416  8960518
forecast_df <- as.data.frame(forecast_2014)
write.csv(forecast_df, "2014_KWH_Forecast.csv")

#Part C - BONUS, optional (part or all), Waterflow_Pipe1.xlsx and Waterflow_Pipe2.xlsx

library(readxl)
library(dplyr)
library(lubridate)
library(tseries)
library(forecast)
library(openxlsx)
Warning: package 'openxlsx' was built under R version 4.5.3

Function to convert to day time, aggregate by hour, and find mean within the same hour

library(readxl)
library(dplyr)
library(lubridate)
library(tseries)
library(forecast)
library(openxlsx)
process_waterflow_data <- function(file_path) {
  
  df <- read_excel(file_path)
  
  #Using .POSIXct with origin="1899-12-30 to convert to date time"
    df$DateTimeConverted <- as.POSIXct(df[[1]] * 86400, origin = "1899-12-30", tz = "UTC")
  
  # Aggregate by floor hour and take the mean of WaterFlow
  # Floor hour turns "2023-10-27 15:34:21" into "2023-10-27 15:00:00"
  hourly_data <- df %>%
    mutate(
      DateTimeConverted = as.POSIXct(.[[1]] * 86400, origin = "1899-12-30", tz = "UTC"),
      Hour = floor_date(DateTimeConverted, unit = "hour")
    ) %>%
    group_by(Hour) %>%
    # Use the actual column name here:
    summarize(Mean_WaterFlow = mean(WaterFlow, na.rm = TRUE)) %>% 
    ungroup()
  
  return(hourly_data)
}
pipe1_hourly <- process_waterflow_data("Waterflow_Pipe1.xlsx")

start_time <- min(pipe1_hourly$Hour)
end_time <- max(pipe1_hourly$Hour)

waterflow_ts <- ts(pipe1_hourly$Mean_WaterFlow, frequency = 24)

Test for Stationarity

adf_test <- adf.test(waterflow_ts)
Warning in adf.test(waterflow_ts): p-value smaller than printed p-value
print(adf_test)

    Augmented Dickey-Fuller Test

data:  waterflow_ts
Dickey-Fuller = -6.2649, Lag order = 6, p-value = 0.01
alternative hypothesis: stationary
is_stationary <- adf_test$p.value < 0.05

The data is stationary so we can start forecasting

fit <- auto.arima(waterflow_ts, seasonal = TRUE)

pipe1_forecast <- forecast(fit, h = 168)

plot(pipe1_forecast, 
     main="Weekly Water Flow Forecast - Pipe 1", 
     xlab="Time (Hours)", 
     ylab="Flow Rate", 
     col="blue")

forecast_export <- data.frame(
  Hour_Ahead = 1:168,
  Point_Forecast = as.numeric(pipe1_forecast$mean),
  Lower_95 = as.numeric(pipe1_forecast$lower[,2]),
  Upper_95 = as.numeric(pipe1_forecast$upper[,2])
)

write.xlsx(forecast_export, "Waterflow_Pipe1_Forecast.xlsx")
Warning in file.create(to[okay]): cannot create file
'Waterflow_Pipe1_Forecast.xlsx', reason 'Permission denied'
write.xlsx(forecast_export, "Waterflow_Pipe1_Forecast.xlsx")
Warning in file.create(to[okay]): cannot create file
'Waterflow_Pipe1_Forecast.xlsx', reason 'Permission denied'