library(tidyverse)
library(readxl)
library(lubridate)
library(forecast)
library(urca)
library(zoo)
library(openxlsx)
library(xts)
library(tseries)

Create an Excel file

wb <- createWorkbook()

Part A – ATM Forecast

Load ATM data

atm_data <- read_excel("ATM624Data.xlsx")
head(atm_data)
## # A tibble: 6 × 3
##   DATE                ATM    Cash
##   <dttm>              <chr> <dbl>
## 1 2009-05-01 00:00:00 ATM1     96
## 2 2009-05-01 00:00:00 ATM2    107
## 3 2009-05-02 00:00:00 ATM1     82
## 4 2009-05-02 00:00:00 ATM2     89
## 5 2009-05-03 00:00:00 ATM1     85
## 6 2009-05-03 00:00:00 ATM2     90

Convert to time series and convert to actual dollar amounts

atm_ts <- atm_data %>% group_by(ATM) %>%
  mutate(Cash = Cash * 100)

Filter data by ATM ID

atm_data_atm1 <- atm_ts %>% filter(ATM == "ATM1")
atm_data_atm2 <- atm_ts %>% filter(ATM == "ATM2")
atm_data_atm3 <- atm_ts %>% filter(ATM == "ATM3")
atm_data_atm4 <- atm_ts %>% filter(ATM == "ATM4")

ATM3 Due to the presence of an extended period of zeros in the first 362 entries of the ATM3 dataset, I will segment the data and apply an ARIMA model specifically to the non-zero segment of the time series.

# Find the index where non-zero data starts (skip the initial zeros)
non_zero_index <- which(atm_data_atm3$Cash > 0)[1]
# Subset the data to get only the non-zero part of the series
non_zero_data <- atm_data_atm3$Cash[non_zero_index:nrow(atm_data_atm3)]

Create time series objects (assuming daily frequency)

atm_ts1 <- ts(atm_data_atm1$Cash, start = c(2009, 5), frequency = 365)
atm_ts2 <- ts(atm_data_atm2$Cash, start = c(2009, 5), frequency = 365)
atm_ts3 <- ts(non_zero_data, start = c(2009, 5), frequency = 365)
atm_ts4 <- ts(atm_data_atm4$Cash, start = c(2009, 5), frequency = 365)

Visualize the time series for ATM1, ATM2, ATM3, and ATM4

plot(atm_ts1, main = "ATM1 Daily Cash Withdrawals", xlab = "Date", ylab = "Cash Withdrawn ($)", col = "darkblue")

plot(atm_ts2, main = "ATM2 Daily Cash Withdrawals", xlab = "Date", ylab = "Cash Withdrawn ($)", col = "red3")

plot(atm_ts3, main = "ATM3 Daily Cash Withdrawals", xlab = "Date", ylab = "Cash Withdrawn ($)", col = "darkgreen")

plot(atm_ts4, main = "ATM4 Daily Cash Withdrawals", xlab = "Date", ylab = "Cash Withdrawn ($)", col = "orange2")

Impute Missing Values using linear interpolation

atm_ts1 <- na.approx(atm_ts1)
atm_ts2 <- na.approx(atm_ts2)
atm_ts3 <- na.approx(atm_ts3)
atm_ts4 <- na.approx(atm_ts4)

Check for stationarity using ADF test for all ATMs

ATM1

adf_test_atm1 <- ur.df(atm_ts1, type = "drift", selectlags = "AIC")
summary(adf_test_atm1)
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8210.0 -1444.9   534.9  2392.1 10502.2 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9407.98709  621.29306   15.14  < 2e-16 ***
## z.lag.1       -1.11821    0.07035  -15.89  < 2e-16 ***
## z.diff.lag     0.20327    0.05160    3.94  9.8e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3597 on 360 degrees of freedom
## Multiple R-squared:  0.4868, Adjusted R-squared:  0.4839 
## F-statistic: 170.7 on 2 and 360 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -15.8953 126.3302 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1  6.47  4.61  3.79

Since the test statistic is smaller than the critical values, the ATM1 data are stationary and ready for forecasting.

ATM2

adf_test_atm2 <- ur.df(atm_ts2, type = "drift", selectlags = "AIC")
summary(adf_test_atm2)
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7768  -3202    261   2629   9845 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8282.13961  470.31026  17.610  < 2e-16 ***
## z.lag.1       -1.32707    0.06886 -19.271  < 2e-16 ***
## z.diff.lag     0.35554    0.04920   7.227 2.98e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3637 on 360 degrees of freedom
## Multiple R-squared:  0.5542, Adjusted R-squared:  0.5518 
## F-statistic: 223.8 on 2 and 360 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -19.2712 185.6902 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1  6.47  4.61  3.79

Since the test statistic is smaller than the critical values, the *ATM2 data are stationary and ready for forecasting.

ATM3 Due to the limited non-zero data for ATM3, I will not conduct the ADF test on it.

ATM4

adf_test_atm4 <- ur.df(atm_ts4, type = "drift", selectlags = "AIC")
summary(adf_test_atm4)
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -48574  -35367   -7024   22497 1043405 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.935e+04  4.938e+03   9.995   <2e-16 ***
## z.lag.1     -1.043e+00  7.485e-02 -13.936   <2e-16 ***
## z.diff.lag   3.332e-02  5.269e-02   0.632    0.527    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 65390 on 360 degrees of freedom
## Multiple R-squared:  0.5053, Adjusted R-squared:  0.5025 
## F-statistic: 183.8 on 2 and 360 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -13.9355 97.0998 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1  6.47  4.61  3.79

Since the test statistic is smaller than the critical values, the *ATM4 data are stationary and ready for forecasting.

Fit ARIMA Model for each ATM

I am utilizing the ARIMA model for forecasting ATM cash withdrawals due to its capacity to effectively analyze both stationary and non-stationary time series data, which facilitates the incorporation of seasonal trends. The model adeptly balances historical trends with random fluctuations, alleviating the need for manual parameter selection. Moreover, ARIMA is acknowledged as a reliable and extensively employed approach in the field of financial forecasting, further reinforcing its suitability for this analysis.

ATM1:

fit_atm1 <- auto.arima(atm_ts1, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(fit_atm1)
## Series: atm_ts1 
## ARIMA(5,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5       mean
##       0.0608  -0.2201  -0.0613  -0.0645  -0.2186  8417.5960
## s.e.  0.0510   0.0509   0.0521   0.0511   0.0511   121.3353
## 
## sigma^2 = 12264934:  log likelihood = -3493.88
## AIC=7001.76   AICc=7002.07   BIC=7029.06
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE MASE         ACF1
## Training set 2.314618 3473.229 2650.806 -150.0205 172.7121  NaN -0.001133594

ATM2:

fit_atm2 <- auto.arima(atm_ts2, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(fit_atm2)
## Series: atm_ts2 
## ARIMA(0,1,5) with drift 
## 
## Coefficients:
##           ma1      ma2     ma3     ma4      ma5    drift
##       -1.0068  -0.3767  0.0479  0.6433  -0.2967  -5.9819
## s.e.   0.0499   0.0606  0.0689  0.0815   0.0546   2.4169
## 
## sigma^2 = 11783873:  log likelihood = -3479.57
## AIC=6973.15   AICc=6973.46   BIC=7000.43
## 
## Training set error measures:
##                     ME     RMSE      MAE  MPE MAPE MASE         ACF1
## Training set -6.278947 3399.688 2831.799 -Inf  Inf  NaN -0.002954414
fit_atm3 <- auto.arima(atm_ts3, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(fit_atm3)
## Series: atm_ts3 
## ARIMA(0,0,0) with non-zero mean 
## 
## Coefficients:
##            mean
##       8766.6667
## s.e.   347.4864
## 
## sigma^2 = 543333:  log likelihood = -23.46
## AIC=50.91   AICc=Inf   BIC=49.11
## 
## Training set error measures:
##                         ME    RMSE      MAE        MPE     MAPE MASE      ACF1
## Training set -1.212669e-12 601.849 555.5556 -0.4557562 6.242793  NaN -0.295501

ATM4:

fit_atm4 <- auto.arima(atm_ts4, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(fit_atm4)
## Series: atm_ts4 
## ARIMA(0,0,0) with non-zero mean 
## 
## Coefficients:
##            mean
##       47404.334
## s.e.   3402.548
## 
## sigma^2 = 4.237e+09:  log likelihood = -4562.92
## AIC=9129.84   AICc=9129.87   BIC=9137.64
## 
## Training set error measures:
##                         ME     RMSE      MAE       MPE     MAPE MASE
## Training set -2.657196e-12 65004.37 32360.01 -616.5219 646.8627  NaN
##                      ACF1
## Training set -0.009358419

Forecast ATM Withdrawals for May 2010 (31 Days)

forecast_atm1 <- forecast(fit_atm1, h = 31)
forecast_atm2 <- forecast(fit_atm2, h = 31)
forecast_atm3 <- forecast(fit_atm3, h = 31)
forecast_atm4 <- forecast(fit_atm4, h = 31)

Plot Forecasts

ATM1:

autoplot(forecast_atm1) + ggtitle("ATM1 Cash Withdrawal Forecast")

The forecast indicates that the overall trend in ATM1 cash withdrawals remains consistent.

ATM2:

autoplot(forecast_atm2) + ggtitle("ATM2 Cash Withdrawal Forecast")

The forecasting model indicates a slight overall downward trend in the data.

ATM3:

autoplot(forecast_atm3) + ggtitle("ATM3 Cash Withdrawal Forecast")

Due to insufficient non-zero data for forecasting, the model chose to use the mean for predictions.

ATM4:

autoplot(forecast_atm4) + ggtitle("ATM4 Cash Withdrawal Forecast")

Since the data shows no clear trends or seasonality, using the mean for forecasting is a solid choice.

Store ATM Forecasts in DataFrame

atm_forecast <- data.frame(
  Date = seq(as.Date("2010-05-01"), by = "day", length.out = 31),
  ATM1_Forecast = as.numeric(forecast_atm1$mean),
  ATM2_Forecast = as.numeric(forecast_atm2$mean),
  ATM3_Forecast = as.numeric(forecast_atm3$mean),
  ATM4_Forecast = as.numeric(forecast_atm4$mean)
)

Save Forecast Results to Excel

addWorksheet(wb, "ATM Forecasts")
writeData(wb, "ATM Forecasts", atm_forecast)

Part B – Forecasting Power

Load ATM data

power_data <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")

Convert to Time Series (Monthly)

power_ts <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)

Visualize the time series for power

plot(power_ts, main = "Monthly Power Consumption in Kilowatt Hours", xlab = "Date", ylab = "Kilowatt Hours", col = "darkblue")

Impute Missing Values using linear interpolation

power_ts <- na.approx(power_ts)

Check for stationarity using ADF test for power_ts

adf_test_power <- ur.df(power_ts, type = "drift", selectlags = "AIC")
summary(adf_test_power)
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -6550302  -770180    10486   707134  5054107 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.310e+06  4.714e+05  11.266  < 2e-16 ***
## z.lag.1     -8.155e-01  7.142e-02 -11.417  < 2e-16 ***
## z.diff.lag   4.360e-01  6.726e-02   6.483 7.81e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1192000 on 187 degrees of freedom
## Multiple R-squared:  0.4112, Adjusted R-squared:  0.4049 
## F-statistic: 65.29 on 2 and 187 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -11.417 65.2006 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.46 -2.88 -2.57
## phi1  6.52  4.63  3.81

Fit ARIMA Model for Power

I have opted to utilize the ARIMA model for the reasons articulated in Part A. This model effectively balances historical trends with random fluctuations.

fit_power <- auto.arima(power_ts, seasonal = TRUE)

Forecast for 2014 (12 Months)

forecast_power <- forecast(fit_power, h = 12)

Plot Forecasts

autoplot(forecast_power) + ggtitle("Power Consumption in Kilowatt Hours Forecast")

Store Power Forecast in DataFrame

power_forecast <- data.frame(
  Date = seq(as.Date("2014-01-01"), by = "month", length.out = 12),
  Power_Forecast_KWH = as.numeric(forecast_power$mean)
)

Add Power forecast to Excel

addWorksheet(wb, "Power Forecasts")
writeData(wb, "Power Forecasts", power_forecast)

Part C – Waterflow (Bonus Part)

Load datasets

water1 <- read_csv("Waterflow_Pipe1.csv")
water2 <- read_csv("Waterflow_Pipe2.csv")

Convert to datetime format

water1$Datetime <- mdy_hms(water1$`Date Time`)
water2$Datetime <- mdy_hms(water2$`Date Time`)

Drop original Date Time columns

water1 <- water1 %>% select(Datetime, WaterFlow)
water2 <- water2 %>% select(Datetime, WaterFlow)

Aggregate by hour (taking mean for multiple values within the same hour)

water1 <- water1 %>% group_by(Datetime = floor_date(Datetime, "hour")) %>% summarise(WaterFlow_Pipe1 = mean(WaterFlow, na.rm = TRUE))
water2 <- water2 %>% group_by(Datetime = floor_date(Datetime, "hour")) %>% summarise(WaterFlow_Pipe2 = mean(WaterFlow, na.rm = TRUE))

Visualize

plot(water1, main = "WaterFlow in Pipe 1(Hourly)", xlab = "Time", ylab = "Flow", col = "darkblue")

plot(water2, main = "WaterFlow in Pipe 2(Hourly)", xlab = "Time", ylab = "Flow", col = "red3")

Merge datasets on hourly timestamps

merged_water <- full_join(water1, water2, by = "Datetime")

Check for stationarity using Augmented Dickey-Fuller Test

adf_test_pipe1 <- adf.test(na.omit(merged_water$WaterFlow_Pipe1))
print(adf_test_pipe1)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  na.omit(merged_water$WaterFlow_Pipe1)
## Dickey-Fuller = -6.2649, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf_test_pipe2 <- adf.test(na.omit(merged_water$WaterFlow_Pipe2))
print(adf_test_pipe2)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  na.omit(merged_water$WaterFlow_Pipe2)
## Dickey-Fuller = -9.5527, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary

Fit ARIMA models

I have opted to utilize the ARIMA model.

model_pipe1 <- auto.arima(na.omit(merged_water$WaterFlow_Pipe1))
summary(model_pipe1)
## Series: na.omit(merged_water$WaterFlow_Pipe1) 
## ARIMA(0,0,0) with non-zero mean 
## 
## Coefficients:
##          mean
##       19.8927
## s.e.   0.2754
## 
## sigma^2 = 17.98:  log likelihood = -675.29
## AIC=1354.59   AICc=1354.64   BIC=1361.51
## 
## Training set error measures:
##                         ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -2.337686e-16 4.231136 3.332455 -5.064092 18.29517 0.7036506
##                    ACF1
## Training set 0.02101267
model_pipe2 <- auto.arima(na.omit(merged_water$WaterFlow_Pipe2))
summary(model_pipe2)
## Series: na.omit(merged_water$WaterFlow_Pipe2) 
## ARIMA(0,0,0) with non-zero mean 
## 
## Coefficients:
##          mean
##       39.5555
## s.e.   0.5073
## 
## sigma^2 = 257.6:  log likelihood = -4194.1
## AIC=8392.2   AICc=8392.22   BIC=8402.02
## 
## Training set error measures:
##                        ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 3.412324e-12 16.04125 13.08058 -34.83598 57.41693 0.7033556
##                     ACF1
## Training set -0.02169994

Forecast for next 7 days (168 hours)

forecast_period <- 168
forecast_pipe1 <- forecast(model_pipe1, h = forecast_period)
forecast_pipe2 <- forecast(model_pipe2, h = forecast_period)

Create a data frame for forecasts

forecast_dates <- seq(from = max(merged_water$Datetime, na.rm = TRUE) + hours(1), 
                      by = "hour", length.out = forecast_period)

forecast_water <- data.frame(
  Datetime = forecast_dates,
  WaterFlow_Pipe1_Forecast = forecast_pipe1$mean,
  WaterFlow_Pipe2_Forecast = forecast_pipe2$mean
)

Plot forecasts

autoplot(forecast_pipe1) + ggtitle("Forecast for WaterFlow - Pipe 1")

autoplot(forecast_pipe2) + ggtitle("Forecast for WaterFlow - Pipe 2")

As there are no obvious trends or seasonality in both data, the ARIMA model automatically selected the mean for forecasting the upcoming week.

Save forecast results to CSV

addWorksheet(wb, "WaterFlow Forecasts")
writeData(wb, "WaterFlow Forecasts", forecast_water)

Save excel file

saveWorkbook(wb, "Project1_Results.xlsx", overwrite = TRUE)