library(tidyverse)
library(readxl)
library(lubridate)
library(forecast)
library(urca)
library(zoo)
library(openxlsx)
library(xts)
library(tseries)
wb <- createWorkbook()
atm_data <- read_excel("ATM624Data.xlsx")
head(atm_data)
## # A tibble: 6 × 3
## DATE ATM Cash
## <dttm> <chr> <dbl>
## 1 2009-05-01 00:00:00 ATM1 96
## 2 2009-05-01 00:00:00 ATM2 107
## 3 2009-05-02 00:00:00 ATM1 82
## 4 2009-05-02 00:00:00 ATM2 89
## 5 2009-05-03 00:00:00 ATM1 85
## 6 2009-05-03 00:00:00 ATM2 90
atm_ts <- atm_data %>% group_by(ATM) %>%
mutate(Cash = Cash * 100)
atm_data_atm1 <- atm_ts %>% filter(ATM == "ATM1")
atm_data_atm2 <- atm_ts %>% filter(ATM == "ATM2")
atm_data_atm3 <- atm_ts %>% filter(ATM == "ATM3")
atm_data_atm4 <- atm_ts %>% filter(ATM == "ATM4")
ATM3 Due to the presence of an extended period of zeros in the first 362 entries of the ATM3 dataset, I will segment the data and apply an ARIMA model specifically to the non-zero segment of the time series.
# Find the index where non-zero data starts (skip the initial zeros)
non_zero_index <- which(atm_data_atm3$Cash > 0)[1]
# Subset the data to get only the non-zero part of the series
non_zero_data <- atm_data_atm3$Cash[non_zero_index:nrow(atm_data_atm3)]
atm_ts1 <- ts(atm_data_atm1$Cash, start = c(2009, 5), frequency = 365)
atm_ts2 <- ts(atm_data_atm2$Cash, start = c(2009, 5), frequency = 365)
atm_ts3 <- ts(non_zero_data, start = c(2009, 5), frequency = 365)
atm_ts4 <- ts(atm_data_atm4$Cash, start = c(2009, 5), frequency = 365)
plot(atm_ts1, main = "ATM1 Daily Cash Withdrawals", xlab = "Date", ylab = "Cash Withdrawn ($)", col = "darkblue")
plot(atm_ts2, main = "ATM2 Daily Cash Withdrawals", xlab = "Date", ylab = "Cash Withdrawn ($)", col = "red3")
plot(atm_ts3, main = "ATM3 Daily Cash Withdrawals", xlab = "Date", ylab = "Cash Withdrawn ($)", col = "darkgreen")
plot(atm_ts4, main = "ATM4 Daily Cash Withdrawals", xlab = "Date", ylab = "Cash Withdrawn ($)", col = "orange2")
atm_ts1 <- na.approx(atm_ts1)
atm_ts2 <- na.approx(atm_ts2)
atm_ts3 <- na.approx(atm_ts3)
atm_ts4 <- na.approx(atm_ts4)
ATM1
adf_test_atm1 <- ur.df(atm_ts1, type = "drift", selectlags = "AIC")
summary(adf_test_atm1)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression drift
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8210.0 -1444.9 534.9 2392.1 10502.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9407.98709 621.29306 15.14 < 2e-16 ***
## z.lag.1 -1.11821 0.07035 -15.89 < 2e-16 ***
## z.diff.lag 0.20327 0.05160 3.94 9.8e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3597 on 360 degrees of freedom
## Multiple R-squared: 0.4868, Adjusted R-squared: 0.4839
## F-statistic: 170.7 on 2 and 360 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -15.8953 126.3302
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1 6.47 4.61 3.79
Since the test statistic is smaller than the critical values, the ATM1 data are stationary and ready for forecasting.
ATM2
adf_test_atm2 <- ur.df(atm_ts2, type = "drift", selectlags = "AIC")
summary(adf_test_atm2)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression drift
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7768 -3202 261 2629 9845
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8282.13961 470.31026 17.610 < 2e-16 ***
## z.lag.1 -1.32707 0.06886 -19.271 < 2e-16 ***
## z.diff.lag 0.35554 0.04920 7.227 2.98e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3637 on 360 degrees of freedom
## Multiple R-squared: 0.5542, Adjusted R-squared: 0.5518
## F-statistic: 223.8 on 2 and 360 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -19.2712 185.6902
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1 6.47 4.61 3.79
Since the test statistic is smaller than the critical values, the *ATM2 data are stationary and ready for forecasting.
ATM3 Due to the limited non-zero data for ATM3, I will not conduct the ADF test on it.
ATM4
adf_test_atm4 <- ur.df(atm_ts4, type = "drift", selectlags = "AIC")
summary(adf_test_atm4)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression drift
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48574 -35367 -7024 22497 1043405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.935e+04 4.938e+03 9.995 <2e-16 ***
## z.lag.1 -1.043e+00 7.485e-02 -13.936 <2e-16 ***
## z.diff.lag 3.332e-02 5.269e-02 0.632 0.527
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 65390 on 360 degrees of freedom
## Multiple R-squared: 0.5053, Adjusted R-squared: 0.5025
## F-statistic: 183.8 on 2 and 360 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -13.9355 97.0998
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1 6.47 4.61 3.79
Since the test statistic is smaller than the critical values, the *ATM4 data are stationary and ready for forecasting.
I am utilizing the ARIMA model for forecasting ATM cash withdrawals due to its capacity to effectively analyze both stationary and non-stationary time series data, which facilitates the incorporation of seasonal trends. The model adeptly balances historical trends with random fluctuations, alleviating the need for manual parameter selection. Moreover, ARIMA is acknowledged as a reliable and extensively employed approach in the field of financial forecasting, further reinforcing its suitability for this analysis.
ATM1:
fit_atm1 <- auto.arima(atm_ts1, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(fit_atm1)
## Series: atm_ts1
## ARIMA(5,0,0) with non-zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 mean
## 0.0608 -0.2201 -0.0613 -0.0645 -0.2186 8417.5960
## s.e. 0.0510 0.0509 0.0521 0.0511 0.0511 121.3353
##
## sigma^2 = 12264934: log likelihood = -3493.88
## AIC=7001.76 AICc=7002.07 BIC=7029.06
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 2.314618 3473.229 2650.806 -150.0205 172.7121 NaN -0.001133594
ATM2:
fit_atm2 <- auto.arima(atm_ts2, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(fit_atm2)
## Series: atm_ts2
## ARIMA(0,1,5) with drift
##
## Coefficients:
## ma1 ma2 ma3 ma4 ma5 drift
## -1.0068 -0.3767 0.0479 0.6433 -0.2967 -5.9819
## s.e. 0.0499 0.0606 0.0689 0.0815 0.0546 2.4169
##
## sigma^2 = 11783873: log likelihood = -3479.57
## AIC=6973.15 AICc=6973.46 BIC=7000.43
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -6.278947 3399.688 2831.799 -Inf Inf NaN -0.002954414
fit_atm3 <- auto.arima(atm_ts3, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(fit_atm3)
## Series: atm_ts3
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 8766.6667
## s.e. 347.4864
##
## sigma^2 = 543333: log likelihood = -23.46
## AIC=50.91 AICc=Inf BIC=49.11
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -1.212669e-12 601.849 555.5556 -0.4557562 6.242793 NaN -0.295501
ATM4:
fit_atm4 <- auto.arima(atm_ts4, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(fit_atm4)
## Series: atm_ts4
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 47404.334
## s.e. 3402.548
##
## sigma^2 = 4.237e+09: log likelihood = -4562.92
## AIC=9129.84 AICc=9129.87 BIC=9137.64
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -2.657196e-12 65004.37 32360.01 -616.5219 646.8627 NaN
## ACF1
## Training set -0.009358419
forecast_atm1 <- forecast(fit_atm1, h = 31)
forecast_atm2 <- forecast(fit_atm2, h = 31)
forecast_atm3 <- forecast(fit_atm3, h = 31)
forecast_atm4 <- forecast(fit_atm4, h = 31)
ATM1:
autoplot(forecast_atm1) + ggtitle("ATM1 Cash Withdrawal Forecast")
The forecast indicates that the overall trend in ATM1 cash withdrawals remains consistent.
ATM2:
autoplot(forecast_atm2) + ggtitle("ATM2 Cash Withdrawal Forecast")
The forecasting model indicates a slight overall downward trend in the data.
ATM3:
autoplot(forecast_atm3) + ggtitle("ATM3 Cash Withdrawal Forecast")
Due to insufficient non-zero data for forecasting, the model chose to use the mean for predictions.
ATM4:
autoplot(forecast_atm4) + ggtitle("ATM4 Cash Withdrawal Forecast")
Since the data shows no clear trends or seasonality, using the mean for forecasting is a solid choice.
atm_forecast <- data.frame(
Date = seq(as.Date("2010-05-01"), by = "day", length.out = 31),
ATM1_Forecast = as.numeric(forecast_atm1$mean),
ATM2_Forecast = as.numeric(forecast_atm2$mean),
ATM3_Forecast = as.numeric(forecast_atm3$mean),
ATM4_Forecast = as.numeric(forecast_atm4$mean)
)
addWorksheet(wb, "ATM Forecasts")
writeData(wb, "ATM Forecasts", atm_forecast)
power_data <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
power_ts <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)
plot(power_ts, main = "Monthly Power Consumption in Kilowatt Hours", xlab = "Date", ylab = "Kilowatt Hours", col = "darkblue")
power_ts <- na.approx(power_ts)
adf_test_power <- ur.df(power_ts, type = "drift", selectlags = "AIC")
summary(adf_test_power)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression drift
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6550302 -770180 10486 707134 5054107
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.310e+06 4.714e+05 11.266 < 2e-16 ***
## z.lag.1 -8.155e-01 7.142e-02 -11.417 < 2e-16 ***
## z.diff.lag 4.360e-01 6.726e-02 6.483 7.81e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1192000 on 187 degrees of freedom
## Multiple R-squared: 0.4112, Adjusted R-squared: 0.4049
## F-statistic: 65.29 on 2 and 187 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -11.417 65.2006
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau2 -3.46 -2.88 -2.57
## phi1 6.52 4.63 3.81
I have opted to utilize the ARIMA model for the reasons articulated in Part A. This model effectively balances historical trends with random fluctuations.
fit_power <- auto.arima(power_ts, seasonal = TRUE)
forecast_power <- forecast(fit_power, h = 12)
autoplot(forecast_power) + ggtitle("Power Consumption in Kilowatt Hours Forecast")
power_forecast <- data.frame(
Date = seq(as.Date("2014-01-01"), by = "month", length.out = 12),
Power_Forecast_KWH = as.numeric(forecast_power$mean)
)
addWorksheet(wb, "Power Forecasts")
writeData(wb, "Power Forecasts", power_forecast)
water1 <- read_csv("Waterflow_Pipe1.csv")
water2 <- read_csv("Waterflow_Pipe2.csv")
water1$Datetime <- mdy_hms(water1$`Date Time`)
water2$Datetime <- mdy_hms(water2$`Date Time`)
water1 <- water1 %>% select(Datetime, WaterFlow)
water2 <- water2 %>% select(Datetime, WaterFlow)
water1 <- water1 %>% group_by(Datetime = floor_date(Datetime, "hour")) %>% summarise(WaterFlow_Pipe1 = mean(WaterFlow, na.rm = TRUE))
water2 <- water2 %>% group_by(Datetime = floor_date(Datetime, "hour")) %>% summarise(WaterFlow_Pipe2 = mean(WaterFlow, na.rm = TRUE))
plot(water1, main = "WaterFlow in Pipe 1(Hourly)", xlab = "Time", ylab = "Flow", col = "darkblue")
plot(water2, main = "WaterFlow in Pipe 2(Hourly)", xlab = "Time", ylab = "Flow", col = "red3")
merged_water <- full_join(water1, water2, by = "Datetime")
adf_test_pipe1 <- adf.test(na.omit(merged_water$WaterFlow_Pipe1))
print(adf_test_pipe1)
##
## Augmented Dickey-Fuller Test
##
## data: na.omit(merged_water$WaterFlow_Pipe1)
## Dickey-Fuller = -6.2649, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf_test_pipe2 <- adf.test(na.omit(merged_water$WaterFlow_Pipe2))
print(adf_test_pipe2)
##
## Augmented Dickey-Fuller Test
##
## data: na.omit(merged_water$WaterFlow_Pipe2)
## Dickey-Fuller = -9.5527, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
I have opted to utilize the ARIMA model.
model_pipe1 <- auto.arima(na.omit(merged_water$WaterFlow_Pipe1))
summary(model_pipe1)
## Series: na.omit(merged_water$WaterFlow_Pipe1)
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 19.8927
## s.e. 0.2754
##
## sigma^2 = 17.98: log likelihood = -675.29
## AIC=1354.59 AICc=1354.64 BIC=1361.51
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -2.337686e-16 4.231136 3.332455 -5.064092 18.29517 0.7036506
## ACF1
## Training set 0.02101267
model_pipe2 <- auto.arima(na.omit(merged_water$WaterFlow_Pipe2))
summary(model_pipe2)
## Series: na.omit(merged_water$WaterFlow_Pipe2)
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 39.5555
## s.e. 0.5073
##
## sigma^2 = 257.6: log likelihood = -4194.1
## AIC=8392.2 AICc=8392.22 BIC=8402.02
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 3.412324e-12 16.04125 13.08058 -34.83598 57.41693 0.7033556
## ACF1
## Training set -0.02169994
forecast_period <- 168
forecast_pipe1 <- forecast(model_pipe1, h = forecast_period)
forecast_pipe2 <- forecast(model_pipe2, h = forecast_period)
forecast_dates <- seq(from = max(merged_water$Datetime, na.rm = TRUE) + hours(1),
by = "hour", length.out = forecast_period)
forecast_water <- data.frame(
Datetime = forecast_dates,
WaterFlow_Pipe1_Forecast = forecast_pipe1$mean,
WaterFlow_Pipe2_Forecast = forecast_pipe2$mean
)
autoplot(forecast_pipe1) + ggtitle("Forecast for WaterFlow - Pipe 1")
autoplot(forecast_pipe2) + ggtitle("Forecast for WaterFlow - Pipe 2")
As there are no obvious trends or seasonality in both data, the ARIMA model automatically selected the mean for forecasting the upcoming week.
addWorksheet(wb, "WaterFlow Forecasts")
writeData(wb, "WaterFlow Forecasts", forecast_water)
saveWorkbook(wb, "Project1_Results.xlsx", overwrite = TRUE)