PART A. ATM DATASET CLEANING AND DATE OBSERVATIONS
#import
atm_main <- read_excel("~/Downloads/ATM624Data.xlsx")
# Clean the data
atm_cleaned <- atm_main |>
filter(!is.na(DATE), !is.na(ATM), !is.na(Cash)) |>
mutate(
DATE = as.Date(DATE, origin = "1899-12-30"),
ATM = as.factor(ATM),
Cash = as.numeric(Cash)
) |>
arrange(ATM, DATE)
#Initial observation of Data
atm_summary <- atm_cleaned |>
group_by(ATM) |>
summarise(
start_date = min(DATE),
end_date = max(DATE),
n_obs = n(),
mean_cash = mean(Cash, na.rm = TRUE),
median_cash = median(Cash, na.rm = TRUE),
sd_cash = sd(Cash, na.rm = TRUE),
min_cash = min(Cash, na.rm = TRUE),
max_cash = max(Cash, na.rm = TRUE)
)
atm_summary
## # A tibble: 4 × 9
## ATM start_date end_date n_obs mean_cash median_cash sd_cash min_cash
## <fct> <date> <date> <int> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 2009-05-01 2010-04-30 362 83.9 91 36.7 1
## 2 ATM2 2009-05-01 2010-04-30 363 62.6 67 38.9 0
## 3 ATM3 2009-05-01 2010-04-30 365 0.721 0 7.94 0
## 4 ATM4 2009-05-01 2010-04-30 365 474. 404. 651. 1.56
## # ℹ 1 more variable: max_cash <dbl>
atm_cleaned <- atm_cleaned |>
as_tsibble(key = ATM, index = DATE)
#There are empty entries, some of which have an ATM declared but no Cash value. The
#values I kept were those with ATM and Cash amount to prevent from having ATM entries
#with no value to be used. There are values with 0 and I kept all of those, specifically
#ATM 3 has many 0 values. Find any gaps in tables, this shows all have.
#Find dates and replace with fill_gaps()
atm_cleaned |>
has_gaps()
## # A tibble: 4 × 2
## ATM .gaps
## <fct> <lgl>
## 1 ATM1 TRUE
## 2 ATM2 TRUE
## 3 ATM3 FALSE
## 4 ATM4 FALSE
scan_gaps(atm_cleaned)
## # A tsibble: 5 x 2 [1D]
## # Key: ATM [2]
## ATM DATE
## <fct> <date>
## 1 ATM1 2009-06-13
## 2 ATM1 2009-06-16
## 3 ATM1 2009-06-22
## 4 ATM2 2009-06-18
## 5 ATM2 2009-06-24
atm_cleaned_f <- atm_cleaned |>
fill_gaps()
Preliminary Observation of Each ATM
-In our initial observations for each ATM, it was clear that an outlier of 10,920 on 2010-02-09 was severely influencing our analysis of the dataset as well as the scaling we can view the data through. ATM3 also has an abnormal dataset filled with zeroes with the exception of the final 3 values. I think this would be the ATM I trust prediction for least. It may have been newly placed or not functioning previously but it is a non-stationary as the set is all zeroes until the final few values. The value was left here for observation purposes but will be removed for our modeling comparisons.
#Mean imputation of outlier
#Removing outlier to calculate replacement so calculation is not biased
atm_cleaned_f <- atm_cleaned |>
as_tibble() |>
group_by(ATM) |>
mutate(
atm_mean = mean(
Cash[!(ATM == "ATM4" & DATE == as.Date("2010-02-09"))],
na.rm = TRUE
),
Cash = case_when(
ATM == "ATM4" & DATE == as.Date("2010-02-09") ~ atm_mean,
TRUE ~ Cash
)
) |>
ungroup() |>
select(-atm_mean) |>
as_tsibble(key = ATM, index = DATE)
#Check Outlier
atm_cleaned |>
filter(ATM == "ATM4", DATE == as.Date("2010-02-09"))
## # A tsibble: 1 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <fct> <dbl>
## 1 2010-02-09 ATM4 10920.
atm_cleaned_f |>
filter(ATM == "ATM4", DATE == as.Date("2010-02-09"))
## # A tsibble: 1 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <fct> <dbl>
## 1 2010-02-09 ATM4 445.
atm_cleaned <- atm_cleaned |>
as_tsibble(key = ATM, index = DATE)
#Compare cleaned and final datasets, ATM4 now appears stationary and fit for model
autoplot(atm_cleaned, Cash) +
facet_wrap(~ ATM, scales = "free_y") +
labs(
title = "Daily Cash Withdrawals by ATM",
x = "Date",
y = "Cash Withdrawals (Hundreds of Dollars)"
)
autoplot(atm_cleaned_f, Cash) +
facet_wrap(~ ATM, scales = "free_y") +
labs(
title = "Daily Cash Withdrawals by ATM",
x = "Date",
y = "Cash Withdrawals (Hundreds of Dollars)"
)
Day of the week Observations and analysis
Our analysis shows some interesting outcomes. ATM1, ATM2, and ATM4 have some extremely similar behavior including a dropoff on Thursday and relatively consistent mean and variance for each atm. Saturday for every ATM also shows that data points fall on the higher end suggesting withdrawls are common with few low values. ATM4 shows by far the greatest variance but it does not seem to be increasing as the set goes on so these are all suitable for our ARIMA and other models. Our Boxplot also interestingly shows us that the ATM1 has the least variance especially relative to the size of the mean. This is confirmed in our atm_summary_f table. ATM3 gives us a very questionable observation about whether this is a dataset that is valuable for prediction. With so few valid datapoints this is very limiting.
#Day of the Week Observations
autoplot(atm_cleaned_f, Cash) +
facet_wrap(~ ATM, scales = "free_y") +
labs(
title = "Daily Cash Withdrawals by ATM",
x = "Date",
y = "Cash Withdrawals (Hundreds of Dollars)"
)
atm_dayofweek <- atm_cleaned_f |>
as_tibble() |>
mutate(day_of_week = wday(DATE, label = TRUE, abbr = FALSE)) |>
group_by(ATM, day_of_week) |>
summarise(
mean_cash = mean(Cash, na.rm = TRUE),
median_cash = median(Cash, na.rm = TRUE),
sd_cash = sd(Cash, na.rm = TRUE),
min_cash = min(Cash, na.rm = TRUE),
max_cash = max(Cash, na.rm = TRUE),
.groups = "drop"
)
atm_dayofweek
## # A tibble: 28 × 7
## ATM day_of_week mean_cash median_cash sd_cash min_cash max_cash
## <fct> <ord> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 Sunday 103. 108 20.6 26 152
## 2 ATM1 Monday 86 85 21.0 6 126
## 3 ATM1 Tuesday 89.6 99 47.9 1 180
## 4 ATM1 Wednesday 82.2 78.5 25.1 2 179
## 5 ATM1 Thursday 31.7 16 32.8 4 125
## 6 ATM1 Friday 98.6 98 30.3 1 152
## 7 ATM1 Saturday 96.6 96 14.0 69 144
## 8 ATM2 Sunday 67.2 64 24.9 0 122
## 9 ATM2 Monday 58.7 63.5 32.2 2 132
## 10 ATM2 Tuesday 73.2 82.5 41.3 0 146
## # ℹ 18 more rows
atm_cleaned_f |>
as_tibble() |>
mutate(day_of_week = wday(DATE, label = TRUE, abbr = FALSE)) |>
ggplot(aes(x = day_of_week, y = Cash, fill = ATM)) +
geom_boxplot() +
facet_wrap(~ ATM, scales = "free_y") +
labs(
title = "Distribution of Cash Withdrawals by Day of Week",
x = "Day of Week",
y = "Cash Withdrawals (Hundreds of Dollars)"
)
atm_summary_f <- atm_cleaned_f |>
as_tibble() |>
group_by(ATM) |>
summarise(
n_obs = n(),
mean_cash = mean(Cash, na.rm = TRUE),
median_cash = median(Cash, na.rm = TRUE),
sd_cash = sd(Cash, na.rm = TRUE),
min_cash = min(Cash, na.rm = TRUE),
max_cash = max(Cash, na.rm = TRUE)
)
atm_summary_f
## # A tibble: 4 × 7
## ATM n_obs mean_cash median_cash sd_cash min_cash max_cash
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 362 83.9 91 36.7 1 180
## 2 ATM2 363 62.6 67 38.9 0 147
## 3 ATM3 365 0.721 0 7.94 0 96
## 4 ATM4 365 445. 404. 351. 1.56 1712.
#Creating training data and models
atm_train <- atm_cleaned_f |>
fill_gaps() |>
as_tibble() |>
group_by(ATM) |>
arrange(DATE, .by_group = TRUE) |>
mutate(Cash = na.approx(Cash, na.rm = FALSE, rule = 2)) |>
ungroup() |>
as_tsibble(key = ATM, index = DATE)
atm_naive <- atm_train |>
model(NAIVE = NAIVE(Cash))
atm_snaive <- atm_train |>
model(SNAIVE = SNAIVE(Cash ~ lag("week")))
atm_drift <- atm_train |>
model(DRIFT = RW(Cash ~ drift()))
atm_ets <- atm_train |>
model(ETS = ETS(Cash))
atm_arima <- atm_train |>
model(ARIMA = ARIMA(Cash))
Based on RMSE, ARIMA performed best for ATM1, ATM2, and ATM3, while ETS performed best for ATM4. Although the ARIMA residual diagnostics were satisfactory for all four ATM series, final model selection was based on forecast accuracy. Therefore, ARIMA was selected for ATM1–ATM3, and ETS was selected for ATM4. ETS was a close competitor in every ATM observed which suggests that there is a seasonal trend in our data that can be valuable in prediction.
The ACF plots indicate strong weekly seasonality for ATM1 and ATM2, as shown by large spikes at lags 7, 14, and 21 which suggests weekly seasonality in these cash withdrawls as we also saw in our boxplot. ATM3 shows stronger short-lag dependence at lags 1 and 2, with less evidence of weekly seasonality. ATM4 has comparatively weak autocorrelation, suggesting a more irregular series. Overall, the plots show that the ATM series have different dependence structures, which helps explain why different forecasting models perform better for different ATMs. ATM3 has heavy autocorrelation that drops off as it is primarily zero values which make context much more important for why the values were mostly zero before the spike.
The amount ATM1 & ATM2 had better success with a model that captures structured dependency well such as ARIMA is not surprising based on the autocorrelation. ATM4 in contrast had more moderate ACF fluctuations and in this model the ETS was able to achieve a better RMSE. Due to how similar the results were I believe the ARIMA model best predicts these but using an ETS model in comparison or replacement on the ATM’S with much higher withdrawls like ATM4. This is why I believe a hybrid method of looking at withdrawls for each ATM to check for the strength of seasonality is the best way to decide which model is best for each ATM when considering for something like cash planning to ensure withdrawl transactions are constant and not stalled by cash planning.
atm_accuracy_train <- bind_rows(
accuracy(atm_naive),
accuracy(atm_snaive),
accuracy(atm_drift),
accuracy(atm_ets),
accuracy(atm_arima)
)
atm_accuracy_train
## # A tibble: 20 × 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 NAIVE Trai… -3.02e- 2 49.9 37.2 -132. 167. 2.10 1.80 -0.356
## 2 ATM2 NAIVE Trai… -4.67e- 2 54.2 42.5 -Inf Inf 2.05 1.79 -0.307
## 3 ATM3 NAIVE Trai… 2.34e- 1 5.09 0.310 28.8 40.2 0.423 0.632 -0.149
## 4 ATM4 NAIVE Trai… -8.10e- 1 477. 386. -488. 547. 1.12 1.05 -0.464
## 5 ATM1 SNAIVE Trai… -3.63e- 2 27.7 17.7 -96.5 116. 1 1 0.188
## 6 ATM2 SNAIVE Trai… 2.23e- 2 30.2 20.8 -Inf Inf 1 1 0.0482
## 7 ATM3 SNAIVE Trai… 7.35e- 1 8.04 0.735 100 100 1.000 1.00 0.640
## 8 ATM4 SNAIVE Trai… -3.70e+ 0 452. 346. -392. 447. 1 1 0.0598
## 9 ATM1 DRIFT Trai… -1.56e-16 49.9 37.2 -132. 167. 2.10 1.80 -0.356
## 10 ATM2 DRIFT Trai… 4.59e-15 54.2 42.5 -Inf Inf 2.05 1.79 -0.307
## 11 ATM3 DRIFT Trai… 1.79e-15 5.08 0.541 -Inf Inf 0.737 0.632 -0.149
## 12 ATM4 DRIFT Trai… -2.81e-15 477. 386. -487. 546. 1.12 1.05 -0.464
## 13 ATM1 ETS Trai… -4.82e- 2 23.8 15.1 -107. 122. 0.852 0.857 0.144
## 14 ATM2 ETS Trai… -6.89e- 1 25.1 17.9 -Inf Inf 0.859 0.833 0.0199
## 15 ATM3 ETS Trai… 2.70e- 1 5.03 0.273 Inf Inf 0.371 0.625 -0.00736
## 16 ATM4 ETS Trai… -7.31e+ 0 329. 264. -520. 550. 0.763 0.726 0.100
## 17 ATM1 ARIMA Trai… -7.42e- 2 23.3 14.5 -102. 117. 0.822 0.838 -0.00872
## 18 ATM2 ARIMA Trai… -8.91e- 1 24.1 17.0 -Inf Inf 0.820 0.799 -0.00428
## 19 ATM3 ARIMA Trai… 2.71e- 1 5.03 0.271 34.6 34.6 0.370 0.625 0.0124
## 20 ATM4 ARIMA Trai… -3.47e- 2 340. 280. -509. 541. 0.809 0.750 0.00464
atm_model_compare <- atm_accuracy_train |>
select(ATM, .model, RMSE, MAE, MAPE)
atm_model_compare
## # A tibble: 20 × 5
## ATM .model RMSE MAE MAPE
## <fct> <chr> <dbl> <dbl> <dbl>
## 1 ATM1 NAIVE 49.9 37.2 167.
## 2 ATM2 NAIVE 54.2 42.5 Inf
## 3 ATM3 NAIVE 5.09 0.310 40.2
## 4 ATM4 NAIVE 477. 386. 547.
## 5 ATM1 SNAIVE 27.7 17.7 116.
## 6 ATM2 SNAIVE 30.2 20.8 Inf
## 7 ATM3 SNAIVE 8.04 0.735 100
## 8 ATM4 SNAIVE 452. 346. 447.
## 9 ATM1 DRIFT 49.9 37.2 167.
## 10 ATM2 DRIFT 54.2 42.5 Inf
## 11 ATM3 DRIFT 5.08 0.541 Inf
## 12 ATM4 DRIFT 477. 386. 546.
## 13 ATM1 ETS 23.8 15.1 122.
## 14 ATM2 ETS 25.1 17.9 Inf
## 15 ATM3 ETS 5.03 0.273 Inf
## 16 ATM4 ETS 329. 264. 550.
## 17 ATM1 ARIMA 23.3 14.5 117.
## 18 ATM2 ARIMA 24.1 17.0 Inf
## 19 ATM3 ARIMA 5.03 0.271 34.6
## 20 ATM4 ARIMA 340. 280. 541.
atm_model_compare2 <- atm_model_compare |>
group_by(ATM) |>
arrange(RMSE, .by_group = TRUE)
atm_model_compare2
## # A tibble: 20 × 5
## # Groups: ATM [4]
## ATM .model RMSE MAE MAPE
## <fct> <chr> <dbl> <dbl> <dbl>
## 1 ATM1 ARIMA 23.3 14.5 117.
## 2 ATM1 ETS 23.8 15.1 122.
## 3 ATM1 SNAIVE 27.7 17.7 116.
## 4 ATM1 DRIFT 49.9 37.2 167.
## 5 ATM1 NAIVE 49.9 37.2 167.
## 6 ATM2 ARIMA 24.1 17.0 Inf
## 7 ATM2 ETS 25.1 17.9 Inf
## 8 ATM2 SNAIVE 30.2 20.8 Inf
## 9 ATM2 DRIFT 54.2 42.5 Inf
## 10 ATM2 NAIVE 54.2 42.5 Inf
## 11 ATM3 ARIMA 5.03 0.271 34.6
## 12 ATM3 ETS 5.03 0.273 Inf
## 13 ATM3 DRIFT 5.08 0.541 Inf
## 14 ATM3 NAIVE 5.09 0.310 40.2
## 15 ATM3 SNAIVE 8.04 0.735 100
## 16 ATM4 ETS 329. 264. 550.
## 17 ATM4 ARIMA 340. 280. 541.
## 18 ATM4 SNAIVE 452. 346. 447.
## 19 ATM4 DRIFT 477. 386. 546.
## 20 ATM4 NAIVE 477. 386. 547.
atm_ets |> filter(ATM == "ATM1") |> gg_tsresiduals()
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
atm_ets |> filter(ATM == "ATM2") |> gg_tsresiduals()
atm_ets |> filter(ATM == "ATM3") |> gg_tsresiduals()
atm_ets |> filter(ATM == "ATM4") |> gg_tsresiduals()
atm_arima |> filter(ATM == "ATM1") |> gg_tsresiduals()
atm_arima |> filter(ATM == "ATM2") |> gg_tsresiduals()
atm_arima |> filter(ATM == "ATM3") |> gg_tsresiduals()
atm_arima |> filter(ATM == "ATM4") |> gg_tsresiduals()
atm_fc_naive <- atm_naive |>
forecast(h = 31)
atm_fc_snaive <- atm_snaive |>
forecast(h = 31)
atm_fc_drift <- atm_drift |>
forecast(h = 31)
atm_fc_ets <- atm_ets |>
forecast(h = 31)
atm_fc_arima <- atm_arima |>
forecast(h = 31)
augment(atm_arima) |>
features(.innov, ljung_box, lag = 14, dof = 0)
## # A tibble: 4 × 4
## ATM .model lb_stat lb_pvalue
## <fct> <chr> <dbl> <dbl>
## 1 ATM1 ARIMA 15.2 0.361
## 2 ATM2 ARIMA 10.2 0.745
## 3 ATM3 ARIMA 0.132 1.000
## 4 ATM4 ARIMA 8.64 0.854
atm_train |>
ACF(Cash) |>
autoplot() +
facet_wrap(~ ATM) +
labs(
title = "Autocorrelation of Daily Cash Withdrawals by ATM",
x = "Lag",
y = "ACF"
)
atm_fc_arima <- atm_arima |>
forecast(h = 31, level = c(80, 95))
autoplot(atm_fc_arima, atm_train) +
facet_wrap(~ ATM, scales = "free_y") +
labs(
title = "ARIMA Forecasts for May 2010",
subtitle = "80% and 95% prediction intervals",
x = "Date",
y = "Cash Withdrawals (Hundreds of Dollars)"
)
atm_fc_ets <- atm_ets |>
forecast(h = 31, level = c(80, 95))
autoplot(atm_fc_ets, atm_train) +
facet_wrap(~ ATM, scales = "free_y") +
labs(
title = "ETS Forecasts for May 2010",
subtitle = "80% and 95% prediction intervals",
x = "Date",
y = "Cash Withdrawals (Hundreds of Dollars)"
)
#Excel file output csv
atm_fc_ets <- atm_ets |>
forecast(h = 31, level = 95)
atm_fc_arima <- atm_arima |>
forecast(h = 31, level = 95)
atm_final_forecast <- bind_rows(
atm_fc_arima |> filter(ATM == "ATM1"),
atm_fc_arima |> filter(ATM == "ATM2"),
atm_fc_arima |> filter(ATM == "ATM3"),
atm_fc_ets |> filter(ATM == "ATM4")
)
atm_forecast_export <- atm_final_forecast |>
hilo(level = 95) |>
unpack_hilo(`95%`) |>
as_tibble() |>
rename(
point_forecast = .mean,
lo_95 = `95%_lower`,
hi_95 = `95%_upper`
)
write.csv(atm_forecast_export, "PartA_ATM_Forecast_Output.csv", row.names = FALSE)
cat("CSV saved to:", file.path(getwd(), "PartA_ATM_Forecast_Output.csv"))
## CSV saved to: /Users/luishernandez/Desktop/MSDS/DATA624/PartA_ATM_Forecast_Output.csv
Part B Forecasting Power
#Import
power_main <- read_excel("~/Downloads/ResidentialCustomerForecastLoad-624.xlsx")
glimpse(power_main)
## Rows: 192
## Columns: 3
## $ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ `YYYY-MMM` <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ KWH <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…
# Clean and convert to monthly tsibble
power_cleaned <- power_main |>
mutate(
Month = yearmonth(`YYYY-MMM`),
KWH = as.numeric(KWH)
) |>
select(Month, KWH) |>
arrange(Month)
power_tsibble <- power_cleaned |>
as_tsibble(index = Month)
# Check for gaps
print(power_tsibble)
## # A tsibble: 192 x 2 [1M]
## Month KWH
## <mth> <dbl>
## 1 1998 Jan 6862583
## 2 1998 Feb 5838198
## 3 1998 Mar 5420658
## 4 1998 Apr 5010364
## 5 1998 May 4665377
## 6 1998 Jun 6467147
## 7 1998 Jul 8914755
## 8 1998 Aug 8607428
## 9 1998 Sep 6989888
## 10 1998 Oct 6345620
## # ℹ 182 more rows
has_gaps(power_tsibble)
## # A tibble: 1 × 1
## .gaps
## <lgl>
## 1 FALSE
# Check for missing values
colSums(is.na(power_tsibble))
## Month KWH
## 0 1
# Plot
autoplot(power_tsibble, KWH) +
labs(
title = "Monthly Residential Power Usage",
x = "Year",
y = "KWH"
)
#Seasonal diagnostics
gg_season(power_tsibble, KWH) +
labs(
title = "Seasonal Plot of Monthly Power Usage",
y = "KWH"
)
## Warning: `gg_season()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_season()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
gg_subseries(power_tsibble, KWH) +
labs(
title = "Seasonal Subseries Plot of Monthly Power Usage",
y = "KWH"
)
## Warning: `gg_subseries()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_subseries()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
The seasonal plot confirms a stable yearly pattern. Power usage is normally highest in January and the later summer months like August, while spring months such as April and May tend to be lower. Most yearly curves follow a similar shape, indicating that seasonality is a major feature of the series. There is one extremely low July observation deviates substantially from the seasonal pattern but we leave this outlier for our observation and usefulness as a part of the pattern.
The seasonal subseries plot supports the presence of strong month-of-year effects. January, August, and September have high average, opposed to May and November. July seems to be a high-usage month, but its strongly effected by one extreme low data observation in 2010.
#Find missing value and replace with linear interpolation method
power_tsibble |>
filter(is.na(KWH))
## # A tsibble: 1 x 2 [1M]
## Month KWH
## <mth> <dbl>
## 1 2008 Sep NA
power_tsibble <- power_tsibble |>
mutate(KWH = na.approx(KWH, na.rm = FALSE))
#STL decomp
power_decomp <- power_tsibble |>
model(STL(KWH))
components(power_decomp) |>
autoplot() +
labs(
title = "STL Decomposition of Monthly Power Usage"
)
# You can also inspect the minimum value month
power_tsibble |>
filter(KWH == min(KWH, na.rm = TRUE))
## # A tsibble: 1 x 2 [1M]
## Month KWH
## <mth> <dbl>
## 1 2010 Jul 770523
#Fit benchmark and models
power_models <- power_tsibble |>
model(
mean_model = MEAN(KWH),
naive_model = NAIVE(KWH),
snaive_model = SNAIVE(KWH),
ets_model = ETS(KWH),
arima_model = ARIMA(KWH)
)
accuracy(power_models)
## # A tibble: 5 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 mean_model Training -4.66e-10 1440020. 1.18e6 -7.81 22.1 1.70 1.22 0.419
## 2 naive_model Training 1.44e+ 4 1539355. 1.20e6 -5.67 21.9 1.72 1.31 0.0270
## 3 snaive_model Training 9.42e+ 4 1178104. 6.97e5 -3.93 14.6 1.00 1 0.231
## 4 ets_model Training 6.10e+ 4 835107. 5.04e5 -4.39 12.0 0.723 0.709 0.170
## 5 arima_model Training -2.51e+ 4 827254. 4.93e5 -5.51 11.7 0.708 0.702 0.0128
# Reports for each model
power_models |> select(mean_model) |> report()
## Series: KWH
## Model: MEAN
##
## Mean: 6502823.5208
## sigma^2: 2084513846667.07
power_models |> select(naive_model) |> report()
## Series: KWH
## Model: NAIVE
##
## sigma^2: 2381879261976.21
power_models |> select(snaive_model) |> report()
## Series: KWH
## Model: SNAIVE
##
## sigma^2: 1386751915026.14
power_models |> select(ets_model) |> report()
## Series: KWH
## Model: ETS(M,N,M)
## Smoothing parameters:
## alpha = 0.1137195
## gamma = 0.0001001086
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 6134773 0.9547463 0.7533319 0.8602685 1.191176 1.255577 1.199206 0.9982427
## s[-7] s[-8] s[-9] s[-10] s[-11]
## 0.7683657 0.8160779 0.9174797 1.061846 1.223683
##
## sigma^2: 0.0144
##
## AIC AICc BIC
## 6224.057 6226.784 6272.919
power_models |> select(arima_model) |> report()
## Series: KWH
## Model: ARIMA(0,0,1)(1,1,1)[12] w/ drift
##
## Coefficients:
## ma1 sar1 sma1 constant
## 0.2439 -0.1518 -0.7311 105259.65
## s.e. 0.0723 0.0963 0.0821 27000.74
##
## sigma^2 estimated as 7.466e+11: log likelihood=-2719.89
## AIC=5449.79 AICc=5450.13 BIC=5465.75
# Residual diagnostics for each model
power_models |> select(mean_model) |> gg_tsresiduals()
power_models |> select(naive_model) |> gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_rug()`).
power_models |> select(snaive_model) |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_rug()`).
power_models |> select(ets_model) |> gg_tsresiduals()
power_models |> select(arima_model) |> gg_tsresiduals()
The mean and naive models performed poorly as they do not account for the strong seasonality in the power series. The seasonal naive model was substantially better and that shows that annual seasonality is an important feature of the data. The ETS and ARIMA provided much better fits than the benchmark models. Among all methods, ARIMA achieved the lowest RMSE, MAE, and MAPE among others indicating the best overall forecasting performance on our training data.
The STL decomposition separates the trend, seasonal, and remainder components. The seasonal component is strong and confirms that annual seasonality is the dominant structure in the data. The trend component changes gradually, but the remainder component contains one very large negative spike around 2010. This indicates that the extreme drop is not explained by the usual seasonal or trend behavior and is better interpreted as an outlier or irregular shock.
The residuals support the choice of ARIMA. The mean shows a large amount of autocorrelation and structure in the residuals but the seasonal naive model performs better. ETS improves the residuals behavior but ARIMA produces the cleanest residual pattern overall, with residual autocorrelation closest to zero. Although all models are affected by the extreme outlier around 2010, ARIMA appears to capture the underlying seasonal dependence. ARIMA achieved the lowest RMSE 827,254, MAE 493,306, and MAPE 11.68, along with the smallest residual autocorrelation 0.01284. For this reason ARIMA was selected as the final model with ETS as a close second.
# Ljung-Box test for residual autocorrelation
augment(power_models) |>
features(.innov, ljung_box, lag = 24, dof = 0)
## # A tibble: 5 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 arima_model 14.2 0.942
## 2 ets_model 28.6 0.235
## 3 mean_model 544. 0
## 4 naive_model 425. 0
## 5 snaive_model 86.5 0.00000000537
# Forecast 12 months ahead for 2014
power_fc <- power_models |>
forecast(h = "12 months")
# Plot forecasts
autoplot(power_tsibble, KWH) +
autolayer(power_fc, level = 95) +
labs(
title = "Monthly Power Usage Forecast for 2014",
x = "Year",
y = "KWH"
)
# Forecast table
power_fc
## # A fable: 60 x 4 [1M]
## # Key: .model [5]
## .model Month
## <chr> <mth>
## 1 mean_model 2014 Jan
## 2 mean_model 2014 Feb
## 3 mean_model 2014 Mar
## 4 mean_model 2014 Apr
## 5 mean_model 2014 May
## 6 mean_model 2014 Jun
## 7 mean_model 2014 Jul
## 8 mean_model 2014 Aug
## 9 mean_model 2014 Sep
## 10 mean_model 2014 Oct
## # ℹ 50 more rows
## # ℹ 2 more variables: KWH <dist>, .mean <dbl>
# More readable forecast table
power_fc_table <- power_fc |>
as_tibble() |>
select(.model, Month, .mean)
print(power_fc_table)
## # A tibble: 60 × 3
## .model Month .mean
## <chr> <mth> <dbl>
## 1 mean_model 2014 Jan 6502824.
## 2 mean_model 2014 Feb 6502824.
## 3 mean_model 2014 Mar 6502824.
## 4 mean_model 2014 Apr 6502824.
## 5 mean_model 2014 May 6502824.
## 6 mean_model 2014 Jun 6502824.
## 7 mean_model 2014 Jul 6502824.
## 8 mean_model 2014 Aug 6502824.
## 9 mean_model 2014 Sep 6502824.
## 10 mean_model 2014 Oct 6502824.
## # ℹ 50 more rows
#Excel csv output
power_fc_final <- power_models |>
select(arima_model) |>
forecast(h = 12, level = 95)
power_forecast_export <- power_fc_final |>
hilo(level = 95) |>
unpack_hilo(`95%`) |>
as_tibble() |>
rename(
point_forecast = .mean,
lo_95 = `95%_lower`,
hi_95 = `95%_upper`
)
write.csv(power_forecast_export, "PartB_Power_Forecast_Output.csv",row.names = FALSE)
Part C
#Read the Excel files
pipe1_raw <- read_excel("~/Downloads/Waterflow_Pipe1.xlsx")
pipe2_raw <- read_excel("~/Downloads/Waterflow_Pipe2.xlsx")
glimpse(pipe1_raw)
## Rows: 1,000
## Columns: 2
## $ `Date Time` <dbl> 42300.02, 42300.03, 42300.04, 42300.04, 42300.06, 42300.06…
## $ WaterFlow <dbl> 23.369599, 28.002881, 23.065895, 29.972809, 5.997953, 15.9…
glimpse(pipe2_raw)
## Rows: 1,000
## Columns: 2
## $ `Date Time` <dbl> 42300.04, 42300.08, 42300.12, 42300.17, 42300.21, 42300.25…
## $ WaterFlow <dbl> 18.810791, 43.087025, 37.987705, 36.120379, 31.851259, 28.…
# Clean data
pipe1_raw <- pipe1_raw |>
rename(Date_Time = `Date Time`) |>
mutate(
Date_Time = as.POSIXct(Date_Time * 86400, origin = "1899-12-30", tz = "UTC"),
WaterFlow = as.numeric(WaterFlow)
)
pipe2_raw <- pipe2_raw |>
rename(Date_Time = `Date Time`) |>
mutate(
Date_Time = as.POSIXct(Date_Time * 86400, origin = "1899-12-30", tz = "UTC"),
WaterFlow = as.numeric(WaterFlow)
)
# Check conversion worked Remove rows with missing datetime or WaterFlow
summary(pipe1_raw$Date_Time)
## Min. 1st Qu. Median
## "2015-10-23 00:24:06" "2015-10-25 11:21:35" "2015-10-27 20:07:30"
## Mean 3rd Qu. Max.
## "2015-10-27 20:49:15" "2015-10-30 08:24:51" "2015-11-01 23:35:43"
summary(pipe2_raw$Date_Time)
## Min. 1st Qu. Median
## "2015-10-23 01:00:00" "2015-11-02 10:45:00" "2015-11-12 20:30:00"
## Mean 3rd Qu. Max.
## "2015-11-12 20:30:00" "2015-11-23 06:15:00" "2015-12-03 16:00:00"
head(pipe1_raw)
## # A tibble: 6 × 2
## Date_Time WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 00:24:06 23.4
## 2 2015-10-23 00:40:02 28.0
## 3 2015-10-23 00:53:51 23.1
## 4 2015-10-23 00:55:40 30.0
## 5 2015-10-23 01:19:17 6.00
## 6 2015-10-23 01:23:58 15.9
head(pipe2_raw)
## # A tibble: 6 × 2
## Date_Time WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 01:00:00 18.8
## 2 2015-10-23 01:59:59 43.1
## 3 2015-10-23 03:00:00 38.0
## 4 2015-10-23 04:00:00 36.1
## 5 2015-10-23 04:59:59 31.9
## 6 2015-10-23 06:00:00 28.2
colSums(is.na(pipe1_raw))
## Date_Time WaterFlow
## 0 0
colSums(is.na(pipe2_raw))
## Date_Time WaterFlow
## 0 0
pipe1_raw <- pipe1_raw |>
filter(!is.na(Date_Time), !is.na(WaterFlow))
pipe2_raw <- pipe2_raw |>
filter(!is.na(Date_Time), !is.na(WaterFlow))
# Full hourly sequences
pipe1_hourly <- pipe1_raw |>
mutate(Hour = floor_date(Date_Time, unit = "hour")) |>
group_by(Hour) |>
summarise(
WaterFlow = mean(WaterFlow, na.rm = TRUE),
.groups = "drop"
) |>
arrange(Hour)
pipe2_hourly <- pipe2_raw |>
mutate(Hour = floor_date(Date_Time, unit = "hour")) |>
group_by(Hour) |>
summarise(
WaterFlow = mean(WaterFlow, na.rm = TRUE),
.groups = "drop"
) |>
arrange(Hour)
pipe1_full <- tibble(
Hour = seq(
from = min(pipe1_hourly$Hour),
to = max(pipe1_hourly$Hour),
by = "hour"
)
) |>
left_join(pipe1_hourly, by = "Hour")
pipe2_full <- tibble(
Hour = seq(
from = min(pipe2_hourly$Hour),
to = max(pipe2_hourly$Hour),
by = "hour"
)
) |>
left_join(pipe2_hourly, by = "Hour")
# Find missing hour values
pipe1_missing_hours <- pipe1_full |>
filter(is.na(WaterFlow))
pipe2_missing_hours <- pipe2_full |>
filter(is.na(WaterFlow))
pipe1_missing_hours
## # A tibble: 4 × 2
## Hour WaterFlow
## <dttm> <dbl>
## 1 2015-10-27 17:00:00 NA
## 2 2015-10-28 00:00:00 NA
## 3 2015-11-01 05:00:00 NA
## 4 2015-11-01 09:00:00 NA
pipe2_missing_hours
## # A tibble: 333 × 2
## Hour WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 02:00:00 NA
## 2 2015-10-23 05:00:00 NA
## 3 2015-10-23 08:00:00 NA
## 4 2015-10-23 11:00:00 NA
## 5 2015-10-23 14:00:00 NA
## 6 2015-10-23 17:00:00 NA
## 7 2015-10-23 20:00:00 NA
## 8 2015-10-23 23:00:00 NA
## 9 2015-10-24 02:00:00 NA
## 10 2015-10-24 05:00:00 NA
## # ℹ 323 more rows
# Convert to tsibble and Combine
pipe1_ts <- pipe1_full |>
as_tsibble(index = Hour)
pipe2_ts <- pipe2_full |>
as_tsibble(index = Hour)
pipe1_combined <- pipe1_ts |>
as_tibble() |>
mutate(Pipe = "Pipe1")
pipe2_combined <- pipe2_ts |>
as_tibble() |>
mutate(Pipe = "Pipe2")
waterflow_combined <- bind_rows(pipe1_combined, pipe2_combined) |>
as_tsibble(key = Pipe, index = Hour)
#Summarized for each pipe
pipe1_summary <- pipe1_ts |>
summarise(
start_time = min(Hour),
end_time = max(Hour),
n_hours = n(),
missing = sum(is.na(WaterFlow)),
mean_flow = mean(WaterFlow, na.rm = TRUE),
sd_flow = sd(WaterFlow, na.rm = TRUE),
min_flow = min(WaterFlow, na.rm = TRUE),
max_flow = max(WaterFlow, na.rm = TRUE)
)
## Warning: There were 8 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `min_flow = min(WaterFlow, na.rm = TRUE)`.
## ℹ In group 114: `Hour = 2015-10-27 17:00:00`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 7 remaining warnings.
pipe2_summary <- pipe2_ts |>
summarise(
start_time = min(Hour),
end_time = max(Hour),
n_hours = n(),
missing = sum(is.na(WaterFlow)),
mean_flow = mean(WaterFlow, na.rm = TRUE),
sd_flow = sd(WaterFlow, na.rm = TRUE),
min_flow = min(WaterFlow, na.rm = TRUE),
max_flow = max(WaterFlow, na.rm = TRUE)
)
## Warning: There were 666 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `min_flow = min(WaterFlow, na.rm = TRUE)`.
## ℹ In group 2: `Hour = 2015-10-23 02:00:00`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 665 remaining warnings.
pipe1_summary
## # A tsibble: 240 x 9 [1h] <UTC>
## Hour start_time end_time n_hours missing
## <dttm> <dttm> <dttm> <int> <int>
## 1 2015-10-23 00:00:00 2015-10-23 00:00:00 2015-10-23 00:00:00 1 0
## 2 2015-10-23 01:00:00 2015-10-23 01:00:00 2015-10-23 01:00:00 1 0
## 3 2015-10-23 02:00:00 2015-10-23 02:00:00 2015-10-23 02:00:00 1 0
## 4 2015-10-23 03:00:00 2015-10-23 03:00:00 2015-10-23 03:00:00 1 0
## 5 2015-10-23 04:00:00 2015-10-23 04:00:00 2015-10-23 04:00:00 1 0
## 6 2015-10-23 05:00:00 2015-10-23 05:00:00 2015-10-23 05:00:00 1 0
## 7 2015-10-23 06:00:00 2015-10-23 06:00:00 2015-10-23 06:00:00 1 0
## 8 2015-10-23 07:00:00 2015-10-23 07:00:00 2015-10-23 07:00:00 1 0
## 9 2015-10-23 08:00:00 2015-10-23 08:00:00 2015-10-23 08:00:00 1 0
## 10 2015-10-23 09:00:00 2015-10-23 09:00:00 2015-10-23 09:00:00 1 0
## # ℹ 230 more rows
## # ℹ 4 more variables: mean_flow <dbl>, sd_flow <dbl>, min_flow <dbl>,
## # max_flow <dbl>
pipe2_summary
## # A tsibble: 1,000 x 9 [1h] <UTC>
## Hour start_time end_time n_hours missing
## <dttm> <dttm> <dttm> <int> <int>
## 1 2015-10-23 01:00:00 2015-10-23 01:00:00 2015-10-23 01:00:00 1 0
## 2 2015-10-23 02:00:00 2015-10-23 02:00:00 2015-10-23 02:00:00 1 1
## 3 2015-10-23 03:00:00 2015-10-23 03:00:00 2015-10-23 03:00:00 1 0
## 4 2015-10-23 04:00:00 2015-10-23 04:00:00 2015-10-23 04:00:00 1 0
## 5 2015-10-23 05:00:00 2015-10-23 05:00:00 2015-10-23 05:00:00 1 1
## 6 2015-10-23 06:00:00 2015-10-23 06:00:00 2015-10-23 06:00:00 1 0
## 7 2015-10-23 07:00:00 2015-10-23 07:00:00 2015-10-23 07:00:00 1 0
## 8 2015-10-23 08:00:00 2015-10-23 08:00:00 2015-10-23 08:00:00 1 1
## 9 2015-10-23 09:00:00 2015-10-23 09:00:00 2015-10-23 09:00:00 1 0
## 10 2015-10-23 10:00:00 2015-10-23 10:00:00 2015-10-23 10:00:00 1 0
## # ℹ 990 more rows
## # ℹ 4 more variables: mean_flow <dbl>, sd_flow <dbl>, min_flow <dbl>,
## # max_flow <dbl>
#autoplots hourly
autoplot(pipe1_ts, WaterFlow) +
labs(
title = "Hourly Water Flow - Pipe 1",
x = "Hour",
y = "Mean Water Flow"
)
autoplot(pipe2_ts, WaterFlow) +
labs(
title = "Hourly Water Flow - Pipe 2",
x = "Hour",
y = "Mean Water Flow"
)
autoplot(waterflow_combined, WaterFlow) +
facet_wrap(~ Pipe, scales = "free_y") +
labs(
title = "Hourly Water Flow for Both Pipes",
x = "Hour",
y = "Mean Water Flow"
)
# Export to csv
write.csv(pipe1_full, "pipe1_hourly_cleaned.csv", row.names = FALSE)
write.csv(pipe2_full, "pipe2_hourly_cleaned.csv", row.names = FALSE)
write.csv(waterflow_combined, "waterflow_hourly_combined.csv", row.names = FALSE)
The time plots do not show a strong trend and both series appear roughly stable in level. Pipe 1 is short and has a lot of noise for the cereation of a 1 week forecast. Pipe 2 has missing hourly timestamps so its not a clean regular hourly series and must be regularized before standard forecasting methods are applied. I would use a method of interpolation here due to the variance for hourly data I think this is the strongest option to completely prepare this data for forecasting.