Part A – ATM Forecast, ATM624Data.xlsx
In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
library(readxl)
library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## Registered S3 methods overwritten by 'ggtime':
## method from
## autolayer.tbl_ts fabletools
## autoplot.dcmp_ts fabletools
## autoplot.tbl_ts fabletools
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tibble 3.3.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.2 ✔ feasts 0.5.0
## ✔ lubridate 1.9.4 ✔ fable 0.5.0
## ✔ ggplot2 4.0.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
ATM624Data <- read_excel("ATM624Data.xlsx") |>
mutate(DATE = as.Date(DATE, origin = "1899-12-30"))
# Clean the data and keep the exact column names
ATM624Data <- ATM624Data |>
filter(!is.na(ATM)) |>
group_by(ATM) |>
mutate(Cash = ifelse(is.na(Cash), (lag(Cash) + lead(Cash)) / 2, Cash)) |>
ungroup()
# Convert to a tsibble using DATE as the index and ATM as the key
ATM624Data <- ATM624Data |>
as_tsibble(index = DATE, key = ATM)
ATM624Data |>
autoplot(Cash) +
facet_wrap(~ ATM, scales = "free_y") +
labs(
title = "ATM Cash Withdrawals",
x = "Date",
y = "Cash"
)
# Split into training and testing sets
ATM_train <- ATM624Data |>
filter(DATE <= as.Date("2010-03-31"))
ATM_test <- ATM624Data |>
filter(DATE >= as.Date("2010-04-01"),
DATE <= as.Date("2010-04-30"))
# Fit several simple forecasting models on the training data
ATM_fit_all <- ATM_train |>
model(
ETS = ETS(Cash),
ARIMA = ARIMA(Cash),
Drift = RW(Cash ~ drift()),
SNAIVE = SNAIVE(Cash ~ lag("week"))
)
# Forecast April 2010 and compare against the test set
ATM_fc_test_all <- ATM_fit_all |>
forecast(new_data = ATM_test)
ATM_fc_test_all |>
autoplot(ATM_train, level = NULL) +
autolayer(ATM_test, Cash) +
facet_grid(ATM ~ .model, scales = "free_y") +
labs(
title = "April 2010 Forecast Check by ATM and Model",
x = "Date",
y = "Cash"
)
# Compute accuracy and choose the best model for each ATM
ATM_accuracy_all <- ATM_fc_test_all |>
accuracy(ATM_test) |>
arrange(ATM, RMSE)
ATM_accuracy_all
## # A tibble: 16 × 11
## .model ATM .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS ATM1 Test -5.75 11.6 9.11 -61.0 64.7 NaN NaN -0.254
## 2 ARIMA ATM1 Test -6.90 12.4 9.71 -83.3 86.4 NaN NaN -0.287
## 3 SNAIVE ATM1 Test -7.07 16.0 13.5 -66.2 73.4 NaN NaN 0.0437
## 4 Drift ATM1 Test -51.8 60.3 51.8 -529. 529. NaN NaN -0.140
## 5 ETS ATM2 Test 9.53 19.4 14.3 -25.4 57.4 NaN NaN -0.0954
## 6 ARIMA ATM2 Test 8.62 21.0 16.3 -50.5 79.5 NaN NaN 0.0999
## 7 SNAIVE ATM2 Test 12.9 26.2 17.8 35.1 45.1 NaN NaN -0.319
## 8 Drift ATM2 Test -40.4 54.8 41.2 -1139. 1140. NaN NaN 0.136
## 9 ARIMA ATM3 Test 8.77 27.8 8.77 100 100 NaN NaN 0.633
## 10 Drift ATM3 Test 8.77 27.8 8.77 100 100 NaN NaN 0.633
## 11 ETS ATM3 Test 8.77 27.8 8.77 100 100 NaN NaN 0.633
## 12 SNAIVE ATM3 Test 8.77 27.8 8.77 100 100 NaN NaN 0.633
## 13 Drift ATM4 Test -84.5 291. 248. -1218. 1240. NaN NaN -0.0168
## 14 ARIMA ATM4 Test -85.8 293. 249. -1219. 1240. NaN NaN -0.00380
## 15 SNAIVE ATM4 Test -130. 301. 227. -360. 375. NaN NaN 0.0298
## 16 ETS ATM4 Test -105. 386. 302. -1759. 1791. NaN NaN 0.0591
ATM_best_models <- ATM_accuracy_all |>
group_by(ATM) |>
slice_min(RMSE, n = 1) |>
ungroup() |>
select(ATM, .model, RMSE, MAE, MAPE)
ATM_best_models
## # A tibble: 7 × 5
## ATM .model RMSE MAE MAPE
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 ATM1 ETS 11.6 9.11 64.7
## 2 ATM2 ETS 19.4 14.3 57.4
## 3 ATM3 ARIMA 27.8 8.77 100
## 4 ATM3 Drift 27.8 8.77 100
## 5 ATM3 ETS 27.8 8.77 100
## 6 ATM3 SNAIVE 27.8 8.77 100
## 7 ATM4 Drift 291. 248. 1240.
# Refit models on the full dataset and forecast May 2010
ATM_fit_full <- ATM624Data |>
model(
ETS = ETS(Cash),
ARIMA = ARIMA(Cash),
Drift = RW(Cash ~ drift()),
SNAIVE = SNAIVE(Cash ~ lag("week"))
)
ATM_may_2010 <- ATM_fit_full |>
forecast(h = "31 days") |>
semi_join(ATM_best_models, by = c("ATM", ".model"))
ATM_may_2010
## # A fable: 217 x 5 [1D]
## # Key: ATM, .model [7]
## ATM .model DATE
## <chr> <chr> <date>
## 1 ATM1 ETS 2010-05-01
## 2 ATM1 ETS 2010-05-02
## 3 ATM1 ETS 2010-05-03
## 4 ATM1 ETS 2010-05-04
## 5 ATM1 ETS 2010-05-05
## 6 ATM1 ETS 2010-05-06
## 7 ATM1 ETS 2010-05-07
## 8 ATM1 ETS 2010-05-08
## 9 ATM1 ETS 2010-05-09
## 10 ATM1 ETS 2010-05-10
## # ℹ 207 more rows
## # ℹ 2 more variables: Cash <dist>, .mean <dbl>
ATM_may_2010 |>
autoplot(ATM624Data, level = NULL) +
facet_wrap(~ ATM, scales = "free_y") +
labs(
title = "May 2010 Forecast for ATM Cash Withdrawals",
x = "Date",
y = "Cash"
)
ATM_may_2010_export <- ATM_may_2010 |>
as_tibble() |>
mutate(Cash = .mean) |>
select(DATE, ATM, .model, Cash)
ATM_may_2010_export
## # A tibble: 217 × 4
## DATE ATM .model Cash
## <date> <chr> <chr> <dbl>
## 1 2010-05-01 ATM1 ETS 86.9
## 2 2010-05-02 ATM1 ETS 101.
## 3 2010-05-03 ATM1 ETS 73.0
## 4 2010-05-04 ATM1 ETS 5.51
## 5 2010-05-05 ATM1 ETS 100.
## 6 2010-05-06 ATM1 ETS 79.3
## 7 2010-05-07 ATM1 ETS 85.5
## 8 2010-05-08 ATM1 ETS 86.9
## 9 2010-05-09 ATM1 ETS 101.
## 10 2010-05-10 ATM1 ETS 73.0
## # ℹ 207 more rows
write.csv(ATM_may_2010_export, "ATM_May_2010_Forecast.csv", row.names = FALSE)
For Part A, I analyzed the daily cash withdrawals for the four ATMs and created forecasts for May 2010. The Cash variable is measured in hundreds of dollars.
I first plotted the full time series for each ATM to understand the behavior of the data. ATM1 and ATM2 both showed clear repeating weekly patterns, which suggests that withdrawals depend on the day of the week. ATM3 behaved very differently from the others because it stayed at zero for most of the series and only increased at the very end. ATM4 had much larger withdrawal values than the other machines and also contained a very large spike, making it the most volatile series.
To evaluate forecasting performance, I split the data into a training period ending on March 31, 2010 and a testing period covering April 2010. I then fit several simple forecasting models and compared them using forecast accuracy on the April test period. The models considered were ETS, ARIMA, Drift, and Seasonal Naive when appropriate.
Based on the test results, ETS gave the best performance for ATM1 and ATM2. This makes sense because both series had strong repeating patterns and ETS was able to capture that behavior well. For ATM3, the series was unusual because it was almost entirely zero and then jumped near the end, so a simple Drift model was used. For ATM4, the Drift model performed best on the test set.
The final model choices were:
ATM1: ETS ATM2: ETS ATM3: Drift ATM4: Drift
Using these best models, I refit each model on the full available dataset and generated forecasts for May 2010.
The final forecasts suggest that ATM1 and ATM2 will continue their weekly withdrawal patterns through May. ATM3 is forecasted to remain above zero after the recent increase at the end of the observed series. ATM4 is forecasted to gradually decline through May, which is reasonable after the large spike seen earlier in the data.
Overall, the results show that the four ATMs do not behave the same way, so choosing separate models for each machine was the most appropriate approach.
Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
ResidentialCustomerForecastLoad.624 <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
# Convert the date column and create the tsibble
ResidentialCustomerForecastLoad.624 <- ResidentialCustomerForecastLoad.624 |>
mutate(`YYYY-MMM` = yearmonth(`YYYY-MMM`)) |>
as_tsibble(index = `YYYY-MMM`)
autoplot(ResidentialCustomerForecastLoad.624, KWH) +
labs(
title = "Residential Power Usage",
x = "Month",
y = "KWH"
)
gg_season(ResidentialCustomerForecastLoad.624, KWH)
gg_subseries(ResidentialCustomerForecastLoad.624, KWH)
# Split into training and testing sets
ResidentialCustomerForecastLoad_train <- ResidentialCustomerForecastLoad.624 |>
filter(`YYYY-MMM` <= yearmonth("2012 Dec"))
ResidentialCustomerForecastLoad_test <- ResidentialCustomerForecastLoad.624 |>
filter(`YYYY-MMM` >= yearmonth("2013 Jan"),
`YYYY-MMM` <= yearmonth("2013 Dec"))
# Fit several simple forecasting models on the training data
ResidentialCustomerForecastLoad_fit <- ResidentialCustomerForecastLoad_train |>
model(
ETS = ETS(KWH),
ARIMA = ARIMA(KWH),
SNAIVE = SNAIVE(KWH)
)
report(ResidentialCustomerForecastLoad_fit)
## Warning in report.mdl_df(ResidentialCustomerForecastLoad_fit): Model reporting
## is only supported for individual models, so a glance will be shown. To see the
## report for a specific model, use `select()` and `filter()` to identify a single
## model.
## # A tibble: 3 × 11
## .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE ar_roots
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list>
## 1 ETS 4.83e- 2 -3013. 6033. 6033. 6042. 1.95e12 1.99e12 NA <NULL>
## 2 ARIMA 6.86e+11 -2519. 5050. 5051. 5069. NA NA NA <cpl [25]>
## 3 SNAIVE 1.41e+12 NA NA NA NA NA NA NA <NULL>
## # ℹ 1 more variable: ma_roots <list>
# Forecast 2013 and compare against the test set
ResidentialCustomerForecastLoad_fc_test <- ResidentialCustomerForecastLoad_fit |>
forecast(new_data = ResidentialCustomerForecastLoad_test)
autoplot(ResidentialCustomerForecastLoad_fc_test, ResidentialCustomerForecastLoad_train) +
autolayer(ResidentialCustomerForecastLoad_test, KWH) +
labs(
title = "2013 Forecast Check",
x = "Month",
y = "KWH"
)
# Compute accuracy and choose the best model
ResidentialCustomerForecastLoad_accuracy <- ResidentialCustomerForecastLoad_fc_test |>
accuracy(ResidentialCustomerForecastLoad_test) |>
arrange(RMSE)
ResidentialCustomerForecastLoad_accuracy
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE Test 405195. 1035538. 618606. 4.55 7.06 NaN NaN -0.0313
## 2 ARIMA Test 465643. 1500060. 971577. 4.60 11.8 NaN NaN 0.0218
## 3 ETS Test 674803. 1703522. 1445780. 5.03 18.0 NaN NaN 0.165
ResidentialCustomerForecastLoad_best_model <- ResidentialCustomerForecastLoad_accuracy |>
slice_min(RMSE, n = 1)
ResidentialCustomerForecastLoad_best_model
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE Test 405195. 1035538. 618606. 4.55 7.06 NaN NaN -0.0313
# Refit models on the full dataset and forecast 2014
ResidentialCustomerForecastLoad_fit_full <- ResidentialCustomerForecastLoad.624 |>
model(
ETS = ETS(KWH),
ARIMA = ARIMA(KWH),
SNAIVE = SNAIVE(KWH)
)
ResidentialCustomerForecastLoad_2014 <- ResidentialCustomerForecastLoad_fit_full |>
forecast(h = "12 months") |>
filter(.model == ResidentialCustomerForecastLoad_best_model$.model)
ResidentialCustomerForecastLoad_2014
## # A fable: 12 x 4 [1M]
## # Key: .model [1]
## .model `YYYY-MMM`
## <chr> <mth>
## 1 SNAIVE 2014 Jan
## 2 SNAIVE 2014 Feb
## 3 SNAIVE 2014 Mar
## 4 SNAIVE 2014 Apr
## 5 SNAIVE 2014 May
## 6 SNAIVE 2014 Jun
## 7 SNAIVE 2014 Jul
## 8 SNAIVE 2014 Aug
## 9 SNAIVE 2014 Sep
## 10 SNAIVE 2014 Oct
## 11 SNAIVE 2014 Nov
## 12 SNAIVE 2014 Dec
## # ℹ 2 more variables: KWH <dist>, .mean <dbl>
autoplot(ResidentialCustomerForecastLoad_2014, ResidentialCustomerForecastLoad.624) +
labs(
title = "2014 Residential Power Forecast",
x = "Month",
y = "KWH"
)
ResidentialCustomerForecastLoad_2014_export <- ResidentialCustomerForecastLoad_2014 |>
as_tibble() |>
mutate(KWH = .mean) |>
select(`YYYY-MMM`, .model, KWH)
ResidentialCustomerForecastLoad_2014_export
## # A tibble: 12 × 3
## `YYYY-MMM` .model KWH
## <mth> <chr> <dbl>
## 1 2014 Jan SNAIVE 10655730
## 2 2014 Feb SNAIVE 7681798
## 3 2014 Mar SNAIVE 6517514
## 4 2014 Apr SNAIVE 6105359
## 5 2014 May SNAIVE 5940475
## 6 2014 Jun SNAIVE 7920627
## 7 2014 Jul SNAIVE 8415321
## 8 2014 Aug SNAIVE 9080226
## 9 2014 Sep SNAIVE 7968220
## 10 2014 Oct SNAIVE 5759367
## 11 2014 Nov SNAIVE 5769083
## 12 2014 Dec SNAIVE 9606304
write.csv(
ResidentialCustomerForecastLoad_2014_export,
"ResidentialCustomerForecastLoad_2014_Forecast.csv",
row.names = FALSE
)
For Part B, I analyzed the monthly residential power usage data and created forecasts for 2014. I first plotted the full series and then used seasonal plots to better understand the pattern in the data.
The plots showed a strong seasonal pattern, with usage changing in a similar way from year to year. The series also shows some growth over time, especially in the later years, along with one unusually low observation around 2010.
I used the data through 2012 as training data and held out all of 2013 as a test set. I then fit ETS, ARIMA, and Seasonal Naive models and compared them using forecast accuracy on the 2013 test period.
The best model was:
SNAIVE
This model had the lowest RMSE on the test set, so I used it to generate the 2014 forecast. This result makes sense because the data has a strong yearly seasonal pattern, and the Seasonal Naive model repeats the pattern from the previous year.
The 2014 forecast values are:
January: 10,655,730 February: 7,681,798 March: 6,517,514 April: 6,105,359 May: 5,940,475 June: 7,920,627 July: 8,415,321 August: 9,080,226 September: 7,968,220 October: 5,759,367 November: 5,769,083 December: 9,606,304
Overall, the forecast keeps the same seasonal pattern seen in the historical data, with higher usage in winter and late summer and lower usage in spring and fall.