#Problem: Forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward.Explain and demonstrate the process, techniques used and not used, actual forecast. Provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
#install.packages("lubridate")
#install.packages("writexl") # run once
library(writexl)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(ggplot2)
library(fable)
## Loading required package: fabletools
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
library(fabletools)
library(tsibble)
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(dplyr)
library(readr)
#Loading ATM Data from github
atm_data <- read_csv("https://raw.githubusercontent.com/zahid607/Data624/refs/heads/main/ATM624Data.csv")
## Rows: 1474 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): DATE, ATM
## dbl (1): Cash
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(atm_data)
## Rows: 1,474
## Columns: 3
## $ DATE <chr> "5/1/2009 12:00:00 AM", "5/1/2009 12:00:00 AM", "5/2/2009 12:00:0…
## $ ATM <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ Cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …
#Overview of data:
#The dataset consists of time series data for daily cash withdrawals from 4 different ATMs between May 1, 2009 and April 30, 2010. There are three variables:DATE: date format, ATM: character format and Cash: dbl format.
#Data Preparation:
atm_data <- atm_data |>
mutate(
DATE = parse_date_time(DATE, orders = "mdy IMS p"),
DATE = as_date(DATE),
ATM = as.factor(ATM)
) |>
as_tsibble(index = DATE, key = ATM)
#Now data is ready for analysis
summary(atm_data)
## DATE ATM Cash
## Min. :2009-05-01 ATM1:365 Min. : 0.0
## 1st Qu.:2009-08-01 ATM2:365 1st Qu.: 0.5
## Median :2009-11-01 ATM3:365 Median : 73.0
## Mean :2009-10-31 ATM4:365 Mean : 155.6
## 3rd Qu.:2010-02-01 NA's: 14 3rd Qu.: 114.0
## Max. :2010-05-14 Max. :10920.0
## NA's :19
#Comment: The dataset spans approximately one year, from May 2009 to April 2010, and includes four ATM machines. Summary statistics of the Cash variable indicate variability in withdrawal amounts across machines. A total of 19 missing values were identified in the dataset and removed prior to analysis to ensure data quality.
atm_data <- atm_data |>
filter(!is.na(ATM))
#Plotting the time series for ATM
ggplot(atm_data, aes(x = DATE, y = Cash, color = ATM)) +
geom_line() +
labs(
title = "ATM Cash Over Time",
x = "Date",
y = "Cash"
) +
theme_minimal()
library(feasts)
## Registered S3 methods overwritten by 'ggtime':
## method from
## autolayer.fbl_ts fabletools
## autolayer.tbl_ts fabletools
## autoplot.dcmp_ts fabletools
## autoplot.fbl_ts fabletools
## autoplot.tbl_ts fabletools
## fortify.fbl_ts fabletools
atm_data %>%
autoplot(Cash) + # feasts::autoplot supports tsibble
facet_wrap(~ATM, ncol = 1, scales = "free_y") +
labs(
title = "ATM Withdrawal for Four ATMs (May 2009 - April 2010)",
x = "Date"
)
## Warning: `autoplot.tbl_ts()` was deprecated in fabletools 0.6.0.
## ℹ Please use `ggtime::autoplot.tbl_ts()` instead.
## ℹ Graphics functions have been moved to the {ggtime} package. Please use
## `library(ggtime)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Comment:
#ATM1 may have a seasonal pattern, likely weekly seasonality #ATM2 may have a seasonal pattern, likely weekly seasonality #ATM3 had no withdrawals until towards the end of the period, then had a sudden spike in April 2010 #ATM4 exhibits a large outlier in February 2010
#ATM 3
atm_data %>%
filter(ATM == "ATM3" & Cash >0)
## # A tsibble: 3 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <fct> <dbl>
## 1 2010-04-28 ATM3 96
## 2 2010-04-29 ATM3 82
## 3 2010-04-30 ATM3 85
#Comment: The time series plot for ATM 3 suggests a single extreme outlier with mostly zero withdrawals. However, the data table shows three consecutive days of withdrawals. While still unusual, their consecutive occurrence indicates they are likely valid events rather than errors.
#ATM 4
atm_data %>%
filter(ATM == "ATM4" & Cash > 175)
## # A tsibble: 255 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <fct> <dbl>
## 1 2009-05-01 ATM4 777
## 2 2009-05-02 ATM4 524
## 3 2009-05-03 ATM4 793
## 4 2009-05-04 ATM4 908
## 5 2009-05-08 ATM4 559
## 6 2009-05-09 ATM4 904
## 7 2009-05-10 ATM4 879
## 8 2009-05-11 ATM4 274
## 9 2009-05-12 ATM4 396
## 10 2009-05-13 ATM4 275
## # ℹ 245 more rows
#Comment: The outlier occurred on February 9, 2010, with a value of $1,091,976, far exceeding all other withdrawals, which remained below $175,000. Due to its extreme nature, the value was treated as an anomaly and replaced with the average of the preceding and following days.
atm_data <- atm_data %>%
group_by(ATM) %>%
mutate(Cash = ifelse(ATM == "ATM4" & DATE == as.Date("2010-02-09"),
mean(Cash[ATM == "ATM4" &
DATE %in% c(as.Date("2010-02-08"), as.Date("2010-02-10"))],
na.rm = TRUE),
Cash)) %>%
ungroup()
#Variance Stabilization Check:
# box cox
atm_lambda <- atm_data %>%
features(Cash, features = guerrero)
## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value
## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value
atm_lambda
## # A tibble: 4 × 2
## ATM lambda_guerrero
## <fct> <dbl>
## 1 ATM1 0.208
## 2 ATM2 0.716
## 3 ATM3 1.37
## 4 ATM4 0.450
#Comment
#ATM1 (λ = 0.17). Very close to 0 → strong transformation needed. A log or near-log transformation is appropriate.
#ATM2 (λ = 0.68). Moderate transformation.
#ATM3 (λ = 1.34).Greater than 1 → unusual case. Transformation may stretch larger values more than smaller ones.
#ATM4 (λ = 0.45). Moderate-to-strong transformation. Between square-root and log transformation.
#Model Development:
#This section describes the development, training, and evaluation of forecasting models used to predict daily cash withdrawals for each ATM.
# Fit Models
fit_models_atm <- atm_data %>%
model(
snaive = SNAIVE(Cash ~ lag("week")),
ets = ETS(Cash),
arima = ARIMA(Cash)
)
# Model info
glimpse(fit_models_atm)
## Rows: 4
## Columns: 4
## Key: ATM [4]
## $ ATM <fct> ATM1, ATM2, ATM3, ATM4
## $ snaive <model> [SNAIVE], [SNAIVE], [SNAIVE], [SNAIVE]
## $ ets <model> [ETS(M,N,N)], [ETS(A,A,N)], [ETS(A,N,N)], [ETS(M,N,M)]
## $ arima <model> [ARIMA(1,0,0)(2,1,0)[7]], [ARIMA(0,0,0)(2,1,0)[7]], [ARIMA(0,0,…
#Best Model Selection
# Check accuracy for each model - focusing on MAE, RMSE, and MAPE
accuracy(fit_models_atm)
## # A tibble: 12 × 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 snaive Train… -0.0366 27.8 17.7 -97.3 117. 1.00 0.999 0.181
## 2 ATM1 ets Train… -0.0687 36.8 27.5 -180. 204. 1.56 1.32 0.0351
## 3 ATM1 arima Train… -0.0894 24.3 15.4 -98.7 115. 0.870 0.873 0.00920
## 4 ATM2 snaive Train… 0.0225 29.7 20.4 -Inf Inf 0.997 0.997 0.0486
## 5 ATM2 ets Train… 0.709 38.2 32.7 -Inf Inf 1.60 1.28 -0.0153
## 6 ATM2 arima Train… -0.118 25.2 17.3 -Inf Inf 0.843 0.846 0.0150
## 7 ATM3 snaive Train… 0.735 8.04 0.735 100 100 1 1 0.640
## 8 ATM3 ets Train… 0.270 5.03 0.273 Inf Inf 0.371 0.625 -0.00736
## 9 ATM3 arima Train… 0.271 5.03 0.271 34.6 34.6 0.370 0.625 0.0124
## 10 ATM4 snaive Train… -3.70 453. 346. -383. 438. 1 1 0.0622
## 11 ATM4 ets Train… -30.4 354. 277. -434. 466. 0.798 0.782 0.0571
## 12 ATM4 arima Train… 0.0288 340. 280. -488. 520. 0.809 0.749 0.00472
#Comment:
#The best model for each ATM was determined by comparing MAE, RMSE, and MPE values. However, for ATM1 and ATM2, all evaluation metrics are missing (NaN), making it impossible to determine the best model for those series.
#Residual Diagonstics:
atm_data <- atm_data %>%
group_by(ATM) %>%
fill_gaps()
atm_data <- atm_data %>%
group_by(ATM) %>%
fill_gaps() %>%
mutate(Cash = replace_na(Cash, 0))
fit_models_atm <- atm_data %>%
model(
arima = ARIMA(Cash),
ets = ETS(Cash),
snaive = SNAIVE(Cash)
)
#ATM 1
fit_models_atm %>%
filter(ATM == "ATM1") %>%
select(arima) %>%
gg_tsresiduals()
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## ℹ Graphics functions have been moved to the {ggtime} package. Please use
## `library(ggtime)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#ATM 2
fit_models_atm %>%
filter(ATM == "ATM2") %>%
select(arima) %>%
gg_tsresiduals()
#ATM 3
fit_models_atm %>%
filter(ATM == "ATM3") %>%
select(arima) %>%
gg_tsresiduals()
#ATM 4
fit_models_atm %>%
filter(ATM == "ATM4") %>%
select(arima) %>%
gg_tsresiduals()
#Forecasting:
acc_tables_atm <- accuracy(fit_models_atm)
# select best models
best_models <- acc_tables_atm %>%
group_by(ATM) %>%
arrange(RMSE, MAE, .by_group = TRUE) %>%
slice(1) %>%
ungroup() %>%
select(ATM, .model)
# forecast
future <- new_data(atm_data, 31)
## Grouping structure is ignored.
## ℹ `ungroup()` to silence this message.
# Forecast with ALL models
fc_all <- forecast(fit_models_atm, new_data = future)
# Keep only the chosen model per ATM
final_fc <- fc_all %>%
inner_join(best_models, by = c("ATM", ".model"))
# write to excel
final_fc %>%
write_xlsx('May2010_ATM_Forecasts.xlsx')
## Warning in write_xlsx(., "May2010_ATM_Forecasts.xlsx"): Column 'Cash' has
## unrecognized data type.
#Forecasting Plots:
# Plot forecasts for each ATM
autoplot(final_fc, atm_data) +
labs(title = "Forecasted Cash Withdrawals for May 2010", y = "USD") +
facet_wrap(~ATM, scales = "free_y") +
xlab("Date") +
ylab("Withdrawal (USD)")+
theme(
plot.title = element_text(hjust=0.5, face='bold')
)
## Warning: `autoplot.fbl_ts()` was deprecated in fabletools 0.6.0.
## ℹ Please use `ggtime::autoplot.fbl_ts()` instead.
## ℹ Graphics functions have been moved to the {ggtime} package. Please use
## `library(ggtime)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `mutate_if()` ignored the following grouping variables:
## • Column `ATM`
#Part B
#Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
#Loading data
url <- "https://raw.githubusercontent.com/zahid607/Data624/refs/heads/main/ResidentialCustomerForecastLoad-624.csv"
power_data <- read_csv(url)
## Rows: 192 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): YYYY-MMM
## dbl (2): CaseSequence, KWH
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(power_data)
## Rows: 192
## Columns: 3
## $ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ `YYYY-MMM` <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ KWH <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…
#Data Preparation
# convert YYYY-MMM column into date format and convert data into tsibble for time series
power_data <- power_data %>%
mutate(
Date = yearmonth(`YYYY-MMM`) # convert to yearmonth object
) %>%
select(CaseSequence, Date, KWH) %>%
as_tsibble(index = Date)
summary(power_data$KWH)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 770523 5429912 6283324 6502475 7620524 10655730 1
range(power_data$Date)
## <yearmonth[2]>
## [1] "1998 Jan" "2013 Dec"
#Missing value
power_data %>%
filter(is.na(KWH))
## # A tsibble: 1 x 3 [1M]
## CaseSequence Date KWH
## <dbl> <mth> <dbl>
## 1 861 2008 Sep NA
#Plotting the power time series
autoplot(power_data, KWH) +
labs(
title = "Residential Power Usage\nBetween January 1998 and December 2013",
x = "Month",
y = "Power (KWH)"
)
#Impute missing September values with the average September KWH across all years
power_data <- power_data %>%
mutate(
month = month(Date, label = TRUE), # extract month name
KWH = ifelse(
is.na(KWH) & month == "Sep",
mean(KWH[month == "Sep"], na.rm = TRUE), # average KWH of all Septembers
KWH
)
) %>%
select(-month)
#Plotting the power time series
autoplot(power_data, KWH) +
labs(
title = "Residential Power Usage\nBetween January 1998 and December 2013",
x = "Month",
y = "Power (KWH)"
) +
theme(
plot.title = element_text(hjust = 0.5, face = 'bold')
)
#Looking Closer at the Outlier
power_data %>%
filter(KWH<3000000)
## # A tsibble: 1 x 3 [1M]
## CaseSequence Date KWH
## <dbl> <mth> <dbl>
## 1 883 2010 Jul 770523
#Variance Check
# box cox
power_lambda <- power_data %>%
features(KWH, features = guerrero) %>%
pull(lambda_guerrero)
power_lambda
## [1] 0.1226627
#Comment: The lambda is 0.1226627, which is very close to 0. I will apply a box-cox transformation with λ=0.1226627.
#Applying box-cox transformation
power_data$KWH_transformed <- box_cox(power_data$KWH, power_lambda)
# plot after transformation
autoplot(power_data, KWH_transformed) +
labs(
title = "Residential Power Usage\nBetween January 1998 and December 2013\nBox-Cox Transformation",
x = "Month",
y = "Power (KWH)"
) +
theme(
plot.title = element_text(hjust = 0.5, face = 'bold')
)
#Model Development:
# fit models
fit_models_power <- power_data %>%
model(
ets = ETS(KWH),
arima = ARIMA(KWH),
stl_ets = decomposition_model(
STL(KWH ~ season(window = "periodic")),
ETS(season_adjust ~ error("A") + trend("A") + season("A"))
)
)
#Model overview
# MODEL INFO
glimpse(fit_models_power)
## Rows: 1
## Columns: 3
## $ ets <model> [ETS(M,N,M)]
## $ arima <model> [ARIMA(0,0,1)(1,1,1)[12] w/ drift]
## $ stl_ets <model> [STL decomposition model]
#Best Model Selection
# Check accuracy for each model - focusing on MAE, RMSE, and MAPE
accuracy(fit_models_power)
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ets Training 62242. 826660. 500307. -4.33 12.0 0.729 0.705 0.173
## 2 arima Training -25758. 823919. 489801. -5.52 11.6 0.714 0.702 0.0131
## 3 stl_ets Training 53043. 821514. 504335. -4.46 11.8 0.735 0.700 0.233
#Forecasting
# ARIMA model
arima_model <- power_data %>%
model(ARIMA(KWH))
# future 12 months
future_power <- new_data(power_data,12)
# generate forecasts
fc_power <- forecast(arima_model, new_data = future_power)
# write to excel
fc_power %>%
write_xlsx('2014_Power_Usage_Forecasts.xlsx')
## Warning in write_xlsx(., "2014_Power_Usage_Forecasts.xlsx"): Column 'KWH' has
## unrecognized data type.
#Forecasting Plot:
# plot forecasts
autoplot(fc_power, power_data) +
labs(
title = "Residential Power Usage Forecast for 2014\n(ARIMA Model)",
x = "Month",
y = "Power (KWH)"
) +
theme(
plot.title = element_text(hjust = 0.5, face='bold')
)
#Final Result
fc_power
## # A fable: 12 x 4 [1M]
## # Key: .model [1]
## .model Date
## <chr> <mth>
## 1 ARIMA(KWH) 2014 Jan
## 2 ARIMA(KWH) 2014 Feb
## 3 ARIMA(KWH) 2014 Mar
## 4 ARIMA(KWH) 2014 Apr
## 5 ARIMA(KWH) 2014 May
## 6 ARIMA(KWH) 2014 Jun
## 7 ARIMA(KWH) 2014 Jul
## 8 ARIMA(KWH) 2014 Aug
## 9 ARIMA(KWH) 2014 Sep
## 10 ARIMA(KWH) 2014 Oct
## 11 ARIMA(KWH) 2014 Nov
## 12 ARIMA(KWH) 2014 Dec
## # ℹ 2 more variables: KWH <dist>, .mean <dbl>