In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.3 ✔ fable 0.4.1
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(lubridate)
library(seasonal)
##
## Attaching package: 'seasonal'
## The following object is masked from 'package:tibble':
##
## view
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ purrr 1.0.2 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ seasonal::view() masks tibble::view()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
atm_cash_withdrawl <- read_csv("https://raw.githubusercontent.com/gillianmcgovern0/cuny-data-624/refs/heads/main/ATM624Data(ATM%20Data).csv")
## Rows: 1474 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): DATE, ATM
## dbl (1): Cash
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(atm_cash_withdrawl)
## # A tibble: 6 × 3
## DATE ATM Cash
## <chr> <chr> <dbl>
## 1 5/1/2009 12:00:00 AM ATM1 96
## 2 5/1/2009 12:00:00 AM ATM2 107
## 3 5/2/2009 12:00:00 AM ATM1 82
## 4 5/2/2009 12:00:00 AM ATM2 89
## 5 5/3/2009 12:00:00 AM ATM1 85
## 6 5/3/2009 12:00:00 AM ATM2 90
Now we must clean the data to get an accurate time series. Let’s convert the date variable into an actual date data type and group by ATM:
# Sum up the cash by ATM
atm_cash_withdrawl <- atm_cash_withdrawl |>
separate(DATE, c("Date", "Time"), " ") |>
group_by(Date, ATM) |>
summarise(Cash = sum(Cash)) # it looks like there are no duplicates for an ATM/Date combination, but to be sure let's sum by ATM/Date
## Warning: Expected 2 pieces. Additional pieces discarded in 1474 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'Date'. You can override using the
## `.groups` argument.
head(atm_cash_withdrawl)
## # A tibble: 6 × 3
## # Groups: Date [2]
## Date ATM Cash
## <chr> <chr> <dbl>
## 1 1/1/2010 ATM1 132
## 2 1/1/2010 ATM2 117
## 3 1/1/2010 ATM3 0
## 4 1/1/2010 ATM4 986
## 5 1/10/2010 ATM1 115
## 6 1/10/2010 ATM2 47
Now let’s update the formatting and convert to tsibble:
# Update to correct date formatting to prepare for conversion to "Date" type
atm_cash_withdrawl2 <- atm_cash_withdrawl |>
mutate(Date = strptime(Date, "%m/%d/%Y"))
atm_cash_withdrawl_final <- atm_cash_withdrawl2 |>
mutate(Date = ymd(Date)) |>
as_tsibble(key = c(ATM, Cash), index = Date)
atm_cash_withdrawl_final
## # A tsibble: 1,474 x 3 [1D]
## # Key: ATM, Cash [532]
## # Groups: @ Date [379]
## Date ATM Cash
## <date> <chr> <dbl>
## 1 2009-09-11 ATM1 1
## 2 2010-02-23 ATM1 1
## 3 2009-12-30 ATM1 2
## 4 2009-06-05 ATM1 3
## 5 2010-03-23 ATM1 3
## 6 2010-04-13 ATM1 3
## 7 2010-04-20 ATM1 3
## 8 2009-07-09 ATM1 4
## 9 2010-03-02 ATM1 4
## 10 2010-03-09 ATM1 4
## # ℹ 1,464 more rows
We now have the final tsibble.
Let’s plot the original data:
atm_cash_withdrawl_final |>
filter(!is.na(ATM)) |>
ggplot(aes(x = Date, y = Cash, colour = ATM)) +
geom_line() +
facet_grid(ATM ~ ., scales = "free_y")
The time interval for this time series is daily, but all these plots look different. Let’s focus on each ATM individually.
Let’s focus on ATM1
first:
atm_cash_withdrawl_final_atm1 <- atm_cash_withdrawl_final |>
filter(ATM == "ATM1")
# We now don't want `ATM` variable to be part of the key in tsibble, so let's convert
# back to data frame, and remove the `ATM` column
atm_cash_withdrawl_final_atm1_df <- atm_cash_withdrawl_final_atm1 |>
as.data.frame(atm_cash_withdrawl_final_atm1)
# Re-create tsibble without `ATM` variable
atm_cash_withdrawl_final_atm1 <- atm_cash_withdrawl_final_atm1_df |>
as_tsibble(index = Date)
head(atm_cash_withdrawl_final_atm1)
## # A tsibble: 6 x 3 [1D]
## Date ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-02 ATM1 82
## 3 2009-05-03 ATM1 85
## 4 2009-05-04 ATM1 90
## 5 2009-05-05 ATM1 99
## 6 2009-05-06 ATM1 88
autoplot(atm_cash_withdrawl_final_atm1, Cash)
Similar to the previous graph, we can see there’s definitely
seasonality, but doesn’t look like any major trend exists. Additionally,
the change in variation looks pretty constant as time increases. It does
look like the variation becomes smaller and then larger again in between
Oct 2009
and Jan 2010
.
Let’s try box-cox transformation just to confirm it doesn’t make the above graph look better:
# Box-cox transformation
lambda_atm_cash_withdrawl_final_atm1<- atm_cash_withdrawl_final_atm1 |>
features(Cash, features = guerrero) |>
pull(lambda_guerrero)
atm_cash_withdrawl_final_atm1 |>
autoplot(box_cox(Cash, lambda_atm_cash_withdrawl_final_atm1))
This doesn’t look like it improved the variance from the original graph, which was expected, so let’s hold off on using a box-cox transformation.
Additionally I am not seeing any major outliers for this data.
Now let’s investigate seasonality more:
# Check for all seasonality
gg_season(atm_cash_withdrawl_final_atm1, Cash) +
labs(title="ATM1")
atm_cash_withdrawl_final_atm1 |>
gg_season(Cash, period = "week") +
labs(title="ATM1 - Weekly")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
atm_cash_withdrawl_final_atm1 |>
gg_season(Cash, period = "month") +
labs(title="ATM1 - Monthly")
atm_cash_withdrawl_final_atm1 |>
gg_season(Cash, period = "year") +
labs(title="ATM1 - Yearly")
Looking at the first seasonality graph, it appears the seasonality period is a week. We don’t have that much data, so we can’t say if there’s annual seasonality.
The weekly seasonality plot during recent weeks show that Cash withdrawl increases until Tuesday, then drops down with a trough at Thursday, increases Friday, decreases a bit Saturday then increases a bit again Sunday.
From what looks like W10 - W20, Cash withdrawl decreases Tuesday then increases Wednesday, slightly decreases Thursday and it’s hard to tell, but it looks like it increases to Sunday.
The monthly seasonal plot doesn’t give us much insight about the data.
So overall, there seems to be 2 distinct seasonality patterns here. Let’s take a look what these dates could be by looking at a seasonal subseries plot:
gg_subseries(atm_cash_withdrawl_final_atm1, Cash, period = "week") +
labs(title = "Daily ATM1 Cash")
It looks like in March/April, is when Tuesday’s Cash has a major drop. In March/April, Wednesday is above average as well as Thursday. What’s interesting is that around the same time Tuesday experienced a drop, Thursday experienced a large increase. Friday has a few weeks where it experienced a major decrease. Saturday and Sunday are for the most part pretty similar across all months.
Let’s look at ACF now:
atm_cash_withdrawl_final_atm1 |>
ACF(Cash, lag_max = 42) |>
autoplot()
This graph indicates:
Since this has complex weekly seasonality, let’s choose to do an STL
decomposition. Since STL()
requires no missing periods, I
will choose to start this at July 2009:
atm_cash_withdrawl_final_atm1_filtered <- atm_cash_withdrawl_final_atm1 |>
filter(yearmonth(Date) >= yearmonth("2009 Jul"))
dcmp <- atm_cash_withdrawl_final_atm1_filtered |>
model(stl = STL(Cash))
components(dcmp) |> autoplot()
The STL()
decomposition confirms the weekly seasonal
period. It also shows there really isn’t a major trend happening other
then just constant increasing and decreasing.
season_week shows that there is a change in seasonal variance in mid-February where variance becomes a lot smaller, but then starts to increase again. Something definitely occured in March 2010.
gg_lag(atm_cash_withdrawl_final_atm1, Cash, geom = "point", lag=1:7)
## Warning: Removed 3 rows containing missing values (gg_lag).
gg_lag(atm_cash_withdrawl_final_atm1, Cash, geom = "point", lag=1:52)
## Warning: Removed 3 rows containing missing values (gg_lag).
Before we pick a model, let’s determine how we’ll handle the missing data.
# Find dates with NA values
atm_cash_withdrawl_final_atm1 |>
filter(is.na(Cash)) |>
mutate(day_of_the_week = weekdays(Date))
## # A tsibble: 3 x 4 [1D]
## Date ATM Cash day_of_the_week
## <date> <chr> <dbl> <chr>
## 1 2009-06-13 ATM1 NA Saturday
## 2 2009-06-16 ATM1 NA Tuesday
## 3 2009-06-22 ATM1 NA Monday
2009-06-13
, 2009-06-16
,
2009-06-22
are the only dates with missing values. Since we
don’t know why this data is missing, we probably shouldn’t remove the
data entirely, especially since we’re not dealing with a lot of data in
the first place.
Since these are individual days, we could probably safely estimate the missing data:
atm_cash_withdrawl_final_atm1_miss <- atm_cash_withdrawl_final_atm1 |>
# Replace with missing values
fill_gaps()
atm_cash_withdrawl_final_atm1_fill <- atm_cash_withdrawl_final_atm1_miss |>
# Fit ARIMA model to the data containing missing values
model(ARIMA(Cash)) |>
# Estimate Trips for all periods
interpolate(atm_cash_withdrawl_final_atm1_miss)
atm_cash_withdrawl_final_atm1_fill |>
autoplot(Cash) +
autolayer(atm_cash_withdrawl_final_atm1_fill |> filter_index("2009-06-14"~"2009-06-17"),
Cash, colour="#D55E00") +
autolayer(atm_cash_withdrawl_final_atm1_fill |> filter_index("2009-06-21"~"2009-06-23"),
Cash, colour="#D55E00")
This graph looks very reasonable to me. This will now allow us to test some models.
# Confirm range
range(atm_cash_withdrawl_final_atm1$Date, na.rm = TRUE)
## [1] "2009-05-01" "2010-04-30"
# Experiment with all simple forecast models
atm_cash_withdrawl_final_atm1_fit <- atm_cash_withdrawl_final_atm1_fill |>
filter(!is.na(Cash)) |> # just want data that contains Bricks data
model(
Seasonal_naive = SNAIVE(Cash),
Naive = NAIVE(Cash),
Drift = RW(Cash ~ drift()),
Mean = MEAN(Cash)
)
# Forecast May 2010
atm_cash_withdrawl_final_atm1_fc <- atm_cash_withdrawl_final_atm1_fit |>
forecast(h = 31)
atm_cash_withdrawl_final_atm1_fc |>
autoplot(atm_cash_withdrawl_final_atm1, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
The only viable forecast is the seasonal naive, but even that does look great.
Let’s look at
# ETS
fit <- atm_cash_withdrawl_final_atm1_fill |>
model(
mna = ETS(Cash ~ error("M") + trend("N") + season("A")),
mnm = ETS(Cash ~ error("M") + trend("N") + season("M")), # best guess
ana = ETS(Cash ~ error("A") + trend("N") + season("A")),
anm = ETS(Cash ~ error("A") + trend("N") + season("M")), # best guess
)
fc <- fit |> forecast(h = 31) # forecast for May 2010
fc
## # A fable: 124 x 4 [1D]
## # Key: .model [4]
## .model Date
## <chr> <date>
## 1 mna 2010-05-01
## 2 mna 2010-05-02
## 3 mna 2010-05-03
## 4 mna 2010-05-04
## 5 mna 2010-05-05
## 6 mna 2010-05-06
## 7 mna 2010-05-07
## 8 mna 2010-05-08
## 9 mna 2010-05-09
## 10 mna 2010-05-10
## # ℹ 114 more rows
## # ℹ 2 more variables: Cash <dist>, .mean <dbl>
fc |>
autoplot(atm_cash_withdrawl_final_atm1_fill, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
# Confirm which is the best fit:
fit <- atm_cash_withdrawl_final_atm1_fill |>
model(ETS(Cash))
fit
## # A mable: 1 x 1
## `ETS(Cash)`
## <model>
## 1 <ETS(A,N,A)>
Visually, it is very hard to tell the difference for most of them
except for ETS(A, N, M)
. ETS(A,N,A)
is the
chosen best but let’s confirm that:
atm_cash_withdrawl_final_atm1_fill |>
stretch_tsibble(.init = 10) |>
model(
mna = ETS(Cash ~ error("M") + trend("N") + season("A")),
mnm = ETS(Cash ~ error("M") + trend("N") + season("M")), # best guess
ana = ETS(Cash ~ error("A") + trend("N") + season("A")),
snaive = SNAIVE(Cash)
) |>
forecast(h = 31) |>
accuracy(atm_cash_withdrawl_final_atm1_fill)
## Warning: 1 error encountered for mna
## [1] Not enough data to estimate this ETS model.
## Warning: 1 error encountered for mnm
## [1] Not enough data to estimate this ETS model.
## Warning: 1 error encountered for ana
## [1] Not enough data to estimate this ETS model.
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 31 observations are missing between 2010-05-01 and 2010-05-31
## # A tibble: 4 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ana Test -0.152 27.7 18.3 -126. 148. 1.04 1.00 0.207
## 2 mna Test -0.233 26.1 17.2 -119. 136. 0.982 0.945 0.200
## 3 mnm Test -1.02 29.7 18.5 -139. 157. 1.05 1.07 0.304
## 4 snaive Test -0.182 31.5 20.7 -111. 134. 1.18 1.14 0.134
The lowest RMSE
value is 26.09655 for
ETS(M,N,A)
so let’s go with that one.
Now let’s check for white noise:
# ETS M,N,A
fit_atm1 <- atm_cash_withdrawl_final_atm1_fill |>
model(
mna = ETS(Cash ~ error("M") + trend("N") + season("A"))
)
fc_atm1 <- fit_atm1 |> forecast(h = 31) # forecast for May 2010
fc_atm1
## # A fable: 31 x 4 [1D]
## # Key: .model [1]
## .model Date
## <chr> <date>
## 1 mna 2010-05-01
## 2 mna 2010-05-02
## 3 mna 2010-05-03
## 4 mna 2010-05-04
## 5 mna 2010-05-05
## 6 mna 2010-05-06
## 7 mna 2010-05-07
## 8 mna 2010-05-08
## 9 mna 2010-05-09
## 10 mna 2010-05-10
## # ℹ 21 more rows
## # ℹ 2 more variables: Cash <dist>, .mean <dbl>
fc_atm1 |>
autoplot(atm_cash_withdrawl_final_atm1_fill, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
Now let’s check the residuals for white noise. Since this has seasonality, we pick l = 2m. The seasonal period is 1 week, so l = 2 * 7 which is 14:
# Box-Pierce test
augment(fit_atm1) |> features(.innov, box_pierce, lag = 14)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 mna 33.8 0.00220
# Ljung-Box test
augment(fit_atm1) |> features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 mna 34.3 0.00184
For both methods, the p value is less than 0.05, so it’s not likely to occur by chance. These residuals are easily distinguished from white noise. We do not accept the white noise hypothesis.
Let’s double check ETS(A,N,A)
:
# ETS A,N,A
fit_atm1 <- atm_cash_withdrawl_final_atm1_fill |>
model(
ETS(Cash)
)
fc_atm1 <- fit_atm1 |> forecast(h = 31) # forecast for May 2010
fc_atm1
## # A fable: 31 x 4 [1D]
## # Key: .model [1]
## .model Date
## <chr> <date>
## 1 ETS(Cash) 2010-05-01
## 2 ETS(Cash) 2010-05-02
## 3 ETS(Cash) 2010-05-03
## 4 ETS(Cash) 2010-05-04
## 5 ETS(Cash) 2010-05-05
## 6 ETS(Cash) 2010-05-06
## 7 ETS(Cash) 2010-05-07
## 8 ETS(Cash) 2010-05-08
## 9 ETS(Cash) 2010-05-09
## 10 ETS(Cash) 2010-05-10
## # ℹ 21 more rows
## # ℹ 2 more variables: Cash <dist>, .mean <dbl>
fc_atm1 |>
autoplot(atm_cash_withdrawl_final_atm1_fill, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
# Box-Pierce test
augment(fit_atm1) |> features(.innov, box_pierce, lag = 14)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 ETS(Cash) 28.8 0.0110
# Ljung-Box test
augment(fit_atm1) |> features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ETS(Cash) 29.4 0.00912
This p value is also too small to accept the hypotheses so it’s now back to the drawing board.
Now let’s double check ARIMA in case it’s better. Check for differences:
# Check for differences
atm_cash_withdrawl_final_atm1_fill |>
features(Cash, unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 1
atm_cash_withdrawl_final_atm1_fill |>
features(Cash, unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.500 0.0416
So we take 1 difference with lag = 7 since it is a weekly seasonal period:
atm_cash_withdrawl_final_atm1_fill |> gg_tsdisplay(difference(Cash, 7), plot_type='partial', lag_max = 14)
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).
These graphs produce stationary behavior.
In the plots of the seasonally differenced data, there are spikes in the PACF and ACF at lags 6 and 7. Let’s see what ARIMA produces:
# ARIMA
fit_atm1 <- atm_cash_withdrawl_final_atm1_fill |>
model(
ARIMA(Cash)
)
fit_atm1
## # A mable: 1 x 1
## `ARIMA(Cash)`
## <model>
## 1 <ARIMA(0,0,1)(0,1,2)[7]>
fc_atm1 <- fit_atm1 |> forecast(h = 31) # forecast for May 2010
fc_atm1
## # A fable: 31 x 4 [1D]
## # Key: .model [1]
## .model Date
## <chr> <date>
## 1 ARIMA(Cash) 2010-05-01
## 2 ARIMA(Cash) 2010-05-02
## 3 ARIMA(Cash) 2010-05-03
## 4 ARIMA(Cash) 2010-05-04
## 5 ARIMA(Cash) 2010-05-05
## 6 ARIMA(Cash) 2010-05-06
## 7 ARIMA(Cash) 2010-05-07
## 8 ARIMA(Cash) 2010-05-08
## 9 ARIMA(Cash) 2010-05-09
## 10 ARIMA(Cash) 2010-05-10
## # ℹ 21 more rows
## # ℹ 2 more variables: Cash <dist>, .mean <dbl>
fc_atm1 |>
autoplot(atm_cash_withdrawl_final_atm1_fill, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
# Box-Pierce test
augment(fit_atm1) |> features(.innov, box_pierce, lag = 14)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(Cash) 15.1 0.372
# Ljung-Box test
augment(fit_atm1) |> features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(Cash) 15.5 0.346
These p values are greater than 0.05 so we can accept the white noise hypotheses.
The winning model is ARIMA(0,0,1)(0,1,2)
.
write_csv(fc_atm1, "~/Downloads/atm1-forecast.csv")
Let’s take a look at the original data:
atm_cash_withdrawl_final_atm2 <- atm_cash_withdrawl_final |>
filter(ATM == "ATM2")
# We now don't want `ATM` variable to be part of the key in tsibble, so let's convert
# back to data frame, and remove the `ATM` column
atm_cash_withdrawl_final_atm2_df <- atm_cash_withdrawl_final_atm2 |>
as.data.frame(atm_cash_withdrawl_final_atm2)
# Re-create tsibble without `ATM` variable
atm_cash_withdrawl_final_atm2 <- atm_cash_withdrawl_final_atm2_df |>
as_tsibble(index = Date)
head(atm_cash_withdrawl_final_atm2)
## # A tsibble: 6 x 3 [1D]
## Date ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM2 107
## 2 2009-05-02 ATM2 89
## 3 2009-05-03 ATM2 90
## 4 2009-05-04 ATM2 55
## 5 2009-05-05 ATM2 79
## 6 2009-05-06 ATM2 19
autoplot(atm_cash_withdrawl_final_atm2, Cash)
The autoplot shows a trend of slight decreasing. There is definitely
seasonality occurring as well. It appears the seasonality for
ATM2
is also weekly. It looks like variance does change a
bit as time goes on, so let’s see if a box cox transformation would
work.
atm_cash_withdrawl_final_atm2_lambda <- atm_cash_withdrawl_final_atm2 |>
features(Cash, features = guerrero) |>
pull(lambda_guerrero)
atm_cash_withdrawl_final_atm2 |>
autoplot(box_cox(Cash, atm_cash_withdrawl_final_atm2_lambda)) +
labs(y = "",
title = latex2exp::TeX(paste0(
"Transformed ATM2 with $\\lambda$ = ",
round(atm_cash_withdrawl_final_atm2_lambda,2))))
This doesn’t look like it did much to improve the interpretability. This makes sense because the lambda is somewhat close to 1 which would mean the original data. To keep things simple, let’s stick with the original data.
Let’s now look into seasonality:
# Check for all seasonality
gg_season(atm_cash_withdrawl_final_atm2, Cash) +
labs(title="ATM2")
atm_cash_withdrawl_final_atm2 |>
gg_season(Cash, period = "week") +
labs(title="ATM2 - Weekly")
atm_cash_withdrawl_final_atm2 |>
gg_season(Cash, period = "week", polar = TRUE) +
labs(title="ATM2 - Weekly")
atm_cash_withdrawl_final_atm2 |>
gg_season(Cash, period = "month") +
labs(title="ATM2 - Monthly")
atm_cash_withdrawl_final_atm2 |>
gg_season(Cash, period = "month", polar = TRUE) +
labs(title="ATM2 - Monthly")
atm_cash_withdrawl_final_atm2 |>
gg_season(Cash, period = "year") +
labs(title="ATM2 - Yearly")
Looking at the first seasonality graph, it appears the seasonality period is a week. We don’t have that much data, so we can’t say if there’s annual seasonality.
The weekly seasonality plot during recent weeks show that Cash withdrawl increases until Tuesday, then drops down with a trough at Thursday, increases Friday, decreases a bit Saturday then increases a bit again Sunday.
From what looks like W10 - W20, Cash withdrawl starts lower on Monday and decreases Tuesday then increases Wednesday, slightly decreases Thursday and it’s hard to tell, but it looks like it increases to Sunday.
The monthly seasonal plot doesn’t give us much insight about the data.
Let’s take a look what these dates could be by looking at a seasonal subseries plot:
gg_subseries(atm_cash_withdrawl_final_atm2, Cash, period = "week") +
labs(title = "Daily ATM2 Cash")
Looks like around January there was a dip for Monday and Tuesday. Wednesday, Thursday and Friday had an increase in cash.
Now let’s look at the ACF:
atm_cash_withdrawl_final_atm2 |>
ACF(Cash, lag_max = 42) |>
autoplot()
Looks like Tuesday and Fridays have strong negative correlation, and Sundays have strong positive correlation.
Before we pick a model, let’s determine how we’ll handle the missing data.
# Find dates with NA values
atm_cash_withdrawl_final_atm2 |>
filter(is.na(Cash)) |>
mutate(day_of_the_week = weekdays(Date))
## # A tsibble: 2 x 4 [1D]
## Date ATM Cash day_of_the_week
## <date> <chr> <dbl> <chr>
## 1 2009-06-18 ATM2 NA Thursday
## 2 2009-06-24 ATM2 NA Wednesday
2009-06-13
, 2009-06-24
are the only dates
with missing values. Since we don’t know why this data is missing, we
probably shouldn’t remove the data entirely, especially since we’re not
dealing with a lot of data in the first place.
Since these are individual days, we could probably safely estimate the missing data:
atm_cash_withdrawl_final_atm2_miss <- atm_cash_withdrawl_final_atm2 |>
# Replace with missing values
fill_gaps()
atm_cash_withdrawl_final_atm2_fill <- atm_cash_withdrawl_final_atm2_miss |>
# Fit ARIMA model to the data containing missing values
model(ARIMA(Cash)) |>
# Estimate Trips for all periods
interpolate(atm_cash_withdrawl_final_atm2_miss)
atm_cash_withdrawl_final_atm2_fill |>
autoplot(Cash) +
autolayer(atm_cash_withdrawl_final_atm2_fill |> filter_index("2009-06-12"~"2009-06-14"),
Cash, colour="#D55E00") +
autolayer(atm_cash_withdrawl_final_atm2_fill |> filter_index("2009-06-23"~"2009-06-25"),
Cash, colour="#D55E00")
This graph looks very reasonable to me. This will now allow us to test some models.
Now let’s use an STL()
:
dcmp <- atm_cash_withdrawl_final_atm2_fill |>
model(stl = STL(Cash))
components(dcmp) |> autoplot()
# Confirm range
range(atm_cash_withdrawl_final_atm2$Date, na.rm = TRUE)
## [1] "2009-05-01" "2010-04-30"
# Experiment with all simple forecast models
atm_cash_withdrawl_final_atm2_fit <- atm_cash_withdrawl_final_atm2_fill |>
filter(!is.na(Cash)) |>
model(
Seasonal_naive = SNAIVE(Cash),
Naive = NAIVE(Cash),
Drift = RW(Cash ~ drift()),
Mean = MEAN(Cash)
)
# Forecast May 2010
atm_cash_withdrawl_final_atm2_fc <- atm_cash_withdrawl_final_atm2_fit |>
forecast(h = 31)
atm_cash_withdrawl_final_atm2_fc |>
autoplot(atm_cash_withdrawl_final_atm2, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
The only viable forecast is the seasonal naive, but even that does look great.
Let’s look at ETS()
:
# ETS
fit <- atm_cash_withdrawl_final_atm2_fill |>
model(
mna = ETS(Cash ~ error("M") + trend("N") + season("A")),
mnm = ETS(Cash ~ error("M") + trend("N") + season("M")), # best guess
ana = ETS(Cash ~ error("A") + trend("N") + season("A")),
anm = ETS(Cash ~ error("A") + trend("N") + season("M")), # best guess
)
fc <- fit |> forecast(h = 31) # forecast for May 2010
fc
## # A fable: 124 x 4 [1D]
## # Key: .model [4]
## .model Date
## <chr> <date>
## 1 mna 2010-05-01
## 2 mna 2010-05-02
## 3 mna 2010-05-03
## 4 mna 2010-05-04
## 5 mna 2010-05-05
## 6 mna 2010-05-06
## 7 mna 2010-05-07
## 8 mna 2010-05-08
## 9 mna 2010-05-09
## 10 mna 2010-05-10
## # ℹ 114 more rows
## # ℹ 2 more variables: Cash <dist>, .mean <dbl>
fc |>
autoplot(atm_cash_withdrawl_final_atm2_fill, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
# Confirm which is the best fit:
fit <- atm_cash_withdrawl_final_atm2_fill |>
model(ETS(Cash))
fit
## # A mable: 1 x 1
## `ETS(Cash)`
## <model>
## 1 <ETS(A,N,A)>
Visually, it is very hard to tell the difference for most of them
except for ETS(M, N, M)
. ETS(A,N,A)
is the
chosen best but let’s confirm that:
atm_cash_withdrawl_final_atm2_fill |>
stretch_tsibble(.init = 10) |>
model(
mna = ETS(Cash ~ error("M") + trend("N") + season("A")),
mnm = ETS(Cash ~ error("M") + trend("N") + season("M")),
ana = ETS(Cash ~ error("A") + trend("N") + season("A")),
snaive = SNAIVE(Cash)
) |>
forecast(h = 31) |>
accuracy(atm_cash_withdrawl_final_atm2_fill)
## Warning: 1 error encountered for mna
## [1] Not enough data to estimate this ETS model.
## Warning: 1 error encountered for mnm
## [1] Not enough data to estimate this ETS model.
## Warning: 1 error encountered for ana
## [1] Not enough data to estimate this ETS model.
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 31 observations are missing between 2010-05-01 and 2010-05-31
## # A tibble: 4 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ana Test -3.03 28.9 20.4 -Inf Inf 1.00 0.976 -0.00288
## 2 mna Test -4.04 30.9 21.9 NaN Inf 1.08 1.04 0.0338
## 3 mnm Test -3.85 36.7 24.6 -Inf Inf 1.21 1.24 0.170
## 4 snaive Test -0.876 32.0 21.9 -Inf Inf 1.08 1.08 -0.0132
The lowest RMSE
value is 28.93770 for
ETS(A,N,A)
which matches the chosen model. Since this has
the lowest RMSE
, let’s go with this model.
Now let’s check for white noise:
# ETS A,N,A
fit_atm2 <- atm_cash_withdrawl_final_atm2_fill |>
model(
mna = ETS(Cash ~ error("A") + trend("N") + season("A"))
)
fc_atm2 <- fit_atm2 |> forecast(h = 31) # forecast for May 2010
fc_atm2
## # A fable: 31 x 4 [1D]
## # Key: .model [1]
## .model Date
## <chr> <date>
## 1 mna 2010-05-01
## 2 mna 2010-05-02
## 3 mna 2010-05-03
## 4 mna 2010-05-04
## 5 mna 2010-05-05
## 6 mna 2010-05-06
## 7 mna 2010-05-07
## 8 mna 2010-05-08
## 9 mna 2010-05-09
## 10 mna 2010-05-10
## # ℹ 21 more rows
## # ℹ 2 more variables: Cash <dist>, .mean <dbl>
fc_atm2 |>
autoplot(atm_cash_withdrawl_final_atm2_fill, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
Now let’s check the residuals for white noise. Since this has seasonality, we pick l = 2m. The seasonal period is 1 week so 7 days, so l = 2 * 7 which is 14:
# Box-Pierce test
augment(fit_atm2) |> features(.innov, box_pierce, lag = 14)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 mna 35.0 0.00147
# Ljung-Box test
augment(fit_atm2) |> features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 mna 35.9 0.00110
For both methods, the p value is smaller than 0.05, so it’s not likely to occur by chance. These residuals are easily distinguished from white noise. We reject the white noise hypothesis.
Now let’s check ARIMA:
# ARIMA
fit_atm2 <- atm_cash_withdrawl_final_atm2_fill |>
model(
ARIMA(Cash)
)
fit_atm2
## # A mable: 1 x 1
## `ARIMA(Cash)`
## <model>
## 1 <ARIMA(2,0,2)(0,1,1)[7]>
fc_atm2 <- fit_atm1 |> forecast(h = 31) # forecast for May 2010
fc_atm2
## # A fable: 31 x 4 [1D]
## # Key: .model [1]
## .model Date
## <chr> <date>
## 1 ARIMA(Cash) 2010-05-01
## 2 ARIMA(Cash) 2010-05-02
## 3 ARIMA(Cash) 2010-05-03
## 4 ARIMA(Cash) 2010-05-04
## 5 ARIMA(Cash) 2010-05-05
## 6 ARIMA(Cash) 2010-05-06
## 7 ARIMA(Cash) 2010-05-07
## 8 ARIMA(Cash) 2010-05-08
## 9 ARIMA(Cash) 2010-05-09
## 10 ARIMA(Cash) 2010-05-10
## # ℹ 21 more rows
## # ℹ 2 more variables: Cash <dist>, .mean <dbl>
fc_atm2 |>
autoplot(atm_cash_withdrawl_final_atm2_fill, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
# Box-Pierce test
augment(fit_atm2) |> features(.innov, box_pierce, lag = 14)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(Cash) 8.84 0.841
# Ljung-Box test
augment(fit_atm2) |> features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(Cash) 9.06 0.827
These p values are greater than 0.05 so we can accept the white noise hypothesis.
The winning model is ARIMA(2,0,2)(0,1,1)
.
write_csv(fc_atm2, "~/Downloads/atm2-forecast.csv")
Let’s look at the original data:
atm_cash_withdrawl_final_atm3 <- atm_cash_withdrawl_final |>
filter(ATM == "ATM3")
# We now don't want `ATM` variable to be part of the key in tsibble, so let's convert
# back to data frame, and remove the `ATM` column
atm_cash_withdrawl_final_atm3_df <- atm_cash_withdrawl_final_atm3 |>
as.data.frame(atm_cash_withdrawl_final_atm3)
# Re-create tsibble without `ATM` variable
atm_cash_withdrawl_final_atm3 <- atm_cash_withdrawl_final_atm3_df |>
as_tsibble(index = Date)
head(atm_cash_withdrawl_final_atm3)
## # A tsibble: 6 x 3 [1D]
## Date ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM3 0
## 2 2009-05-02 ATM3 0
## 3 2009-05-03 ATM3 0
## 4 2009-05-04 ATM3 0
## 5 2009-05-05 ATM3 0
## 6 2009-05-06 ATM3 0
autoplot(atm_cash_withdrawl_final_atm3, Cash)
The autoplot shows that the most of the data is missing, which I’m assuming is due to the ATM not being active. Let’s take a look at the non-missing data:
# Confirm range
atm_cash_withdrawl_final_atm3 <- atm_cash_withdrawl_final_atm3 |>
filter(Cash > 0)
range(atm_cash_withdrawl_final_atm3$Date, na.rm = TRUE)
## [1] "2010-04-28" "2010-04-30"
autoplot(atm_cash_withdrawl_final_atm3, Cash)
We only have 3 days of data here, so it is a very short time series. We do not have much to go off of, so it is hard to tell anything about trend or seasonality.
Let’s start off with simple models and ARIMA since we do not have much data:
# Experiment with all simple forecast models
atm_cash_withdrawl_final_atm3_fit <- atm_cash_withdrawl_final_atm3 |>
model(
Seasonal_naive = SNAIVE(Cash),
Naive = NAIVE(Cash),
Drift = RW(Cash ~ drift()),
Mean = MEAN(Cash),
ARIMA(Cash),
ETS(Cash)
)
## Warning in sqrt(diag(best$var.coef)): NaNs produced
## Warning: 1 error encountered for ETS(Cash)
## [1] Not enough data to estimate this ETS model.
# Forecast May 2010
atm_cash_withdrawl_final_atm3_fc <- atm_cash_withdrawl_final_atm3_fit |>
forecast(h = 31)
atm_cash_withdrawl_final_atm3_fc |>
autoplot(atm_cash_withdrawl_final_atm3, level = NULL)
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_line()`).
The only one out of the question is the seasonal naive.
Let’s compare the models:
atm_cash_withdrawl_final_atm3 |>
stretch_tsibble(.init = 2) |>
model(
Naive = NAIVE(Cash),
Drift = RW(Cash ~ drift()),
Mean = MEAN(Cash),
ARIMA(Cash),
ETS(Cash)
) |>
forecast(h = 31) |>
accuracy(atm_cash_withdrawl_final_atm3)
## Warning in sqrt(diag(best$var.coef)): NaNs produced
## Warning: 2 errors (1 unique) encountered for ETS(Cash)
## [2] Not enough data to estimate this ETS model.
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 31 observations are missing between 2010-05-01 and 2010-05-31
## # A tibble: 5 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA(Cash) Test 17.0 17.0 17.0 20.0 20.0 NaN NaN NA
## 2 Drift Test 17 17 17 20 20 NaN NaN NA
## 3 ETS(Cash) Test NaN NaN NaN NaN NaN NaN NaN NA
## 4 Mean Test -4 4 4 -4.71 4.71 NaN NaN NA
## 5 Naive Test 3 3 3 3.53 3.53 NaN NaN NA
Naive has the lowest RMSE value, so let’s choose that one.
Now let’s check for white noise:
fit_atm3 <- atm_cash_withdrawl_final_atm3 |>
model(
Naive = NAIVE(Cash)
)
fc_atm3 <- fit_atm3 |> forecast(h = 31) # forecast for May 2010
fc_atm3
## # A fable: 31 x 4 [1D]
## # Key: .model [1]
## .model Date
## <chr> <date>
## 1 Naive 2010-05-01
## 2 Naive 2010-05-02
## 3 Naive 2010-05-03
## 4 Naive 2010-05-04
## 5 Naive 2010-05-05
## 6 Naive 2010-05-06
## 7 Naive 2010-05-07
## 8 Naive 2010-05-08
## 9 Naive 2010-05-09
## 10 Naive 2010-05-10
## # ℹ 21 more rows
## # ℹ 2 more variables: Cash <dist>, .mean <dbl>
fc_atm3 |>
autoplot(atm_cash_withdrawl_final_atm3, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
# Box-Pierce test
augment(fit_atm3) |> features(.innov, box_pierce)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 Naive 0.5 0.480
# Ljung-Box test
augment(fit_atm3) |> features(.innov, ljung_box)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 Naive 2 0.157
We don’t have enough data for lag=14, so without adding a lag, the p values are greater than 0.05 so we can accept the white noise hypothesis.
The final model is NAIVE(Cash)
.
write_csv(fc_atm3, "~/Downloads/atm3-forecast.csv")
Let’s take a look at the original data:
atm_cash_withdrawl_final_atm4 <- atm_cash_withdrawl_final |>
filter(ATM == "ATM4")
# We now don't want `ATM` variable to be part of the key in tsibble, so let's convert
# back to data frame, and remove the `ATM` column
atm_cash_withdrawl_final_atm4_df <- atm_cash_withdrawl_final_atm4 |>
as.data.frame(atm_cash_withdrawl_final_atm4)
# Re-create tsibble without `ATM` variable
atm_cash_withdrawl_final_atm4 <- atm_cash_withdrawl_final_atm4_df |>
as_tsibble(index = Date)
head(atm_cash_withdrawl_final_atm4)
## # A tsibble: 6 x 3 [1D]
## Date ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM4 777
## 2 2009-05-02 ATM4 524
## 3 2009-05-03 ATM4 793
## 4 2009-05-04 ATM4 908
## 5 2009-05-05 ATM4 53
## 6 2009-05-06 ATM4 52
autoplot(atm_cash_withdrawl_final_atm4, Cash)
The autoplot does not show a major trend of increasing or decreasing. There does seem like variance does change a bit, and there is a major outlier between Jan 2010 and Apr 2010.
To make the data more stable, let’s try a box cox transformation. A box cox transformation should help with the outlier as well:
atm_cash_withdrawl_final_atm4_lambda <- atm_cash_withdrawl_final_atm4 |>
features(Cash, features = guerrero) |>
pull(lambda_guerrero)
atm_cash_withdrawl_final_atm4 |>
autoplot(box_cox(Cash, atm_cash_withdrawl_final_atm4_lambda)) +
labs(y = "",
title = latex2exp::TeX(paste0(
"Transformed ATM4 with $\\lambda$ = ",
round(atm_cash_withdrawl_final_atm4_lambda,2))))
The data is clearer and more stable now, and it greatly helps the outlier. Since lambda is close to 0, let’s choose lambda=0 so we choose a simple log transformation that can easily be explainable.
atm_cash_withdrawl_final_atm4 |>
autoplot(box_cox(Cash, 0)) +
labs(y = "",
title = latex2exp::TeX(paste0(
"Transformed ATM4 with $\\lambda$ = ",
round(0))))
Now let’s check for seasonality on the transformed data:
# Check for all seasonality
gg_season(atm_cash_withdrawl_final_atm4, box_cox(Cash, 0)) +
labs(title="ATM4")
gg_season(atm_cash_withdrawl_final_atm4, box_cox(Cash, 0), period = "week") +
labs(title="ATM4 - Weekly")
gg_season(atm_cash_withdrawl_final_atm4, box_cox(Cash, 0), period = "month") +
labs(title="ATM4 - Monthly")
gg_season(atm_cash_withdrawl_final_atm4, box_cox(Cash, 0), period = "year") +
labs(title="ATM4 - Yearly")
Looking at the first seasonality graph, it appears the seasonality period is a week. The weekly seasonality plot shows a few different patterns indicating again that something happened during a particular month(s). It looks like there is usually either a peak on Tuesday or Wednesay (mostly Wednesday), frops Thursday, increases again Friday then drops Saturday, then increases Sunday.
From what looks like W10 - W20, Cash withdrawl decreases Tuesday then increases Wednesday, slightly decreases Thursday and it’s hard to tell, but it looks like it increases to Sunday.
The monthly seasonal plot doesn’t give us much insight about the data.
Let’s take a look what these dates could be by looking at a seasonal subseries plot:
atm_cash_withdrawl_final_atm4 |> gg_subseries(box_cox(Cash, 0), period = "week") +
labs(title = "Daily ATM4 Cash - Transformed")
So it looks like generally, cash decreases Tuesday, and this is due
to what happens in January. Tuesday normally would stay the same, but
something happened in January to cause the cash to decrease by a lot.
Wednesday is slight below Monday’s numbers. It then drops again
Thursday. What’s interesting is that during the same time Tuesday
decreased, Thursday increased which is similar to ATM1
.
Friday, Saturday, and Sunday stays the same for the most part.
Let’s look at ACF now:
atm_cash_withdrawl_final_atm4 |>
ACF(box_cox(Cash, 0), lag_max = 52) |>
autoplot()
The ACF shows every 7 lags (week), there is strong positive correlation. Most days of the week show negative correlation.
Since this has complex weekly seasonality, let’s choose to do an STL
decomposition. Since STL()
requires no missing periods, I
will choose to start this at July 2009:
dcmp <- atm_cash_withdrawl_final_atm4 |>
model(stl = STL(box_cox(Cash, 0)))
components(dcmp) |> autoplot()
gg_lag(atm_cash_withdrawl_final_atm4, Cash, geom = "point", lag=1:7)
gg_lag(atm_cash_withdrawl_final_atm4, Cash, geom = "point", lag=1:52)
# Confirm range
range(atm_cash_withdrawl_final_atm4$Date, na.rm = TRUE)
## [1] "2009-05-01" "2010-04-30"
# Experiment with all simple forecast models
atm_cash_withdrawl_final_atm4_fit <- atm_cash_withdrawl_final_atm4 |>
model(
Seasonal_naive = SNAIVE(box_cox(Cash, 0)),
Naive = NAIVE(box_cox(Cash, 0)),
Drift = RW(box_cox(Cash, 0) ~ drift()),
Mean = MEAN(box_cox(Cash, 0))
)
# Forecast May 2010
atm_cash_withdrawl_final_atm4_fc <- atm_cash_withdrawl_final_atm4_fit |>
forecast(h = 31)
atm_cash_withdrawl_final_atm4_fc |>
autoplot(atm_cash_withdrawl_final_atm4, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
The only viable forecast is the seasonal naive. Let’s look at
ETS()
:
# ETS
fit <- atm_cash_withdrawl_final_atm4 |>
model(
mna = ETS(box_cox(Cash, 0) ~ error("M") + trend("N") + season("A")),
mnm = ETS(box_cox(Cash, 0) ~ error("M") + trend("N") + season("M")),
ana = ETS(box_cox(Cash, 0) ~ error("A") + trend("N") + season("A")),
anm = ETS(box_cox(Cash, 0) ~ error("A") + trend("N") + season("M")),
)
fc <- fit |> forecast(h = 31)
fc |>
autoplot(atm_cash_withdrawl_final_atm4, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
# Confirm which is the best fit:
fit <- atm_cash_withdrawl_final_atm4 |>
model(ETS(box_cox(Cash, 0)))
fit
## # A mable: 1 x 1
## `ETS(box_cox(Cash, 0))`
## <model>
## 1 <ETS(M,N,A)>
atm_cash_withdrawl_final_atm4 |>
stretch_tsibble(.init = 10) |>
model(
mna = ETS(box_cox(Cash, 0) ~ error("M") + trend("N") + season("A")),
mnm = ETS(box_cox(Cash, 0) ~ error("M") + trend("N") + season("M")), # best guess
ana = ETS(box_cox(Cash, 0) ~ error("A") + trend("N") + season("A")),
snaive = SNAIVE(box_cox(Cash, 0))
) |>
forecast(h = 31) |>
accuracy(atm_cash_withdrawl_final_atm4)
## Warning: 1 error encountered for mna
## [1] Not enough data to estimate this ETS model.
## Warning: 1 error encountered for mnm
## [1] Not enough data to estimate this ETS model.
## Warning: 1 error encountered for ana
## [1] Not enough data to estimate this ETS model.
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 31 observations are missing between 2010-05-01 and 2010-05-31
## # A tibble: 4 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ana Test -46.4 682. 330. -594. 624. 0.822 0.761 0.0192
## 2 mna Test -80.0 702. 342. -632. 661. 0.851 0.783 0.0357
## 3 mnm Test -83.6 762. 349. -640. 669. 0.868 0.850 0.152
## 4 snaive Test -1773. 3932. 1928. -2794. 2811. 4.80 4.39 0.0429
The lowest RMSE value is ETS(M,N,A)
.
Now let’s check for white noise:
# ETS M,N,A
fit_atm4 <- atm_cash_withdrawl_final_atm4 |>
model(
mna = ETS(box_cox(Cash, 0) ~ error("M") + trend("N") + season("A"))
)
fc_atm4 <- fit_atm4 |> forecast(h = 31) # forecast for May 2010
fc_atm4
## # A fable: 31 x 4 [1D]
## # Key: .model [1]
## .model Date Cash .mean
## <chr> <date> <dist> <dbl>
## 1 mna 2010-05-01 t(N(5.2, 1.5)) 318.
## 2 mna 2010-05-02 t(N(5.7, 1.7)) 544.
## 3 mna 2010-05-03 t(N(5.8, 1.8)) 647.
## 4 mna 2010-05-04 t(N(3.5, 0.65)) 43.0
## 5 mna 2010-05-05 t(N(6, 1.9)) 800.
## 6 mna 2010-05-06 t(N(5.5, 1.6)) 427.
## 7 mna 2010-05-07 t(N(6.1, 2)) 904.
## 8 mna 2010-05-08 t(N(5.2, 1.5)) 322.
## 9 mna 2010-05-09 t(N(5.7, 1.8)) 551.
## 10 mna 2010-05-10 t(N(5.8, 1.8)) 650.
## # ℹ 21 more rows
fc_atm4 |>
autoplot(atm_cash_withdrawl_final_atm4, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
Now let’s check the residuals for white noise. Since this has seasonality, we pick l = 2m. The seasonal period is 1 week, so l = 2 * 7 which is 14:
# Box-Pierce test
augment(fit_atm4) |> features(.innov, box_pierce, lag = 14)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 mna 16.2 0.301
# Ljung-Box test
augment(fit_atm4) |> features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 mna 16.6 0.279
Both p values so we can accept the white noise hypothesis!
ETS(M,N,A)
is a model we can use.
Now let’s double check ARIMA in case it’s better:
atm_cash_withdrawl_final_atm4 |>
stretch_tsibble(.init = 10) |>
model(
ETS(box_cox(Cash, 0)),
ARIMA(box_cox(Cash, 0))
) |>
forecast(h = 31) |>
accuracy(atm_cash_withdrawl_final_atm4) |>
select(.model, RMSE:MAPE)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 31 observations are missing between 2010-05-01 and 2010-05-31
## # A tibble: 2 × 5
## .model RMSE MAE MPE MAPE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA(box_cox(Cash, 0)) 699. 344. -625. 652.
## 2 ETS(box_cox(Cash, 0)) 1039. 371. -689. 719.
Since ARIMA showed a lower RMSE value, let’s choose that model for the forecast.
fit_atm4 <- atm_cash_withdrawl_final_atm4 |>
model(
ARIMA(box_cox(Cash, 0))
)
fit_atm4
## # A mable: 1 x 1
## `ARIMA(box_cox(Cash, 0))`
## <model>
## 1 <ARIMA(0,0,0)(2,0,0)[7] w/ mean>
fc_atm4 <- fit_atm4 |> forecast(h = 31) # forecast for May 2010
fc_atm4
## # A fable: 31 x 4 [1D]
## # Key: .model [1]
## .model Date Cash .mean
## <chr> <date> <dist> <dbl>
## 1 ARIMA(box_cox(Cash, 0)) 2010-05-01 t(N(4.6, 1.7)) 187.
## 2 ARIMA(box_cox(Cash, 0)) 2010-05-02 t(N(5.7, 1.7)) 530.
## 3 ARIMA(box_cox(Cash, 0)) 2010-05-03 t(N(5.8, 1.7)) 634.
## 4 ARIMA(box_cox(Cash, 0)) 2010-05-04 t(N(4.4, 1.7)) 147.
## 5 ARIMA(box_cox(Cash, 0)) 2010-05-05 t(N(5.8, 1.7)) 582.
## 6 ARIMA(box_cox(Cash, 0)) 2010-05-06 t(N(5.2, 1.7)) 323.
## 7 ARIMA(box_cox(Cash, 0)) 2010-05-07 t(N(5.6, 1.7)) 499.
## 8 ARIMA(box_cox(Cash, 0)) 2010-05-08 t(N(4.5, 1.8)) 176.
## 9 ARIMA(box_cox(Cash, 0)) 2010-05-09 t(N(5.7, 1.8)) 569.
## 10 ARIMA(box_cox(Cash, 0)) 2010-05-10 t(N(5.7, 1.8)) 562.
## # ℹ 21 more rows
fc_atm4 |>
autoplot(atm_cash_withdrawl_final_atm4, level = NULL) +
guides(colour = guide_legend(title = "Forecast"))
# Box-Pierce test
augment(fit_atm4) |> features(.innov, box_pierce, lag = 14)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(box_cox(Cash, 0)) 15.7 0.333
# Ljung-Box test
augment(fit_atm4) |> features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(box_cox(Cash, 0)) 16.1 0.306
Both p values are greater than 0.05, so let’s pick use
ARIMA(0,0,0)(2,0,0)[7] w/ mean
.
write_csv(fc_atm4, "~/Downloads/atm4-forecast.csv")