library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.4 ✔ fable 0.4.1
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(AppliedPredictiveModeling)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ purrr 1.0.4 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readxl)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
atmdf <- read_excel("C:/Users/ddebo/Downloads/ATM624Data.xlsx")
summary(atmdf)
## DATE ATM Cash
## Min. :39934 Length:1474 Min. : 0.0
## 1st Qu.:40026 Class :character 1st Qu.: 0.5
## Median :40118 Mode :character Median : 73.0
## Mean :40118 Mean : 155.6
## 3rd Qu.:40210 3rd Qu.: 114.0
## Max. :40312 Max. :10919.8
## NA's :19
The file contains three columns: two numbers and one character. Since the first column is meant to represent date and not number of days since 1900, it should be converted. The 19 NA values in the cash column also need to be investigated.
atmdf <- atmdf |>
mutate(DATE = as.Date(DATE, origin = "1899-12-30")) |>
subset(!is.na(ATM))
In fact, the rows with NA values for Cash may have been a convenient place for me to leave my prediction, so they can be filtered out for now. To analyze this data as a time series in R, it should be in a tsibble format, so we can autoplot
atmts <- atmdf |>
as_tsibble(index = DATE, key = ATM) |>
fill_gaps(.full = TRUE)
autoplot(atmts) +
facet_wrap(~ATM)
## Plot variable not specified, automatically selected `.vars = Cash`
Looking at the autoplots, there may be an outlier or typographical error in the data for ATM4. Looking at the value, it is not obvious which, so another possibility is to replace that value with the median. Either way, I need to detach the scaling of the y axis from ATM4.
autoplot(atmts) +
facet_wrap(~ATM, scales = "free_y") +
labs(y="Hundreds of Dollars")
## Plot variable not specified, automatically selected `.vars = Cash`
This is closer to what I expected to see. For convenience in fitting each atm with its own model, I will pivot the data to make each atm its own column rather than values within one column.
atmpiv<- atmts |>
pivot_wider(names_from = ATM, values_from = Cash)
summary(atmpiv)
## DATE ATM1 ATM2 ATM3
## Min. :2009-05-01 Min. : 1.00 Min. : 0.00 Min. : 0.0000
## 1st Qu.:2009-07-31 1st Qu.: 73.00 1st Qu.: 25.50 1st Qu.: 0.0000
## Median :2009-10-30 Median : 91.00 Median : 67.00 Median : 0.0000
## Mean :2009-10-30 Mean : 83.89 Mean : 62.58 Mean : 0.7206
## 3rd Qu.:2010-01-29 3rd Qu.:108.00 3rd Qu.: 93.00 3rd Qu.: 0.0000
## Max. :2010-04-30 Max. :180.00 Max. :147.00 Max. :96.0000
## NA's :3 NA's :2
## ATM4
## Min. : 1.563
## 1st Qu.: 124.334
## Median : 403.839
## Mean : 474.043
## 3rd Qu.: 704.507
## Max. :10919.762
##
atmpiv$ATM1 <- atmpiv$ATM1 %>% replace(is.na(.), median(atmpiv$ATM1, na.rm = TRUE))
atmpiv$ATM2 <- atmpiv$ATM2 %>% replace(is.na(.), median(atmpiv$ATM2, na.rm = TRUE))
atmpiv$ATM4[atmpiv$ATM4==max(atmpiv$ATM4)] <- median(atmpiv$ATM4, na.rm = TRUE)
We can now split this pivoted data into separate time series.
ts_ATM1 <- atmpiv |>
select(c(DATE, ATM1))
ts_ATM2 <- atmpiv |>
select(c(DATE, ATM2))
ts_ATM3 <- atmpiv |>
select(c(DATE, ATM3))
ts_ATM4 <- atmpiv |>
select(c(DATE, ATM4))
ts_ATM1 |>
gg_tsdisplay()
## Plot variable not specified, automatically selected `y = ATM1`
There seems to be weekly seasonality looking at the strong peaks at 7, 14, and 21 on the ACF for ATM1, so indeed, this is not stationary data. So for the missing data, the best plan seems to be to use the value from seven days prior. But since I have not been able to come up with a way to code this, I’m using the median instead.
fit <- ts_ATM1 |>
model(ETS(ATM1))
report(fit)
## Series: ATM1
## Model: ETS(A,N,A)
## Smoothing parameters:
## alpha = 0.01651456
## gamma = 0.3270894
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 78.33245 -60.17048 -9.153121 17.72025 6.50369 15.97554 16.44455 12.67958
##
## sigma^2: 579.039
##
## AIC AICc BIC
## 4486.250 4486.871 4525.249
The program recommends an A,N,A model, additive seasonality and error with no trend, which makes sense given the appearance of the data. Now to plot the predictions made by this model for the following month.
fc <- fit |>
forecast(h = 31)
fc |>
autoplot(ts_ATM1) +
geom_line(aes(y = .fitted), col="#D55E00",
data = augment(fit)) +
labs(y="Hundreds of Dollars", title="Predicted ATM Usage") +
guides(colour = "none")
fit5 <- forecast::auto.arima(ts_ATM1)
summary(fit5)
## Series: ts_ATM1
## ARIMA(0,0,1)(0,1,2)[7]
##
## Coefficients:
## ma1 sma1 sma2
## 0.1695 -0.5867 -0.0989
## s.e. 0.0553 0.0508 0.0497
##
## sigma^2 = 559.8: log likelihood = -1641.12
## AIC=3290.23 AICc=3290.34 BIC=3305.75
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.07363941 23.3332 14.60281 -102.7359 117.6027 0.8247052
## ACF1
## Training set -0.008134059
Because the original data was not stationary, a regular ARIMA model would not work and the program returned a Seasonal ARIMA model. This may end up being best, but perhaps a Box-Cox transformation will make it stationary and improve the other models?
lambda1 <- ts_ATM1 |>
features(ATM1, features = guerrero) |>
pull(lambda_guerrero)
ATM1_models <-
ts_ATM1 |>
model("ANA" = ETS(ATM1 ~ error("A") + trend("N") + season("A")),
"Box-Cox" = ETS(box_cox(ATM1,lambda1) ~ error("A") + trend("N") + season("A")),
"ARIMA" = ARIMA(ATM1),
"ARIMA w/ Box-Cox" = ARIMA(ATM1, seasonal = TRUE, lambda = lambda1)
)
## Warning: 1 error encountered for ARIMA w/ Box-Cox
## [1] Could not find an appropriate ARIMA model.
## This is likely because automatic selection does not select models with characteristic roots that may be numerically unstable.
## For more details, refer to https://otexts.com/fpp3/arima-r.html#plotting-the-characteristic-roots
ATM1_forecast <-
ATM1_models |>
forecast(h = 31)
ATM1_forecast|>
autoplot(ts_ATM1, level=NULL) +
labs(y="Hundreds of Dollars")
## Warning: Removed 31 rows containing missing values or values outside the scale range
## (`geom_line()`).
The Box-Cox transformation definitely improved our prediction. Now to compare to ARIMA models once we have accounted for the seasonality.
report(ATM1_models)
## Warning in report.mdl_df(ATM1_models): Model reporting is only supported for
## individual models, so a glance will be shown. To see the report for a specific
## model, use `select()` and `filter()` to identify a single model.
## # A tibble: 3 × 11
## .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE ar_roots ma_roots
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 ANA 579. -2233. 4486. 4487. 4525. 565. 568. 15.0 <NULL> <NULL>
## 2 Box-C… 1.61 -1159. 2339. 2339. 2378. 1.57 1.58 0.684 <NULL> <NULL>
## 3 ARIMA 560. -1641. 3290. 3290. 3306. NA NA NA <cpl> <cpl>
The lowest AICc is found in the Box-Cox transformed data using the ETS (A,N,A) model, so this would generate the strongest forecasts. I wondered why the ARIMA on the transformed data was able to produce graphed data, but was not included in the report.
fitt <- ts_ATM1 |>
model(ARIMA(ATM1, seasonal = TRUE, lambda = lambda1))
## Warning: 1 error encountered for ARIMA(ATM1, seasonal = TRUE, lambda = lambda1)
## [1] Could not find an appropriate ARIMA model.
## This is likely because automatic selection does not select models with characteristic roots that may be numerically unstable.
## For more details, refer to https://otexts.com/fpp3/arima-r.html#plotting-the-characteristic-roots
report(fitt)
## Series: ATM1
## Model: NULL model
## NULL model
The error model that we get in viewing this data says that the data has potentially unstable roots, so it is fair to remove this model from consideration and stick with the Box-Cox transformed ETS.
ts_ATM2 |>
gg_tsdisplay()
## Plot variable not specified, automatically selected `y = ATM2`
ATM2 has a very similar ACF plot to ATM1, showing a weekly cycle, though no sign of a trend. It is likely to be recommended the same ETS model as ATM1. We should account for the seasonality and
fit2 <- ts_ATM2 |>
model(ETS(ATM2))
report(fit2)
## Series: ATM2
## Model: ETS(A,N,A)
## Smoothing parameters:
## alpha = 0.000100055
## gamma = 0.3627371
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 72.97637 -46.64692 -22.2591 4.814576 0.2230553 20.58732 14.88622 28.39485
##
## sigma^2: 644.2689
##
## AIC AICc BIC
## 4525.212 4525.834 4564.211
As predicted, an A,N,A model was chosen.
fc2 <- fit2 |>
forecast(h = 31)
fc2 |>
autoplot(ts_ATM2) +
geom_line(aes(y = .fitted), col="#D55E00",
data = augment(fit)) +
labs(y="Hundreds of Dollars", title="Predicted ATM Usage") +
guides(colour = "none")
lambda2 <- ts_ATM2 |>
features(ATM2, features = guerrero) |>
pull(lambda_guerrero)
ATM2_models <-
ts_ATM2 |>
model("ANA" = ETS(ATM2 ~ error("A") + trend("N") + season("A")),
"Box-Cox" = ETS(box_cox(ATM2,lambda2) ~ error("A") + trend("N") + season("A")),
"ARIMA" = ARIMA(ATM2),
"ARIMA + BC" = ARIMA(ATM2, seasonal = TRUE, lambda = lambda2)
)
## Warning: 1 error encountered for ARIMA + BC
## [1] Could not find an appropriate ARIMA model.
## This is likely because automatic selection does not select models with characteristic roots that may be numerically unstable.
## For more details, refer to https://otexts.com/fpp3/arima-r.html#plotting-the-characteristic-roots
ATM2_forecast <-
ATM2_models |>
forecast(h = 31)
ATM2_forecast|>
autoplot(ts_ATM2, level=NULL) +
labs(y="Hundreds of Dollars")
## Warning: Removed 31 rows containing missing values or values outside the scale range
## (`geom_line()`).
report(ATM2_models)
## Warning in report.mdl_df(ATM2_models): Model reporting is only supported for
## individual models, so a glance will be shown. To see the report for a specific
## model, use `select()` and `filter()` to identify a single model.
## # A tibble: 3 × 11
## .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE ar_roots ma_roots
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 ANA 644. -2253. 4525. 4526. 4564. 628. 630. 17.8 <NULL> <NULL>
## 2 Box-Cox 74.6 -1859. 3738. 3739. 3777. 72.8 73.0 5.99 <NULL> <NULL>
## 3 ARIMA 602. -1653. 3319. 3319. 3342. NA NA NA <cpl [2]> <cpl [9]>
fit6 <- ts_ATM2 |>
model(ARIMA(ATM2, seasonal = TRUE, lambda = lambda2))
## Warning: 1 error encountered for ARIMA(ATM2, seasonal = TRUE, lambda = lambda2)
## [1] Could not find an appropriate ARIMA model.
## This is likely because automatic selection does not select models with characteristic roots that may be numerically unstable.
## For more details, refer to https://otexts.com/fpp3/arima-r.html#plotting-the-characteristic-roots
Trying the ARIMA with transformed data results in the same error. Looking at these values, the best model would be the ETS model after the Box-Cox transformation. More importantly, it confirmed that using just the ARIMA function will automatically use what auto.arima would have selected.
fit2 <- ts_ATM2 |>
model(ETS(box_cox(ATM2,lambda2) ~ error("A") + trend("N") + season("A")))
There is no need to run any model to predict values for ATM3. I have two competing thoughts. The first is to use the Naive Method. The only non-zero values occur at the very end. Maybe the ATM was broken or not yet functional earlier and now we are just able to see how people use it. The other thought is that the overwhelming mode of that data is zero, so using that to base predictions off of seems sensible. Without more information, I’ll follow the first instinct and say that my prediction for each day in the next month for ATM3 is 8500 dollars.
ts_ATM4 |>
gg_tsdisplay()
## Plot variable not specified, automatically selected `y = ATM4`
After replacing the outlier with the median, ATM4 produces an ACF graph fairly similar to the first two, with likely weekly seasonal cycles.
lambda4 <- ts_ATM4 |>
features(ATM4, features = guerrero) |>
pull(lambda_guerrero)
ATM4_models <-
ts_ATM4 |>
model("ANA" = ETS(ATM4 ~ error("A") + trend("N") + season("A")),
"Box-Cox" = ETS(box_cox(ATM4,lambda4) ~ error("A") + trend("N") + season("A")),
"ARIMA" = ARIMA(ATM4)
)
ATM4_forecast <-
ATM4_models |>
forecast(h = 31)
ATM4_forecast|>
autoplot(ts_ATM4, level=NULL) +
labs(y="Hundreds of Dollars")
report(ATM4_models)
## Warning in report.mdl_df(ATM4_models): Model reporting is only supported for
## individual models, so a glance will be shown. To see the report for a specific
## model, use `select()` and `filter()` to identify a single model.
## # A tibble: 3 × 11
## .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE ar_roots
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list>
## 1 ANA 110776. -3192. 6404. 6405. 6443. 108044. 108251. 265. <NULL>
## 2 Box-Cox 173. -2012. 4044. 4045. 4083. 168. 169. 10.3 <NULL>
## 3 ARIMA 117510. -2645. 5306. 5307. 5338. NA NA NA <cpl [10]>
## # ℹ 1 more variable: ma_roots <list>
The Box-Cox transformed data will provide the best predictions of this set, but with these large errors, I question the strength of the prediction.
fit4 <- ts_ATM4 |>
model(ETS(box_cox(ATM4,lambda4) ~ error("A") + trend("N") + season("A")))
31 observations are needed because May has 31 days
atm1.fc <- data.frame(forecast(fit, h=31)) |>
remove_rownames()
write_csv(atm1.fc, "ATM1_May.csv")
atm2.fc <- data.frame(forecast(fit2, h=31)) |>
remove_rownames()
write_csv(atm2.fc, "ATM2_May.csv")
atm4.fc <- data.frame(forecast(fit4, h=31)) |>
remove_rownames()
write_csv(atm4.fc, "ATM4_May.csv")
As far as ATM3 is concerned, it is easiest to generate a data frame where all values are the last recorded value, 85. It would help to appear the same as the others.
start_date <- ymd("2010/05/01")
end_date <- ymd("2010/05/30")
range <- seq(start_date, end_date,"days")
atm3.fc <- data.frame (
model = "Naive",
date = range,
mean = 85
)
write_csv(atm3.fc, "ATM3_May.csv")
elecdf <- read_excel("C:/Users/ddebo/Downloads/ResidentialCustomerForecastLoad-624.xlsx")
summary(elecdf)
## CaseSequence YYYY-MMM KWH
## Min. :733.0 Length:192 Min. : 770523
## 1st Qu.:780.8 Class :character 1st Qu.: 5429912
## Median :828.5 Mode :character Median : 6283324
## Mean :828.5 Mean : 6502475
## 3rd Qu.:876.2 3rd Qu.: 7620524
## Max. :924.0 Max. :10655730
## NA's :1
Again, the first issue is that our variable of time is being represented by a character, so that needs to change when we form our time series.
elects <- ts(elecdf[,"KWH"], start=c(1998,1), frequency=12) |>
tsclean()
summary(elects)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4313019 5443502 6351262 6529723 7608792 10655730
So we have a time series, but it is not yet a tsibble.
elects <- elects |>
as_tsibble()
autoplot(elects)
## Plot variable not specified, automatically selected `.vars = value`
There might be a slight positive trend. It is also uncertain whether there is seasonality present. There are cycles, but I need to investigate further if they are regularly spaced.
elects |>
gg_tsdisplay()
## Plot variable not specified, automatically selected `y = value`
The strongest autocorrelations occur at lag 12, so there is yearly seasonality present. Other significant autocorrelations occur at lags 1, 3, 6, 9…. so this data is not stationary and there are multiple cycles at play.