library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tsibble 1.1.6 ✔ feasts 0.4.2
## ✔ tsibbledata 0.4.1 ✔ fable 0.4.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(readxl)
library(tsibble)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following object is masked from 'package:tsibble':
##
## index
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(readr)
library(writexl)
atm <- read_excel("ATM624Data.xlsx")
atm <- atm |> rename_with(tolower)
glimpse(atm)
## Rows: 1,474
## Columns: 3
## $ date <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
## $ atm <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …
atm <- atm |> mutate(date = as.Date(date, origin = "1900-01-01"), cash = cash * 100)
glimpse(atm)
## Rows: 1,474
## Columns: 3
## $ date <date> 2009-05-03, 2009-05-03, 2009-05-04, 2009-05-04, 2009-05-05, 2009…
## $ atm <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ cash <dbl> 9600, 10700, 8200, 8900, 8500, 9000, 9000, 5500, 9900, 7900, 8800…
atm_ts <- atm |> as_tsibble(index = date, key = atm)
head(atm_ts)
## # A tsibble: 6 x 3 [1D]
## # Key: atm [1]
## date atm cash
## <date> <chr> <dbl>
## 1 2009-05-03 ATM1 9600
## 2 2009-05-04 ATM1 8200
## 3 2009-05-05 ATM1 8500
## 4 2009-05-06 ATM1 9000
## 5 2009-05-07 ATM1 9900
## 6 2009-05-08 ATM1 8800
atm_ts %>% count(atm)
## # A tibble: 5 × 2
## atm n
## <chr> <int>
## 1 ATM1 365
## 2 ATM2 365
## 3 ATM3 365
## 4 ATM4 365
## 5 <NA> 14
atm_ts %>% group_by(atm) %>% summarise(null_count = sum(is.na(cash)))
## # A tsibble: 1,474 x 3 [1D]
## # Key: atm [5]
## atm date null_count
## <chr> <date> <int>
## 1 ATM1 2009-05-03 0
## 2 ATM1 2009-05-04 0
## 3 ATM1 2009-05-05 0
## 4 ATM1 2009-05-06 0
## 5 ATM1 2009-05-07 0
## 6 ATM1 2009-05-08 0
## 7 ATM1 2009-05-09 0
## 8 ATM1 2009-05-10 0
## 9 ATM1 2009-05-11 0
## 10 ATM1 2009-05-12 0
## # ℹ 1,464 more rows
There appears to be 14 missing atm data. However, All four ATMs look to have 365 data so It may be a good idea to drop them for now.
ggplot(atm_ts, aes(x = date, y = cash, color = atm)) +
geom_line() +
labs(title = "Daily ATM Cash Withdrawals", y = "Cash ($)", x = "Date") +
theme_minimal()
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
atm_ts %>% count(atm)
## # A tibble: 5 × 2
## atm n
## <chr> <int>
## 1 ATM1 365
## 2 ATM2 365
## 3 ATM3 365
## 4 ATM4 365
## 5 <NA> 14
summary(atm_ts$cash)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 50 7300 15558 11400 1091976 19
There is a withdrawal of 1091976 for atm 4. It is unclear whether or not this was a error in the data.
ggplot(atm_ts, aes(x = date, y = cash, color = atm)) +
geom_line() +
labs(title = "Daily ATM Cash Withdrawals", y = "Cash ($)", x = "Date") +
theme_minimal()
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
atm1 <- atm_ts %>%
filter(atm == "ATM1") %>%
mutate(cash = na.approx(cash, na.rm = FALSE))
atm2 <- atm_ts %>%
filter(atm == "ATM2") %>%
mutate(cash = na.approx(cash, na.rm = FALSE))
atm3 <- atm_ts %>% filter(atm == "ATM3")
atm4 <- atm_ts %>%
filter(atm == "ATM4") %>%
mutate(second_highest = sort(cash, decreasing = TRUE)[2],
cash = if_else(cash > 9000, second_highest, cash)) %>%
select(-second_highest) %>%
mutate(cash = na.approx(cash, na.rm = FALSE))
ggplot(atm1, aes(x = date, y = cash, color = atm)) +
geom_line() +
labs(title = "Daily ATM 1 Cash Withdrawals", y = "Cash ($)", x = "Date") +
theme_minimal()
ggplot(atm2, aes(x = date, y = cash, color = atm)) +
geom_line() +
labs(title = "Daily ATM 2 Cash Withdrawals", y = "Cash ($)", x = "Date") +
theme_minimal()
ggplot(atm3, aes(x = date, y = cash, color = atm)) +
geom_line() +
labs(title = "Daily ATM 3 Cash Withdrawals", y = "Cash ($)", x = "Date") +
theme_minimal()
ggplot(atm4, aes(x = date, y = cash, color = atm)) +
geom_line() +
labs(title = "Daily ATM 4 Cash Withdrawals", y = "Cash ($)", x = "Date") +
theme_minimal()
atm_ts |>
group_by(atm) |>
summarise(
null_count = sum(is.na(cash))
)
## # A tsibble: 1,474 x 3 [1D]
## # Key: atm [5]
## atm date null_count
## <chr> <date> <int>
## 1 ATM1 2009-05-03 0
## 2 ATM1 2009-05-04 0
## 3 ATM1 2009-05-05 0
## 4 ATM1 2009-05-06 0
## 5 ATM1 2009-05-07 0
## 6 ATM1 2009-05-08 0
## 7 ATM1 2009-05-09 0
## 8 ATM1 2009-05-10 0
## 9 ATM1 2009-05-11 0
## 10 ATM1 2009-05-12 0
## # ℹ 1,464 more rows
atm1 %>% model(stl = STL(cash)) %>% components() %>% autoplot()
atm1 <- atm1 %>% mutate(sqrt_cash = sqrt(cash))
atm1 |> model(stl = STL(sqrt_cash)) |> components() |> autoplot()
atm1_train <- atm1 |> filter(date < "2010-04-01")
atm1_test <- atm1 |> filter(date >= "2010-04-01")
atm1_fits <- atm1_train |> model(
snaive = SNAIVE(sqrt_cash),
auto_ETS = ETS(sqrt_cash),
holt_winters_add = ETS(sqrt_cash ~ error("A") + trend("N") + season("A")),
arima = ARIMA(sqrt_cash)
)
atm1_fcs <- atm1_fits |> forecast(h = nrow(atm1_test) + 29)
atm1_fcs |> autoplot(atm1_test) +
labs(y = "SQRT of Cash Withdrawn (Dollars)",
title = "Forecast of Cash Withdrawn (April 1st - May 31st 2010)")
atm1_final_fit <- atm1 |> model(arima = ARIMA(sqrt_cash ~ pdq(0,0,2) + PDQ(0,1,1,period=7)))
atm1_final_fc <- atm1_final_fit |> forecast(h = 29)
atm1_final_fit |> select(.model = "arima") |> gg_tsresiduals() +
labs(title = "ARIMA(0,0,2)(0,1,1)[7] Model Residuals")
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
atm1_fc <- atm1_final_fc |>
as_tibble() |>
filter(.model == "arima") |>
mutate(
cash_lower_ci_95 = hilo(sqrt_cash)$lower^2,
cash_prediction = mean(sqrt_cash)^2,
cash_upper_ci_95 = hilo(sqrt_cash)$upper^2
) |>
select(.model, date, cash_prediction, cash_lower_ci_95, cash_upper_ci_95) |>
mutate(atm_id = "atm1")
atm2 |> model(stl = STL(cash)) |> components() |> autoplot()
atm2_train <- atm2 |> filter(date < "2010-04-01")
atm2_test <- atm2 |> filter(date >= "2010-04-01")
atm2_fits <- atm2_train |> model(
snaive = SNAIVE(cash),
auto_ETS = ETS(cash),
holt_winters_add = ETS(cash ~ error("A") + trend("N") + season("A")),
arima = ARIMA(cash)
)
atm2_fcs <- atm2_fits |> forecast(h = nrow(atm2_test) + 29)
atm2_fcs |> autoplot(atm2_test) +
labs(y = "Cash Withdrawn (Dollars)",
title = "Forecast of Cash Withdrawn (April 1st - May 31st 2010)")
atm2_final_fit <- atm2 |> model(arima = ARIMA(cash ~ pdq(2,0,2) + PDQ(0,1,1,period=7)))
atm2_final_fc <- atm2_final_fit |> forecast(h = 29)
atm2_final_fit |> select(.model = "arima") |> gg_tsresiduals() +
labs(title = "ARIMA(2,0,2)(0,1,1)[7] Model Residuals")
atm2_fc <- atm2_final_fc |>
as_tibble() |>
filter(.model == "arima") |>
mutate(
cash_lower_ci_95 = hilo(cash)$lower^2,
cash_prediction = mean(cash)^2,
cash_upper_ci_95 = hilo(cash)$upper^2
) |>
select(.model, date, cash_prediction, cash_lower_ci_95, cash_upper_ci_95) |>
mutate(atm_id = "atm2")
atm3 |> filter(cash > 0) |> autoplot()
## Plot variable not specified, automatically selected `.vars = cash`
atm3_fit <- atm3 |> model(NAIVE(cash))
atm3_final_fc <- atm3_fit |> forecast(h = 29)
atm3_final_fc |> autoplot(atm3)
atm3_fc <- atm3_final_fc |>
as_tibble() |>
filter(.model == "arima") |>
mutate(
cash_lower_ci_95 = hilo(cash)$lower^2,
cash_prediction = mean(cash)^2,
cash_upper_ci_95 = hilo(cash)$upper^2
) |>
select(.model, date, cash_prediction, cash_lower_ci_95, cash_upper_ci_95) |>
mutate(atm_id = "atm3")
atm4 |> model(stl = STL(cash)) |> components() |> autoplot()
atm4_lambda <- atm4 |> features(cash, features = guerrero) |> pull(lambda_guerrero)
atm4_final_fit <- atm4 |> model(arima = ARIMA(cash))
atm4_final_fc <- atm4_final_fit |> forecast(h = 29)
atm4_final_fit |> select(.model = "arima") |> gg_tsresiduals() +
labs(title = "ATM4 ARIMA Model Residuals")
atm4_final_fc |> autoplot(atm4)
atm4_fc <- atm4_final_fc |>
as_tibble() |>
filter(.model == "arima") |>
mutate(
cash_lower_ci_95 = hilo(cash)$lower^2,
cash_prediction = mean(cash)^2,
cash_upper_ci_95 = hilo(cash)$upper^2
) |>
select(.model, date, cash_prediction, cash_lower_ci_95, cash_upper_ci_95) |>
mutate(atm_id = "atm4")
atm_fc_final <- bind_rows(atm1_fc, atm2_fc, atm3_fc, atm4_fc)
atm_fc_final
## # A tibble: 87 × 6
## .model date cash_prediction cash_lower_ci_95 cash_upper_ci_95 atm_id
## <chr> <date> <dbl> <dbl> <dbl> <chr>
## 1 arima 2010-05-03 8658. 3909. 15272. atm1
## 2 arima 2010-05-04 10074. 4832. 17219. atm1
## 3 arima 2010-05-05 7397. 3027. 13687. atm1
## 4 arima 2010-05-06 443. 98.9 2707. atm1
## 5 arima 2010-05-07 10024. 4779. 17189. atm1
## 6 arima 2010-05-08 7946. 3382. 14430. atm1
## 7 arima 2010-05-09 8568. 3792. 15265. atm1
## 8 arima 2010-05-10 8710. 3643. 15953. atm1
## 9 arima 2010-05-11 10058. 4526. 17770. atm1
## 10 arima 2010-05-12 7397. 2806. 14170. atm1
## # ℹ 77 more rows
write_xlsx(atm_fc_final, "ATM624DForecastOutput.xlsx")
power_usage <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
head(power_usage)
## # A tibble: 6 × 3
## CaseSequence `YYYY-MMM` KWH
## <dbl> <chr> <dbl>
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
power_usage <- power_usage %>%
rename(Date = "YYYY-MMM") %>%
mutate(Date = yearmonth(Date)) %>%
as_tsibble(index = Date)
glimpse(power_usage)
## Rows: 192
## Columns: 3
## $ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ Date <mth> 1998 Jan, 1998 Feb, 1998 Mar, 1998 Apr, 1998 May, 1998 Ju…
## $ KWH <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…
power_usage %>%
autoplot(KWH) +
labs(
x = "Date",
y = "Power Usage (kWh)",
title = "Power Usage Over Time"
)
power_usage$KWH <- ifelse(is.na(power_usage$KWH), mean(power_usage$KWH, na.rm = TRUE), power_usage$KWH)
power_usage %>% filter(is.na(power_usage$KWH))
## # A tsibble: 0 x 3 [?]
## # ℹ 3 variables: CaseSequence <dbl>, Date <mth>, KWH <dbl>
limit <- 3000000
power_usage$KWH <- ifelse(power_usage$KWH < limit, mean(power_usage$KWH, na.rm = TRUE), power_usage$KWH)
lambda <- power_usage %>% features(KWH, features = guerrero) %>% pull(lambda_guerrero)
power_usage$KWH_transformed <- box_cox(power_usage$KWH, lambda)
power_usage %>%
autoplot(KWH_transformed) +
labs(x = "Month", y = "kWh (Box-Cox)", title = "Box-Cox Transformed Power Usage")
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
adf_result <- adf.test(power_usage$KWH_transformed)
## Warning in adf.test(power_usage$KWH_transformed): p-value smaller than printed
## p-value
adf_result
##
## Augmented Dickey-Fuller Test
##
## data: power_usage$KWH_transformed
## Dickey-Fuller = -4.5238, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
Our p-value is less then 0.05 so we can assume that the series is stationary
power_usage %>%
gg_tsdisplay(
difference(difference(KWH_transformed, 12)),
plot_type = "partial",
lag = 36
) +
labs(title = "Seasonally Differenced Series", y = "")
## Warning: `gg_tsdisplay()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsdisplay()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 13 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 13 rows containing missing values or values outside the scale range
## (`geom_point()`).
power_usage_fit <- power_usage %>%
model(
Mean = MEAN(KWH_transformed),
Naive = NAIVE(KWH_transformed),
SeasonalNaive = SNAIVE(KWH_transformed)
)
power_usage_fit %>% forecast(h = 5) %>% autoplot(power_usage)
power_usage_fit %>% select(SeasonalNaive) %>% gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
power_usage_arima <- power_usage %>%
model(
auto = ARIMA(KWH_transformed),
arima = ARIMA(KWH_transformed ~ pdq(2,1,0) + PDQ(0,1,1))
)
power_usage_fc <- power_usage_arima %>% forecast(h = 12)
power_usage_fc %>% autoplot(power_usage)
power_usage_arima %>% select(auto) %>% gg_tsresiduals()
library(fabletools)
power_usage_fc_back <- power_usage_fc %>%
mutate(.mean = inv_box_cox(.mean, lambda))
power_usage_fc_back
## # A fable: 24 x 4 [1M]
## # Key: .model [2]
## .model Date
## <chr> <mth>
## 1 auto 2014 Jan
## 2 auto 2014 Feb
## 3 auto 2014 Mar
## 4 auto 2014 Apr
## 5 auto 2014 May
## 6 auto 2014 Jun
## 7 auto 2014 Jul
## 8 auto 2014 Aug
## 9 auto 2014 Sep
## 10 auto 2014 Oct
## # ℹ 14 more rows
## # ℹ 2 more variables: KWH_transformed <dist>, .mean <dbl>
power_usage_fc_back %>%
autoplot(power_usage, level = NULL) +
labs(x = "Month", y = "kWh", title = "Forecasted Power Usage (Back-Transformed)")
export_power_usage <- as.data.frame(power_usage_fc_back)
export_power_usage <- export_power_usage %>%
select(-KWH_transformed)
write_xlsx(export_power_usage, "ResidentialCustomerForecastOutput.xlsx")