Project 1

Load Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fpp3)
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tsibble     1.1.6     ✔ feasts      0.4.2
## ✔ tsibbledata 0.4.1     ✔ fable       0.4.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
library(readxl)
library(tsibble)
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following object is masked from 'package:tsibble':
## 
##     index
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(readr)
library(writexl)

Part A

Load The Data Into A Dataframe

atm <- read_excel("ATM624Data.xlsx")
atm <- atm |> rename_with(tolower)
glimpse(atm)
## Rows: 1,474
## Columns: 3
## $ date <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
## $ atm  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …

Converting Dates and Cash Into Proper Calender Date and Cash Value

atm <- atm |> mutate(date = as.Date(date, origin = "1900-01-01"), cash = cash * 100)
glimpse(atm)
## Rows: 1,474
## Columns: 3
## $ date <date> 2009-05-03, 2009-05-03, 2009-05-04, 2009-05-04, 2009-05-05, 2009…
## $ atm  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ cash <dbl> 9600, 10700, 8200, 8900, 8500, 9000, 9000, 5500, 9900, 7900, 8800…

Convert to Tsibble

atm_ts <- atm |> as_tsibble(index = date, key = atm)
head(atm_ts)
## # A tsibble: 6 x 3 [1D]
## # Key:       atm [1]
##   date       atm    cash
##   <date>     <chr> <dbl>
## 1 2009-05-03 ATM1   9600
## 2 2009-05-04 ATM1   8200
## 3 2009-05-05 ATM1   8500
## 4 2009-05-06 ATM1   9000
## 5 2009-05-07 ATM1   9900
## 6 2009-05-08 ATM1   8800

Exploratory Data Analysis

atm_ts %>% count(atm)
## # A tibble: 5 × 2
##   atm       n
##   <chr> <int>
## 1 ATM1    365
## 2 ATM2    365
## 3 ATM3    365
## 4 ATM4    365
## 5 <NA>     14
atm_ts %>% group_by(atm) %>% summarise(null_count = sum(is.na(cash)))
## # A tsibble: 1,474 x 3 [1D]
## # Key:       atm [5]
##    atm   date       null_count
##    <chr> <date>          <int>
##  1 ATM1  2009-05-03          0
##  2 ATM1  2009-05-04          0
##  3 ATM1  2009-05-05          0
##  4 ATM1  2009-05-06          0
##  5 ATM1  2009-05-07          0
##  6 ATM1  2009-05-08          0
##  7 ATM1  2009-05-09          0
##  8 ATM1  2009-05-10          0
##  9 ATM1  2009-05-11          0
## 10 ATM1  2009-05-12          0
## # ℹ 1,464 more rows

There appears to be 14 missing atm data. However, All four ATMs look to have 365 data so It may be a good idea to drop them for now.

ggplot(atm_ts, aes(x = date, y = cash, color = atm)) + 
  geom_line() + 
  labs(title = "Daily ATM Cash Withdrawals", y = "Cash ($)", x = "Date") + 
  theme_minimal()
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).

atm_ts %>% count(atm)
## # A tibble: 5 × 2
##   atm       n
##   <chr> <int>
## 1 ATM1    365
## 2 ATM2    365
## 3 ATM3    365
## 4 ATM4    365
## 5 <NA>     14
summary(atm_ts$cash)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0      50    7300   15558   11400 1091976      19

There is a withdrawal of 1091976 for atm 4. It is unclear whether or not this was a error in the data.

ggplot(atm_ts, aes(x = date, y = cash, color = atm)) + 
  geom_line() + 
  labs(title = "Daily ATM Cash Withdrawals", y = "Cash ($)", x = "Date") + 
  theme_minimal()
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).

Splitting Data by ATM

atm1 <- atm_ts %>%
filter(atm == "ATM1") %>%
mutate(cash = na.approx(cash, na.rm = FALSE))

atm2 <- atm_ts %>%
filter(atm == "ATM2") %>%
mutate(cash = na.approx(cash, na.rm = FALSE))

atm3 <- atm_ts %>% filter(atm == "ATM3")

atm4 <- atm_ts %>%
filter(atm == "ATM4") %>%
mutate(second_highest = sort(cash, decreasing = TRUE)[2],
cash = if_else(cash > 9000, second_highest, cash)) %>%
select(-second_highest) %>%
mutate(cash = na.approx(cash, na.rm = FALSE))

Visualizing Individual ATMs

ggplot(atm1, aes(x = date, y = cash, color = atm)) + 
  geom_line() + 
  labs(title = "Daily ATM 1 Cash Withdrawals", y = "Cash ($)", x = "Date") + 
  theme_minimal()

ggplot(atm2, aes(x = date, y = cash, color = atm)) + 
  geom_line() + 
  labs(title = "Daily ATM 2 Cash Withdrawals", y = "Cash ($)", x = "Date") + 
  theme_minimal()

ggplot(atm3, aes(x = date, y = cash, color = atm)) + 
  geom_line() + 
  labs(title = "Daily ATM 3 Cash Withdrawals", y = "Cash ($)", x = "Date") + 
  theme_minimal()

ggplot(atm4, aes(x = date, y = cash, color = atm)) + 
  geom_line() + 
  labs(title = "Daily ATM 4 Cash Withdrawals", y = "Cash ($)", x = "Date") + 
  theme_minimal()

atm_ts |>
  group_by(atm) |>
  summarise(
    null_count = sum(is.na(cash))
  )
## # A tsibble: 1,474 x 3 [1D]
## # Key:       atm [5]
##    atm   date       null_count
##    <chr> <date>          <int>
##  1 ATM1  2009-05-03          0
##  2 ATM1  2009-05-04          0
##  3 ATM1  2009-05-05          0
##  4 ATM1  2009-05-06          0
##  5 ATM1  2009-05-07          0
##  6 ATM1  2009-05-08          0
##  7 ATM1  2009-05-09          0
##  8 ATM1  2009-05-10          0
##  9 ATM1  2009-05-11          0
## 10 ATM1  2009-05-12          0
## # ℹ 1,464 more rows

ATM1 Forecast

atm1 %>%  model(stl = STL(cash)) %>%  components() %>%  autoplot()

atm1 <- atm1 %>%  mutate(sqrt_cash = sqrt(cash))

atm1 |> model(stl = STL(sqrt_cash)) |> components() |> autoplot()

atm1_train <- atm1 |> filter(date < "2010-04-01")
atm1_test <- atm1 |> filter(date >= "2010-04-01")

atm1_fits <- atm1_train |> model(
snaive = SNAIVE(sqrt_cash),
auto_ETS = ETS(sqrt_cash),
holt_winters_add = ETS(sqrt_cash ~ error("A") + trend("N") + season("A")),
arima = ARIMA(sqrt_cash)
)

atm1_fcs <- atm1_fits |> forecast(h = nrow(atm1_test) + 29)

atm1_fcs |> autoplot(atm1_test) +
labs(y = "SQRT of Cash Withdrawn (Dollars)",
title = "Forecast of Cash Withdrawn (April 1st - May 31st 2010)")

atm1_final_fit <- atm1 |> model(arima = ARIMA(sqrt_cash ~ pdq(0,0,2) + PDQ(0,1,1,period=7)))
atm1_final_fc <- atm1_final_fit |> forecast(h = 29)

atm1_final_fit |> select(.model = "arima") |> gg_tsresiduals() +
labs(title = "ARIMA(0,0,2)(0,1,1)[7] Model Residuals")
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

atm1_fc <- atm1_final_fc |> 
  as_tibble() |>
  filter(.model == "arima") |>
  mutate(
    cash_lower_ci_95 = hilo(sqrt_cash)$lower^2,
    cash_prediction  = mean(sqrt_cash)^2,
    cash_upper_ci_95 = hilo(sqrt_cash)$upper^2
  ) |> 
  select(.model, date, cash_prediction, cash_lower_ci_95, cash_upper_ci_95) |> 
  mutate(atm_id = "atm1")

ATM2 Forecast

atm2 |> model(stl = STL(cash)) |> components() |> autoplot()

atm2_train <- atm2 |> filter(date < "2010-04-01")
atm2_test <- atm2 |> filter(date >= "2010-04-01")

atm2_fits <- atm2_train |> model(
snaive = SNAIVE(cash),
auto_ETS = ETS(cash),
holt_winters_add = ETS(cash ~ error("A") + trend("N") + season("A")),
arima = ARIMA(cash)
)

atm2_fcs <- atm2_fits |> forecast(h = nrow(atm2_test) + 29)

atm2_fcs |> autoplot(atm2_test) +
labs(y = "Cash Withdrawn (Dollars)",
title = "Forecast of Cash Withdrawn (April 1st - May 31st 2010)")

atm2_final_fit <- atm2 |> model(arima = ARIMA(cash ~ pdq(2,0,2) + PDQ(0,1,1,period=7)))
atm2_final_fc <- atm2_final_fit |> forecast(h = 29)

atm2_final_fit |> select(.model = "arima") |> gg_tsresiduals() +
labs(title = "ARIMA(2,0,2)(0,1,1)[7] Model Residuals")

atm2_fc <- atm2_final_fc |> 
  as_tibble() |>
  filter(.model == "arima") |>
  mutate(
    cash_lower_ci_95 = hilo(cash)$lower^2,
    cash_prediction  = mean(cash)^2,
    cash_upper_ci_95 = hilo(cash)$upper^2
  ) |> 
  select(.model, date, cash_prediction, cash_lower_ci_95, cash_upper_ci_95) |> 
  mutate(atm_id = "atm2")

ATM3 Forecast

atm3 |> filter(cash > 0) |> autoplot()
## Plot variable not specified, automatically selected `.vars = cash`

atm3_fit <- atm3 |> model(NAIVE(cash))
atm3_final_fc <- atm3_fit |> forecast(h = 29)
atm3_final_fc |> autoplot(atm3)

atm3_fc <- atm3_final_fc |> 
  as_tibble() |>
  filter(.model == "arima") |>
  mutate(
    cash_lower_ci_95 = hilo(cash)$lower^2,
    cash_prediction  = mean(cash)^2,
    cash_upper_ci_95 = hilo(cash)$upper^2
  ) |> 
  select(.model, date, cash_prediction, cash_lower_ci_95, cash_upper_ci_95) |> 
  mutate(atm_id = "atm3")

ATM4 Forecast

atm4 |> model(stl = STL(cash)) |> components() |> autoplot()

atm4_lambda <- atm4 |> features(cash, features = guerrero) |> pull(lambda_guerrero)

atm4_final_fit <- atm4 |> model(arima = ARIMA(cash))
atm4_final_fc <- atm4_final_fit |> forecast(h = 29)

atm4_final_fit |> select(.model = "arima") |> gg_tsresiduals() +
labs(title = "ATM4 ARIMA Model Residuals")

atm4_final_fc |> autoplot(atm4)

atm4_fc <- atm4_final_fc |> 
  as_tibble() |>
  filter(.model == "arima") |>
  mutate(
    cash_lower_ci_95 = hilo(cash)$lower^2,
    cash_prediction  = mean(cash)^2,
    cash_upper_ci_95 = hilo(cash)$upper^2
  ) |> 
  select(.model, date, cash_prediction, cash_lower_ci_95, cash_upper_ci_95) |> 
  mutate(atm_id = "atm4")

Combining forecasts

atm_fc_final <- bind_rows(atm1_fc, atm2_fc, atm3_fc, atm4_fc)
atm_fc_final
## # A tibble: 87 × 6
##    .model date       cash_prediction cash_lower_ci_95 cash_upper_ci_95 atm_id
##    <chr>  <date>               <dbl>            <dbl>            <dbl> <chr> 
##  1 arima  2010-05-03           8658.           3909.            15272. atm1  
##  2 arima  2010-05-04          10074.           4832.            17219. atm1  
##  3 arima  2010-05-05           7397.           3027.            13687. atm1  
##  4 arima  2010-05-06            443.             98.9            2707. atm1  
##  5 arima  2010-05-07          10024.           4779.            17189. atm1  
##  6 arima  2010-05-08           7946.           3382.            14430. atm1  
##  7 arima  2010-05-09           8568.           3792.            15265. atm1  
##  8 arima  2010-05-10           8710.           3643.            15953. atm1  
##  9 arima  2010-05-11          10058.           4526.            17770. atm1  
## 10 arima  2010-05-12           7397.           2806.            14170. atm1  
## # ℹ 77 more rows

Exporting to Excel

write_xlsx(atm_fc_final, "ATM624DForecastOutput.xlsx")

Part B

Loading Data

power_usage <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
head(power_usage)
## # A tibble: 6 × 3
##   CaseSequence `YYYY-MMM`     KWH
##          <dbl> <chr>        <dbl>
## 1          733 1998-Jan   6862583
## 2          734 1998-Feb   5838198
## 3          735 1998-Mar   5420658
## 4          736 1998-Apr   5010364
## 5          737 1998-May   4665377
## 6          738 1998-Jun   6467147

Renaming and Preliminary Setup

power_usage <- power_usage %>%
rename(Date = "YYYY-MMM") %>%
mutate(Date = yearmonth(Date)) %>%
as_tsibble(index = Date)

glimpse(power_usage)
## Rows: 192
## Columns: 3
## $ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ Date         <mth> 1998 Jan, 1998 Feb, 1998 Mar, 1998 Apr, 1998 May, 1998 Ju…
## $ KWH          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…

Visualizing Data

power_usage %>%
autoplot(KWH) +
labs(
x = "Date",
y = "Power Usage (kWh)",
title = "Power Usage Over Time"
)

Replace missing values with mean

power_usage$KWH <- ifelse(is.na(power_usage$KWH), mean(power_usage$KWH, na.rm = TRUE), power_usage$KWH)

power_usage %>% filter(is.na(power_usage$KWH))
## # A tsibble: 0 x 3 [?]
## # ℹ 3 variables: CaseSequence <dbl>, Date <mth>, KWH <dbl>

Replace extreme outliers with mean

limit <- 3000000
power_usage$KWH <- ifelse(power_usage$KWH < limit, mean(power_usage$KWH, na.rm = TRUE), power_usage$KWH)

Perform Box-Cox Transformation

lambda <- power_usage %>% features(KWH, features = guerrero) %>% pull(lambda_guerrero)

power_usage$KWH_transformed <- box_cox(power_usage$KWH, lambda)

power_usage %>%
autoplot(KWH_transformed) +
labs(x = "Month", y = "kWh (Box-Cox)", title = "Box-Cox Transformed Power Usage")

Checking Stationary

library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
adf_result <- adf.test(power_usage$KWH_transformed)
## Warning in adf.test(power_usage$KWH_transformed): p-value smaller than printed
## p-value
adf_result
## 
##  Augmented Dickey-Fuller Test
## 
## data:  power_usage$KWH_transformed
## Dickey-Fuller = -4.5238, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary

Our p-value is less then 0.05 so we can assume that the series is stationary

Plotting Autocorrelation and Lag Plots

power_usage %>%
gg_tsdisplay(
difference(difference(KWH_transformed, 12)),
plot_type = "partial",
lag = 36
) +
labs(title = "Seasonally Differenced Series", y = "")
## Warning: `gg_tsdisplay()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsdisplay()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 13 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 13 rows containing missing values or values outside the scale range
## (`geom_point()`).

Fitting Models

power_usage_fit <- power_usage %>%
model(
Mean = MEAN(KWH_transformed),
Naive = NAIVE(KWH_transformed),
SeasonalNaive = SNAIVE(KWH_transformed)
)

power_usage_fit %>% forecast(h = 5) %>% autoplot(power_usage)

power_usage_fit %>% select(SeasonalNaive) %>% gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

Model With Seasonal ARIMA

power_usage_arima <- power_usage %>%
model(
auto = ARIMA(KWH_transformed),
arima = ARIMA(KWH_transformed ~ pdq(2,1,0) + PDQ(0,1,1))
)

power_usage_fc <- power_usage_arima %>% forecast(h = 12)

power_usage_fc %>% autoplot(power_usage)

power_usage_arima %>% select(auto) %>% gg_tsresiduals()

Reverting Forecasts Back

library(fabletools)

power_usage_fc_back <- power_usage_fc %>%
  mutate(.mean = inv_box_cox(.mean, lambda))

power_usage_fc_back
## # A fable: 24 x 4 [1M]
## # Key:     .model [2]
##    .model     Date
##    <chr>     <mth>
##  1 auto   2014 Jan
##  2 auto   2014 Feb
##  3 auto   2014 Mar
##  4 auto   2014 Apr
##  5 auto   2014 May
##  6 auto   2014 Jun
##  7 auto   2014 Jul
##  8 auto   2014 Aug
##  9 auto   2014 Sep
## 10 auto   2014 Oct
## # ℹ 14 more rows
## # ℹ 2 more variables: KWH_transformed <dist>, .mean <dbl>

Plotting Forecasts

power_usage_fc_back %>%
  autoplot(power_usage, level = NULL) +
  labs(x = "Month", y = "kWh", title = "Forecasted Power Usage (Back-Transformed)")

Exporting to Excel

export_power_usage <- as.data.frame(power_usage_fc_back)

export_power_usage <- export_power_usage %>%
  select(-KWH_transformed)

write_xlsx(export_power_usage, "ResidentialCustomerForecastOutput.xlsx")