DATA624-PROJECT1

Project 1

# Load required libraries
library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tsibble)

## Warning: package 'tsibble' was built under R version 4.3.3

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## 
## Attaching package: 'tsibble'

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

library(fable)

## Loading required package: fabletools

library(ggplot2)
library(fpp3)

## Warning: package 'fpp3' was built under R version 4.3.3

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──

## ✔ tibble      3.2.1     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.3.2
## ✔ lubridate   1.9.3

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()     masks base::date()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ tsibble::intersect()  masks base::intersect()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ tsibble::setdiff()    masks base::setdiff()
## ✖ tsibble::union()      masks base::union()

library(readr)
library(feasts)
library(forecast)

## Warning: package 'forecast' was built under R version 4.3.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(lubridate)

Part A – ATM Forecast, ATM624Data.xlsx

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.

We will begin by loading the data and visualizing each ATM idividually. This will give us insight on how to proceed.

# Load and preprocess data
atm_data_raw <- read_excel("/Users/zigcah/Downloads/ATM624Data.xlsx")


# Check structure
glimpse(atm_data_raw)

## Rows: 1,474
## Columns: 3
## $ DATE <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
## $ ATM  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ Cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …

atm_data <- atm_data_raw %>% 
  mutate(DATE = as.Date(DATE)) %>% 
  group_by(ATM, year_month = yearmonth(DATE)) %>% 
  summarize(Cash = sum(Cash, na.rm = TRUE)) %>% 
  as_tsibble(key = ATM, index = year_month)

## `summarise()` has grouped output by 'ATM'. You can override using the `.groups`
## argument.

# Visualization for ATM1
atm_data %>%
  filter(ATM == "ATM1") %>%
  ggplot(aes(x = year_month, y = Cash)) +
  geom_line(color = "blue") +
  labs(
    title = "Monthly Cash Withdrawals - ATM1",
    x = "Month",
    y = "Cash (hundreds)"
  ) +
  theme_minimal()

# Visualization for ATM2
atm_data %>%
  filter(ATM == "ATM2") %>%
  ggplot(aes(x = year_month, y = Cash)) +
  geom_line(color = "green") +
  labs(
    title = "Monthly Cash Withdrawals - ATM2",
    x = "Month",
    y = "Cash (hundreds)"
  ) +
  theme_minimal()

# Visualization for ATM3
atm_data %>%
  filter(ATM == "ATM3") %>%
  ggplot(aes(x = year_month, y = Cash)) +
  geom_line(color = "red") +
  labs(
    title = "Monthly Cash Withdrawals - ATM3",
    x = "Month",
    y = "Cash (hundreds)"
  ) +
  theme_minimal()

# Visualization for ATM4
atm_data %>%
  filter(ATM == "ATM4") %>%
  ggplot(aes(x = year_month, y = Cash)) +
  geom_line(color = "purple") +
  labs(
    title = "Monthly Cash Withdrawals - ATM4",
    x = "Month",
    y = "Cash (hundreds)"
  ) +
  theme_minimal()

# Visualize for all ATM's
atm_data %>% 
  ggplot(aes(x = year_month, y = Cash, color = ATM)) +
  geom_line() +
  labs(title = "Monthly Cash Withdrawals All ATM's", x = "Month", y = "Cash (hundreds)")

# Clean the dataset
atm_data <- atm_data_raw %>%
  mutate(
    DATE = as.Date(DATE, format = "%m/%d/%Y")
  )

# Check the structure
glimpse(atm_data)

## Rows: 1,474
## Columns: 3
## $ DATE <date> 2079-05-03, 2079-05-03, 2079-05-04, 2079-05-04, 2079-05-05, 2079…
## $ ATM  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ Cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …

# Ensure data is a valid tsibble
atm_tsibble <- atm_data %>%
  as_tsibble(index = DATE, key = ATM)

# Check for missing or irregular data
atm_tsibble %>%
  summarize(min_date = min(DATE), max_date = max(DATE), unique_dates = n_distinct(DATE))

## # A tsibble: 379 x 4 [1D]
##    DATE       min_date   max_date   unique_dates
##    <date>     <date>     <date>            <int>
##  1 2079-05-03 2079-05-03 2079-05-03            1
##  2 2079-05-04 2079-05-04 2079-05-04            1
##  3 2079-05-05 2079-05-05 2079-05-05            1
##  4 2079-05-06 2079-05-06 2079-05-06            1
##  5 2079-05-07 2079-05-07 2079-05-07            1
##  6 2079-05-08 2079-05-08 2079-05-08            1
##  7 2079-05-09 2079-05-09 2079-05-09            1
##  8 2079-05-10 2079-05-10 2079-05-10            1
##  9 2079-05-11 2079-05-11 2079-05-11            1
## 10 2079-05-12 2079-05-12 2079-05-12            1
## # ℹ 369 more rows

STL Decomposition on a monthly basis

# Aggregate the data to monthly totals
atm_monthly <- atm_tsibble %>%
  index_by(year_month = yearmonth(DATE)) %>%
  summarize(Cash = sum(Cash, na.rm = TRUE))

# Perform STL decomposition for each ATM
atm_stl <- atm_monthly %>%
  model(
    stl_decomp = STL(Cash ~ trend(window = 7) + season(window = "periodic"))
  ) %>%
  components()

# Check the actual column names in the components output
glimpse(atm_stl)

## Rows: 13
## Columns: 6
## Key: .model [1]
## : Cash = trend + remainder
## $ .model        <chr> "stl_decomp", "stl_decomp", "stl_decomp", "stl_decomp", …
## $ year_month    <mth> 2079 May, 2079 Jun, 2079 Jul, 2079 Aug, 2079 Sep, 2079 O…
## $ Cash          <dbl> 16943.9074, 17692.9987, 19319.2780, 19971.4699, 22397.23…
## $ trend         <dbl> 17283.602, 17779.314, 18275.025, 18634.995, 18635.587, 1…
## $ remainder     <dbl> -339.69500, -86.31512, 1044.25288, 1336.47514, 3761.6501…
## $ season_adjust <dbl> 16943.9074, 17692.9987, 19319.2780, 19971.4699, 22397.23…

 atm_monthly <- atm_data_raw %>%
  group_by(ATM, year_month = yearmonth(DATE)) %>%
  summarize(Cash = sum(Cash, na.rm = TRUE)) %>%
  ungroup() %>%
  as_tsibble(key = ATM, index = year_month)

## `summarise()` has grouped output by 'ATM'. You can override using the `.groups`
## argument.

atm_stl <- atm_monthly %>%
  model(
    stl_decomp = STL(Cash ~ trend(window = 7) + season(window = "periodic"))
  ) %>%
  components()   


# Visualize STL components for each ATM
atm_stl %>%
  ggplot(aes(x = year_month)) +
  geom_line(aes(y = season_adjust), color = "blue") +
  facet_wrap(~ATM, scales = "free_y") +
  labs(
    title = "STL Decomposition for ATMs",
    x = "Month",
    y = "Seasonally Adjusted Cash"
  )

STL decomposition components for each ATM

# Visualize full STL decomposition components for each ATM
atm_stl %>%
  pivot_longer(cols = c(trend, remainder, season_adjust), 
               names_to = "component", values_to = "value") %>%
  ggplot(aes(x = year_month, y = value, color = ATM)) +
  geom_line() +
  facet_wrap(~component, scales = "free_y", ncol = 1) +
  labs(
    title = "STL Decomposition Components for ATMs",
    x = "Month",
    y = "Value"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12)
  )

Forecasting Models

# Fit forecasting models for each ATM
atm_models <- atm_monthly %>%
  model(
    arima = ARIMA(Cash),
    ets = ETS(Cash ~ error("A") + trend("A") + season("M"))
  )

## Warning: 2 errors (1 unique) encountered for ets
## [2] missing value where TRUE/FALSE needed

# Forecast for May 2010 (1 month)
atm_forecasts <- atm_models %>% forecast(h = "1 month")

# Visualize forecasts
atm_forecasts %>%
  autoplot(atm_monthly) +
  labs(
    title = "Forecast for ATM Withdrawals (May 2010)",
    x = "Month",
    y = "Cash (hundreds)"
  ) +
  facet_wrap(~ATM, scales = "free_y")

## Warning in min(x, na.rm = na.rm): no non-missing arguments to min; returning
## Inf

## Warning in max(x, na.rm = na.rm): no non-missing arguments to max; returning
## -Inf

## Warning in min(x, na.rm = na.rm): no non-missing arguments to min; returning
## Inf

## Warning in max(x, na.rm = na.rm): no non-missing arguments to max; returning
## -Inf

## Warning in min(x, na.rm = na.rm): no non-missing arguments to min; returning
## Inf

## Warning in max(x, na.rm = na.rm): no non-missing arguments to max; returning
## -Inf

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Filter for ATM1
atm1_data <- atm_tsibble %>%
  filter(ATM == "ATM1")

# Build models for ATM1
atm1_models <- atm1_data %>%
  model(
    ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
    arima = ARIMA(Cash)
  )

## Warning: 1 error encountered for ets
## [1] ETS does not support missing values.

# Forecast for ATM1 for May 2010
atm1_forecast <- atm1_models %>%
  forecast(h = 31)  # 31 days for May 2010

# Plot the forecast
atm1_forecast %>%
  autoplot(atm1_data, level = NULL) +
  labs(title = "ATM1 Forecast for May 2010", y = "Cash (hundreds of dollars)")

## Warning: Removed 31 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Filter for ATM2
atm2_data <- atm_tsibble %>%
  filter(ATM == "ATM2")

# Build model for ATM2
atm2_models <- atm2_data %>%
  model(
    ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
    arima = ARIMA(Cash)
  )

## Warning: 1 error encountered for ets
## [1] ETS does not support missing values.

# Forecast for ATM2 for May 2010
atm2_forecast <- atm2_models %>%
  forecast(h = 31)  # 31 days for May 2010

# Plot the forecast
atm2_forecast %>%
  autoplot(atm2_data, level = NULL) +
  labs(title = "ATM2 Forecast for May 2010", y = "Cash (hundreds of dollars)")

## Warning: Removed 31 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Filter for ATM3
atm3_data <- atm_tsibble %>%
  filter(ATM == "ATM3")

# Build model for ATM3
atm3_models <- atm3_data %>%
  model(
    ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
    arima = ARIMA(Cash)
  )

# Forecast for ATM3 for May 2010
atm3_forecast <- atm3_models %>%
  forecast(h = 31)  # 31 days for the of May

# Plot the forecast
atm3_forecast %>%
  autoplot(atm3_data, level = NULL) +
  labs(title = "ATM3 Forecast for May 2010", y = "Cash (hundreds of dollars)")

# Filter for ATM4
atm4_data <- atm_tsibble %>%
  filter(ATM == "ATM4")

# Build model for ATM4
atm4_models <- atm4_data %>%
  model(
    ets = ETS(Cash ~ error("A") + trend("A") + season("A")),
    arima = ARIMA(Cash)
  )

# Forecast for ATM4 for May 2010
atm4_forecast <- atm4_models %>%
  forecast(h = 31)  # 31 days for May 2010

# Plot the forecast
atm4_forecast %>%
  autoplot(atm4_data, level = NULL) +
  labs(title = "ATM4 Forecast for May 2010", y = "Cash (hundreds of dollars)")

# Combine forecasts for all ATMs
all_forecasts <- bind_rows(
  atm1_forecast %>% as_tibble() %>% mutate(ATM = "ATM1"),
  atm2_forecast %>% as_tibble() %>% mutate(ATM = "ATM2"),
  atm3_forecast %>% as_tibble() %>% mutate(ATM = "ATM3"),
  atm4_forecast %>% as_tibble() %>% mutate(ATM = "ATM4")
)

# Plot for combined forecasts
all_forecasts %>%
  ggplot(aes(x = DATE, y = .mean, color = ATM, group = ATM)) +
  geom_line(size = 1) +
  labs(
    title = "Forecasted Cash Withdrawals for May 2010",
    x = "Date",
    y = "Cash (hundreds of dollars)",
    color = "ATM"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12)
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).

atm_stl <- atm_monthly %>%
  model(
    stl_decomp = STL(Cash ~ trend(window = 7) + season(window = "periodic"))
  ) %>%
  components()       

atm_stl %>%
  ggplot(aes(x = year_month)) +
  geom_line(aes(y = season_adjust), color = "blue") +
  facet_wrap(~ ATM, scales = "free_y") +
  labs(title="Seasonally Adjusted Cash by ATM")

atm_models <- atm_monthly %>%
  model(
    arima_fit = ARIMA(Cash),
    # or if you like ETS
    ets_fit   = ETS(Cash ~ error("A") + trend("A") + season("M"))
  )

## Warning: 2 errors (1 unique) encountered for ets_fit
## [2] missing value where TRUE/FALSE needed

atm_forecasts <- atm_models %>%
  forecast(h = "1 month")

atm_forecasts %>%
  autoplot(atm_monthly, level = NULL) +
  labs(title = "Forecast for May 2010", x = "Month", y = "Cash (hundreds)") +
  facet_wrap(~ATM, scales="free_y")

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Save combined forecasts to a single CSV
write.csv(as.data.frame(atm_forecasts), "ATM_Forecast_May2010.csv")

Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

Part B – Forecasting Power

library(readxl)
library(dplyr)
library(tsibble)
library(fable)
library(feasts)
library(lubridate)
library(ggplot2)
library(forecast)   
library(purrr)

# Import and Basic Cleaning 
power_data <- read_excel("/Users/zigcah/Downloads/ResidentialCustomerForecastLoad-624.xlsx")

# Convert 'YYYY-MMM' to a Date format
power_data <- power_data %>%
  mutate(date = as.Date(
    paste0(
      substr(`YYYY-MMM`, 1, 4), "-",
      match(substr(`YYYY-MMM`, 6, 8), month.abb),
      "-01"
    )
  )) %>%
  select(date, KWH) %>%
  drop_na(KWH) %>%
  as_tsibble(index = date)

# Quick check of the data 
head(power_data)

## # A tsibble: 6 x 2 [1D]
##   date           KWH
##   <date>       <dbl>
## 1 1998-01-01 6862583
## 2 1998-02-01 5838198
## 3 1998-03-01 5420658
## 4 1998-04-01 5010364
## 5 1998-05-01 4665377
## 6 1998-06-01 6467147

str(power_data)

## tbl_ts [191 × 2] (S3: tbl_ts/tbl_df/tbl/data.frame)
##  $ date: Date[1:191], format: "1998-01-01" "1998-02-01" ...
##  $ KWH : num [1:191] 6862583 5838198 5420658 5010364 4665377 ...
##  - attr(*, "key")= tibble [1 × 1] (S3: tbl_df/tbl/data.frame)
##   ..$ .rows: list<int> [1:1] 
##   .. ..$ : int [1:191] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..@ ptype: int(0) 
##  - attr(*, "index")= chr "date"
##   ..- attr(*, "ordered")= logi TRUE
##  - attr(*, "index2")= chr "date"
##  - attr(*, "interval")= interval [1:1] 1D
##   ..@ .regular: logi TRUE

# Time series plot
power_data %>%
  ggplot(aes(x = date, y = KWH)) +
  geom_line() +
  labs(title = "Monthly Residential Power Usage", x = "Month", y = "KWH")

We will check the data for missing values which need to be handled to complete our forecast.

# Check for missing data by plotting the time series with NA values highlighted
ggplot(power_data, aes(x = date, y = KWH)) +
  geom_point(aes(color = is.na(KWH))) +
  labs(title = "Missing Data in Power Usage", x = "Month", y = "KWH")

We will now clean the data and remove any nulls or N/A’s so we can properly conduct our analysis.

# Fill any missing time periods (explicit and implicit gaps)
power_data_filled <- power_data %>%
  fill_gaps()

# Check if there are still any missing values after fill_gaps
sum(is.na(power_data_filled$KWH))

## [1] 5623

# If there are still NA values, we can apply an imputation method like linear interpolation
power_data_filled <- power_data_filled %>%
  mutate(KWH = zoo::na.approx(KWH))  # Using linear interpolation for NA values

# Check if there are still any missing values after imputation
sum(is.na(power_data_filled$KWH))

## [1] 0

# Apply STL decomposition
power_data_stl <- power_data_filled %>%
  model(
    STL(KWH ~ trend(window = 7) + season(window = "periodic"))
  ) %>%
  components()

# Visualize the decomposition components
autoplot(power_data_stl) +
  labs(title = "STL Decomposition of Power Usage", x = "Month", y = "KWH")

# Fit ARIMA model to the seasonally adjusted data (from STL)
power_arima_model <- power_data_stl %>%
  model(
    arima = ARIMA(season_adjust)
  )

# Report the model summary
report(power_arima_model)

## Series: season_adjust 
## Model: ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.9389  -0.8612
## s.e.  0.0098   0.0140
## 
## sigma^2 estimated as 1.954e+10:  log likelihood=-77119.19
## AIC=154244.4   AICc=154244.4   BIC=154264.4

# Extract residuals from ARIMA model specifically
residuals_power_arima <- power_arima_model %>%
  select(arima) %>%  # Select the ARIMA model specifically
  residuals()  # Extract residuals

# Plot the residuals
autoplot(residuals_power_arima) +
  labs(title = "Residuals of ARIMA Model", x = "Month", y = "Residuals")

## Plot variable not specified, automatically selected `.vars = .resid`