Project 1 Data 624

PART A. ATM DATASET CLEANING AND DATE OBSERVATIONS

#import
atm_main <- read_excel("~/Downloads/ATM624Data.xlsx")

# Clean the data
atm_cleaned <- atm_main |>
  filter(!is.na(DATE), !is.na(ATM), !is.na(Cash)) |>
  mutate(
    DATE = as.Date(DATE, origin = "1899-12-30"),
    ATM = as.factor(ATM),
    Cash = as.numeric(Cash)
  ) |>
  arrange(ATM, DATE)

#Initial observation of Data
atm_summary <- atm_cleaned |>
  group_by(ATM) |>
  summarise(
    start_date = min(DATE),
    end_date = max(DATE),
    n_obs = n(),
    mean_cash = mean(Cash, na.rm = TRUE),
    median_cash = median(Cash, na.rm = TRUE),
    sd_cash = sd(Cash, na.rm = TRUE),
    min_cash = min(Cash, na.rm = TRUE),
    max_cash = max(Cash, na.rm = TRUE)
  )

atm_summary

## # A tibble: 4 × 9
##   ATM   start_date end_date   n_obs mean_cash median_cash sd_cash min_cash
##   <fct> <date>     <date>     <int>     <dbl>       <dbl>   <dbl>    <dbl>
## 1 ATM1  2009-05-01 2010-04-30   362    83.9           91    36.7      1   
## 2 ATM2  2009-05-01 2010-04-30   363    62.6           67    38.9      0   
## 3 ATM3  2009-05-01 2010-04-30   365     0.721          0     7.94     0   
## 4 ATM4  2009-05-01 2010-04-30   365   474.           404.  651.       1.56
## # ℹ 1 more variable: max_cash <dbl>

atm_cleaned <- atm_cleaned |>
  as_tsibble(key = ATM, index = DATE)
#There are empty entries, some of which have an ATM declared but no Cash value. The  
#values I kept were those with ATM and Cash amount to prevent from having ATM entries 
#with no value to be used. There are values with 0 and I kept all of those, specifically 
#ATM 3 has many 0 values. Find any gaps in tables, this shows all have.
#Find dates and replace with fill_gaps()
atm_cleaned |>
  has_gaps()

## # A tibble: 4 × 2
##   ATM   .gaps
##   <fct> <lgl>
## 1 ATM1  TRUE 
## 2 ATM2  TRUE 
## 3 ATM3  FALSE
## 4 ATM4  FALSE

scan_gaps(atm_cleaned)

## # A tsibble: 5 x 2 [1D]
## # Key:       ATM [2]
##   ATM   DATE      
##   <fct> <date>    
## 1 ATM1  2009-06-13
## 2 ATM1  2009-06-16
## 3 ATM1  2009-06-22
## 4 ATM2  2009-06-18
## 5 ATM2  2009-06-24

atm_cleaned_f <- atm_cleaned |>
  fill_gaps()

Preliminary Observation of Each ATM

-In our initial observations for each ATM, it was clear that an outlier of 10,920 on 2010-02-09 was severely influencing our analysis of the dataset as well as the scaling we can view the data through. ATM3 also has an abnormal dataset filled with zeroes with the exception of the final 3 values. I think this would be the ATM I trust prediction for least. It may have been newly placed or not functioning previously but it is a non-stationary as the set is all zeroes until the final few values. The value was left here for observation purposes but will be removed for our modeling comparisons.

#Mean imputation of outlier
#Removing outlier to calculate replacement so calculation is not biased
atm_cleaned_f <- atm_cleaned |>
  as_tibble() |>
  group_by(ATM) |>
  mutate(
    atm_mean = mean(
      Cash[!(ATM == "ATM4" & DATE == as.Date("2010-02-09"))],
      na.rm = TRUE
    ),
    Cash = case_when(
      ATM == "ATM4" & DATE == as.Date("2010-02-09") ~ atm_mean,
      TRUE ~ Cash
    )
  ) |>
  ungroup() |>
  select(-atm_mean) |>
  as_tsibble(key = ATM, index = DATE)
#Check Outlier
atm_cleaned |>
  filter(ATM == "ATM4", DATE == as.Date("2010-02-09"))

## # A tsibble: 1 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM     Cash
##   <date>     <fct>  <dbl>
## 1 2010-02-09 ATM4  10920.

atm_cleaned_f |>
  filter(ATM == "ATM4", DATE == as.Date("2010-02-09"))

## # A tsibble: 1 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM    Cash
##   <date>     <fct> <dbl>
## 1 2010-02-09 ATM4   445.

atm_cleaned <- atm_cleaned |>
  as_tsibble(key = ATM, index = DATE)

#Compare cleaned and final datasets, ATM4 now appears stationary and fit for model
autoplot(atm_cleaned, Cash) +
  facet_wrap(~ ATM, scales = "free_y") +
  labs(
    title = "Daily Cash Withdrawals by ATM",
    x = "Date",
    y = "Cash Withdrawals (Hundreds of Dollars)"
  )

autoplot(atm_cleaned_f, Cash) +
  facet_wrap(~ ATM, scales = "free_y") +
  labs(
    title = "Daily Cash Withdrawals by ATM",
    x = "Date",
    y = "Cash Withdrawals (Hundreds of Dollars)"
  )

Day of the week Observations and analysis

Our analysis shows some interesting outcomes. ATM1, ATM2, and ATM4 have some extremely similar behavior including a dropoff on Thursday and relatively consistent mean and variance for each atm. Saturday for every ATM also shows that data points fall on the higher end suggesting withdrawls are common with few low values. ATM4 shows by far the greatest variance but it does not seem to be increasing as the set goes on so these are all suitable for our ARIMA and other models. Our Boxplot also interestingly shows us that the ATM1 has the least variance especially relative to the size of the mean. This is confirmed in our atm_summary_f table. ATM3 gives us a very questionable observation about whether this is a dataset that is valuable for prediction. With so few valid datapoints this is very limiting.

#Day of the Week Observations

autoplot(atm_cleaned_f, Cash) +
  facet_wrap(~ ATM, scales = "free_y") +
  labs(
    title = "Daily Cash Withdrawals by ATM",
    x = "Date",
    y = "Cash Withdrawals (Hundreds of Dollars)"
  )

atm_dayofweek <- atm_cleaned_f |>
  as_tibble() |>
  mutate(day_of_week = wday(DATE, label = TRUE, abbr = FALSE)) |>
  group_by(ATM, day_of_week) |>
  summarise(
    mean_cash = mean(Cash, na.rm = TRUE),
    median_cash = median(Cash, na.rm = TRUE),
    sd_cash = sd(Cash, na.rm = TRUE),
    min_cash = min(Cash, na.rm = TRUE),
    max_cash = max(Cash, na.rm = TRUE),
    .groups = "drop"
  )

atm_dayofweek

## # A tibble: 28 × 7
##    ATM   day_of_week mean_cash median_cash sd_cash min_cash max_cash
##    <fct> <ord>           <dbl>       <dbl>   <dbl>    <dbl>    <dbl>
##  1 ATM1  Sunday          103.        108      20.6       26      152
##  2 ATM1  Monday           86          85      21.0        6      126
##  3 ATM1  Tuesday          89.6        99      47.9        1      180
##  4 ATM1  Wednesday        82.2        78.5    25.1        2      179
##  5 ATM1  Thursday         31.7        16      32.8        4      125
##  6 ATM1  Friday           98.6        98      30.3        1      152
##  7 ATM1  Saturday         96.6        96      14.0       69      144
##  8 ATM2  Sunday           67.2        64      24.9        0      122
##  9 ATM2  Monday           58.7        63.5    32.2        2      132
## 10 ATM2  Tuesday          73.2        82.5    41.3        0      146
## # ℹ 18 more rows

atm_cleaned_f |>
  as_tibble() |>
  mutate(day_of_week = wday(DATE, label = TRUE, abbr = FALSE)) |>
  ggplot(aes(x = day_of_week, y = Cash, fill = ATM)) +
  geom_boxplot() +
  facet_wrap(~ ATM, scales = "free_y") +
  labs(
    title = "Distribution of Cash Withdrawals by Day of Week",
    x = "Day of Week",
    y = "Cash Withdrawals (Hundreds of Dollars)"
  )

atm_summary_f <- atm_cleaned_f |>
  as_tibble() |>
  group_by(ATM) |>
  summarise(
    n_obs = n(),
    mean_cash = mean(Cash, na.rm = TRUE),
    median_cash = median(Cash, na.rm = TRUE),
    sd_cash = sd(Cash, na.rm = TRUE),
    min_cash = min(Cash, na.rm = TRUE),
    max_cash = max(Cash, na.rm = TRUE)
  )

atm_summary_f

## # A tibble: 4 × 7
##   ATM   n_obs mean_cash median_cash sd_cash min_cash max_cash
##   <fct> <int>     <dbl>       <dbl>   <dbl>    <dbl>    <dbl>
## 1 ATM1    362    83.9           91    36.7      1        180 
## 2 ATM2    363    62.6           67    38.9      0        147 
## 3 ATM3    365     0.721          0     7.94     0         96 
## 4 ATM4    365   445.           404.  351.       1.56    1712.

#Creating training data and models
atm_train <- atm_cleaned_f |>
  fill_gaps() |>
  as_tibble() |>
  group_by(ATM) |>
  arrange(DATE, .by_group = TRUE) |>
  mutate(Cash = na.approx(Cash, na.rm = FALSE, rule = 2)) |>
  ungroup() |>
  as_tsibble(key = ATM, index = DATE)

atm_naive <- atm_train |>
  model(NAIVE = NAIVE(Cash))

atm_snaive <- atm_train |>
  model(SNAIVE = SNAIVE(Cash ~ lag("week")))

atm_drift <- atm_train |>
  model(DRIFT = RW(Cash ~ drift()))

atm_ets <- atm_train |>
  model(ETS = ETS(Cash))

atm_arima <- atm_train |>
  model(ARIMA = ARIMA(Cash))

Based on RMSE, ARIMA performed best for ATM1, ATM2, and ATM3, while ETS performed best for ATM4. Although the ARIMA residual diagnostics were satisfactory for all four ATM series, final model selection was based on forecast accuracy. Therefore, ARIMA was selected for ATM1–ATM3, and ETS was selected for ATM4. ETS was a close competitor in every ATM observed which suggests that there is a seasonal trend in our data that can be valuable in prediction.

The ACF plots indicate strong weekly seasonality for ATM1 and ATM2, as shown by large spikes at lags 7, 14, and 21 which suggests weekly seasonality in these cash withdrawls as we also saw in our boxplot. ATM3 shows stronger short-lag dependence at lags 1 and 2, with less evidence of weekly seasonality. ATM4 has comparatively weak autocorrelation, suggesting a more irregular series. Overall, the plots show that the ATM series have different dependence structures, which helps explain why different forecasting models perform better for different ATMs. ATM3 has heavy autocorrelation that drops off as it is primarily zero values which make context much more important for why the values were mostly zero before the spike.

The amount ATM1 & ATM2 had better success with a model that captures structured dependency well such as ARIMA is not surprising based on the autocorrelation. ATM4 in contrast had more moderate ACF fluctuations and in this model the ETS was able to achieve a better RMSE. Due to how similar the results were I believe the ARIMA model best predicts these but using an ETS model in comparison or replacement on the ATM’S with much higher withdrawls like ATM4. This is why I believe a hybrid method of looking at withdrawls for each ATM to check for the strength of seasonality is the best way to decide which model is best for each ATM when considering for something like cash planning to ensure withdrawl transactions are constant and not stalled by cash planning.

atm_accuracy_train <- bind_rows(
  accuracy(atm_naive),
  accuracy(atm_snaive),
  accuracy(atm_drift),
  accuracy(atm_ets),
  accuracy(atm_arima)
)

atm_accuracy_train

## # A tibble: 20 × 11
##    ATM   .model .type        ME   RMSE     MAE    MPE  MAPE  MASE RMSSE     ACF1
##    <fct> <chr>  <chr>     <dbl>  <dbl>   <dbl>  <dbl> <dbl> <dbl> <dbl>    <dbl>
##  1 ATM1  NAIVE  Trai… -3.02e- 2  49.9   37.2   -132.  167.  2.10  1.80  -0.356  
##  2 ATM2  NAIVE  Trai… -4.67e- 2  54.2   42.5   -Inf   Inf   2.05  1.79  -0.307  
##  3 ATM3  NAIVE  Trai…  2.34e- 1   5.09   0.310   28.8  40.2 0.423 0.632 -0.149  
##  4 ATM4  NAIVE  Trai… -8.10e- 1 477.   386.    -488.  547.  1.12  1.05  -0.464  
##  5 ATM1  SNAIVE Trai… -3.63e- 2  27.7   17.7    -96.5 116.  1     1      0.188  
##  6 ATM2  SNAIVE Trai…  2.23e- 2  30.2   20.8   -Inf   Inf   1     1      0.0482 
##  7 ATM3  SNAIVE Trai…  7.35e- 1   8.04   0.735  100   100   1.000 1.00   0.640  
##  8 ATM4  SNAIVE Trai… -3.70e+ 0 452.   346.    -392.  447.  1     1      0.0598 
##  9 ATM1  DRIFT  Trai… -1.56e-16  49.9   37.2   -132.  167.  2.10  1.80  -0.356  
## 10 ATM2  DRIFT  Trai…  4.59e-15  54.2   42.5   -Inf   Inf   2.05  1.79  -0.307  
## 11 ATM3  DRIFT  Trai…  1.79e-15   5.08   0.541 -Inf   Inf   0.737 0.632 -0.149  
## 12 ATM4  DRIFT  Trai… -2.81e-15 477.   386.    -487.  546.  1.12  1.05  -0.464  
## 13 ATM1  ETS    Trai… -4.82e- 2  23.8   15.1   -107.  122.  0.852 0.857  0.144  
## 14 ATM2  ETS    Trai… -6.89e- 1  25.1   17.9   -Inf   Inf   0.859 0.833  0.0199 
## 15 ATM3  ETS    Trai…  2.70e- 1   5.03   0.273  Inf   Inf   0.371 0.625 -0.00736
## 16 ATM4  ETS    Trai… -7.31e+ 0 329.   264.    -520.  550.  0.763 0.726  0.100  
## 17 ATM1  ARIMA  Trai… -7.42e- 2  23.3   14.5   -102.  117.  0.822 0.838 -0.00872
## 18 ATM2  ARIMA  Trai… -8.91e- 1  24.1   17.0   -Inf   Inf   0.820 0.799 -0.00428
## 19 ATM3  ARIMA  Trai…  2.71e- 1   5.03   0.271   34.6  34.6 0.370 0.625  0.0124 
## 20 ATM4  ARIMA  Trai… -3.47e- 2 340.   280.    -509.  541.  0.809 0.750  0.00464

atm_model_compare <- atm_accuracy_train |>
  select(ATM, .model, RMSE, MAE, MAPE)

atm_model_compare

## # A tibble: 20 × 5
##    ATM   .model   RMSE     MAE  MAPE
##    <fct> <chr>   <dbl>   <dbl> <dbl>
##  1 ATM1  NAIVE   49.9   37.2   167. 
##  2 ATM2  NAIVE   54.2   42.5   Inf  
##  3 ATM3  NAIVE    5.09   0.310  40.2
##  4 ATM4  NAIVE  477.   386.    547. 
##  5 ATM1  SNAIVE  27.7   17.7   116. 
##  6 ATM2  SNAIVE  30.2   20.8   Inf  
##  7 ATM3  SNAIVE   8.04   0.735 100  
##  8 ATM4  SNAIVE 452.   346.    447. 
##  9 ATM1  DRIFT   49.9   37.2   167. 
## 10 ATM2  DRIFT   54.2   42.5   Inf  
## 11 ATM3  DRIFT    5.08   0.541 Inf  
## 12 ATM4  DRIFT  477.   386.    546. 
## 13 ATM1  ETS     23.8   15.1   122. 
## 14 ATM2  ETS     25.1   17.9   Inf  
## 15 ATM3  ETS      5.03   0.273 Inf  
## 16 ATM4  ETS    329.   264.    550. 
## 17 ATM1  ARIMA   23.3   14.5   117. 
## 18 ATM2  ARIMA   24.1   17.0   Inf  
## 19 ATM3  ARIMA    5.03   0.271  34.6
## 20 ATM4  ARIMA  340.   280.    541.

atm_model_compare2 <- atm_model_compare |>
  group_by(ATM) |>
  arrange(RMSE, .by_group = TRUE)

atm_model_compare2

## # A tibble: 20 × 5
## # Groups:   ATM [4]
##    ATM   .model   RMSE     MAE  MAPE
##    <fct> <chr>   <dbl>   <dbl> <dbl>
##  1 ATM1  ARIMA   23.3   14.5   117. 
##  2 ATM1  ETS     23.8   15.1   122. 
##  3 ATM1  SNAIVE  27.7   17.7   116. 
##  4 ATM1  DRIFT   49.9   37.2   167. 
##  5 ATM1  NAIVE   49.9   37.2   167. 
##  6 ATM2  ARIMA   24.1   17.0   Inf  
##  7 ATM2  ETS     25.1   17.9   Inf  
##  8 ATM2  SNAIVE  30.2   20.8   Inf  
##  9 ATM2  DRIFT   54.2   42.5   Inf  
## 10 ATM2  NAIVE   54.2   42.5   Inf  
## 11 ATM3  ARIMA    5.03   0.271  34.6
## 12 ATM3  ETS      5.03   0.273 Inf  
## 13 ATM3  DRIFT    5.08   0.541 Inf  
## 14 ATM3  NAIVE    5.09   0.310  40.2
## 15 ATM3  SNAIVE   8.04   0.735 100  
## 16 ATM4  ETS    329.   264.    550. 
## 17 ATM4  ARIMA  340.   280.    541. 
## 18 ATM4  SNAIVE 452.   346.    447. 
## 19 ATM4  DRIFT  477.   386.    546. 
## 20 ATM4  NAIVE  477.   386.    547.

atm_ets |> filter(ATM == "ATM1") |> gg_tsresiduals()

## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

atm_ets |> filter(ATM == "ATM2") |> gg_tsresiduals()

atm_ets |> filter(ATM == "ATM3") |> gg_tsresiduals()

atm_ets |> filter(ATM == "ATM4") |> gg_tsresiduals()

atm_arima |> filter(ATM == "ATM1") |> gg_tsresiduals()

atm_arima |> filter(ATM == "ATM2") |> gg_tsresiduals()

atm_arima |> filter(ATM == "ATM3") |> gg_tsresiduals()

atm_arima |> filter(ATM == "ATM4") |> gg_tsresiduals()

atm_fc_naive <- atm_naive |>
  forecast(h = 31)

atm_fc_snaive <- atm_snaive |>
  forecast(h = 31)

atm_fc_drift <- atm_drift |>
  forecast(h = 31)

atm_fc_ets <- atm_ets |>
  forecast(h = 31)

atm_fc_arima <- atm_arima |>
  forecast(h = 31)

augment(atm_arima) |>
  features(.innov, ljung_box, lag = 14, dof = 0)

## # A tibble: 4 × 4
##   ATM   .model lb_stat lb_pvalue
##   <fct> <chr>    <dbl>     <dbl>
## 1 ATM1  ARIMA   15.2       0.361
## 2 ATM2  ARIMA   10.2       0.745
## 3 ATM3  ARIMA    0.132     1.000
## 4 ATM4  ARIMA    8.64      0.854

atm_train |>
  ACF(Cash) |>
  autoplot() +
  facet_wrap(~ ATM) +
  labs(
    title = "Autocorrelation of Daily Cash Withdrawals by ATM",
    x = "Lag",
    y = "ACF"
  )

atm_fc_arima <- atm_arima |>
  forecast(h = 31, level = c(80, 95))

autoplot(atm_fc_arima, atm_train) +
  facet_wrap(~ ATM, scales = "free_y") +
  labs(
    title = "ARIMA Forecasts for May 2010",
    subtitle = "80% and 95% prediction intervals",
    x = "Date",
    y = "Cash Withdrawals (Hundreds of Dollars)"
  )

atm_fc_ets <- atm_ets |>
  forecast(h = 31, level = c(80, 95))

autoplot(atm_fc_ets, atm_train) +
  facet_wrap(~ ATM, scales = "free_y") +
  labs(
    title = "ETS Forecasts for May 2010",
    subtitle = "80% and 95% prediction intervals",
    x = "Date",
    y = "Cash Withdrawals (Hundreds of Dollars)"
  )

#Excel file output csv
atm_fc_ets <- atm_ets |>
  forecast(h = 31, level = 95)

atm_fc_arima <- atm_arima |>
  forecast(h = 31, level = 95)

atm_final_forecast <- bind_rows(
  atm_fc_arima |> filter(ATM == "ATM1"),
  atm_fc_arima |> filter(ATM == "ATM2"),
  atm_fc_arima |> filter(ATM == "ATM3"),
  atm_fc_ets   |> filter(ATM == "ATM4")
)

atm_forecast_export <- atm_final_forecast |>
  hilo(level = 95) |>
  unpack_hilo(`95%`) |>
  as_tibble() |>
  rename(
    point_forecast = .mean,
    lo_95 = `95%_lower`,
    hi_95 = `95%_upper`
  )


write.csv(atm_forecast_export, "PartA_ATM_Forecast_Output.csv", row.names = FALSE)

cat("CSV saved to:", file.path(getwd(), "PartA_ATM_Forecast_Output.csv"))

## CSV saved to: /Users/luishernandez/Desktop/MSDS/DATA624/PartA_ATM_Forecast_Output.csv

Part B Forecasting Power

#Import 
power_main <- read_excel("~/Downloads/ResidentialCustomerForecastLoad-624.xlsx")

glimpse(power_main)

## Rows: 192
## Columns: 3
## $ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ `YYYY-MMM`   <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ KWH          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…

# Clean and convert to monthly tsibble
power_cleaned <- power_main |>
  mutate(
    Month = yearmonth(`YYYY-MMM`),
    KWH = as.numeric(KWH)
  ) |>
  select(Month, KWH) |>
  arrange(Month)

power_tsibble <- power_cleaned |>
  as_tsibble(index = Month)

# Check for gaps 
print(power_tsibble)

## # A tsibble: 192 x 2 [1M]
##       Month     KWH
##       <mth>   <dbl>
##  1 1998 Jan 6862583
##  2 1998 Feb 5838198
##  3 1998 Mar 5420658
##  4 1998 Apr 5010364
##  5 1998 May 4665377
##  6 1998 Jun 6467147
##  7 1998 Jul 8914755
##  8 1998 Aug 8607428
##  9 1998 Sep 6989888
## 10 1998 Oct 6345620
## # ℹ 182 more rows

has_gaps(power_tsibble)

## # A tibble: 1 × 1
##   .gaps
##   <lgl>
## 1 FALSE

# Check for missing values
colSums(is.na(power_tsibble))

## Month   KWH 
##     0     1

# Plot 
autoplot(power_tsibble, KWH) +
  labs(
    title = "Monthly Residential Power Usage",
    x = "Year",
    y = "KWH"
  )

#Seasonal diagnostics
gg_season(power_tsibble, KWH) +
  labs(
    title = "Seasonal Plot of Monthly Power Usage",
    y = "KWH"
  )

## Warning: `gg_season()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_season()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

gg_subseries(power_tsibble, KWH) +
  labs(
    title = "Seasonal Subseries Plot of Monthly Power Usage",
    y = "KWH"
  )

## Warning: `gg_subseries()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_subseries()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

The seasonal plot confirms a stable yearly pattern. Power usage is normally highest in January and the later summer months like August, while spring months such as April and May tend to be lower. Most yearly curves follow a similar shape, indicating that seasonality is a major feature of the series. There is one extremely low July observation deviates substantially from the seasonal pattern but we leave this outlier for our observation and usefulness as a part of the pattern.

The seasonal subseries plot supports the presence of strong month-of-year effects. January, August, and September have high average, opposed to May and November. July seems to be a high-usage month, but its strongly effected by one extreme low data observation in 2010.

#Find missing value and replace with linear interpolation method
power_tsibble |>
  filter(is.na(KWH))

## # A tsibble: 1 x 2 [1M]
##      Month   KWH
##      <mth> <dbl>
## 1 2008 Sep    NA

power_tsibble <- power_tsibble |>
  mutate(KWH = na.approx(KWH, na.rm = FALSE))


#STL decomp
power_decomp <- power_tsibble |>
  model(STL(KWH))

components(power_decomp) |>
  autoplot() +
  labs(
    title = "STL Decomposition of Monthly Power Usage"
  )

# You can also inspect the minimum value month
power_tsibble |>
  filter(KWH == min(KWH, na.rm = TRUE))

## # A tsibble: 1 x 2 [1M]
##      Month    KWH
##      <mth>  <dbl>
## 1 2010 Jul 770523

#Fit benchmark and models
power_models <- power_tsibble |>
  model(
    mean_model   = MEAN(KWH),
    naive_model  = NAIVE(KWH),
    snaive_model = SNAIVE(KWH),
    ets_model    = ETS(KWH),
    arima_model  = ARIMA(KWH)
  )

accuracy(power_models)

## # A tibble: 5 × 10
##   .model       .type           ME     RMSE    MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>        <chr>        <dbl>    <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 mean_model   Training -4.66e-10 1440020. 1.18e6 -7.81  22.1 1.70  1.22  0.419 
## 2 naive_model  Training  1.44e+ 4 1539355. 1.20e6 -5.67  21.9 1.72  1.31  0.0270
## 3 snaive_model Training  9.42e+ 4 1178104. 6.97e5 -3.93  14.6 1.00  1     0.231 
## 4 ets_model    Training  6.10e+ 4  835107. 5.04e5 -4.39  12.0 0.723 0.709 0.170 
## 5 arima_model  Training -2.51e+ 4  827254. 4.93e5 -5.51  11.7 0.708 0.702 0.0128

# Reports for each model
power_models |> select(mean_model)   |> report()

## Series: KWH 
## Model: MEAN 
## 
## Mean: 6502823.5208 
## sigma^2: 2084513846667.07

power_models |> select(naive_model)  |> report()

## Series: KWH 
## Model: NAIVE 
## 
## sigma^2: 2381879261976.21

power_models |> select(snaive_model) |> report()

## Series: KWH 
## Model: SNAIVE 
## 
## sigma^2: 1386751915026.14

power_models |> select(ets_model)    |> report()

## Series: KWH 
## Model: ETS(M,N,M) 
##   Smoothing parameters:
##     alpha = 0.1137195 
##     gamma = 0.0001001086 
## 
##   Initial states:
##     l[0]      s[0]     s[-1]     s[-2]    s[-3]    s[-4]    s[-5]     s[-6]
##  6134773 0.9547463 0.7533319 0.8602685 1.191176 1.255577 1.199206 0.9982427
##      s[-7]     s[-8]     s[-9]   s[-10]   s[-11]
##  0.7683657 0.8160779 0.9174797 1.061846 1.223683
## 
##   sigma^2:  0.0144
## 
##      AIC     AICc      BIC 
## 6224.057 6226.784 6272.919

power_models |> select(arima_model)  |> report()

## Series: KWH 
## Model: ARIMA(0,0,1)(1,1,1)[12] w/ drift 
## 
## Coefficients:
##          ma1     sar1     sma1   constant
##       0.2439  -0.1518  -0.7311  105259.65
## s.e.  0.0723   0.0963   0.0821   27000.74
## 
## sigma^2 estimated as 7.466e+11:  log likelihood=-2719.89
## AIC=5449.79   AICc=5450.13   BIC=5465.75

# Residual diagnostics for each model
power_models |> select(mean_model)   |> gg_tsresiduals()

power_models |> select(naive_model)  |> gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_rug()`).

power_models |> select(snaive_model) |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_rug()`).

power_models |> select(ets_model)    |> gg_tsresiduals()

power_models |> select(arima_model)  |> gg_tsresiduals()

The mean and naive models performed poorly as they do not account for the strong seasonality in the power series. The seasonal naive model was substantially better and that shows that annual seasonality is an important feature of the data. The ETS and ARIMA provided much better fits than the benchmark models. Among all methods, ARIMA achieved the lowest RMSE, MAE, and MAPE among others indicating the best overall forecasting performance on our training data.

The STL decomposition separates the trend, seasonal, and remainder components. The seasonal component is strong and confirms that annual seasonality is the dominant structure in the data. The trend component changes gradually, but the remainder component contains one very large negative spike around 2010. This indicates that the extreme drop is not explained by the usual seasonal or trend behavior and is better interpreted as an outlier or irregular shock.

The residuals support the choice of ARIMA. The mean shows a large amount of autocorrelation and structure in the residuals but the seasonal naive model performs better. ETS improves the residuals behavior but ARIMA produces the cleanest residual pattern overall, with residual autocorrelation closest to zero. Although all models are affected by the extreme outlier around 2010, ARIMA appears to capture the underlying seasonal dependence. ARIMA achieved the lowest RMSE 827,254, MAE 493,306, and MAPE 11.68, along with the smallest residual autocorrelation 0.01284. For this reason ARIMA was selected as the final model with ETS as a close second.

# Ljung-Box test for residual autocorrelation
augment(power_models) |>
  features(.innov, ljung_box, lag = 24, dof = 0)

## # A tibble: 5 × 3
##   .model       lb_stat     lb_pvalue
##   <chr>          <dbl>         <dbl>
## 1 arima_model     14.2 0.942        
## 2 ets_model       28.6 0.235        
## 3 mean_model     544.  0            
## 4 naive_model    425.  0            
## 5 snaive_model    86.5 0.00000000537

# Forecast 12 months ahead for 2014
power_fc <- power_models |>
  forecast(h = "12 months")

# Plot forecasts
autoplot(power_tsibble, KWH) +
  autolayer(power_fc, level = 95) +
  labs(
    title = "Monthly Power Usage Forecast for 2014",
    x = "Year",
    y = "KWH"
  )

# Forecast table
power_fc

## # A fable: 60 x 4 [1M]
## # Key:     .model [5]
##    .model        Month
##    <chr>         <mth>
##  1 mean_model 2014 Jan
##  2 mean_model 2014 Feb
##  3 mean_model 2014 Mar
##  4 mean_model 2014 Apr
##  5 mean_model 2014 May
##  6 mean_model 2014 Jun
##  7 mean_model 2014 Jul
##  8 mean_model 2014 Aug
##  9 mean_model 2014 Sep
## 10 mean_model 2014 Oct
## # ℹ 50 more rows
## # ℹ 2 more variables: KWH <dist>, .mean <dbl>

# More readable forecast table
power_fc_table <- power_fc |>
  as_tibble() |>
  select(.model, Month, .mean)

print(power_fc_table)

## # A tibble: 60 × 3
##    .model        Month    .mean
##    <chr>         <mth>    <dbl>
##  1 mean_model 2014 Jan 6502824.
##  2 mean_model 2014 Feb 6502824.
##  3 mean_model 2014 Mar 6502824.
##  4 mean_model 2014 Apr 6502824.
##  5 mean_model 2014 May 6502824.
##  6 mean_model 2014 Jun 6502824.
##  7 mean_model 2014 Jul 6502824.
##  8 mean_model 2014 Aug 6502824.
##  9 mean_model 2014 Sep 6502824.
## 10 mean_model 2014 Oct 6502824.
## # ℹ 50 more rows

#Excel csv output

power_fc_final <- power_models |>
  select(arima_model) |>
  forecast(h = 12, level = 95)

power_forecast_export <- power_fc_final |>
  hilo(level = 95) |>
  unpack_hilo(`95%`) |>
  as_tibble() |>
  rename(
    point_forecast = .mean,
    lo_95 = `95%_lower`,
    hi_95 = `95%_upper`
  )

write.csv(power_forecast_export, "PartB_Power_Forecast_Output.csv",row.names = FALSE)

Part C

#Read the Excel files
pipe1_raw <- read_excel("~/Downloads/Waterflow_Pipe1.xlsx")
pipe2_raw <- read_excel("~/Downloads/Waterflow_Pipe2.xlsx")

glimpse(pipe1_raw)

## Rows: 1,000
## Columns: 2
## $ `Date Time` <dbl> 42300.02, 42300.03, 42300.04, 42300.04, 42300.06, 42300.06…
## $ WaterFlow   <dbl> 23.369599, 28.002881, 23.065895, 29.972809, 5.997953, 15.9…

glimpse(pipe2_raw)

## Rows: 1,000
## Columns: 2
## $ `Date Time` <dbl> 42300.04, 42300.08, 42300.12, 42300.17, 42300.21, 42300.25…
## $ WaterFlow   <dbl> 18.810791, 43.087025, 37.987705, 36.120379, 31.851259, 28.…

# Clean data
pipe1_raw <- pipe1_raw |>
  rename(Date_Time = `Date Time`) |>
  mutate(
    Date_Time = as.POSIXct(Date_Time * 86400, origin = "1899-12-30", tz = "UTC"),
    WaterFlow = as.numeric(WaterFlow)
  )

pipe2_raw <- pipe2_raw |>
  rename(Date_Time = `Date Time`) |>
  mutate(
    Date_Time = as.POSIXct(Date_Time * 86400, origin = "1899-12-30", tz = "UTC"),
    WaterFlow = as.numeric(WaterFlow)
  )

# Check conversion worked Remove rows with missing datetime or WaterFlow
summary(pipe1_raw$Date_Time)

##                  Min.               1st Qu.                Median 
## "2015-10-23 00:24:06" "2015-10-25 11:21:35" "2015-10-27 20:07:30" 
##                  Mean               3rd Qu.                  Max. 
## "2015-10-27 20:49:15" "2015-10-30 08:24:51" "2015-11-01 23:35:43"

summary(pipe2_raw$Date_Time)

##                  Min.               1st Qu.                Median 
## "2015-10-23 01:00:00" "2015-11-02 10:45:00" "2015-11-12 20:30:00" 
##                  Mean               3rd Qu.                  Max. 
## "2015-11-12 20:30:00" "2015-11-23 06:15:00" "2015-12-03 16:00:00"

head(pipe1_raw)

## # A tibble: 6 × 2
##   Date_Time           WaterFlow
##   <dttm>                  <dbl>
## 1 2015-10-23 00:24:06     23.4 
## 2 2015-10-23 00:40:02     28.0 
## 3 2015-10-23 00:53:51     23.1 
## 4 2015-10-23 00:55:40     30.0 
## 5 2015-10-23 01:19:17      6.00
## 6 2015-10-23 01:23:58     15.9

head(pipe2_raw)

## # A tibble: 6 × 2
##   Date_Time           WaterFlow
##   <dttm>                  <dbl>
## 1 2015-10-23 01:00:00      18.8
## 2 2015-10-23 01:59:59      43.1
## 3 2015-10-23 03:00:00      38.0
## 4 2015-10-23 04:00:00      36.1
## 5 2015-10-23 04:59:59      31.9
## 6 2015-10-23 06:00:00      28.2

colSums(is.na(pipe1_raw))

## Date_Time WaterFlow 
##         0         0

colSums(is.na(pipe2_raw))

## Date_Time WaterFlow 
##         0         0

pipe1_raw <- pipe1_raw |>
  filter(!is.na(Date_Time), !is.na(WaterFlow))

pipe2_raw <- pipe2_raw |>
  filter(!is.na(Date_Time), !is.na(WaterFlow))


# Full hourly sequences

pipe1_hourly <- pipe1_raw |>
  mutate(Hour = floor_date(Date_Time, unit = "hour")) |>
  group_by(Hour) |>
  summarise(
    WaterFlow = mean(WaterFlow, na.rm = TRUE),
    .groups = "drop"
  ) |>
  arrange(Hour)

pipe2_hourly <- pipe2_raw |>
  mutate(Hour = floor_date(Date_Time, unit = "hour")) |>
  group_by(Hour) |>
  summarise(
    WaterFlow = mean(WaterFlow, na.rm = TRUE),
    .groups = "drop"
  ) |>
  arrange(Hour)


pipe1_full <- tibble(
  Hour = seq(
    from = min(pipe1_hourly$Hour),
    to = max(pipe1_hourly$Hour),
    by = "hour"
  )
) |>
  left_join(pipe1_hourly, by = "Hour")

pipe2_full <- tibble(
  Hour = seq(
    from = min(pipe2_hourly$Hour),
    to = max(pipe2_hourly$Hour),
    by = "hour"
  )
) |>
  left_join(pipe2_hourly, by = "Hour")

# Find missing hour values

pipe1_missing_hours <- pipe1_full |>
  filter(is.na(WaterFlow))

pipe2_missing_hours <- pipe2_full |>
  filter(is.na(WaterFlow))

pipe1_missing_hours

## # A tibble: 4 × 2
##   Hour                WaterFlow
##   <dttm>                  <dbl>
## 1 2015-10-27 17:00:00        NA
## 2 2015-10-28 00:00:00        NA
## 3 2015-11-01 05:00:00        NA
## 4 2015-11-01 09:00:00        NA

pipe2_missing_hours

## # A tibble: 333 × 2
##    Hour                WaterFlow
##    <dttm>                  <dbl>
##  1 2015-10-23 02:00:00        NA
##  2 2015-10-23 05:00:00        NA
##  3 2015-10-23 08:00:00        NA
##  4 2015-10-23 11:00:00        NA
##  5 2015-10-23 14:00:00        NA
##  6 2015-10-23 17:00:00        NA
##  7 2015-10-23 20:00:00        NA
##  8 2015-10-23 23:00:00        NA
##  9 2015-10-24 02:00:00        NA
## 10 2015-10-24 05:00:00        NA
## # ℹ 323 more rows

# Convert to tsibble and Combine

pipe1_ts <- pipe1_full |>
  as_tsibble(index = Hour)

pipe2_ts <- pipe2_full |>
  as_tsibble(index = Hour)

pipe1_combined <- pipe1_ts |>
  as_tibble() |>
  mutate(Pipe = "Pipe1")

pipe2_combined <- pipe2_ts |>
  as_tibble() |>
  mutate(Pipe = "Pipe2")

waterflow_combined <- bind_rows(pipe1_combined, pipe2_combined) |>
  as_tsibble(key = Pipe, index = Hour)

#Summarized for each pipe
pipe1_summary <- pipe1_ts |>
  summarise(
    start_time = min(Hour),
    end_time   = max(Hour),
    n_hours    = n(),
    missing    = sum(is.na(WaterFlow)),
    mean_flow  = mean(WaterFlow, na.rm = TRUE),
    sd_flow    = sd(WaterFlow, na.rm = TRUE),
    min_flow   = min(WaterFlow, na.rm = TRUE),
    max_flow   = max(WaterFlow, na.rm = TRUE)
  )

## Warning: There were 8 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `min_flow = min(WaterFlow, na.rm = TRUE)`.
## ℹ In group 114: `Hour = 2015-10-27 17:00:00`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 7 remaining warnings.

pipe2_summary <- pipe2_ts |>
  summarise(
    start_time = min(Hour),
    end_time   = max(Hour),
    n_hours    = n(),
    missing    = sum(is.na(WaterFlow)),
    mean_flow  = mean(WaterFlow, na.rm = TRUE),
    sd_flow    = sd(WaterFlow, na.rm = TRUE),
    min_flow   = min(WaterFlow, na.rm = TRUE),
    max_flow   = max(WaterFlow, na.rm = TRUE)
  )

## Warning: There were 666 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `min_flow = min(WaterFlow, na.rm = TRUE)`.
## ℹ In group 2: `Hour = 2015-10-23 02:00:00`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 665 remaining warnings.

pipe1_summary

## # A tsibble: 240 x 9 [1h] <UTC>
##    Hour                start_time          end_time            n_hours missing
##    <dttm>              <dttm>              <dttm>                <int>   <int>
##  1 2015-10-23 00:00:00 2015-10-23 00:00:00 2015-10-23 00:00:00       1       0
##  2 2015-10-23 01:00:00 2015-10-23 01:00:00 2015-10-23 01:00:00       1       0
##  3 2015-10-23 02:00:00 2015-10-23 02:00:00 2015-10-23 02:00:00       1       0
##  4 2015-10-23 03:00:00 2015-10-23 03:00:00 2015-10-23 03:00:00       1       0
##  5 2015-10-23 04:00:00 2015-10-23 04:00:00 2015-10-23 04:00:00       1       0
##  6 2015-10-23 05:00:00 2015-10-23 05:00:00 2015-10-23 05:00:00       1       0
##  7 2015-10-23 06:00:00 2015-10-23 06:00:00 2015-10-23 06:00:00       1       0
##  8 2015-10-23 07:00:00 2015-10-23 07:00:00 2015-10-23 07:00:00       1       0
##  9 2015-10-23 08:00:00 2015-10-23 08:00:00 2015-10-23 08:00:00       1       0
## 10 2015-10-23 09:00:00 2015-10-23 09:00:00 2015-10-23 09:00:00       1       0
## # ℹ 230 more rows
## # ℹ 4 more variables: mean_flow <dbl>, sd_flow <dbl>, min_flow <dbl>,
## #   max_flow <dbl>

pipe2_summary

## # A tsibble: 1,000 x 9 [1h] <UTC>
##    Hour                start_time          end_time            n_hours missing
##    <dttm>              <dttm>              <dttm>                <int>   <int>
##  1 2015-10-23 01:00:00 2015-10-23 01:00:00 2015-10-23 01:00:00       1       0
##  2 2015-10-23 02:00:00 2015-10-23 02:00:00 2015-10-23 02:00:00       1       1
##  3 2015-10-23 03:00:00 2015-10-23 03:00:00 2015-10-23 03:00:00       1       0
##  4 2015-10-23 04:00:00 2015-10-23 04:00:00 2015-10-23 04:00:00       1       0
##  5 2015-10-23 05:00:00 2015-10-23 05:00:00 2015-10-23 05:00:00       1       1
##  6 2015-10-23 06:00:00 2015-10-23 06:00:00 2015-10-23 06:00:00       1       0
##  7 2015-10-23 07:00:00 2015-10-23 07:00:00 2015-10-23 07:00:00       1       0
##  8 2015-10-23 08:00:00 2015-10-23 08:00:00 2015-10-23 08:00:00       1       1
##  9 2015-10-23 09:00:00 2015-10-23 09:00:00 2015-10-23 09:00:00       1       0
## 10 2015-10-23 10:00:00 2015-10-23 10:00:00 2015-10-23 10:00:00       1       0
## # ℹ 990 more rows
## # ℹ 4 more variables: mean_flow <dbl>, sd_flow <dbl>, min_flow <dbl>,
## #   max_flow <dbl>

#autoplots hourly
autoplot(pipe1_ts, WaterFlow) +
  labs(
    title = "Hourly Water Flow - Pipe 1",
    x = "Hour",
    y = "Mean Water Flow"
  )

autoplot(pipe2_ts, WaterFlow) +
  labs(
    title = "Hourly Water Flow - Pipe 2",
    x = "Hour",
    y = "Mean Water Flow"
  )

autoplot(waterflow_combined, WaterFlow) +
  facet_wrap(~ Pipe, scales = "free_y") +
  labs(
    title = "Hourly Water Flow for Both Pipes",
    x = "Hour",
    y = "Mean Water Flow"
  )

# Export to csv
write.csv(pipe1_full, "pipe1_hourly_cleaned.csv", row.names = FALSE)
write.csv(pipe2_full, "pipe2_hourly_cleaned.csv", row.names = FALSE)
write.csv(waterflow_combined, "waterflow_hourly_combined.csv", row.names = FALSE)

The time plots do not show a strong trend and both series appear roughly stable in level. Pipe 1 is short and has a lot of noise for the cereation of a 1 week forecast. Pipe 2 has missing hourly timestamps so its not a clean regular hourly series and must be regularized before standard forecasting methods are applied. I would use a method of interpolation here due to the variance for hourly data I think this is the strongest option to completely prepare this data for forecasting.

Project 1 Data 624

2026-03-20