PROJECT1_DATA624

library(fpp3)
library(readxl)
library(patchwork)
library(purrr)
library(fable.prophet)
library(httr)
library(writexl)

Data preparation and EDA of the series

First, I imported the ATM data and converted it into a time series format so I could look at each ATM separately over time. I also created some calendar-based variables such as day of week, month start, month end, and payday window. I did this because ATM withdrawals are usually not fully random. People often take out cash in repeated weekly patterns, and sometimes at the start or end of the month, so these variables could help explain some of the variation in the data.

url <- "https://github.com/farhodibr/CUNY-SPS-MSDS/raw/main/DATA624/PROJECT1/ATM624Data.xlsx"

dest_file <- tempfile(fileext = ".xlsx")
GET(url, write_disk(dest_file, overwrite = TRUE))

## Response [https://raw.githubusercontent.com/farhodibr/CUNY-SPS-MSDS/main/DATA624/PROJECT1/ATM624Data.xlsx]
##   Date: 2026-03-30 04:35
##   Status: 200
##   Content-Type: application/octet-stream
##   Size: 40.8 kB
## <ON DISK>  C:\Users\farho\AppData\Local\Temp\RtmpeI8NfO\file4288383a4a3d.xlsx

atm_total <- read_excel(dest_file, 
                        col_types = c('date', 'text', 'numeric'))

head(atm_total)

## # A tibble: 6 × 3
##   DATE                ATM    Cash
##   <dttm>              <chr> <dbl>
## 1 2009-05-01 00:00:00 ATM1     96
## 2 2009-05-01 00:00:00 ATM2    107
## 3 2009-05-02 00:00:00 ATM1     82
## 4 2009-05-02 00:00:00 ATM2     89
## 5 2009-05-03 00:00:00 ATM1     85
## 6 2009-05-03 00:00:00 ATM2     90

# atm_total <- read_excel("https://github.com/farhodibr/CUNY-SPS-MSDS/blob/main/DATA624/PROJECT1/ATM624Data.xlsx",
#                         col_types = c('date', 'text', 'numeric'))

atm_total

## # A tibble: 1,474 × 3
##    DATE                ATM    Cash
##    <dttm>              <chr> <dbl>
##  1 2009-05-01 00:00:00 ATM1     96
##  2 2009-05-01 00:00:00 ATM2    107
##  3 2009-05-02 00:00:00 ATM1     82
##  4 2009-05-02 00:00:00 ATM2     89
##  5 2009-05-03 00:00:00 ATM1     85
##  6 2009-05-03 00:00:00 ATM2     90
##  7 2009-05-04 00:00:00 ATM1     90
##  8 2009-05-04 00:00:00 ATM2     55
##  9 2009-05-05 00:00:00 ATM1     99
## 10 2009-05-05 00:00:00 ATM2     79
## # ℹ 1,464 more rows

atm_ts <- atm_total |>
  mutate(DATE = as_date(DATE)) |>
  as_tsibble(index = DATE, key = ATM)
atm_ts

## # A tsibble: 1,474 x 3 [1D]
## # Key:       ATM [5]
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-05-01 ATM1     96
##  2 2009-05-02 ATM1     82
##  3 2009-05-03 ATM1     85
##  4 2009-05-04 ATM1     90
##  5 2009-05-05 ATM1     99
##  6 2009-05-06 ATM1     88
##  7 2009-05-07 ATM1      8
##  8 2009-05-08 ATM1    104
##  9 2009-05-09 ATM1     87
## 10 2009-05-10 ATM1     93
## # ℹ 1,464 more rows

atm_ts |> 
  autoplot(Cash)+
  facet_wrap(~ATM, ncol = 1, scale = 'free_y')

After that, I checked the series for missing dates and gaps. This step was important because time series models assume the observations are in correct time order and that the time index is consistent. If there are gaps, the model can misread the structure of the series and give weaker forecasts. In this case, after the imputation step, all four ATM series had no gaps, so the data was ready for modeling.

atm_ts |> 
  filter(is.na(Cash))

## # A tsibble: 19 x 3 [1D]
## # Key:       ATM [3]
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-06-13 ATM1     NA
##  2 2009-06-16 ATM1     NA
##  3 2009-06-22 ATM1     NA
##  4 2009-06-18 ATM2     NA
##  5 2009-06-24 ATM2     NA
##  6 2010-05-01 <NA>     NA
##  7 2010-05-02 <NA>     NA
##  8 2010-05-03 <NA>     NA
##  9 2010-05-04 <NA>     NA
## 10 2010-05-05 <NA>     NA
## 11 2010-05-06 <NA>     NA
## 12 2010-05-07 <NA>     NA
## 13 2010-05-08 <NA>     NA
## 14 2010-05-09 <NA>     NA
## 15 2010-05-10 <NA>     NA
## 16 2010-05-11 <NA>     NA
## 17 2010-05-12 <NA>     NA
## 18 2010-05-13 <NA>     NA
## 19 2010-05-14 <NA>     NA

As we see there are 19 observations with NA values:

ATM1 has 3 missing Cash values
ATM2 has 2 missing Cashvalues
and there are 16 observations with missing ATM and Cash values for May of 2010

Since the goal of this analysis is to forecast ATM withdrawals for May of 2010 I will remove rows for corresponding dates with missing values.

atm_ts <- atm_ts |> 
  filter(DATE < as.Date('2010-05-01'))
atm_ts |> 
  filter(is.na(Cash))

## # A tsibble: 5 x 3 [1D]
## # Key:       ATM [2]
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-06-13 ATM1     NA
## 2 2009-06-16 ATM1     NA
## 3 2009-06-22 ATM1     NA
## 4 2009-06-18 ATM2     NA
## 5 2009-06-24 ATM2     NA

atm_ts |> 
  has_gaps()

## # A tibble: 4 × 2
##   ATM   .gaps
##   <chr> <lgl>
## 1 ATM1  FALSE
## 2 ATM2  FALSE
## 3 ATM3  FALSE
## 4 ATM4  FALSE

As we see there are no missing dates for each of the series.

To impute missing Cash values in ATM1 and ATM2 series i will use ARIMA model to forecast and impute the missing values. The reason i choose ARIMA is to prevent seasonality and structure of the data. I think ARIMA is better approach than simple linear interpolation.

fit_impute_nas_arima <- atm_ts |> 
  model(arima_nas = ARIMA(Cash))

atm_imputed <-  fit_impute_nas_arima |> 
  interpolate(atm_ts)

atm_imputed |> 
  filter(is.na(Cash))

## # A tsibble: 0 x 3 [?]
## # Key:       ATM [0]
## # ℹ 3 variables: ATM <chr>, DATE <date>, Cash <dbl>

atm_imputed |>
  semi_join(
    atm_ts |> filter(is.na(Cash)),
    by = c("DATE", "ATM")
  )

## # A tsibble: 5 x 3 [1D]
## # Key:       ATM [2]
##   ATM   DATE         Cash
##   <chr> <date>      <dbl>
## 1 ATM1  2009-06-13 111.  
## 2 ATM1  2009-06-16 124.  
## 3 ATM1  2009-06-22 100.  
## 4 ATM2  2009-06-18   9.42
## 5 ATM2  2009-06-24  29.4

atm_imputed |> 
  has_gaps()

## # A tibble: 4 × 2
##   ATM   .gaps
##   <chr> <lgl>
## 1 ATM1  FALSE
## 2 ATM2  FALSE
## 3 ATM3  FALSE
## 4 ATM4  FALSE

As we see there are no gaps in the series and missing values are imputed.

Here I created new variables to indicate dates as:

each day as day of the week with day_of_weekvariable
first 3 days of the month as is_month_startvariable
is_month_endindicates last 2-3 days as end of month

atm_ts <- atm_imputed |> 
  mutate(day_of_week = wday(DATE, label = TRUE),
         is_month_start = day(DATE) <= 3,
         is_month_end = day(DATE) >= 28)

Below I will check if unusual spike is an outlier at ATM4 series

atm_ts |>
  filter(ATM == "ATM4") |>
  filter(Cash > 2000)

## # A tsibble: 1 x 6 [1D]
## # Key:       ATM [1]
##   ATM   DATE         Cash day_of_week is_month_start is_month_end
##   <chr> <date>      <dbl> <ord>       <lgl>          <lgl>       
## 1 ATM4  2010-02-09 10920. Tue         FALSE          FALSE

Extreme outlier in ATM4 on 2010-02-09 where the cash amount was about 10,920, while the values around it are much smaller. Since the project states that cash is recorded in hundreds of dollars (and that actual value becomes $1,091,976), and because the surrounding values are around much lower values, this point looked much more like a data entry error than a real-world jump in withdrawals. I think it is sounds unrealistic for single ATM machine to have a supply of 1 millions of dollars on any given day.

Because of that, I adjusted it to 109.19 instead of leaving the inflated value in the series. I did this so one bad value would not distort the trend, seasonal pattern, variance, and model fit for ATM4.

atm_clean <- atm_ts |>
  mutate(
    Cash = if_else(
      ATM == "ATM4" &
      DATE == as.Date("2010-02-09"),
      109.19,
      Cash
    )
  )

atm_clean |> 
  autoplot(Cash)+
  facet_wrap(~ATM, ncol = 1, scale = 'free_y')

Now after imputing missing values and adjusting ATM4 outlier we see that the machines do not all behave the same way

ATM1 has a pattern of seasonality with large amounts of huge dips compared to few spikes
ATM2 has somewhat more regular mean-centered variation with presence of seasonality
ATM3 is mostly zeros with only a few actual transactions
and ATM4 had the largest scale and strongest fluctuations. There is no indications of seasonality for ATM4 series.

ATM1 series

For ATM1, I first plotted the raw series to understand the behavior over time. The graph suggested a repeating weekly pattern and fairly stable behavior, but also some seasonality that needed to be handled before fitting ARIMA-type models.

atm1 <- atm_ts |> 
  filter(ATM == 'ATM1')
p1_atm1_reg <- atm1 |> 
  autoplot(Cash) +
  labs(title = 'ATM1 series')
p1_atm1_reg

The graph suggested a repeating weekly pattern and fairly stable behavior, but also some seasonality that needed to be handled before fitting models. There is an evidence of pattern of repeating days with very low amounts of withdrawals which could be explained by less activities on the weekends and a location of this machine.
Few spikes of withdrawals can be results doesn’t seem to be extreme and could be caused by events or holidays on those days.
There is also a visible pattern of up and down trend in this series. As we see trend appear to increase towards September followed by decline in December and appear to stabilize after.

lambda_atm1 <- atm1 |> 
  features(Cash, guerrero) |> 
  pull(lambda_guerrero)

atm1 <- atm1 |> 
  mutate(Cash_bxcx = box_cox(Cash, lambda_atm1))
p2_atm1_bxcx <- atm1 |> 
  autoplot(Cash_bxcx)+
  labs(title = 'Boxcox transformed series')
p1_atm1_reg/p2_atm1_bxcx

I used a Box-Cox transformation to stabilize the variance and it appears to surpress spikes, but at the same time it created imbalance in variances if dips.
After debating if i should be using regular values or BoxCox transformed ones in modeling, I decided to use both and see how results will vary.

atm1 |> ACF(Cash, lag_max = 60) |> autoplot()

ACF plot proves my previous observation of presence of seasonality. As wee see there are strong significant autocorrelation in multiples of 7 (lags 7, 14, 21, 28 and so on).

There is also pattern of negative aoutocorrelations at lags 2, 9, 16, 23, 30 and 5, 12, 19, 26.

These confirms of weekly seasonality of this series.

Below I split the data into train and test sets.

atm1_train <- atm1 |> 
  filter(DATE < ymd('2010-04-01'))

lambda_atm1_train <- atm1_train |> 
  features(Cash, box_cox, feature =guerrero) |> 
  pull(lambda_guerrero)
lambda_atm1_train

## [1] 0.437283

p1_atm_train <- atm1_train |> 
  autoplot(Cash)
p2_atm_train_bxcx <- atm1_train |> 
  autoplot(box_cox(Cash, lambda_atm1_train))
p1_atm_train/p2_atm_train_bxcx

STL Decomposition

atm1 |> 
  model(
    STL(Cash_bxcx~ trend(window = 31) + season(window = 7))
  ) |> 
  components() |> 
  autoplot()

I decomposed the Box-Cox transformed cash data into trend, weekly seasonality, and remainder to better understand the structure.

Looking at the original series, there are clear repeating drops, so right away it shows strong seasonality. It’s not random at all.

As we see trend is not stable, it goes up a bit in mid-2009, then drops towards the end of 2009 and kind of flattens in 2010. I think more data needed to see if these trend pattern is consistent over the years.

The seasonality is the strongest part. There are very regular sharp dips, most likely weekends. This pattern is very consistent over time, except in February and March of 2010. As we see there is a decrease of variance, and it appear decomposition couldn’t catch full seasonality and it leaked into remainder.

The remainder is mostly around zero, but there are some big drops and few spikes. That tells that not everything is explained and these could be outliers, or unusual events or machine disfunction on those days.

atm1_train |>
  model(stl = STL(box_cox(Cash, lambda_atm1_train) ~ trend(window=30) + season(window = 7))) |>
  components() |>
  ggplot(aes(x = atm1_train$day_of_week, y = season_adjust)) +
  geom_boxplot() +
  labs(title = 'Boxplots of seasonally adjusted (STL) + \nBox-Cox transformed data by day of week for ATM1')

I plotted boxplots of the seasonally adjusted and Box-Cox transformed data by day of week to check on outliers.

These outliers are days where cash usage was much higher or lower than usual.

Likely reasons:

holidays, special events or paydays causing higher withdrawals
ATM or weather issues could cause very low values
possible data errors

So in my opinion outliers are not just noise, but could be real unusual events, especially noticeable low ones on Fridays.

I will leave these outliers as is for further analysis, but in real-world I would investigate what could cause them and fix them if needed.

Modelling

atm1_train |> 
  features(box_cox(Cash, lambda_atm1_train), unitroot_nsdiffs)

## # A tibble: 1 × 2
##   ATM   nsdiffs
##   <chr>   <int>
## 1 ATM1        1

atm1_train |> 
  features(difference(box_cox(Cash, lambda_atm1_train), 7), unitroot_ndiffs)

## # A tibble: 1 × 2
##   ATM   ndiffs
##   <chr>  <int>
## 1 ATM1       0

The unit root results showed that ATM1 needed one seasonal difference, but no additional regular difference after that. That means the main non-stationarity was seasonal rather than trend-based. Basically the pattern was repeating across weeks, so differencing at lag 7 made more sense than taking ordinary first differences.

atm1_train |> 
  gg_tsdisplay(difference(box_cox(Cash, lambda_atm1_train), 7),
               lag_max = 60,
               plot_type = 'partial')

From the plot, the series looks mostly centered around zero, with few spikes and dips related to previous discovered outliers from STL analysis.

In the ACF, there is a strong negative spike at lag 7, and this clearly shows weekly seasonality is present.

In the PACF, also a strong spike at lags of multiples of 7, and then it quickly fades out.

So this tells me:

data is mostly stationary now
strong weekly pattern still exists
not much additional structure beyond that

fit_atm1 <- atm1_train |> 
    model(
    stl_arima_periodic = decomposition_model(
      STL(box_cox(Cash, lambda_atm1_train) ~ trend(window = 21) 
          + season(window = "periodic"), robust = TRUE),
      ARIMA(season_adjust)
    ),
    stl_arima_adaptive = decomposition_model(
      STL(box_cox(Cash, lambda_atm1_train) ~ trend(window = 21) + season(window = 7)),
      ARIMA(season_adjust)
    ),
    arima_auto_bxcx = ARIMA(box_cox(Cash, lambda_atm1_train),
                       stepwise = FALSE,
                       approx = FALSE),
     arima_auto = ARIMA(Cash,
                       stepwise = FALSE,
                       approx = FALSE),
    arima_202_311 = ARIMA(box_cox(Cash, lambda_atm1_train) ~ 0+ pdq(2,0,2) + PDQ(3,1,1)),
    arima_101_411 = ARIMA(box_cox(Cash, lambda_atm1_train) ~ 0+ pdq(2,0,2) + PDQ(4,1,1)),
    arima_002_413 = ARIMA(box_cox(Cash, lambda_atm1_train) ~ 0+ pdq(0,0,2) + PDQ(4,1,3)),
    arima_200_512 = ARIMA(box_cox(Cash, lambda_atm1_train) ~ 0+ pdq(0,0,2) + PDQ(2,1,4)),
    stl_ets_adaptive = decomposition_model(
      STL(box_cox(Cash, lambda_atm1_train) ~ trend(window = 30) + season(window = 7)),
      ETS(season_adjust)),
    ets = ETS(box_cox(Cash, lambda_atm1_train)),
    ets_reg = ETS(Cash~ error('A') + trend('N') + season('A')),
    snaive = SNAIVE(box_cox(Cash, lambda_atm1_train) ~ lag("week"))
    
    
  )

I tried a mix of STL-based, ARIMA, ETS, and baseline models to see what works best for this data, since it has strong weekly seasonality and some noise.

The STL + ARIMA models (stl_arima_periodic, stl_arima_adaptive) were used to separate trend and seasonality first, then model the remainder. I used both periodic and adaptive seasonality to see if the seasonal pattern is stable or changing over time.

The auto ARIMA models (arima_auto, arima_auto_bxcx) let the algorithm find the best parameters, both with and without Box-Cox, just to compare if transformation helps.

The manual ARIMA models were chosen based on ACF/PACF after seasonal differencing, and I mainly focused on seasonal terms since lag 7 was strong. I tried a few variations to see which combination captures the structure better.

The STL + ETS model was included as another decomposition approach, but I used ETS instead of ARIMA for the adjusted data.

The ETS models (ets, ets_reg) are used because they handle trend and seasonality differently and sometimes work better than ARIMA.

Finally, SNAIVE is used as a benchmark model using weekly seasonality. If other models can’t beat this, then they’re probably not good.

Overall, I didn’t rely on just one model, I tested different methods to compare performance and make sure the final choice is actually justified.

train_accuracy <- fit_atm1 |> 
  accuracy() |> 
  select(.model, RMSE, MAE, MAPE) |> 
  arrange(RMSE)
train_accuracy

## # A tibble: 12 × 4
##    .model              RMSE   MAE  MAPE
##    <chr>              <dbl> <dbl> <dbl>
##  1 stl_arima_adaptive  16.1  11.9  36.2
##  2 stl_ets_adaptive    19.3  12.9  72.9
##  3 arima_auto          24.0  15.0 123. 
##  4 arima_200_512       24.0  15.8 106. 
##  5 arima_002_413       24.3  15.9 108. 
##  6 ets_reg             24.6  15.6 129. 
##  7 arima_202_311       24.8  16.1 111. 
##  8 arima_101_411       24.8  16.1 111. 
##  9 arima_auto_bxcx     25.0  16.0 115. 
## 10 stl_arima_periodic  25.0  16.3 118. 
## 11 ets                 25.3  16.3 121. 
## 12 snaive              28.6  18.3 125.

From train accuracy the best RMSE= 16.13 results from stl_arima_adaptivemodel. But training accuracy is not a best way to asses models, so let’s take a look how models behave with test data.
For test data I will use forecast comarision with 2010 April data.

atm1_test <- atm1 |> 
  filter(DATE >= '2010-04-01')
fc_atm1 <- fit_atm1 |> 
  forecast(new_data = atm1_test)

test_accuracy <- accuracy(fc_atm1, atm1_test) |> 
  select(.model, RMSE, MAE, MAPE) |> 
  arrange(RMSE)
test_accuracy

## # A tibble: 12 × 4
##    .model              RMSE   MAE  MAPE
##    <chr>              <dbl> <dbl> <dbl>
##  1 ets_reg             11.6  9.12  64.3
##  2 arima_auto_bxcx     12.1 10.1   61.8
##  3 ets                 12.7 10.5   50.8
##  4 arima_101_411       12.9 10.4   69.9
##  5 arima_202_311       12.9 10.4   71.5
##  6 arima_auto          12.9  9.76 102. 
##  7 arima_200_512       13.0 10.1   67.6
##  8 arima_002_413       13.2 10.0   61.8
##  9 stl_ets_adaptive    15.4 11.8   32.4
## 10 stl_arima_adaptive  18.5 14.7   36.2
## 11 stl_arima_periodic  21.3 16.7  158. 
## 12 snaive              24.6 21.0  117.

On training, stl_arima_adaptive looked best with lowest RMSE. But on test data it is overfitting.

Actually models ets_reg and arima_auto_bxcx did better on test, even if they weren’t best on training.

So STL models were too flexible and fit noise, while simpler models generalized better.

fc_atm1 |> 
  filter(.model %in% c('arima_auto_bxcx', 'ets_reg')) |>
  autoplot(atm1 |> 
             filter(DATE >= ymd('2010-03-01')),
           level = NULL) +
  labs(title = 'Forecasts for April 2010')

I would go with ets_reg , since test performance was better than other models.

fit_final_atm1 <- atm1 |>
  model(
    ets_reg = ETS(Cash ~ error("A") + trend("N") + season("A"))
  )

fc_may_atm1 <- fit_final_atm1 |> 
  forecast(h = 31)

fc_df <- fc_may_atm1 |>
  as_tibble()

#write_xlsx(fc_df, "fc_may_atm1.xlsx")

fc_may_atm1 |> 
  autoplot(atm1, , level = NULL)+
  labs(title = 'Forecast for May 2010', level = NULL)

As we see from the plot the model forecasted close seasonality with the stable trend.
It doesn’t have a capability to forecast unusual events with forecast points, but prediction intervals may include predictions of uncertain withdrawal amounts.

ATM2 series

atm2 <- atm_ts |> 
  filter(ATM == 'ATM2')
p1_atm2_reg <- atm2 |> 
  autoplot(Cash) +
  labs(title = 'ATM2 series')
p1_atm2_reg

The series does not look completely random, there is some kind of repeating behavior. It looks more noisy compared with ATM1 and a somewhat uneven across time. Still, it appears that weekly pattern might be there, but not as strong or stable.

lambda_atm2 <- atm2 |> 
  features(Cash, guerrero) |> 
  pull(lambda_guerrero)

atm2 <- atm2 |> 
  mutate(Cash_bxcx = box_cox(Cash, lambda_atm1)) |> 
  mutate(Cash_log1p = log1p(Cash))
p2_atm2_bxcx <- atm2 |> 
  autoplot(Cash_bxcx)+
  labs(title = 'ATM2 Boxcox transformed series')
p3_atm2_log1p <- atm2 |> 
  autoplot(Cash_log1p)+
  labs(title = 'ATM2 Log1p transformed series')
p1_atm2_reg/p2_atm2_bxcx/p3_atm2_log1p

Because of that, I decided to apply Box-Cox and Log1P transformations first. The reason is same as before in ATM1, variance is not constant, especially when values jump up and down. After transformation, the seasonal pattern appears stronger.

atm2 |> ACF(Cash, lag_max = 60) |> autoplot()

ACF plot confirms weekly seasonality with strong autocorrelations at lags multiples of 7. Also there is a pattern of negative autocorrelations.

Here I split the data into train and test sets, in the same way as for ATM1 series.

atm2_train <- atm2 |> 
  filter(DATE < ymd('2010-04-01'))
#atm2_train

lambda_atm2_train <- atm2_train |> 
  features(Cash, box_cox, feature =guerrero) |> 
  pull(lambda_guerrero)
lambda_atm2_train

## [1] 0.6517278

p1_atm2_train <- atm2_train |> 
  autoplot(Cash)
p2_atm2_train_bxcx <- atm2_train |> 
  autoplot(box_cox(Cash, lambda_atm2_train))
p1_atm2_train/p2_atm2_train_bxcx

atm2 |> 
  model(
    STL(Cash_bxcx~ trend(window = 31) + season(window = 7))
  ) |> 
  components() |> 
  autoplot()

From the STL plot, I can see that there is a weekly seasonality, and the trend appears to decline overall.
Remainder appear to have stable variance, except few extreme lows, probably caused by unusual outliers.

atm2_train |>
  model(stl = STL(box_cox(Cash, lambda_atm2_train) ~ trend(window=30) + season(window = 7))) |>
  components() |>
  ggplot(aes(x = atm2_train$day_of_week, y = season_adjust)) +
  geom_boxplot() +
  labs(title = 'Boxplots of seasonally adjusted (STL) + \nBox-Cox transformed data by day of week for ATM2')

I plotted boxplots of the seasonally adjusted and Box-Cox transformed data by day of week to check on outliers.

These outliers are days where cash usage was much higher or lower than usual.

Same as in ATM1 likely reasons:

holidays, special events or paydays causing higher withdrawals
ATM or weather issues could cause very low values
possible data errors

So in my opinion outliers are not just noise, but could be real unusual events, especially noticeable low ones.

And as I did in ATM1 analysis, I will leave these outliers as is for this data, but in real-world I would investigate what could cause them and fix them if needed.

atm2_train |> 
  features(box_cox(Cash, lambda_atm2_train), unitroot_nsdiffs)

## # A tibble: 1 × 2
##   ATM   nsdiffs
##   <chr>   <int>
## 1 ATM2        1

atm2_train |> 
  features(difference(box_cox(Cash, lambda_atm2_train), 7), unitroot_ndiffs)

## # A tibble: 1 × 2
##   ATM   ndiffs
##   <chr>  <int>
## 1 ATM2       0

Unitroot KPSS test confirms that only seasonal differencing is needed for ARIMA models.

atm2_train |> 
  gg_tsdisplay(difference(box_cox(Cash, lambda_atm2_train), 7),
               lag_max = 60,
               plot_type = 'partial')

Series looks centered around zero, so seasonal differencing worked, but has few spikes and dips caused by ouliers.

In the ACF, strong negative spike at lag 7 indicating weekly seasonality is still there.

After that, correlations are weak and kind of messy → no clear structure.

In the PACF, one strong spike at lag 7 followed by less significant at lags 14, 21, then nothing really significant

I would choose manual parameters things like:

ARIMA(0,0,0)(1,1,0)
ARIMA(1,0,0)(1,1,0)
ARIMA(0,0,1)(1,1,0)
ARIMA(0,0,0)(2,1,0)

Modelling

I will try to use multiple models: seasonal ARIMA (both manual and automatic), ETS (additive and multiplicative), and also SNAIVE as a benchmark.

fit_atm2 <- atm2_train |> 
    model(
    stl_arima_periodic = decomposition_model(
      STL(box_cox(Cash, lambda_atm2_train) ~ trend(window = 21) 
          + season(window = "periodic"), robust = TRUE),
      ARIMA(season_adjust)
    ),
    stl_arima_adaptive = decomposition_model(
      STL(box_cox(Cash, lambda_atm2_train) ~ trend(window = 30) + season(window = 7)),
      ARIMA(season_adjust)
    ),
    arima_auto_bxcx = ARIMA(box_cox(Cash, lambda_atm2_train),
                       stepwise = FALSE,
                       approx = FALSE),
     arima_auto = ARIMA(Cash,
                       stepwise = FALSE,
                       approx = FALSE),
    arima_001_214 = ARIMA(box_cox(Cash, lambda_atm2_train) ~ 0+ pdq(0,0,1) + PDQ(2,1,4)),
    arima_001_214_reg = ARIMA(Cash ~ 0+ pdq(0,0,1) + PDQ(2,1,4)),
    arima_202_011 = ARIMA(box_cox(Cash, lambda_atm2_train) ~ 0+ pdq(2,0,2) + PDQ(0,1,1)),
    arima_002_213 = ARIMA(box_cox(Cash, lambda_atm2_train) ~ 0+ pdq(0,0,2) + PDQ(2,1,3)),
    arima_101_412 = ARIMA(box_cox(Cash, lambda_atm2_train) ~ 0+ pdq(1,0,1) + PDQ(4,1,2)),
    stl_ets_adaptive = decomposition_model(
      STL(box_cox(Cash, lambda_atm2_train) ~ trend(window = 21) + season(window = 7), robust = TRUE),
      ETS(season_adjust)),
    ets = ETS(box_cox(Cash, lambda_atm2_train)),
    ets_additive = ETS(box_cox(Cash, lambda_atm2_train)~ error('A') + trend('N') + season('A')),
    ets_multi = ETS(Cash~ error('A') + trend('N') + season('M')),
    snaive = SNAIVE(box_cox(Cash, lambda_atm2_train) ~ lag("week"))
  )

atm2_test <- atm2 |> 
  filter(DATE >= '2010-04-01')
fc_atm2 <- fit_atm2 |> 
  forecast(new_data = atm2_test)

accuracy(fc_atm2, atm2_test) |> 
  select(.model, RMSE, MAE, MAPE) |> 
  arrange(RMSE)

## # A tibble: 14 × 4
##    .model              RMSE   MAE  MAPE
##    <chr>              <dbl> <dbl> <dbl>
##  1 arima_002_213       18.3  13.5  99.4
##  2 arima_001_214       18.6  13.3 105. 
##  3 ets_multi           19.6  14.4  54.7
##  4 ets                 19.6  14.6 102. 
##  5 ets_additive        19.6  14.6 102. 
##  6 arima_001_214_reg   19.8  14.5  45.1
##  7 stl_arima_adaptive  19.9  14.3  58.5
##  8 arima_101_412       20.3  15.0 123. 
##  9 stl_ets_adaptive    20.6  13.7  77.1
## 10 stl_arima_periodic  21.0  16.0 104. 
## 11 arima_202_011       21.0  16.0 104. 
## 12 arima_auto_bxcx     21.0  16.0 104. 
## 13 snaive              22.6  15.5  43.9
## 14 arima_auto          23.2  17.8  44.8

Looking at the test set accuracy results arima_002_213andarima_001_214has best RMSE results.

Below I will add combination of arima_002_213 + ets + snaive models to see if it will give better RMSE.

fit_comb <- fit_atm2 |> 
  mutate(combination = (arima_002_213 + ets + snaive) / 3)
fc_comb <- fit_comb |> 
  forecast(new_data = atm2_test)
accuracy(fc_comb, atm2_test) |> 
  select(.model, RMSE, MAE, MAPE) |> 
  arrange(RMSE)

## # A tibble: 15 × 4
##    .model              RMSE   MAE  MAPE
##    <chr>              <dbl> <dbl> <dbl>
##  1 arima_002_213       18.3  13.5  99.4
##  2 arima_001_214       18.6  13.3 105. 
##  3 combination         19.0  13.9  71.7
##  4 ets_multi           19.3  14.2  52.4
##  5 ets                 19.6  14.6 102. 
##  6 ets_additive        19.6  14.6 102. 
##  7 arima_001_214_reg   19.8  14.5  45.1
##  8 stl_arima_adaptive  19.9  14.3  58.5
##  9 arima_101_412       20.3  15.0 123. 
## 10 stl_ets_adaptive    20.6  13.7  77.1
## 11 stl_arima_periodic  21.0  16.0 104. 
## 12 arima_202_011       21.0  16.0 104. 
## 13 arima_auto_bxcx     21.0  16.0 104. 
## 14 snaive              22.6  15.5  43.9
## 15 arima_auto          23.2  17.8  44.8

The combinationmodel performs well, and is not so far behind first two models. The only problem with combination models is they are not easy to explain.

fc_atm2 |> 
  filter(.model %in% c('arima_002_213', 'ets_multi')) |>
  autoplot(atm2 |> 
             filter(DATE >= ymd('2010-03-01')),
           level = NULL)

ets_multi = ETS(Cash~ error(‘A’) + trend(‘N’) + season(‘M’))

fit_atm2_final <- atm2 |> 
  model(
    ets_multi = ETS(Cash~ error('A') + trend('N') + season('M'))
  )

fc_atm2_final <- fit_atm2_final |> 
  forecast(h = 31)

fc_atm2_final |> 
  autoplot(atm2, level = NULL)

fit_atm2_final |> 
  report()

## Series: Cash 
## Model: ETS(A,N,M) 
##   Smoothing parameters:
##     alpha = 0.0001000072 
##     gamma = 0.3593879 
## 
##   Initial states:
##      l[0]       s[0]     s[-1]    s[-2]     s[-3]    s[-4]    s[-5]    s[-6]
##  71.22607 0.08306391 0.3331656 1.390791 0.9445368 1.311519 1.382709 1.554215
## 
##   sigma^2:  619.8326
## 
##      AIC     AICc      BIC 
## 4511.099 4511.720 4550.098

The selected model for ATM2 is ETS(A,N,M), which means additive error, no trend, and multiplicative seasonality. The alpha value is almost zero, which basically says that the level is not changing much over time and the model is relying more on the seasonal pattern rather than updating the level aggressively.
On the other hand, gamma is around 0.36, so seasonality is actually being updated. This matches STL decomposition’s weak trend and clear weekly pattern. The initial daily seasonal states also vary , which again shows that different days behave differently. Overall, this model makes sense because ATM2 does not really have a strong trend, but still has seasonal behavior that needs to be captured.

ATM3

atm3 <- atm_ts |> 
  filter(ATM == 'ATM3')
atm3 |>
  autoplot(Cash)

atm3 |> 
  filter(!Cash == 0) |> 
  autoplot(Cash)+
  labs(title = 'Plot of days with actual ATM3 transactions')

The plots show that data for ATM3 series has only last 3 days with non-zero withdrawals.
The reasons could be that it is a new machine at this location, or previous data was lost.

Since there are only 3 days of data is available, it is best to simply apply Naive or Mean methods to forecast future points.

fit_atm3 <- atm3 |> 
  filter(!Cash == 0) |> 
  model(
    naive = NAIVE(Cash),
    mean = MEAN(Cash)
  )

fc_atm3 <- fit_atm3 |> 
  forecast(h = 30)

fc_atm3 |>
  autoplot(atm3 |> filter(!Cash == 0), level = NULL)

From the plot I would use Meanmodel. ATM3 data is kind of different from others. Using simple baseline forecasts is actually the correct decision here, even if it feels too basic.

ATM4

atm4 <- atm_clean|> 
  filter(ATM == 'ATM4')

p1_atm4_reg <- atm4 |> 
  autoplot(Cash)+
  labs(title = 'ATM4 series')
p1_atm4_reg

At first, the raw series looked different compared to others, and there was one extreme outlier. That value was way larger than the rest, so I treated it as a data entry issue and adjusted it. Otherwise, it would completely mislead analysis, modelling and as well forecast.

lambda_atm4 <- atm4 |> 
  features(Cash, guerrero) |> 
  pull(lambda_guerrero)
lambda_atm4

## [1] 0.4487252

Lambda’s value is 0.45 and it looks as square root is optimal for mathematic transformation.

Let’s take a look at plots and compare before and after transformation.

atm4 <- atm4 |> 
  mutate(Cash_bxcx = box_cox(Cash, lambda_atm4),
         Cash_log = log(Cash))
p2_atm4_bxcx <-  atm4 |> 
  autoplot(Cash_bxcx)+
  labs(title = 'Boxcox transformed ATM4 series. Lambda = 0.45')
p1_atm4_reg/p2_atm4_bxcx

After transformation, the series variance looked more stabilized, and spikes from original data seems to surpressed.

atm4 |> 
  ACF(Cash_bxcx, lag_max = 60) |> 
  autoplot()+
  labs(title = )

From ACF plot I can tell there is moderate seasonal pattern since significant correlations values are not large. Also autocorrelations are not consistently decreasing, as we see there is some pattern like wave. This could be caused by inconsistent trend level.

From here I split the data into training and test sets. I estimate the Box-Cox transformation parameter (lambda) only on the training data to avoid leakage.

atm4_train <- atm4 |> 
  filter(DATE < ymd('2010-04-01'))

lambda_atm4_train <- atm4_train |> 
  features(Cash, box_cox, feature =guerrero) |> 
  pull(lambda_guerrero)
lambda_atm4_train

## [1] 0.3946594

p1_atm4_train <- atm4_train |> 
  autoplot(Cash)+
  labs(title = 'ATM4 train set')
p2_atm4_train_bxcx <- atm4_train |> 
  autoplot(box_cox(Cash, lambda_atm4_train))+
  labs(title = 'BoxCox transformed ATM4 train set. Lambda = 0.4')
p1_atm4_train/p2_atm4_train_bxcx

dcmp_atm4 <- atm4 |> 
  model(
    STL(Cash_bxcx ~ trend(window = 31) + season(window = "periodic"))
  ) |> 
  components() 

  dcmp_atm4 |>  autoplot()

dcmp_atm4 |> 
  features(remainder, ljung_box, lag = 14)

## # A tibble: 1 × 4
##   ATM   .model                                                 lb_stat lb_pvalue
##   <chr> <chr>                                                    <dbl>     <dbl>
## 1 ATM4  "STL(Cash_bxcx ~ trend(window = 31) + season(window =…    44.9 0.0000427

STL decomposition for ATM4 shows that trend is not smooth at all. It is kind of wavy and inconsistent, goes up and down without clear direction. So I would not assume it as real trend, it looks more like short-term movement. I think we need data of few more years to see if the trend is consistent or not.
Seasonality is much more clear, I can see repeating weekly pattern even if it is not perfectly stable.
I checked the remainder using Ljung-Box test and the p-value is very small, so it is not white noise. From the plot, most of the seasonal pattern is removed, but there is still some structure and variability left. So STL decomposition captures main components, but not perfectly and some data structure still remains. So using only decomposition is not enough. I need models like ARIMA that can model autocorrelation, or ETS that can smooth remaining structure.

atm4_train |>
  model(stl = STL(box_cox(Cash, lambda_atm4_train) ~ trend(window=21) + season(window = 7))) |>
  components() |>
  ggplot(aes(x = atm4_train$day_of_week, y = season_adjust)) +
  geom_boxplot() +
  labs(title = 'Boxplots of seasonally adjusted (STL) + \nBox-Cox transformed data by day of week for ATM4')

As we see there are not many extreme outliers except Saturday. I will leave them as is, but in real world I would investigate cause of these outliers.

atm4 |> 
  features(Cash_bxcx, feat_stl) |>  
  select(trend_strength:seasonal_strength_week)

## # A tibble: 1 × 2
##   trend_strength seasonal_strength_week
##            <dbl>                  <dbl>
## 1          0.269                  0.442

Feature-based analysis using STL decomposition indicates weak trend (0.27) and moderate weekly seasonality (0.44). This suggests that complex seasonal models are unnecessary, and simpler models such as ETS and ARIMA are more appropriate

atm4_train |> 
  features(box_cox(Cash, lambda_atm4_train), unitroot_nsdiffs)

## # A tibble: 1 × 2
##   ATM   nsdiffs
##   <chr>   <int>
## 1 ATM4        0

atm4_train |> 
  features(box_cox(Cash, lambda_atm4_train), unitroot_ndiffs)

## # A tibble: 1 × 2
##   ATM   ndiffs
##   <chr>  <int>
## 1 ATM4       0

atm4_train |> 
  gg_tsdisplay(box_cox(Cash, lambda_atm4_train),
               lag_max = 90,
               plot_type = 'partial')

The series looks stable with no clear trend, which matches ndiffs = 0, and no seasonal differencing is needed (nsdiffs = 0).

The ACF shows a gradual decay with clear spikes at lags of multiples of 7 (7, 14, 21 and so on) indicating weekly seasonality. The PACF has a strong spikes at lags 7, 14 and very small significance at lags 21, 35 and 56, suggesting a low-order AR component.

Since none of ACF and PACF showing a sharp cutoff, I can’t easily manually choose ARIMA model. AFC plot has several lags of multiples of 7, and it is oscillating. PACF plot also has several aoutocorrelations above significance level and oscillating.

Modeling

I tried multiple models: seasonal ARIMA (both manual and automatic), ETS (additive and multiplicative), and also SNAIVE as a benchmark.

fit_atm4 <- atm4_train |> 
    model(
    stl_arima_periodic = decomposition_model(
      STL(box_cox(Cash, lambda_atm4_train) ~ trend(window = 21) 
          + season(window = "periodic"), robust = TRUE),
      ARIMA(season_adjust)
    ),
    stl_arima_adaptive = decomposition_model(
      STL(box_cox(Cash, lambda_atm4_train) ~ trend(window = 21) + season(window = 7), robust = TRUE),
      ARIMA(season_adjust)
    ),
    arima_dow = ARIMA(box_cox(Cash, lambda_atm4_train) ~ day_of_week,
                      stepwise = FALSE,
                      approx = FALSE),
    arima_auto_bxcx = ARIMA(box_cox(Cash, lambda_atm4_train),
                       stepwise = FALSE,
                       approx = FALSE),
     arima_auto = ARIMA(Cash,
                       stepwise = FALSE,
                       approx = FALSE),
    arima_001_101 = ARIMA(box_cox(Cash, lambda_atm4_train) ~ 0+ pdq(0,0,1) + PDQ(1,0,1)),
    arima_201_001 = ARIMA(box_cox(Cash, lambda_atm4_train) ~ 0+ pdq(2,0,1) + PDQ(0,0,1)),
    arima_100_001 = ARIMA(box_cox(Cash, lambda_atm4_train) ~ 0+ pdq(1,0,0) + PDQ(0,0,1)),
    stl_ets_adaptive = decomposition_model(
      STL(box_cox(Cash, lambda_atm4_train) ~ trend(window = 21) + season(window = 7)),
      ETS(season_adjust)),
    ets_reg = ETS(Cash),
    ets = ETS(box_cox(Cash, lambda_atm4_train)),
    ets_additive = ETS(box_cox(Cash, lambda_atm4_train)~ error('A') + trend('N') + season('A')),
    snaive = SNAIVE(box_cox(Cash, lambda_atm4_train) ~ lag("week")),
    r_walk = RW(Cash~drift())
    
    
  )

atm4_test <- atm4 |> 
  filter(DATE >= '2010-04-01')
fc_atm4 <- fit_atm4 |> 
  forecast(new_data = atm4_test)

accuracy(fc_atm4, atm4_test) |> 
  select(.model, RMSE, MAE, MAPE) |> 
  arrange(RMSE)

## # A tibble: 14 × 4
##    .model              RMSE   MAE  MAPE
##    <chr>              <dbl> <dbl> <dbl>
##  1 arima_001_101       242.  204.  752.
##  2 ets                 247.  208.  644.
##  3 arima_auto_bxcx     257.  222.  839.
##  4 arima_auto          273.  234. 1004.
##  5 stl_ets_adaptive    277.  229.  553.
##  6 stl_arima_adaptive  278.  225.  450.
##  7 arima_201_001       283.  243. 1049.
##  8 r_walk              291.  248. 1240.
##  9 arima_dow           296.  254. 1152.
## 10 stl_arima_periodic  302.  258. 1161.
## 11 ets_additive        305.  262. 1212.
## 12 ets_reg             307.  260. 1246.
## 13 arima_100_001       320.  268.  583.
## 14 snaive              739.  611. 1231.

After testing models with test data simplest ARIMA(0,0,1)(1,0,1)[7] had the best RMSE = 241.52.

Below I will try to use combination model and see if it will produce better RMSE.

fit_comb4 <- fit_atm4 |> 
  mutate(combination = (arima_001_101 + ets) / 2)
fc_comb <- fit_comb4 |> 
  forecast(new_data = atm4_test)
accuracy(fc_comb, atm4_test) |> 
  select(.model, RMSE, MAE, MAPE) |> 
  arrange(RMSE)

## # A tibble: 15 × 4
##    .model              RMSE   MAE  MAPE
##    <chr>              <dbl> <dbl> <dbl>
##  1 combination         241.  203.  697.
##  2 arima_001_101       242.  204.  752.
##  3 ets                 247.  208.  644.
##  4 arima_auto_bxcx     257.  222.  839.
##  5 arima_auto          273.  234. 1004.
##  6 stl_ets_adaptive    277.  229.  553.
##  7 stl_arima_adaptive  278.  225.  450.
##  8 arima_201_001       283.  243. 1049.
##  9 r_walk              291.  248. 1240.
## 10 arima_dow           296.  254. 1152.
## 11 stl_arima_periodic  302.  258. 1161.
## 12 ets_additive        305.  262. 1212.
## 13 ets_reg             307.  260. 1246.
## 14 arima_100_001       320.  268.  583.
## 15 snaive              739.  611. 1231.

Combination model did little better with RMSE = 241.30 (almost same as ARIMA(0,0,1)(1,0,1)[7] RMSE = 241.52)

fc_comb |> 
  filter(.model %in% c('arima_001_101', 'combination')) |>
  autoplot(atm4 |> 
             filter(DATE >= ymd('2010-03-01')),
           level = NULL)

Let’s see how models behave with Cross-Validation

atm4_cv <- atm4_train |>
  select(DATE, Cash) |>
  as_tsibble(index = DATE) |>
  stretch_tsibble(.init = 200, .step = 7)

folds <- atm4_cv |>
  group_by(.id) |>
  group_split()

cv_results_atm4 <- map_dfr(folds, function(fold_data) {
  
  lambda_fold <- fold_data |>
    features(Cash, guerrero) |>
    pull(lambda_guerrero)
  
  fit_fold <- fold_data |>
    model(
      ets = ETS(box_cox(Cash, lambda_fold)),
      arima_auto_bxcx = ARIMA(
        box_cox(Cash, lambda_fold),
        stepwise = FALSE,
        approx = FALSE
      ),
      arima_101_001 = ARIMA(box_cox(Cash, lambda_fold) ~ 0+ pdq(0,0,1) + PDQ(1,0,1)),
       r_walk = RW(Cash~drift())
    )
  
  last_date <- max(fold_data$DATE)
  
  test_fold <- atm4_train |>
    select(DATE, Cash) |>
    as_tsibble(index = DATE) |>
    filter(DATE > last_date, DATE <= last_date + 30) |>
    mutate(.id = unique(fold_data$.id)) |>
    as_tsibble(index = DATE, key = .id)
  
  if (nrow(test_fold) == 0) return(tibble())
  
  fc <- forecast(fit_fold, new_data = test_fold)
  
  accuracy(fc, test_fold) |>
    select(.model, RMSE, MAE, MAPE) |>
    mutate(lambda = lambda_fold)
})

cv_results_atm4 |>
  group_by(.model) |>
  summarise(
    RMSE = mean(RMSE, na.rm = TRUE),
    MAE = mean(MAE, na.rm = TRUE),
    MAPE = mean(MAPE, na.rm = TRUE),
    mean_lambda = mean(lambda, na.rm = TRUE)
  ) |>
  arrange(RMSE)

## # A tibble: 4 × 5
##   .model           RMSE   MAE  MAPE mean_lambda
##   <chr>           <dbl> <dbl> <dbl>       <dbl>
## 1 arima_auto_bxcx  318.  267.  401.       0.438
## 2 arima_101_001    325.  267.  386.       0.438
## 3 ets              339.  278.  413.       0.438
## 4 r_walk           408.  334.  465.       0.438

For ATM4, the cross-validation results showed that the Box-Cox transformed automatic arima_auto_bxcx model performed best overall, with the lowest average RMSE and MAE across folds.

# fit_prophet <- atm4_train |> 
#   model(
#     prophet(Cash~ day_of_week ),
#     nettar = NNETAR(Cash)
#   )
# fc_prophet <- fit_prophet |> 
#   forecast(new_data = atm4_test)
# accuracy(fc_prophet, atm4_test) |> 
#   select(.model, RMSE, MAE, MAPE) |> 
#   arrange(RMSE)
fit_atm4_final <- atm4 |> 
  model(
     arima_001_101 = ARIMA(box_cox(Cash, lambda_atm4_train) ~ 0+ pdq(0,0,1) + PDQ(1,0,1))
  )

fc_atm4_final <- fit_atm4_final |> 
   forecast(h = 31)

fc_atm4_final |> 
  filter(.model == 'arima_001_101') |> 
  autoplot(atm4, level = NULL)

So for ATM4 I will choose arima_001_101as my forecast model. It is not perfect, and cannot predict volatility as we seen in ATM4 series.

Below I tested prophet and nettar models

fit_prophet <- atm4_train |> 
  model(
    prophet(Cash~ day_of_week ),
    nettar = NNETAR(Cash)
  )
fc_prophet <- fit_prophet |>
  forecast(new_data = atm4_test)
accuracy(fc_prophet, atm4_test) |>
  select(.model, RMSE, MAE, MAPE) |>
  arrange(RMSE)

## # A tibble: 2 × 4
##   .model                       RMSE   MAE  MAPE
##   <chr>                       <dbl> <dbl> <dbl>
## 1 nettar                       254.  202.  522.
## 2 prophet(Cash ~ day_of_week)  304.  259. 1174.

fc_prophet |> 
  autoplot(atm4 |> filter(DATE >= yearmonth('Mar 2010')), level = NULL)

## Warning: There was 1 warning in `filter()`.
## ℹ In argument: `DATE >= yearmonth("Mar 2010")`.
## Caused by warning:
## ! Incompatible methods (">=.Date", ">=.vctrs_vctr") for ">="

As we see from the plot nettarmodel is more predicting volatility, but it is also has negative predictions. I don’t think it is appropriate for this series, since we can’t have negative withdrawals.

On the other hand prophet model is not able to predict volatility.

It is my first time experiencing with these models, so I don’t know yet how to optimize them. But I will learn.

Power consumption

url <- "https://raw.githubusercontent.com/farhodibr/CUNY-SPS-MSDS/main/DATA624/PROJECT1/ResidentialCustomerForecastLoad-624.xlsx"

dest_file <- tempfile(fileext = ".xlsx")

GET(url, write_disk(dest_file, overwrite = TRUE))

## Response [https://raw.githubusercontent.com/farhodibr/CUNY-SPS-MSDS/main/DATA624/PROJECT1/ResidentialCustomerForecastLoad-624.xlsx]
##   Date: 2026-03-30 04:47
##   Status: 200
##   Content-Type: application/octet-stream
##   Size: 14.1 kB
## <ON DISK>  C:\Users\farho\AppData\Local\Temp\RtmpeI8NfO\file428852882130.xlsx

power <- read_excel(
  dest_file,
  col_types = c("numeric", "text", "numeric")
) 

power <- power |>
  mutate(DATE = yearmonth(`YYYY-MMM`))

head(power)

## # A tibble: 6 × 4
##   CaseSequence `YYYY-MMM`     KWH     DATE
##          <dbl> <chr>        <dbl>    <mth>
## 1          733 1998-Jan   6862583 1998 Jan
## 2          734 1998-Feb   5838198 1998 Feb
## 3          735 1998-Mar   5420658 1998 Mar
## 4          736 1998-Apr   5010364 1998 Apr
## 5          737 1998-May   4665377 1998 May
## 6          738 1998-Jun   6467147 1998 Jun

power <- power |> 
  select(-`YYYY-MMM`)
power

## # A tibble: 192 × 3
##    CaseSequence     KWH     DATE
##           <dbl>   <dbl>    <mth>
##  1          733 6862583 1998 Jan
##  2          734 5838198 1998 Feb
##  3          735 5420658 1998 Mar
##  4          736 5010364 1998 Apr
##  5          737 4665377 1998 May
##  6          738 6467147 1998 Jun
##  7          739 8914755 1998 Jul
##  8          740 8607428 1998 Aug
##  9          741 6989888 1998 Sep
## 10          742 6345620 1998 Oct
## # ℹ 182 more rows

power_ts <- power |> 
  as_tsibble(index = DATE)
power_ts

## # A tsibble: 192 x 3 [1M]
##    CaseSequence     KWH     DATE
##           <dbl>   <dbl>    <mth>
##  1          733 6862583 1998 Jan
##  2          734 5838198 1998 Feb
##  3          735 5420658 1998 Mar
##  4          736 5010364 1998 Apr
##  5          737 4665377 1998 May
##  6          738 6467147 1998 Jun
##  7          739 8914755 1998 Jul
##  8          740 8607428 1998 Aug
##  9          741 6989888 1998 Sep
## 10          742 6345620 1998 Oct
## # ℹ 182 more rows

power_ts |> 
  autoplot(KWH/1e6)+
  labs(title = 'Residential power usage series', y = 'KWH (millions)')

I can see there is missing observation around 2008-2009, and huge drop in one of the early months in 2010.

Below I will check on these missing observation and drop in power usage.

power_ts_nas <- power_ts |> 
  complete(DATE = seq.Date(from = min(DATE),
                           to = max(DATE),
                           by = 'month'))
missing_periods <- power_ts_nas |> 
  filter(is.na(KWH))
missing_periods

## # A tibble: 1 × 3
##   DATE       CaseSequence   KWH
##   <date>            <dbl> <dbl>
## 1 2008-09-01          861    NA

As we see there is a missing value for September of 2008. Since there is enough data prior this observation, I will ARIMA model to forecast and impute this missing observation. I prefer ARIMA imputation method in this case because it helps to prevent seasonality and structure of the data.

fit_power_nas <- power_ts |> 
  as_tsibble(index = DATE) |> 
  model(
    arima_nas = ARIMA(KWH)
  )

power_ts_imputed <- fit_power_nas |> 
  interpolate(power_ts)

power_ts_imputed |> 
  autoplot(KWH/1e6)

Now let’s take a look at the extreme outlier

power_ts_imputed |> 
  filter(DATE >= yearmonth('2010 Jan'))

## # A tsibble: 48 x 2 [1M]
##        DATE     KWH
##       <mth>   <dbl>
##  1 2010 Jan 9397357
##  2 2010 Feb 8390677
##  3 2010 Mar 7347915
##  4 2010 Apr 5776131
##  5 2010 May 4919289
##  6 2010 Jun 6696292
##  7 2010 Jul  770523
##  8 2010 Aug 7922701
##  9 2010 Sep 7819472
## 10 2010 Oct 5875917
## # ℹ 38 more rows

I see that observation for 2010 July is unusually lower (770523) compared with previous and following months. Espesially since this is summer month it is hard to believe that residential power were that low (unless some power supply disruptions occurred during that period).

Assuming that there were no disruptions in power supply I believe this could be data entry or reading entry. I will multiply it by 10 so it can be in the range of surrounding observations.

power_ts_imputed <- power_ts_imputed |>
  mutate(
    KWH = if_else(
      DATE == yearmonth("2010 Jul"),
      7705230,   
      KWH
    )
  )

power_clean <- power_ts_imputed
power_clean |> 
  autoplot(KWH/1e6) +
  labs(title = "Residential power usage", y = 'KWH (millions)')

When I looked at the raw series plot, the first thing we can see is the strong repeating yearly cycle.
The summer and winter months tend to have higher KWH usage, while spring and fall are lower. This makes sense from a business point of view because residential demand usually increases during extreme weather periods due to heating and cooling.

The trend appears to be stable until around 2009, and after 2009 it appear that it has slight upward increasing tendency.

lambda_power <- power_clean |> 
  features(KWH, box_cox, feature = guerrero) |> 
  pull(lambda_guerrero)
lambda_power

## [1] -0.2046292

power_clean <- power_clean |> 
  mutate(KWH_bxcx = box_cox(KWH, lambda_power),
         KWH_log = log(KWH))

power_clean |>
  select(DATE, KWH, KWH_bxcx, KWH_log) |>
  pivot_longer(-DATE) |>
  as_tsibble(index = DATE, key = name) |>
  autoplot(value) +
  facet_wrap(~name, ncol = 1, scales = "free_y")

Because the fluctuations also seemed to grow with the level of the series, I applied a log / Box-Cox type transformation before fitting ARIMA models. The main reason for this was to stabilize variance so the model focuses on the recurring seasonal structure instead of being dominated by larger peaks.

I think Log transformation improved variance slightly better than Box_Cox, therefore I will use Log transformed data for further analysis and modelling.

Let’s take a look at STL decomposition

dcmp <- power_clean |> 
  model(STL(log(KWH), robust =TRUE)) |> 
  components() 
dcmp |> 
  autoplot()

This decomposition tells me that the power data is driven mainly by:

strong yearly seasonality
smooth long-term growth

Because of that, models like STL + ETS, multiplicative ETS, and seasonal ARIMA are all reasonable choices, since they are built exactly for this type of stable seasonal load behavior.

Below I will have detailed look at remainder component

dcmp |> 
  ACF(remainder, lag_max = 48) |> 
  autoplot()

dcmp |>
  features(remainder, ljung_box, lag = 24)

## # A tibble: 1 × 3
##   .model                       lb_stat lb_pvalue
##   <chr>                          <dbl>     <dbl>
## 1 STL(log(KWH), robust = TRUE)    34.5    0.0756

I checked the remainder with ACF and Ljung-Box. Most spikes except 2 stay inside bounds, and p-value is 0.076 which is above 0.05, so remainder is can be considered as white noise. This results says that STL already captured most of trend and yearly seasonality, so decomposition-based models like STL + ETS likely will work well for this series.

Here I split data into train and test sets

train_power <- power_clean |> 
  filter(DATE < yearmonth('2010 Jan'))

test_power <- power_clean |> 
  filter(DATE >= yearmonth('2010 Jan'))

power_clean |> 
  features(log(KWH), unitroot_nsdiffs)

## # A tibble: 1 × 1
##   nsdiffs
##     <int>
## 1       1

power_clean |> 
  features(difference(log(KWH), 12), unitroot_ndiffs)

## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0

power_clean |> 
  gg_tsdisplay(difference(log(KWH), 12), lag=24, plot_type = 'partial')

After seasonal differencing at lag 12, the series becomes much more stable around zero, so yearly seasonality is mostly removed. ACF still shows strong negative spike at lag 12, while PACF has clear spike at lag 1 and another seasonal effect around 12. From plots my manual model would be ARIMA(1,0,1)(2,1,1)[12] .

fit_power <- train_power |> 
  model(
    STL_ETS = decomposition_model(
      STL(log(KWH), robust = TRUE),
      ETS(season_adjust~ error('A') + trend('Ad', phi = 0.2) + season('A'))
    ),
    ets_multi = ETS(KWH~ error('M') + trend('A') + season('M')),
    arima = ARIMA(log(KWH), stepwise = FALSE, approx = FALSE),
    arima_210_012 = ARIMA(log(KWH)~ 0 + pdq(2,1,0) + PDQ(0,1,2)),
    arima_102_211 = ARIMA(log(KWH)~ 1 + pdq(1,0,1) + PDQ(2,1,1))
  )



fc_power <- fit_power |>
  mutate(combination = (arima_210_012+ STL_ETS)/2) |> 
  forecast(new_data = test_power)

accuracy(fc_power, test_power) |> 
  arrange(RMSE)

## # A tibble: 6 × 10
##   .model        .type       ME     RMSE     MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>         <chr>    <dbl>    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 combination   Test   197073.  780815. 609112.  1.65  7.80   NaN   NaN 0.308
## 2 arima_210_012 Test  -467988.  953913. 803441. -7.42 11.2    NaN   NaN 0.461
## 3 arima_102_211 Test   734443. 1060231. 783684.  9.00  9.77   NaN   NaN 0.272
## 4 arima         Test   731679. 1068234. 796363.  8.95  9.98   NaN   NaN 0.272
## 5 ets_multi     Test   791588. 1158558. 872757.  9.65 10.9    NaN   NaN 0.298
## 6 STL_ETS       Test   862134. 1175848. 886274. 10.7  11.1    NaN   NaN 0.278

I tried the model ARIMA(1,0,1)(2,1,1)[12] I mentioned earlier and additionally arima_210_012 model . I also used STL + ETS because decomposition already captured yearly seasonality very well.

fc_power |> 
  filter(.model == c('arima_210_012', 'STL_ETS')) |> 
  autoplot(power_clean, level = NULL)+
  labs(title = 'Forecasts of arima_210_012 and STL_ETS models using test data')

fc_power |> 
  filter(.model == 'combination') |> 
  autoplot(power_clean, level = NULL)+
  labs(title = 'Forecast of (arima_210_012+ STL_ETS)/2 combination model')

The combination model performed best on the test set with the lowest RMSE (780,815) and MAE (609,112), which makes it my final choice.

The plot also show that combination model almost captured the test data, except few unusual spikes. In my opinion it is almost impossible and hard to build the model to capture those spikes. Maybe in the future learning I will find the methods to predict unusual events as well.

The reason why combination model works better is because it combines strengths of both models. As we saw on previous plot arima_210_012 captures the remaining autocorrelation after seasonal differencing, while STL_ETS keeps the smooth yearly seasonal shape and damped long-term growth. By averaging them, the forecast becomes more stable. This is why the combination model generalized better than each individual model alone.

# fc_power |> 
#   autoplot(power_clean,level = NULL)

fit_full <- power_clean |> 
  model(
    STL_ETS = decomposition_model(
      STL(log(KWH), robust = TRUE),
      ETS(season_adjust ~ error("A") + trend("Ad") + season("A"))
    ),
    arima_010_111_c = ARIMA(log(KWH) ~ 0 + pdq(2,1,0) + PDQ(0,1,2))
  )

My final choice to forecast is combinationmodel

fit_full <- fit_full |>
  mutate(
    combination = (STL_ETS + arima_010_111_c) / 2
  )

fc_full <- fit_full |>
  forecast(h = "1 year")

fc_full |>
  filter(.model == 'combination') |> 
  autoplot(power_clean, level = NULL) +
  labs(title = 'Final power usage forecast for 2014 (in blue color)')

library(writexl)
library(dplyr)

all_fc <- bind_rows(
  fc_may_atm1 |>
    as_tibble() |>
    select(DATE, .mean) |>
    mutate(ATM = "ATM1"),

  fc_atm2_final |>
    as_tibble() |>
    select(DATE, .mean) |>
    mutate(ATM = "ATM2"),

  fc_atm3 |>
    as_tibble() |>
    select(DATE, .mean) |>
    mutate(ATM = "ATM3"),

  fc_atm4_final |>
    as_tibble() |>
    select(DATE, .mean) |>
    mutate(ATM = "ATM4")
) |>
  rename(Forecast = .mean)

write_xlsx(all_fc, "ALL_ATMs_May_2010_Forecasts.xlsx")

PROJECT1_DATA624

Farhod Ibragimov

2026-03-25

Data preparation and EDA of the series

ATM1 series

STL Decomposition

Modelling

ATM2 series

Modelling

ATM3

Modeling

Power consumption