Project 1

Loading Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(readxl)
library(openxlsx)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries)
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.4.3
## 
## Attaching package: 'tsibble'
## The following object is masked from 'package:zoo':
## 
##     index
## The following object is masked from 'package:lubridate':
## 
##     interval
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(feasts)
## Warning: package 'feasts' was built under R version 4.4.3
## Loading required package: fabletools
## Warning: package 'fabletools' was built under R version 4.4.3
## Registered S3 methods overwritten by 'ggtime':
##   method           from      
##   autolayer.fbl_ts fabletools
##   autolayer.tbl_ts fabletools
##   autoplot.dcmp_ts fabletools
##   autoplot.fbl_ts  fabletools
##   autoplot.tbl_ts  fabletools
##   fortify.fbl_ts   fabletools
library(ggplot2)
library(fable)
## Warning: package 'fable' was built under R version 4.4.3

Loading Data

atm_data <- read_excel("/Users/tatyanatitova/Documents/SPRING2026/DATA624/ATM624Data.xlsx")
residential_data <- read_excel("/Users/tatyanatitova/Documents/SPRING2026/DATA624/ResidentialCustomerForecastLoad-624.xlsx")

Part A – ATM Forecast, ATM624Data.xlsx

For this project, it is important to examine and explore the data along the way. So first, I am checking the data set structure of the imported excel for the atm data.

Exploring Data

head(atm_data)
## # A tibble: 6 × 3
##    DATE ATM    Cash
##   <dbl> <chr> <dbl>
## 1 39934 ATM1     96
## 2 39934 ATM2    107
## 3 39935 ATM1     82
## 4 39935 ATM2     89
## 5 39936 ATM1     85
## 6 39936 ATM2     90
summary(atm_data)
##       DATE           ATM                 Cash        
##  Min.   :39934   Length:1474        Min.   :    0.0  
##  1st Qu.:40026   Class :character   1st Qu.:    0.5  
##  Median :40118   Mode  :character   Median :   73.0  
##  Mean   :40118                      Mean   :  155.6  
##  3rd Qu.:40210                      3rd Qu.:  114.0  
##  Max.   :40312                      Max.   :10919.8  
##                                     NA's   :19
str(atm_data)
## tibble [1,474 × 3] (S3: tbl_df/tbl/data.frame)
##  $ DATE: num [1:1474] 39934 39934 39935 39935 39936 ...
##  $ ATM : chr [1:1474] "ATM1" "ATM2" "ATM1" "ATM2" ...
##  $ Cash: num [1:1474] 96 107 82 89 85 90 90 55 99 79 ...

This shows that the dataset has 1474 rows and 3 columns that are the DATE, ATM, and CASH. Upon initial review I can see that there are some formatting issues with the date column, missing cash values (the zeros for minumum), and a very high max for cash.

Checking for names of the ATM machines

atm_data |> distinct(ATM)
## # A tibble: 5 × 1
##   ATM  
##   <chr>
## 1 ATM1 
## 2 ATM2 
## 3 <NA> 
## 4 ATM3 
## 5 ATM4

There are total of 4 atm machines and some na values that need to be addressed.

Converting dates

The dates are not in date format so I will use the mutate function to address that and set the format using the standard excel date.

atm_data <- atm_data |> 
  mutate(DATE = as.Date(DATE, origin = "1899-12-30"))

Removing NA Values

Creating a new variable where I will store the filtered data where there is no NA values.

atm_clean <- atm_data |> filter(!is.na(ATM))

Addressing Outliers

The max value is way to high compared to other values so I will filter out and replace with NA values greater than 5000 for now.

atm_clean <- atm_clean |> 
  mutate(Cash = ifelse(ATM == "ATM4" & Cash > 5000, NA, Cash))

Checking

Checking to see what the max value is now and if the filtering solved the issue

atm_clean |> filter(ATM == "ATM4") |> arrange(desc(Cash)) |> head(5)
## # A tibble: 5 × 3
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-09-22 ATM4  1712.
## 2 2010-01-29 ATM4  1575.
## 3 2009-09-04 ATM4  1495.
## 4 2009-06-09 ATM4  1484.
## 5 2009-09-05 ATM4  1301.

Cleaning the rest of NA values

Filtering through the na values again that occurred when resolving the outliar issue.

atm_clean <- atm_clean |> 
  group_by(ATM) |> 
  mutate(Cash = na.approx(Cash, na.rm = FALSE)) |> 
  ungroup() 
atm_clean |> summarise(remaining_NAs = sum(is.na(Cash)))
## # A tibble: 1 × 1
##   remaining_NAs
##           <int>
## 1             0

Visualization

For this plot its best to do a facet wrap to see the graphs side by side and see patterns of seaonality

atm_clean |> 
  ggplot(aes(x = DATE, y = Cash)) + 
  geom_line(color = "steelblue") + 
  facet_wrap(~ ATM, scales = "free_y") + 
  theme_light() + 
  labs(
    title = "Daily ATM Cash Withdrawals", 
    x = "DATE", 
    y = "Cash (in hundreds $)"
  )

ATM1 and ATM2 look the most similar and overtime they appear to be the sot stable with no clear trend. The rapid changes signify the seasonality of the data, most likely weekly.

ATM3 looks like the machine was either down or not working as it has 0 and then a spike. ATM4 is on a different scale as it appears to be more busy than any of the other ATMs.

Modeling and Forecasing

ATM1 - Pattern

Using the mutate function I extracted the day of the week from each dateto show the avg withdrawal of cash from the ATM1 per day. This will show the daily pattern.

atm_clean |> 
  filter(ATM == "ATM1") |> 
  mutate(day_of_week = wday(DATE, label = TRUE)) |> 
  group_by(day_of_week) |> 
  summarise(avg_cash = mean(Cash))
## # A tibble: 7 × 2
##   day_of_week avg_cash
##   <ord>          <dbl>
## 1 Sun            103. 
## 2 Mon             86.5
## 3 Tue             89.9
## 4 Wed             82.2
## 5 Thu             31.7
## 6 Fri             98.6
## 7 Sat             97.3

The weekend has the highest ATM withdrawals during the week.

ATM2 - Pattern

Similar process here to test for daily trends

atm_clean |> 
  filter(ATM == "ATM2") |> 
  mutate(day_of_week = wday(DATE, label = TRUE)) |> 
  group_by(day_of_week) |> 
  summarise(avg_cash = mean(Cash))
## # A tibble: 7 × 2
##   day_of_week avg_cash
##   <ord>          <dbl>
## 1 Sun             67.2
## 2 Mon             58.7
## 3 Tue             73.2
## 4 Wed             43.9
## 5 Thu             26.6
## 6 Fri             92.0
## 7 Sat             76.0

Friday here is the peak while THurdsday has the smallest amount of withdrawals.

ATM3 - Pattern

Just as the graph shows, most of the values are just zeros for this ATM.

atm_clean |> 
  filter(ATM == "ATM3") |> 
  mutate(day_of_week = wday(DATE, label = TRUE)) |> 
  group_by(day_of_week) |> 
  summarise(avg_cash = mean(Cash))
## # A tibble: 7 × 2
##   day_of_week avg_cash
##   <ord>          <dbl>
## 1 Sun             0   
## 2 Mon             0   
## 3 Tue             0   
## 4 Wed             1.85
## 5 Thu             1.58
## 6 Fri             1.60
## 7 Sat             0
atm4 <- atm_clean |> 
  filter(ATM == "ATM4") |> 
  mutate(day_of_week = wday(DATE, label = TRUE))
atm4 |> 
  group_by(day_of_week) |> 
  summarise(avg_cash = mean(Cash))
## # A tibble: 7 × 2
##   day_of_week avg_cash
##   <ord>          <dbl>
## 1 Sun             539.
## 2 Mon             481.
## 3 Tue             442.
## 4 Wed             414.
## 5 Thu             170.
## 6 Fri             574.
## 7 Sat             492.

The pattern appears to be similar where Friday has the highest number.

ATM3

atm_clean |> 
  filter(ATM == "ATM3") |> 
  summarise(
    n_total = n(), 
    n_nonzero = sum(Cash > 0 ), 
    first_nonzero = min(DATE[Cash > 0])
  )
## # A tibble: 1 × 3
##   n_total n_nonzero first_nonzero
##     <int>     <int> <date>       
## 1     365         3 2010-04-28
atm_clean |> 
  filter(ATM == "ATM3", Cash > 0)
## # A tibble: 3 × 3
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2010-04-28 ATM3     96
## 2 2010-04-29 ATM3     82
## 3 2010-04-30 ATM3     85

Visualization of Time Series

atm1 <- atm_clean |> 
  filter(ATM == "ATM1") |>
  arrange(DATE) |> 
  pull(Cash) |> 
  ts(frequency = 7)

Fitting into ARIMA Model

atm1_fit <- auto.arima(atm1)
summary(atm1_fit)
## Series: atm1 
## ARIMA(0,0,1)(0,1,2)[7] 
## 
## Coefficients:
##          ma1     sma1     sma2
##       0.1950  -0.5806  -0.1037
## s.e.  0.0546   0.0506   0.0494
## 
## sigma^2 = 556.4:  log likelihood = -1640.01
## AIC=3288.03   AICc=3288.14   BIC=3303.55
## 
## Training set error measures:
##                       ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.07416536 23.26239 14.53988 -102.2594 117.1482 0.8215398
##                      ACF1
## Training set -0.008720072

The autofit works well here since there is a weekly cycle.

Generating the forecast

atm1_forecast <- forecast(atm1_fit, h = 31)
autoplot(atm1_forecast) + 
  theme_light() + 
  labs(
    title = "ATM 1 Forecast"
  )

ARM2

atm2 <- atm_clean |> 
  filter(ATM == "ATM2") |> 
  arrange(DATE) |> 
  pull(Cash) |> 
  ts(frequency = 7)
atm2_fit <- auto.arima(atm2)
summary(atm2_fit)
## Series: atm2 
## ARIMA(2,0,2)(0,1,1)[7] 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2     sma1
##       -0.4320  -0.9130  0.4773  0.8048  -0.7547
## s.e.   0.0553   0.0407  0.0861  0.0556   0.0381
## 
## sigma^2 = 602.5:  log likelihood = -1653.67
## AIC=3319.33   AICc=3319.57   BIC=3342.61
## 
## Training set error measures:
##                      ME     RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set -0.8907648 24.13932 17.04479 -Inf  Inf 0.8199456 -0.004275338

ATM2 Forecast

atm2_forecast <- forecast(atm2_fit, h = 31)
autoplot(atm2_forecast) + 
  theme_minimal() + 
  labs(title = "ATM2 Forecast")

ATM4

atm4 <- atm_clean |> 
  filter(ATM == "ATM4") |>
  arrange(DATE) |> 
  pull(Cash) |>
  ts(frequency = 7)
atm4_fit <- auto.arima(atm4) 
summary(atm4_fit)
## Series: atm4 
## ARIMA(0,0,3)(1,0,0)[7] with non-zero mean 
## 
## Coefficients:
##          ma1     ma2     ma3    sar1      mean
##       0.0891  0.0185  0.0892  0.1831  444.1423
## s.e.  0.0532  0.0585  0.0516  0.0527   26.1912
## 
## sigma^2 = 119496:  log likelihood = -2649.15
## AIC=5310.29   AICc=5310.53   BIC=5333.69
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE    MAPE      MASE
## Training set -0.2182895 343.3066 287.5528 -518.8467 552.121 0.8300788
##                      ACF1
## Training set -0.008472808

ATM4 Forecast

atm4_forecast <- forecast(atm4_fit, h = 31)
autoplot(atm4_forecast) + 
  theme_minimal() + 
  labs(title = "ATM4 Forecast")

Creating an Excel Readable File

Creating May Dates

dates <- seq(as.Date("2010-05-01"), as.Date("2010-05-31"), by = "day")

Combining all forecasts into a table

forecast_table <- data.frame(
  DATE = dates, 
  ATM1 = as.numeric(atm1_forecast$mean),
  ATM2 = as.numeric(atm2_forecast$mean),
  ATM3 = rep(mean(c(96, 82, 85)), 31),
  ATM4 = as.numeric(atm4_forecast$mean)
)

Printing the forecast table

forecast_table
##          DATE       ATM1      ATM2     ATM3     ATM4
## 1  2010-05-01  87.167893 66.436775 87.66667 361.2625
## 2  2010-05-02 104.055911 72.717930 87.66667 432.1130
## 3  2010-05-03  73.149472 13.674789 87.66667 450.3253
## 4  2010-05-04   8.032036  5.297995 87.66667 365.3366
## 5  2010-05-05 100.006619 97.634046 87.66667 426.5774
## 6  2010-05-06  80.857483 88.290342 87.66667 370.9293
## 7  2010-05-07  86.669742 64.988924 87.66667 451.1258
## 8  2010-05-08  87.958452 66.444443 87.66667 428.9687
## 9  2010-05-09 102.747019 72.727023 87.66667 441.9400
## 10 2010-05-10  73.390751 13.663861 87.66667 445.2743
## 11 2010-05-11   8.588613  5.294414 87.66667 429.7146
## 12 2010-05-12 100.653148 97.645570 87.66667 440.9265
## 13 2010-05-13  80.393212 88.288633 87.66667 430.7385
## 14 2010-05-14  86.927292 64.979141 87.66667 445.4209
## 15 2010-05-15  88.030443 66.450229 87.66667 441.3643
## 16 2010-05-16 102.747019 72.733455 87.66667 443.7391
## 17 2010-05-17  73.390751 13.655799 87.66667 444.3496
## 18 2010-05-18   8.588613  5.292024 87.66667 441.5009
## 19 2010-05-19 100.653148 97.653963 87.66667 443.5536
## 20 2010-05-20  80.393212 88.287189 87.66667 441.6884
## 21 2010-05-21  86.927292 64.972102 87.66667 444.3764
## 22 2010-05-22  88.030443 66.454588 87.66667 443.6337
## 23 2010-05-23 102.747019 72.737999 87.66667 444.0685
## 24 2010-05-24  73.390751 13.649857 87.66667 444.1803
## 25 2010-05-25   8.588613  5.290443 87.66667 443.6587
## 26 2010-05-26 100.653148 97.660071 87.66667 444.0345
## 27 2010-05-27  80.393212 88.285994 87.66667 443.6930
## 28 2010-05-28  86.927292 64.967041 87.66667 444.1852
## 29 2010-05-29  88.030443 66.457865 87.66667 444.0492
## 30 2010-05-30 102.747019 72.741204 87.66667 444.1288
## 31 2010-05-31  73.390751 13.645480 87.66667 444.1493

Excel write up

write.xlsx(forecast_table, "ATM_Forecast_May2010.xlsx")

After fitting the ARIMA models for each of the ATMs, the different types of patterns require different approaches. Since ATM1 and ATM2 follow similar pattenrs they were fitted almost the same way.

Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx

Exploring Data

Taking a closer look at the data similar to part A.

head(residential_data)
## # A tibble: 6 × 3
##   CaseSequence `YYYY-MMM`     KWH
##          <dbl> <chr>        <dbl>
## 1          733 1998-Jan   6862583
## 2          734 1998-Feb   5838198
## 3          735 1998-Mar   5420658
## 4          736 1998-Apr   5010364
## 5          737 1998-May   4665377
## 6          738 1998-Jun   6467147
str(residential_data)
## tibble [192 × 3] (S3: tbl_df/tbl/data.frame)
##  $ CaseSequence: num [1:192] 733 734 735 736 737 738 739 740 741 742 ...
##  $ YYYY-MMM    : chr [1:192] "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
##  $ KWH         : num [1:192] 6862583 5838198 5420658 5010364 4665377 ...
summary(residential_data)
##   CaseSequence     YYYY-MMM              KWH          
##  Min.   :733.0   Length:192         Min.   :  770523  
##  1st Qu.:780.8   Class :character   1st Qu.: 5429912  
##  Median :828.5   Mode  :character   Median : 6283324  
##  Mean   :828.5                      Mean   : 6502475  
##  3rd Qu.:876.2                      3rd Qu.: 7620524  
##  Max.   :924.0                      Max.   :10655730  
##                                     NA's   :1

THis shows the shape and the types of values within the data set.

Converting numbers into time series of 12 month cycle

kwh_ts <- ts(residential_data$KWH, start = c(1998, 1), frequency = 12)
sum(is.na(kwh_ts))
## [1] 1

Fixing the na values

kwh_ts <- na.approx(kwh_ts)
sum(is.na(kwh_ts))
## [1] 0

Visualization

autoplot(kwh_ts) + 
  theme_light() + 
  labs(
    title = "Residential Power Usage",
    x = "Year", 
    y = "KWH"
  )

The graph definitly shows a seasonal pattern. There is a dip after 2010, which could be an outlier.

CHecking for source of outliar

which.min(kwh_ts)
## [1] 151
which.max(kwh_ts)
## [1] 181
kwh_ts[151]
## [1] 770523
kwh_ts[181]
## [1] 10655730

The min value is way to small so I will adjust it.

kwh_ts[151] <- NA
kwh_ts <- na.approx(kwh_ts)

Plotting again

autoplot(kwh_ts) + 
  theme_light() + 
  labs(
    title = "Residential Power Usage", 
    x = "Year", 
    y = "KWH"
  )

Modeling and Forecasing

Fitting the ARIMA model

kwh_fit <- auto.arima(kwh_ts)
summary(kwh_fit)
## Series: kwh_ts 
## ARIMA(3,0,2)(2,1,0)[12] with drift 
## 
## Coefficients:
##           ar1      ar2     ar3     ma1     ma2     sar1     sar2     drift
##       -0.6085  -0.2572  0.3404  0.9446  0.5421  -0.7399  -0.4226  9011.908
## s.e.   0.3130   0.2642  0.0863  0.3221  0.3464   0.0799   0.0825  2990.939
## 
## sigma^2 = 3.87e+11:  log likelihood = -2657.12
## AIC=5332.25   AICc=5333.31   BIC=5360.99
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -8408.93 588799.3 428524.6 -0.794707 6.490438 0.6866794
##                      ACF1
## Training set 0.0004608235

The ARIMA model is showing seasonality with a drift with an upward trend.

Forecast

kwh_forecast <- forecast(kwh_fit, h = 12)
autoplot(kwh_forecast) + 
  theme_light() + 
  labs( 
    title = "Residential Power Usage",
    x = "Year", 
    y = "KWH"
    )

## Exporting the table into excel

kwh_forecast_table <- data.frame(
  month = seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month"), 
  KWH_Forecast = as.numeric(kwh_forecast$mean)
)
write.xlsx(kwh_forecast_table, "Residential_Power_Forecast.xlsx")