library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readxl)
library(openxlsx)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tseries)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.4.3
##
## Attaching package: 'tsibble'
## The following object is masked from 'package:zoo':
##
## index
## The following object is masked from 'package:lubridate':
##
## interval
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(feasts)
## Warning: package 'feasts' was built under R version 4.4.3
## Loading required package: fabletools
## Warning: package 'fabletools' was built under R version 4.4.3
## Registered S3 methods overwritten by 'ggtime':
## method from
## autolayer.fbl_ts fabletools
## autolayer.tbl_ts fabletools
## autoplot.dcmp_ts fabletools
## autoplot.fbl_ts fabletools
## autoplot.tbl_ts fabletools
## fortify.fbl_ts fabletools
library(ggplot2)
library(fable)
## Warning: package 'fable' was built under R version 4.4.3
atm_data <- read_excel("/Users/tatyanatitova/Documents/SPRING2026/DATA624/ATM624Data.xlsx")
residential_data <- read_excel("/Users/tatyanatitova/Documents/SPRING2026/DATA624/ResidentialCustomerForecastLoad-624.xlsx")
For this project, it is important to examine and explore the data along the way. So first, I am checking the data set structure of the imported excel for the atm data.
head(atm_data)
## # A tibble: 6 × 3
## DATE ATM Cash
## <dbl> <chr> <dbl>
## 1 39934 ATM1 96
## 2 39934 ATM2 107
## 3 39935 ATM1 82
## 4 39935 ATM2 89
## 5 39936 ATM1 85
## 6 39936 ATM2 90
summary(atm_data)
## DATE ATM Cash
## Min. :39934 Length:1474 Min. : 0.0
## 1st Qu.:40026 Class :character 1st Qu.: 0.5
## Median :40118 Mode :character Median : 73.0
## Mean :40118 Mean : 155.6
## 3rd Qu.:40210 3rd Qu.: 114.0
## Max. :40312 Max. :10919.8
## NA's :19
str(atm_data)
## tibble [1,474 × 3] (S3: tbl_df/tbl/data.frame)
## $ DATE: num [1:1474] 39934 39934 39935 39935 39936 ...
## $ ATM : chr [1:1474] "ATM1" "ATM2" "ATM1" "ATM2" ...
## $ Cash: num [1:1474] 96 107 82 89 85 90 90 55 99 79 ...
This shows that the dataset has 1474 rows and 3 columns that are the DATE, ATM, and CASH. Upon initial review I can see that there are some formatting issues with the date column, missing cash values (the zeros for minumum), and a very high max for cash.
atm_data |> distinct(ATM)
## # A tibble: 5 × 1
## ATM
## <chr>
## 1 ATM1
## 2 ATM2
## 3 <NA>
## 4 ATM3
## 5 ATM4
There are total of 4 atm machines and some na values that need to be addressed.
The dates are not in date format so I will use the mutate function to address that and set the format using the standard excel date.
atm_data <- atm_data |>
mutate(DATE = as.Date(DATE, origin = "1899-12-30"))
Creating a new variable where I will store the filtered data where there is no NA values.
atm_clean <- atm_data |> filter(!is.na(ATM))
The max value is way to high compared to other values so I will filter out and replace with NA values greater than 5000 for now.
atm_clean <- atm_clean |>
mutate(Cash = ifelse(ATM == "ATM4" & Cash > 5000, NA, Cash))
Checking to see what the max value is now and if the filtering solved the issue
atm_clean |> filter(ATM == "ATM4") |> arrange(desc(Cash)) |> head(5)
## # A tibble: 5 × 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-09-22 ATM4 1712.
## 2 2010-01-29 ATM4 1575.
## 3 2009-09-04 ATM4 1495.
## 4 2009-06-09 ATM4 1484.
## 5 2009-09-05 ATM4 1301.
Filtering through the na values again that occurred when resolving the outliar issue.
atm_clean <- atm_clean |>
group_by(ATM) |>
mutate(Cash = na.approx(Cash, na.rm = FALSE)) |>
ungroup()
atm_clean |> summarise(remaining_NAs = sum(is.na(Cash)))
## # A tibble: 1 × 1
## remaining_NAs
## <int>
## 1 0
For this plot its best to do a facet wrap to see the graphs side by side and see patterns of seaonality
atm_clean |>
ggplot(aes(x = DATE, y = Cash)) +
geom_line(color = "steelblue") +
facet_wrap(~ ATM, scales = "free_y") +
theme_light() +
labs(
title = "Daily ATM Cash Withdrawals",
x = "DATE",
y = "Cash (in hundreds $)"
)
ATM1 and ATM2 look the most similar and overtime they appear to be the
sot stable with no clear trend. The rapid changes signify the
seasonality of the data, most likely weekly.
ATM3 looks like the machine was either down or not working as it has 0 and then a spike. ATM4 is on a different scale as it appears to be more busy than any of the other ATMs.
Using the mutate function I extracted the day of the week from each dateto show the avg withdrawal of cash from the ATM1 per day. This will show the daily pattern.
atm_clean |>
filter(ATM == "ATM1") |>
mutate(day_of_week = wday(DATE, label = TRUE)) |>
group_by(day_of_week) |>
summarise(avg_cash = mean(Cash))
## # A tibble: 7 × 2
## day_of_week avg_cash
## <ord> <dbl>
## 1 Sun 103.
## 2 Mon 86.5
## 3 Tue 89.9
## 4 Wed 82.2
## 5 Thu 31.7
## 6 Fri 98.6
## 7 Sat 97.3
The weekend has the highest ATM withdrawals during the week.
Similar process here to test for daily trends
atm_clean |>
filter(ATM == "ATM2") |>
mutate(day_of_week = wday(DATE, label = TRUE)) |>
group_by(day_of_week) |>
summarise(avg_cash = mean(Cash))
## # A tibble: 7 × 2
## day_of_week avg_cash
## <ord> <dbl>
## 1 Sun 67.2
## 2 Mon 58.7
## 3 Tue 73.2
## 4 Wed 43.9
## 5 Thu 26.6
## 6 Fri 92.0
## 7 Sat 76.0
Friday here is the peak while THurdsday has the smallest amount of withdrawals.
Just as the graph shows, most of the values are just zeros for this ATM.
atm_clean |>
filter(ATM == "ATM3") |>
mutate(day_of_week = wday(DATE, label = TRUE)) |>
group_by(day_of_week) |>
summarise(avg_cash = mean(Cash))
## # A tibble: 7 × 2
## day_of_week avg_cash
## <ord> <dbl>
## 1 Sun 0
## 2 Mon 0
## 3 Tue 0
## 4 Wed 1.85
## 5 Thu 1.58
## 6 Fri 1.60
## 7 Sat 0
atm4 <- atm_clean |>
filter(ATM == "ATM4") |>
mutate(day_of_week = wday(DATE, label = TRUE))
atm4 |>
group_by(day_of_week) |>
summarise(avg_cash = mean(Cash))
## # A tibble: 7 × 2
## day_of_week avg_cash
## <ord> <dbl>
## 1 Sun 539.
## 2 Mon 481.
## 3 Tue 442.
## 4 Wed 414.
## 5 Thu 170.
## 6 Fri 574.
## 7 Sat 492.
The pattern appears to be similar where Friday has the highest number.
atm_clean |>
filter(ATM == "ATM3") |>
summarise(
n_total = n(),
n_nonzero = sum(Cash > 0 ),
first_nonzero = min(DATE[Cash > 0])
)
## # A tibble: 1 × 3
## n_total n_nonzero first_nonzero
## <int> <int> <date>
## 1 365 3 2010-04-28
atm_clean |>
filter(ATM == "ATM3", Cash > 0)
## # A tibble: 3 × 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2010-04-28 ATM3 96
## 2 2010-04-29 ATM3 82
## 3 2010-04-30 ATM3 85
atm1 <- atm_clean |>
filter(ATM == "ATM1") |>
arrange(DATE) |>
pull(Cash) |>
ts(frequency = 7)
atm1_fit <- auto.arima(atm1)
summary(atm1_fit)
## Series: atm1
## ARIMA(0,0,1)(0,1,2)[7]
##
## Coefficients:
## ma1 sma1 sma2
## 0.1950 -0.5806 -0.1037
## s.e. 0.0546 0.0506 0.0494
##
## sigma^2 = 556.4: log likelihood = -1640.01
## AIC=3288.03 AICc=3288.14 BIC=3303.55
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.07416536 23.26239 14.53988 -102.2594 117.1482 0.8215398
## ACF1
## Training set -0.008720072
The autofit works well here since there is a weekly cycle.
atm1_forecast <- forecast(atm1_fit, h = 31)
autoplot(atm1_forecast) +
theme_light() +
labs(
title = "ATM 1 Forecast"
)
atm2 <- atm_clean |>
filter(ATM == "ATM2") |>
arrange(DATE) |>
pull(Cash) |>
ts(frequency = 7)
atm2_fit <- auto.arima(atm2)
summary(atm2_fit)
## Series: atm2
## ARIMA(2,0,2)(0,1,1)[7]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4320 -0.9130 0.4773 0.8048 -0.7547
## s.e. 0.0553 0.0407 0.0861 0.0556 0.0381
##
## sigma^2 = 602.5: log likelihood = -1653.67
## AIC=3319.33 AICc=3319.57 BIC=3342.61
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.8907648 24.13932 17.04479 -Inf Inf 0.8199456 -0.004275338
atm2_forecast <- forecast(atm2_fit, h = 31)
autoplot(atm2_forecast) +
theme_minimal() +
labs(title = "ATM2 Forecast")
atm4 <- atm_clean |>
filter(ATM == "ATM4") |>
arrange(DATE) |>
pull(Cash) |>
ts(frequency = 7)
atm4_fit <- auto.arima(atm4)
summary(atm4_fit)
## Series: atm4
## ARIMA(0,0,3)(1,0,0)[7] with non-zero mean
##
## Coefficients:
## ma1 ma2 ma3 sar1 mean
## 0.0891 0.0185 0.0892 0.1831 444.1423
## s.e. 0.0532 0.0585 0.0516 0.0527 26.1912
##
## sigma^2 = 119496: log likelihood = -2649.15
## AIC=5310.29 AICc=5310.53 BIC=5333.69
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.2182895 343.3066 287.5528 -518.8467 552.121 0.8300788
## ACF1
## Training set -0.008472808
atm4_forecast <- forecast(atm4_fit, h = 31)
autoplot(atm4_forecast) +
theme_minimal() +
labs(title = "ATM4 Forecast")
dates <- seq(as.Date("2010-05-01"), as.Date("2010-05-31"), by = "day")
forecast_table <- data.frame(
DATE = dates,
ATM1 = as.numeric(atm1_forecast$mean),
ATM2 = as.numeric(atm2_forecast$mean),
ATM3 = rep(mean(c(96, 82, 85)), 31),
ATM4 = as.numeric(atm4_forecast$mean)
)
forecast_table
## DATE ATM1 ATM2 ATM3 ATM4
## 1 2010-05-01 87.167893 66.436775 87.66667 361.2625
## 2 2010-05-02 104.055911 72.717930 87.66667 432.1130
## 3 2010-05-03 73.149472 13.674789 87.66667 450.3253
## 4 2010-05-04 8.032036 5.297995 87.66667 365.3366
## 5 2010-05-05 100.006619 97.634046 87.66667 426.5774
## 6 2010-05-06 80.857483 88.290342 87.66667 370.9293
## 7 2010-05-07 86.669742 64.988924 87.66667 451.1258
## 8 2010-05-08 87.958452 66.444443 87.66667 428.9687
## 9 2010-05-09 102.747019 72.727023 87.66667 441.9400
## 10 2010-05-10 73.390751 13.663861 87.66667 445.2743
## 11 2010-05-11 8.588613 5.294414 87.66667 429.7146
## 12 2010-05-12 100.653148 97.645570 87.66667 440.9265
## 13 2010-05-13 80.393212 88.288633 87.66667 430.7385
## 14 2010-05-14 86.927292 64.979141 87.66667 445.4209
## 15 2010-05-15 88.030443 66.450229 87.66667 441.3643
## 16 2010-05-16 102.747019 72.733455 87.66667 443.7391
## 17 2010-05-17 73.390751 13.655799 87.66667 444.3496
## 18 2010-05-18 8.588613 5.292024 87.66667 441.5009
## 19 2010-05-19 100.653148 97.653963 87.66667 443.5536
## 20 2010-05-20 80.393212 88.287189 87.66667 441.6884
## 21 2010-05-21 86.927292 64.972102 87.66667 444.3764
## 22 2010-05-22 88.030443 66.454588 87.66667 443.6337
## 23 2010-05-23 102.747019 72.737999 87.66667 444.0685
## 24 2010-05-24 73.390751 13.649857 87.66667 444.1803
## 25 2010-05-25 8.588613 5.290443 87.66667 443.6587
## 26 2010-05-26 100.653148 97.660071 87.66667 444.0345
## 27 2010-05-27 80.393212 88.285994 87.66667 443.6930
## 28 2010-05-28 86.927292 64.967041 87.66667 444.1852
## 29 2010-05-29 88.030443 66.457865 87.66667 444.0492
## 30 2010-05-30 102.747019 72.741204 87.66667 444.1288
## 31 2010-05-31 73.390751 13.645480 87.66667 444.1493
write.xlsx(forecast_table, "ATM_Forecast_May2010.xlsx")
After fitting the ARIMA models for each of the ATMs, the different types of patterns require different approaches. Since ATM1 and ATM2 follow similar pattenrs they were fitted almost the same way.
Taking a closer look at the data similar to part A.
head(residential_data)
## # A tibble: 6 × 3
## CaseSequence `YYYY-MMM` KWH
## <dbl> <chr> <dbl>
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
str(residential_data)
## tibble [192 × 3] (S3: tbl_df/tbl/data.frame)
## $ CaseSequence: num [1:192] 733 734 735 736 737 738 739 740 741 742 ...
## $ YYYY-MMM : chr [1:192] "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
## $ KWH : num [1:192] 6862583 5838198 5420658 5010364 4665377 ...
summary(residential_data)
## CaseSequence YYYY-MMM KWH
## Min. :733.0 Length:192 Min. : 770523
## 1st Qu.:780.8 Class :character 1st Qu.: 5429912
## Median :828.5 Mode :character Median : 6283324
## Mean :828.5 Mean : 6502475
## 3rd Qu.:876.2 3rd Qu.: 7620524
## Max. :924.0 Max. :10655730
## NA's :1
THis shows the shape and the types of values within the data set.
kwh_ts <- ts(residential_data$KWH, start = c(1998, 1), frequency = 12)
sum(is.na(kwh_ts))
## [1] 1
kwh_ts <- na.approx(kwh_ts)
sum(is.na(kwh_ts))
## [1] 0
autoplot(kwh_ts) +
theme_light() +
labs(
title = "Residential Power Usage",
x = "Year",
y = "KWH"
)
The graph definitly shows a seasonal pattern. There is a dip after 2010,
which could be an outlier.
which.min(kwh_ts)
## [1] 151
which.max(kwh_ts)
## [1] 181
kwh_ts[151]
## [1] 770523
kwh_ts[181]
## [1] 10655730
The min value is way to small so I will adjust it.
kwh_ts[151] <- NA
kwh_ts <- na.approx(kwh_ts)
Plotting again
autoplot(kwh_ts) +
theme_light() +
labs(
title = "Residential Power Usage",
x = "Year",
y = "KWH"
)
kwh_fit <- auto.arima(kwh_ts)
summary(kwh_fit)
## Series: kwh_ts
## ARIMA(3,0,2)(2,1,0)[12] with drift
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 sar1 sar2 drift
## -0.6085 -0.2572 0.3404 0.9446 0.5421 -0.7399 -0.4226 9011.908
## s.e. 0.3130 0.2642 0.0863 0.3221 0.3464 0.0799 0.0825 2990.939
##
## sigma^2 = 3.87e+11: log likelihood = -2657.12
## AIC=5332.25 AICc=5333.31 BIC=5360.99
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -8408.93 588799.3 428524.6 -0.794707 6.490438 0.6866794
## ACF1
## Training set 0.0004608235
The ARIMA model is showing seasonality with a drift with an upward trend.
kwh_forecast <- forecast(kwh_fit, h = 12)
autoplot(kwh_forecast) +
theme_light() +
labs(
title = "Residential Power Usage",
x = "Year",
y = "KWH"
)
## Exporting the table into excel
kwh_forecast_table <- data.frame(
month = seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month"),
KWH_Forecast = as.numeric(kwh_forecast$mean)
)
write.xlsx(kwh_forecast_table, "Residential_Power_Forecast.xlsx")