library(fpp3)
## ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.4
## ✔ dplyr 1.1.3 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.0 ✔ feasts 0.3.1
## ✔ lubridate 1.9.3 ✔ fable 0.3.3
## ✔ ggplot2 3.4.4 ✔ fabletools 0.3.4
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(dplyr)
library(ggplot2)
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ purrr 1.0.2 ✔ stringr 1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tsibble)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(fable)
library(tidyr)
library(lubridate)
library(fabletools)
library(distributional)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following object is masked from 'package:tsibble':
##
## key
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(writexl)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following object is masked from 'package:tsibble':
##
## index
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(seasonal)
##
## Attaching package: 'seasonal'
##
## The following object is masked from 'package:tibble':
##
## view
atm_data <- read_excel("/Users/briansingh/Desktop/CUNY/Spring 2024/Data 624/Project1/ATM624Data.xlsx")
head(atm_data)
## # A tibble: 6 × 3
## DATE ATM Cash
## <dbl> <chr> <dbl>
## 1 39934 ATM1 96
## 2 39934 ATM2 107
## 3 39935 ATM1 82
## 4 39935 ATM2 89
## 5 39936 ATM1 85
## 6 39936 ATM2 90
#convert date from excel import
atm_data <- atm_data %>%
mutate(DATE = as.Date("1900-01-01") + lubridate::days(DATE))
#summary and descriptive statistics
summary(atm_data)
## DATE ATM Cash
## Min. :2009-05-03 Length:1474 Min. : 0.0
## 1st Qu.:2009-08-03 Class :character 1st Qu.: 0.5
## Median :2009-11-03 Mode :character Median : 73.0
## Mean :2009-11-02 Mean : 155.6
## 3rd Qu.:2010-02-03 3rd Qu.: 114.0
## Max. :2010-05-16 Max. :10919.8
## NA's :19
We can see there is a minimum value of 0 and 19 total NA values. We’d have to address for forecasting purposes. We can take a look at the withdrawals for each ATM over time.
ggplot(atm_data, aes(x = DATE, y = Cash, color = ATM)) +
geom_line() +
labs(title = "Cash Withdrawn by ATM",
x = "Date",
y = "Cash",
color = "ATM")
## Warning: Removed 14 rows containing missing values (`geom_line()`).
We can see an outlier in ATM 4 around March of 2010 with a date with a withdrawal total over $10K. We may would have to normalize in for a more accurate model.
Let’s break up each ATM and look at them on their own scales.
atm_data %>%
filter(!is.na(ATM)) %>%
ggplot(aes(x = DATE, y = Cash, col = ATM)) +
geom_line(show.legend = FALSE) +
facet_wrap(~ ATM, ncol = 1, scales = "free_y") +
labs(title = "ATM Withdrawals", x = "Date") +
scale_y_continuous("Withdrawals (hundreds)")
ATM 3 appears to have no withdrawals until after April 2010. It could be that it was not put into use until then. This should probably be excluded in the forecasting model as it will not have enough data input to generate a reliable prediction. All in all, each ATM does not appear to have a trend, but definitely shows seasonality.
Let’s convert the data to a tsibble for time series forecasting. We’d need to have each ATM broken out with each of their individual values in order to pass into the model. We can see above that an NA ATM is being generated which we’d have to ignore – it holds no values.
atm_data_2 <- atm_data %>%
filter(!is.na(ATM)) %>%
filter(!is.na(Cash)) #5 missing values, will drop for modeling
#replace outlier with median
atm_data_2 <- atm_data_2 %>%
mutate(Cash = ifelse(ATM == "ATM4" & Cash > 10000, median(Cash), Cash))
atm_tsibble <- as_tsibble(atm_data_2, key = ATM, index = DATE)
atm_tsibble <- fill_gaps(atm_tsibble)
atm_tsibble <- atm_tsibble %>%
fill(Cash)
atm_tsibble_filtered <- filter(atm_tsibble, year(DATE) == 2009 | (year(DATE) == 2010 & month(DATE) < 5))
#atm 3 does not begin until April 28, 2010, need to adjust the start date for modeling
start_date_atm3 <- max(filter(atm_tsibble, ATM == "ATM3")$DATE)
atm_tsibble_filtered <- atm_tsibble_filtered %>%
filter(!(ATM == "ATM3" & DATE < start_date_atm3))
#time series plot for each atm
atm_tsibble_filtered %>%
ggplot(aes(x = DATE, y = Cash, color = ATM)) +
geom_line() +
labs(title = "Cash Withdrawn from ATMs Over Time",
x = "Date",
y = "Cash Withdrawn") +
theme_minimal()
#decomposition for seasonality and trend
atm_decomp <- atm_tsibble_filtered %>%
model(
decomposed = STL(Cash ~ season(window = Inf))
) %>%
components()
atm_decomp %>% autoplot()
#visualize deomp
atm_decomp %>%
ggplot(aes(x = DATE, y = Cash, color = .model)) +
geom_line() +
facet_wrap(~.model, scales = "free_y") +
labs(title = "Decomposition of Cash Withdrawn from ATMs",
x = "Date",
y = "Cash Withdrawn") +
theme_minimal()
#fit the model using ARIMA and Seasonal Naive
atm_forecasts <- atm_tsibble_filtered %>%
model(
arima = ARIMA(Cash),
snaive = SNAIVE(Cash)
) %>%
forecast(h = "1 month")
#plot forecasts for each ATM based on ARIMA and SNAIVE
atm_forecasts %>%
autoplot(atm_tsibble_filtered) +
labs(title = "Forecast of Cash Withdrawn from ATMs",
x = "Date",
y = "Cash Withdrawn",
color = "ATM") +
theme_minimal()
fit <- atm_tsibble_filtered %>%
model(ARIMA(Cash))
report(fit)
## Warning in report.mdl_df(fit): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.
## # A tibble: 3 × 9
## ATM .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 ATM1 ARIMA(Cash) 562. -1633. 3273. 3273. 3289. <cpl [0]> <cpl [15]>
## 2 ATM2 ARIMA(Cash) 605. -1645. 3302. 3302. 3325. <cpl [2]> <cpl [9]>
## 3 ATM4 ARIMA(Cash) 118135. -2632. 5279. 5280. 5310. <cpl [10]> <cpl [2]>
fit %>% filter(ATM=="ATM1") %>% report()
## Series: Cash
## Model: ARIMA(0,0,1)(0,1,2)[7]
##
## Coefficients:
## ma1 sma1 sma2
## 0.1996 -0.5808 -0.1042
## s.e. 0.0546 0.0507 0.0495
##
## sigma^2 estimated as 561.6: log likelihood=-1632.53
## AIC=3273.05 AICc=3273.17 BIC=3288.55
fit %>% filter(ATM=="ATM2") %>% report()
## Series: Cash
## Model: ARIMA(2,0,2)(0,1,1)[7]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4282 -0.9223 0.4638 0.8178 -0.7554
## s.e. 0.0501 0.0392 0.0791 0.0537 0.0391
##
## sigma^2 estimated as 604.6: log likelihood=-1645.05
## AIC=3302.1 AICc=3302.34 BIC=3325.35
fit %>% filter(ATM=="ATM4") %>% report()
## Series: Cash
## Model: ARIMA(3,0,2)(1,0,0)[7] w/ mean
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 sar1 constant
## -0.8723 -0.6801 0.1696 0.9590 0.8110 0.2464 797.0464
## s.e. 0.1003 0.1188 0.0567 0.0903 0.1078 0.0559 49.1564
##
## sigma^2 estimated as 118135: log likelihood=-2631.63
## AIC=5279.26 AICc=5279.67 BIC=5310.41
fit %>%
filter(ATM == "ATM1") %>%
gg_tsresiduals()
fit %>%
filter(ATM == "ATM2") %>%
gg_tsresiduals()
fit %>%
filter(ATM == "ATM4") %>%
gg_tsresiduals()
fit_atm1 <- fit %>% filter(ATM=="ATM1")%>%forecast(h=31)
fit_atm2 <- fit %>% filter(ATM=="ATM2")%>%forecast(h=31)
fit_atm4 <- fit %>% filter(ATM=="ATM4")%>%forecast(h=31)
SNAIVE
fit_snaive <- atm_tsibble_filtered %>%
model(SNAIVE(Cash))
fit_snaive %>% filter(ATM == "ATM1") %>% gg_tsresiduals()
## Warning: Removed 7 rows containing missing values (`geom_line()`).
## Warning: Removed 7 rows containing missing values (`geom_point()`).
## Warning: Removed 7 rows containing non-finite values (`stat_bin()`).
fit_snaive %>% filter(ATM == "ATM2") %>% gg_tsresiduals()
## Warning: Removed 7 rows containing missing values (`geom_line()`).
## Warning: Removed 7 rows containing missing values (`geom_point()`).
## Warning: Removed 7 rows containing non-finite values (`stat_bin()`).
fit_snaive %>% filter(ATM == "ATM4") %>% gg_tsresiduals()
## Warning: Removed 7 rows containing missing values (`geom_line()`).
## Warning: Removed 7 rows containing missing values (`geom_point()`).
## Warning: Removed 7 rows containing non-finite values (`stat_bin()`).
A spike in the lag at day 7 in each of the ATMs, which may indicate strong weekly seasonality. Otherwise, appears to be white noise with no systematic change in variance over time, . The residuals appear to be centered around zero and follow a normal distribution. Also, there appears to be constant variance, which holds for a good model.
fit_snaive %>% filter(ATM == "ATM1") %>%
accuracy()
## # A tibble: 1 × 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 SNAIVE(Cash) Training -0.0730 27.9 17.8 -97.1 117. 1 1 0.193
fit_snaive %>% filter(ATM == "ATM2") %>%
accuracy()
## # A tibble: 1 × 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM2 SNAIVE(Cash) Training -0.0674 30.1 20.7 -Inf Inf 1 1 0.0458
fit_snaive %>% filter(ATM == "ATM4") %>%
accuracy()
## # A tibble: 1 × 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM4 SNAIVE(Cash) Training -3.65 454. 347. -393. 448. 1 1 0.0643
atm_tsibble_filtered %>%
filter(ATM == "ATM1") %>%
model(SNAIVE(Cash ~ lag("week"))) %>%
forecast(h=31) %>%
autoplot(atm_tsibble_filtered)
#atm1 snaive forecast
atm1_forecast_snaive <- atm_tsibble_filtered %>%
filter(ATM == "ATM1") %>%
model(SNAIVE(Cash ~ lag("week"))) %>%
forecast(h=31)
#atm2 snaive forecast
atm2_forecast_snaive <- atm_tsibble_filtered %>%
filter(ATM == "ATM2") %>%
model(SNAIVE(Cash ~ lag("week"))) %>%
forecast(h=31)
#atm4 snaive forecast
atm4_forecast_snaive <- atm_tsibble_filtered %>%
filter(ATM == "ATM4") %>%
model(SNAIVE(Cash ~ lag("week"))) %>%
forecast(h=31)
#combine the seasonal naive forecasts into a df and export to excel
forecast_df_snaive <- rbindlist(list(atm1_forecast_snaive,atm2_forecast_snaive,atm4_forecast_snaive))
write_xlsx(forecast_df_snaive, "forecast_atmsnaive_combined.xlsx")
kwh_data <- read_excel("/Users/briansingh/Desktop/CUNY/Spring 2024/Data 624/Project1/ResidentialCustomerForecastLoad-624.xlsx",col_names = TRUE)
names(kwh_data)[names(kwh_data) == "YYYY-MMM"] <- "Date"
summary(kwh_data)
## CaseSequence Date KWH
## Min. :733.0 Length:192 Min. : 770523
## 1st Qu.:780.8 Class :character 1st Qu.: 5429912
## Median :828.5 Mode :character Median : 6283324
## Mean :828.5 Mean : 6502475
## 3rd Qu.:876.2 3rd Qu.: 7620524
## Max. :924.0 Max. :10655730
## NA's :1
#convert from character to monthly for time series
kwh_data$Date <- yearmonth(as.Date(as.yearmon(kwh_data$Date, "%Y-%b")))
#one missing value in KWH -- replace with median so no gaps in time series
kwh_data$KWH[is.na(kwh_data$KWH)] = median(kwh_data$KWH, na.rm = TRUE)
#case sequence has no significance in this
kwh_timeseries <- kwh_data %>%
as_tsibble(index=Date) %>%
select(-CaseSequence)
kwh_timeseries %>%
pivot_longer(-Date) %>%
ggplot(aes(x = Date, y = value, colour = name)) +
geom_line() +
facet_grid(name ~ ., scales = "free_y")
I can see seasonal data with a slight upwards trend after approximately 2011.
kwh_timeseries %>%
gg_season(KWH)
July has a large outlier which we can address and normalize for modeling. Lets look at the decomposition to identify trend and seasonality.
kwh_timeseries %>%
model(
classical_decomposition(KWH, type = "multiplicative")
) %>%
components() %>%
autoplot() +
labs(title = "Classical decomposition with type = multiplicative")
## Warning: Removed 6 rows containing missing values (`geom_line()`).
As seen above, definitely seasonal data with an upward trend beginning after 2010. It is not steadily increasing, so drifting would not be ideal for forecasting. We can use seasonal naive.
fit_kwh_snaive <- kwh_timeseries %>%
model(SNAIVE(KWH))
fit_kwh_snaive %>% gg_tsresiduals()
## Warning: Removed 12 rows containing missing values (`geom_line()`).
## Warning: Removed 12 rows containing missing values (`geom_point()`).
## Warning: Removed 12 rows containing non-finite values (`stat_bin()`).
The residuals appear to be centered around zero and follow a normal distribution. Also, there appears to be constant variance, with the exception of a couple of outliers, which holds for a good model.
lambda <- kwh_timeseries %>%
features(KWH,features = guerrero) %>%
pull(lambda_guerrero)
kwh_timeseries %>%
autoplot(box_cox(KWH, lambda))
Lambda of 0.1041 can be used to apply box-cox to normalize the data. However, did not appear very helpful.
kwh_snaive_forecast <- fit_kwh_snaive %>%
forecast(h=12)
kwh_snaive_forecast
## # A fable: 12 x 4 [1M]
## # Key: .model [1]
## .model Date KWH .mean
## <chr> <mth> <dist> <dbl>
## 1 SNAIVE(KWH) 2014 Jan N(1.1e+07, 1.4e+12) 10655730
## 2 SNAIVE(KWH) 2014 Feb N(7681798, 1.4e+12) 7681798
## 3 SNAIVE(KWH) 2014 Mar N(6517514, 1.4e+12) 6517514
## 4 SNAIVE(KWH) 2014 Apr N(6105359, 1.4e+12) 6105359
## 5 SNAIVE(KWH) 2014 May N(5940475, 1.4e+12) 5940475
## 6 SNAIVE(KWH) 2014 Jun N(7920627, 1.4e+12) 7920627
## 7 SNAIVE(KWH) 2014 Jul N(8415321, 1.4e+12) 8415321
## 8 SNAIVE(KWH) 2014 Aug N(9080226, 1.4e+12) 9080226
## 9 SNAIVE(KWH) 2014 Sep N(8e+06, 1.4e+12) 7968220
## 10 SNAIVE(KWH) 2014 Oct N(5759367, 1.4e+12) 5759367
## 11 SNAIVE(KWH) 2014 Nov N(5769083, 1.4e+12) 5769083
## 12 SNAIVE(KWH) 2014 Dec N(9606304, 1.4e+12) 9606304
kwh_snaive_forecast %>%
autoplot(kwh_timeseries)
kwh_snaive_forecast_df <- data.frame(kwh_snaive_forecast)
write_xlsx(kwh_snaive_forecast_df, "forecast_kwh.xlsx")
It does appear to be a good model and follows the overall trend and seasonality of the original data.