library(fpp3)
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble      3.2.1     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.1
## ✔ lubridate   1.9.3     ✔ fable       0.4.1
## ✔ ggplot2     3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
library(readxl)
library(ggplot2)
library(openxlsx)

Part A – ATM Forecast

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.

atm_data <- read_excel("ATM624Data.xlsx")
str(atm_data) 
## tibble [1,474 × 3] (S3: tbl_df/tbl/data.frame)
##  $ DATE: num [1:1474] 39934 39934 39935 39935 39936 ...
##  $ ATM : chr [1:1474] "ATM1" "ATM2" "ATM1" "ATM2" ...
##  $ Cash: num [1:1474] 96 107 82 89 85 90 90 55 99 79 ...
atm_clean <- atm_data %>%
  mutate(DATE = as.Date(DATE, origin = "1899-12-30")) %>% #converting the dates since they are in excel format 
  as_tsibble(index = DATE, key = ATM)

summary(atm_clean)
##       DATE                ATM                 Cash        
##  Min.   :2009-05-01   Length:1474        Min.   :    0.0  
##  1st Qu.:2009-08-01   Class :character   1st Qu.:    0.5  
##  Median :2009-11-01   Mode  :character   Median :   73.0  
##  Mean   :2009-10-31                      Mean   :  155.6  
##  3rd Qu.:2010-02-01                      3rd Qu.:  114.0  
##  Max.   :2010-05-14                      Max.   :10919.8  
##                                          NA's   :19
atm_clean %>%
  autoplot(Cash) +
  facet_wrap(~ATM,scales = "free",nrow = 5)  
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).

In the first visualization, I’ve observed a few key issues: there is missing data (looks like for may month), most of the values for ATM_3 are zeros, and ATM_4 contains a significant outlier. I’ll explore these aspects in more details separetely.

atm_clean %>%
  filter(is.na(ATM) | is.na(Cash)) %>%
  print()
## # A tsibble: 19 x 3 [1D]
## # Key:       ATM [3]
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-06-13 ATM1     NA
##  2 2009-06-16 ATM1     NA
##  3 2009-06-22 ATM1     NA
##  4 2009-06-18 ATM2     NA
##  5 2009-06-24 ATM2     NA
##  6 2010-05-01 <NA>     NA
##  7 2010-05-02 <NA>     NA
##  8 2010-05-03 <NA>     NA
##  9 2010-05-04 <NA>     NA
## 10 2010-05-05 <NA>     NA
## 11 2010-05-06 <NA>     NA
## 12 2010-05-07 <NA>     NA
## 13 2010-05-08 <NA>     NA
## 14 2010-05-09 <NA>     NA
## 15 2010-05-10 <NA>     NA
## 16 2010-05-11 <NA>     NA
## 17 2010-05-12 <NA>     NA
## 18 2010-05-13 <NA>     NA
## 19 2010-05-14 <NA>     NA

We are missing data for May 2010 but that is the month we will be forecasting for, so we can remove those dates. We are also missing data for a few dates in 2009 for two different ATMs.

#removing the data for May 
atm_clean <- atm_clean %>%
  filter(!is.na(ATM))

# imputing the missing data for the other dates in 2009
atm_clean <- atm_clean %>%
  group_by(ATM) %>%
  arrange(DATE) %>% 
  mutate(Cash = zoo::na.approx(Cash, na.rm = TRUE)) %>%
  ungroup()

Since the data for each ATM is different I will separate the data and work with forecasts for each ATM separately.

ATM 1

atm1 <- atm_clean %>%
  filter(ATM == "ATM1") 

sum(is.na(atm1$Cash)) # no missing data
## [1] 0
atm1 %>%
  gg_tsdisplay(Cash)

atm1 %>% 
  ggplot(aes(x = ATM, y = Cash)) +
  geom_boxplot(outlier.color = 'red') +
  theme_minimal()

atm1 %>%
  model(STL(Cash ~ season(window = "periodic"))) %>%
  components() %>%
  autoplot() +
  labs(title = "Decomposition for ATM1")

# no trend, strong week patterns

atm1_fit <- atm1 %>%
  model(
    ARIMA = ARIMA(Cash),
    ETS = ETS(Cash))

report(atm1_fit)
## Warning in report.mdl_df(atm1_fit): Model reporting is only supported for
## individual models, so a glance will be shown. To see the report for a specific
## model, use `select()` and `filter()` to identify a single model.
## # A tibble: 2 × 12
##   ATM   .model sigma2 log_lik   AIC  AICc   BIC ar_roots  ma_roots     MSE  AMSE
##   <chr> <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <list>    <list>     <dbl> <dbl>
## 1 ATM1  ARIMA    556.  -1640. 3288. 3288. 3304. <cpl [0]> <cpl [15]>   NA    NA 
## 2 ATM1  ETS      580.  -2234. 4487. 4488. 4526. <NULL>    <NULL>      566.  569.
## # ℹ 1 more variable: MAE <dbl>
# Model: ARIMA(0,0,1)(0,1,2)[7] ARIMA AICc = 3288.141
# Model: ETS(A,N,A) ETS AICc = 4487.6

atm1_fc <- forecast(atm1_fit, h = "31 days")

atm1_fc %>% autoplot(atm1)+ 
  labs(title = "ATM1 Forecast")

I used both ARIMA and ETS approaches to forecast ATM 1 data. The ARIMA model demonstrated better performance based on information criteria, with lower AIC (3288.03 vs. 4487.02), AICc (3288.14 vs. 4487.64), and BIC (3303.55 vs. 4526.02) compared to the ETS model.

ATM 2

atm2 <- atm_clean %>%
  filter(ATM == "ATM2") 

atm2 %>%
  gg_tsdisplay(Cash)

atm2 %>% 
  ggplot(aes(x = ATM, y = Cash)) +
  geom_boxplot(outlier.color = 'red') +
  theme_minimal()

atm2 %>%
  model(STL(Cash ~ season(window = "periodic"))) %>%
  components() %>%
  autoplot() +
  labs(title = "Decomposition for ATM2")

Decomposition highlights a strong weekly seasonal structure, no clear trend.

#model and forecast
atm2_fit <- atm2 %>%
  model(
    ARIMA = ARIMA(Cash), 
    ETS = ETS(Cash))

report(atm2_fit)
## Warning in report.mdl_df(atm2_fit): Model reporting is only supported for
## individual models, so a glance will be shown. To see the report for a specific
## model, use `select()` and `filter()` to identify a single model.
## # A tibble: 2 × 12
##   ATM   .model sigma2 log_lik   AIC  AICc   BIC ar_roots  ma_roots    MSE  AMSE
##   <chr> <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <list>    <list>    <dbl> <dbl>
## 1 ATM2  ARIMA    603.  -1654. 3319. 3320. 3343. <cpl [2]> <cpl [9]>   NA    NA 
## 2 ATM2  ETS      648.  -2254. 4527. 4528. 4566. <NULL>    <NULL>     632.  634.
## # ℹ 1 more variable: MAE <dbl>
# Model: ARIMA(2,0,2)(0,1,1)[7]  ARIMA AICc = 3319.570
# Model: ETS(A,N,A) ETS AICc = 4527.998

atm2_fc <- forecast(atm2_fit, h = "31 days")

atm2_fc %>% autoplot(atm2)+ 
  labs(title = "ATM 2 Forecast")

The ATM2 cash withdrawal data was modeled using both an ARIMA and an ETS approach. The ARIMA model, specified as ARIMA(2,0,2)(0,1,1)[7], achieved an AICc of 3319.57, significantly outperforming the ETS model ETS(A,N,A), which had an AICc of 4527.998. This large difference in AICc indicates that the ARIMA model provides a much better fit to the data while appropriately accounting for weekly seasonality through the seasonal component(7).

ATM 3

Since ATM3 only has 3 days of historical data, complex models like ARIMA or ETS would be unreliable and likely overfit. NAIVE model, which assumes that future values will be equal to the most recent observed value.

atm3 <- atm_clean %>%
  filter(ATM == "ATM3") 

atm3 %>%
  autoplot(Cash)

# Will use NAIVE model since we have data just for 3 days. 

atm3_model <- atm3 %>%
  model(NAIVE(Cash))

atm3_fc <- atm3_model %>%
  forecast(h=31)

atm3_fc |>
  autoplot(atm3) + 
  labs(title = "ATM 3 Forecast")

ATM 4

atm4 <- atm_clean %>%
  filter(ATM == "ATM4") 

atm4 %>%
  gg_tsdisplay(Cash)

atm4 %>% 
  ggplot(aes(x = ATM, y = Cash)) +
  geom_boxplot(outlier.color = 'red') +
  theme_minimal()

max_row <- atm4 %>%
  slice(which.max(Cash))
print(max_row)
## # A tsibble: 1 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM     Cash
##   <date>     <chr>  <dbl>
## 1 2010-02-09 ATM4  10920.
# I will do median imputation to replace the significant outlier for this ATM. 
atm4$Cash[which.max(atm4$Cash)] <- median(atm4$Cash[-which.max(atm4$Cash)], na.rm = TRUE)

# Lets take a look at the data now after replacing the outlier
atm4 %>%
  autoplot(Cash)

atm4 %>%
  model(STL(Cash ~ season(window = "periodic"))) %>%
  components() %>%
  autoplot() +
  labs(title = "Decomposition for ATM4") #strong and consistent weekly seasonal pattern

#model and forecast
atm4_fit <- atm4 %>%
  model(
    ARIMA = ARIMA(Cash), 
    ETS = ETS(Cash))

report(atm4_fit)
## Warning in report.mdl_df(atm4_fit): Model reporting is only supported for
## individual models, so a glance will be shown. To see the report for a specific
## model, use `select()` and `filter()` to identify a single model.
## # A tibble: 2 × 12
##   ATM   .model  sigma2 log_lik   AIC  AICc   BIC ar_roots   ma_roots      MSE
##   <chr> <chr>    <dbl>   <dbl> <dbl> <dbl> <dbl> <list>     <list>      <dbl>
## 1 ATM4  ARIMA  117510.  -2645. 5306. 5307. 5338. <cpl [10]> <cpl [2]>     NA 
## 2 ATM4  ETS    110776.  -3192. 6404. 6405. 6443. <NULL>     <NULL>    108045.
## # ℹ 2 more variables: AMSE <dbl>, MAE <dbl>
# Model: ARIMA(3,0,2)(1,0,0)[7] w/ mean ARIMA AICc = 5306.75
# Model: ETS(A,N,A) ETS AICc = 6404.54

atm4_fc <- forecast(atm4_fit, h = "31 days")

atm4_fc %>% autoplot(atm4)+ 
  labs(title = "ATM 4 Forecast")

Two models were evaluated: ARIMA(3,0,2)(1,0,0)[7] (AICc = 5306.76) and ETS(A,N,A) (AICc = 6404.54). The ARIMA model clearly outperforms the ETS model across all metrics, including lower AIC, AICc, and BIC, and better log-likelihood (–2645.18 vs. –3191.96). This suggests that ARIMA captures both the autocorrelation and weekly seasonality more effectively than ETS.

Part B – Forecasting Power

ResidentialCustomerForecastLoad-624.xlsx

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

residential_data <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")

residential_data <- residential_data %>%
  rename(Date = "YYYY-MMM") %>%
  mutate(Date = yearmonth(Date), 
         Month = month(Date, label = TRUE)) %>%
  as_tsibble(index=Date)

glimpse(residential_data)
## Rows: 192
## Columns: 4
## $ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ Date         <mth> 1998 Jan, 1998 Feb, 1998 Mar, 1998 Apr, 1998 May, 1998 Ju…
## $ KWH          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…
## $ Month        <ord> Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, De…
colSums(is.na(residential_data)) # one missing value for KWH
## CaseSequence         Date          KWH        Month 
##            0            0            1            0
residential_data %>% filter(is.na(KWH)) # 2008 sep date for missing value
## # A tsibble: 1 x 4 [1M]
##   CaseSequence     Date   KWH Month
##          <dbl>    <mth> <dbl> <ord>
## 1          861 2008 Sep    NA Sep
residential_data %>%
  ggplot(aes(x = Date, y = KWH)) +
  geom_line() +
  labs(title = "Residential Power Usage Over Time", x = "Date", y = "KWH") +
  theme_minimal()

 # consistent seasonal pattern, slow upward trend after 2010 july, outlier around 2010 Jan, will need to impute the missing value as well

residential_data %>%
  ggplot(aes(x = KWH)) +
  geom_histogram(bins = 35) +
  labs(title = "Histogram of KWH")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

residential_data %>%
  filter(KWH == min(KWH, na.rm = TRUE)) 
## # A tsibble: 1 x 4 [1M]
##   CaseSequence     Date    KWH Month
##          <dbl>    <mth>  <dbl> <ord>
## 1          883 2010 Jul 770523 Jul
#outlier is with the value KWH = 770523, on 2010 Jul, perhaps this data entry error

residential_data %>%
  gg_season(KWH) +
  ggtitle("Seasonal Plot")

#since the data has a seasonal pattern, i think we can replace the outlier and missing value with the average value for the September and July months from other years

september_power <- residential_data %>% 
  tibble() %>%
  filter(Month == "Sep") %>% 
  summarise(avg_sep_kwh = mean(KWH, na.rm = TRUE))
september_power
## # A tibble: 1 × 1
##   avg_sep_kwh
##         <dbl>
## 1    7702333.
july_power <- residential_data %>% 
  tibble() %>%
  filter(Month == "Jul") %>% 
  summarise(avg_jul_kwh = mean(KWH, na.rm = TRUE))
july_power
## # A tibble: 1 × 1
##   avg_jul_kwh
##         <dbl>
## 1    7409896.
residential_data <- residential_data %>%
  mutate(KWH = ifelse(Date == yearmonth("2010 Jul"), july_power$avg_jul_kwh, KWH),
         KWH = ifelse(Date == yearmonth("2008 Sep"), september_power$avg_sep_kwh, KWH))

residential_data %>%
  gg_tsdisplay(KWH)

#modeling and forecasting

residential_data %>%
    model(STL(KWH)) %>%
    components() %>%
    autoplot() 

# will compare ARIMA and ETS

power_arima_model <- residential_data %>%
  model(ARIMA(KWH ~ pdq() + PDQ()))

#ARIMA(0,0,1)(2,1,0)[12] w/ drift 
# AICc = 5332.38

report(power_arima_model)
## Series: KWH 
## Model: ARIMA(0,0,1)(2,1,0)[12] w/ drift 
## 
## Coefficients:
##          ma1     sar1     sar2   constant
##       0.3134  -0.7229  -0.4084  225113.41
## s.e.  0.0857   0.0775   0.0785   64071.59
## 
## sigma^2 estimated as 3.966e+11:  log likelihood=-2661.02
## AIC=5332.04   AICc=5332.38   BIC=5348
power_arima_model %>% 
  gg_tsresiduals()

power_ets_model <- residential_data %>% 
  model(ETS(KWH))

report(power_ets_model)
## Series: KWH 
## Model: ETS(M,A,M) 
##   Smoothing parameters:
##     alpha = 0.06456344 
##     beta  = 0.00189127 
##     gamma = 0.0001632348 
## 
##   Initial states:
##     l[0]     b[0]      s[0]     s[-1]     s[-2]    s[-3]    s[-4]    s[-5]
##  6211687 6881.708 0.9563419 0.7451865 0.8693497 1.170657 1.268249 1.198462
##     s[-6]     s[-7]     s[-8]    s[-9]   s[-10]   s[-11]
##  1.000793 0.7665065 0.8047715 0.921006 1.074472 1.224206
## 
##   sigma^2:  0.0091
## 
##      AIC     AICc      BIC 
## 6141.785 6145.302 6197.162
#ETS(M,A,M) 
# AICc = 6145.3

power_ets_model %>% 
  gg_tsresiduals()

Based on the residual plots, AICc, BIC, ARIMA model would perform better but I will forecast both of the models.

power_arima_fc <- power_arima_model %>%
  forecast(h="12 months") 

power_ets_fc <- power_ets_model %>%
  forecast(h="12 months") 

combined_fc <- bind_rows(power_arima_fc, power_ets_fc)

# Plot combined forecasts
autoplot(combined_fc, residential_data) +
  labs(title = "Comparison of ARIMA and ETS Forecasts for 2014",
       y = "KWH",
       x = "Date")

To generate forecasts for 2014, I compared ARIMA and ETS models. The ARIMA model selected was ARIMA(0,0,1)(2,1,0)[12] with drift (AICc = 5332.38), while the ETS model selected was ETS(M,A,M) (AICc = 6145.3). Based on lower AICc and better residual diagnostics, ARIMA will be more appropriate model.

Part C – BONUS, optional (part or all)

Waterflow_Pipe1.xlsx and Waterflow_Pipe2.xlsx

Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.

pipe1_data <- read_excel("Waterflow_Pipe1.xlsx")
pipe2_data <- read_excel("Waterflow_Pipe2.xlsx")

glimpse(pipe1_data)
## Rows: 1,000
## Columns: 2
## $ `Date Time` <dbl> 42300.02, 42300.03, 42300.04, 42300.04, 42300.06, 42300.06…
## $ WaterFlow   <dbl> 23.369599, 28.002881, 23.065895, 29.972809, 5.997953, 15.9…
glimpse(pipe2_data)
## Rows: 1,000
## Columns: 2
## $ `Date Time` <dbl> 42300.04, 42300.08, 42300.12, 42300.17, 42300.21, 42300.25…
## $ WaterFlow   <dbl> 18.810791, 43.087025, 37.987705, 36.120379, 31.851259, 28.…
#Excel numeric date format (days since 1899-12-30)
pipe1_data <- pipe1_data %>%
  mutate(datetime = as_datetime(`Date Time` * 86400, origin = "1899-12-30"))

pipe2_data <- pipe2_data %>%
  mutate(datetime = as_datetime(`Date Time` * 86400, origin = "1899-12-30"))

head(pipe1_data)
## # A tibble: 6 × 3
##   `Date Time` WaterFlow datetime           
##         <dbl>     <dbl> <dttm>             
## 1      42300.     23.4  2015-10-23 00:24:06
## 2      42300.     28.0  2015-10-23 00:40:02
## 3      42300.     23.1  2015-10-23 00:53:51
## 4      42300.     30.0  2015-10-23 00:55:40
## 5      42300.      6.00 2015-10-23 01:19:17
## 6      42300.     15.9  2015-10-23 01:23:58
#combining the data 
combined_pipedata <- bind_rows(pipe1_data, pipe2_data) %>%
  group_by(datetime) %>%
  summarise(waterflow = mean(WaterFlow, na.rm = TRUE)) %>%
  ungroup()

head(combined_pipedata)
## # A tibble: 6 × 2
##   datetime            waterflow
##   <dttm>                  <dbl>
## 1 2015-10-23 00:24:06     23.4 
## 2 2015-10-23 00:40:02     28.0 
## 3 2015-10-23 00:53:51     23.1 
## 4 2015-10-23 00:55:40     30.0 
## 5 2015-10-23 01:00:00     18.8 
## 6 2015-10-23 01:19:17      6.00
hourly_data <- combined_pipedata %>%
  mutate(hour = floor_date(datetime, unit = "hour")) %>%
  group_by(hour) %>%
  summarise(avg_flow = mean(waterflow, na.rm = TRUE)) %>%
  ungroup()
head(hourly_data)
## # A tibble: 6 × 2
##   hour                avg_flow
##   <dttm>                 <dbl>
## 1 2015-10-23 00:00:00     26.1
## 2 2015-10-23 01:00:00     22.3
## 3 2015-10-23 02:00:00     15.2
## 4 2015-10-23 03:00:00     25.6
## 5 2015-10-23 04:00:00     24.7
## 6 2015-10-23 05:00:00     22.7
pipe_ts <- hourly_data %>%
  as_tsibble(index = hour)

# i tried plotting the avg_flow and had an error stating "data contains implicit gaps in time"

pipe_ts <- pipe_ts %>%
  fill_gaps()

pipe_ts %>% 
  gg_tsdisplay(avg_flow) +
  labs(title = "Hourly Water Flow")
## Warning: Removed 255 rows containing missing values or values outside the scale range
## (`geom_point()`).

pipe_ts %>%
  features(avg_flow, features = unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      4.96        0.01

Structural change occurs around the start of November — transitioning from lower, more stable values to higher more volatile flow. Based on kpss test(kpss_pvalye < 0.01) the hourly water flow data is non-stationary, and we need to difference the series before applying ARIMA.

pipe_ts_diff <- pipe_ts %>%
  mutate(diff_flow = difference(avg_flow))

pipe_ts_diff %>%
  gg_tsdisplay(diff_flow) +
  labs(title = "Differenced Hourly Water Flow")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 511 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

I’ll leave this question as is (since I have a few more assignments to work on). However, if I had more time, I would go back and try a different approach to handle the missing values and explore other imputation methods. Then I’d re-check for stationarity, apply differencing if needed, and compare ARIMA and ETS models.