library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fable)
## Warning: package 'fable' was built under R version 4.3.3
## Loading required package: fabletools
## Warning: package 'fabletools' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.3.3
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tsibble     1.1.5     ✔ feasts      0.4.1
## ✔ tsibbledata 0.4.1
## Warning: package 'tsibble' was built under R version 4.3.3
## Warning: package 'feasts' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
library(readxl)
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

Part A – ATM Forecast

Instructions

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010.

Deliverable:

Explain and demonstrate your process, techniques used and not used, and your actual forecast.

I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file

Also please submit the forecast which you will put in an Excel readable file.

Exploratory Data Analysis

# Read the data in and convert columns to data types 
atm_data <- read_excel("ATM624Data.xlsx", col_types = c("date", "text", "numeric"))
atm_data$DATE <- as.Date(atm_data$DATE)

# View first few rows
head(atm_data, 10)
## # A tibble: 10 × 3
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-05-01 ATM1     96
##  2 2009-05-01 ATM2    107
##  3 2009-05-02 ATM1     82
##  4 2009-05-02 ATM2     89
##  5 2009-05-03 ATM1     85
##  6 2009-05-03 ATM2     90
##  7 2009-05-04 ATM1     90
##  8 2009-05-04 ATM2     55
##  9 2009-05-05 ATM1     99
## 10 2009-05-05 ATM2     79
# View last few rows
tail(atm_data, 10)
## # A tibble: 10 × 3
##    DATE       ATM     Cash
##    <date>     <chr>  <dbl>
##  1 2010-04-21 ATM4  557.  
##  2 2010-04-22 ATM4  386.  
##  3 2010-04-23 ATM4  165.  
##  4 2010-04-24 ATM4    5.45
##  5 2010-04-25 ATM4  542.  
##  6 2010-04-26 ATM4  404.  
##  7 2010-04-27 ATM4   13.7 
##  8 2010-04-28 ATM4  348.  
##  9 2010-04-29 ATM4   44.2 
## 10 2010-04-30 ATM4  482.
# Summary Stats 
summary(atm_data)
##       DATE                ATM                 Cash        
##  Min.   :2009-05-01   Length:1474        Min.   :    0.0  
##  1st Qu.:2009-08-01   Class :character   1st Qu.:    0.5  
##  Median :2009-11-01   Mode  :character   Median :   73.0  
##  Mean   :2009-10-31                      Mean   :  155.6  
##  3rd Qu.:2010-02-01                      3rd Qu.:  114.0  
##  Max.   :2010-05-14                      Max.   :10919.8  
##                                          NA's   :19
# View distribution of the cash in each ATM 

atm_data %>% ggplot(aes(x = ATM, y = Cash)) + geom_boxplot() + 
  labs(title = "Distribution of Cash", 
       x = "ATM", 
       y = "Cash (USD)")
## Warning: Removed 19 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

There is a very clear outlier in the 4th ATM that skews the ability to observe the cash distribution of each ATM. Filtering the data for each ATM and then visualizing the distribution of values might be a better approach.

atm_data %>% group_by(ATM) %>% summarise(Min_Date = min(DATE, na.rm = T),
                                         Max_Date = max(DATE, na.rm = T),
                                         Min_cash = min(Cash, na.rm = T),
                                         Q1_cash = quantile(Cash, 0.25, na.rm = T),
                                         Median_Cash = median(Cash, na.rm = T), 
                                         Mean_Cash = mean(Cash, na.rm = T),
                                         SD_Cash = sd(Cash, na.rm = T),
                                         Q3_Cash = quantile(Cash, 0.75, na.rm = T),
                                         Max_Cash = max(Cash, na.rm = T))
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Min_cash = min(Cash, na.rm = T)`.
## ℹ In group 5: `ATM = NA`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
## # A tibble: 5 × 10
##   ATM   Min_Date   Max_Date   Min_cash Q1_cash Median_Cash Mean_Cash SD_Cash
##   <chr> <date>     <date>        <dbl>   <dbl>       <dbl>     <dbl>   <dbl>
## 1 ATM1  2009-05-01 2010-04-30     1       73           91     83.9     36.7 
## 2 ATM2  2009-05-01 2010-04-30     0       25.5         67     62.6     38.9 
## 3 ATM3  2009-05-01 2010-04-30     0        0            0      0.721    7.94
## 4 ATM4  2009-05-01 2010-04-30     1.56   124.         404.   474.     651.  
## 5 <NA>  2010-05-01 2010-05-14   Inf       NA           NA    NaN       NA   
## # ℹ 2 more variables: Q3_Cash <dbl>, Max_Cash <dbl>
# Check NA values
na_count <-sapply(atm_data, function(y) sum(length(which(is.na(y)))))

na_count
## DATE  ATM Cash 
##    0   14   19
na_rows <- atm_data[rowSums(is.na(atm_data)) > 0,]

na_rows
## # A tibble: 19 × 3
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-06-13 ATM1     NA
##  2 2009-06-16 ATM1     NA
##  3 2009-06-18 ATM2     NA
##  4 2009-06-22 ATM1     NA
##  5 2009-06-24 ATM2     NA
##  6 2010-05-01 <NA>     NA
##  7 2010-05-02 <NA>     NA
##  8 2010-05-03 <NA>     NA
##  9 2010-05-04 <NA>     NA
## 10 2010-05-05 <NA>     NA
## 11 2010-05-06 <NA>     NA
## 12 2010-05-07 <NA>     NA
## 13 2010-05-08 <NA>     NA
## 14 2010-05-09 <NA>     NA
## 15 2010-05-10 <NA>     NA
## 16 2010-05-11 <NA>     NA
## 17 2010-05-12 <NA>     NA
## 18 2010-05-13 <NA>     NA
## 19 2010-05-14 <NA>     NA

Most of the missing values were among the dates that we are instructed to forecast so we can drop those observations from the original atm data. Viewing the remaining rows with NA values, there were five observations 3 in ATM1 and 2 in ATM2 where the missing values were in the target variable - Cash. The missing will need to be handled and the initial thought was to separate the data into the four ATM’s.

# Re-check missing values
atm_data <- atm_data %>% drop_na(ATM)

na_rows <- atm_data[rowSums(is.na(atm_data)) > 0,]

na_rows
## # A tibble: 5 × 3
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-06-13 ATM1     NA
## 2 2009-06-16 ATM1     NA
## 3 2009-06-18 ATM2     NA
## 4 2009-06-22 ATM1     NA
## 5 2009-06-24 ATM2     NA
# Create separate data frames for each of the ATM machines
atm1 <- atm_data %>% filter(ATM == "ATM1")
atm2 <- atm_data %>% filter(ATM == "ATM2")
atm3 <- atm_data %>% filter(ATM == "ATM3")
atm4 <- atm_data %>% filter(ATM == "ATM4")

Missing values for ATM 1 and 2 were only found in June. To determine how to deal with the missing values we can looking at the mean and median cash for that month in each ATM (ATM1 and ATM2). Imputing the mean into the missing data and re-checking the how much that effects the spread of the data is a first step to determine whether this simple imputation technique is appropriate.

# ATM1 and ATM2 - Mean and median for the month of June

atm_data %>% filter(ATM %in% c("ATM1", "ATM2")) %>% group_by(ATM) %>% filter(month(DATE) == 6) %>% summarise(mean = mean(Cash, na.rm = T), median = median(Cash, na.rm = T))
## # A tibble: 2 × 3
##   ATM    mean median
##   <chr> <dbl>  <dbl>
## 1 ATM1   89.3    104
## 2 ATM2   76.0     82

Since there aren’t big gaps in the data an approach that supports the temporal structure of the data called rolling statistic imputation will be used. This approach replaces missing values with a statistic like mean over a defined window period. This can be achieved using a package called \(imputeTS\). A simple moving average was used with a window of 4. This strategy did a better job of preserving the underlying distribution of the original data than just imputing the mean cash across the entire time series within each ATM.

library(imputeTS)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
atm_data %>% filter(ATM %in% c("ATM1", "ATM2")) %>% 
  group_by(ATM) %>%
  filter(month(DATE) == 6) %>% 
  arrange(ATM) %>% 
  na_ma(k = 4, weighting = "simple") %>% 
  summarise(mean = mean(Cash, na.rm = T), median = median(Cash, na.rm = T))
## # A tibble: 2 × 3
##   ATM    mean median
##   <chr> <dbl>  <dbl>
## 1 ATM1   90.2  103. 
## 2 ATM2   76.1   81.3
# Impute missing values for ATM1 and ATM2 
atm1 <- atm1 %>% na_ma(k = 4, weighting = "simple") 
atm2 <- atm2 %>% na_ma(k = 4, weighting = "simple") 
# Convert each data frame into a tsibble
atm1_ts <- as_tsibble(atm1, index = DATE)
atm2_ts <- as_tsibble(atm2, index = DATE)
atm3_ts <- as_tsibble(atm3, index = DATE)
atm4_ts <- as_tsibble(atm4, index = DATE)

ATM 1

EDA

# Plot entire time series
atm1_ts %>% autoplot(Cash) + labs(title = "ATM_1 Cash Removed", x = "Date", y = "USD")

# Plot the time series for one month 
atm1_ts %>% filter(month(DATE) == 5) %>% autoplot(Cash) + labs(title = "ATM_1 Cash Removed", x = "Date", y = "USD")

Visualizing the times series we don’t see any linear trends but a strong pattern of large spikes with intermittent smaller fluctuations indicating seasonality with in the data. Using functions to visualize any autocorrelations will assist in identifying seasonality. There appears to be a few time points where the spikes are higher but no other outliers are noted.

atm1_ts %>% gg_lag(Cash, lags = 1:14, geom = "point")

atm1_ts %>% ACF(Cash, lag_max = 21) %>% autoplot()

# Decompose data using STL 
stl_decomp <- atm1_ts %>% model(STL(Cash ~ trend() + season(), robust = TRUE)) %>% components()

# Plot the STL decomposition 
stl_decomp %>% autoplot() + labs(title = "STL")

The autocorrelation plot confirms the seasonality as we see strong lags at multiples of 7, indicating a correlation between the same days on previous weeks. Decomposing the times series is helpful to analyze the data to understand any underlying patterns, trends, and seasonal variations. Visualizing the components in this data we can see there is no clear linear trend but perhaps some cyclical behavior. We can see a strong seasonal component in the season_week plot, where the large down spike is associated with the same day of the week. We also see a lot of variation in the remainder component which may indicate that the trend and season component do not account for all of the variability in the data.

PRE-PROCESSING

Determine if transformation is necessary

A Box-Cox transformation might not be helpful for this particular ATM data as the variation in the data does not consistently only increase or only decrease throughout the series. Visualizing the transformation and comparing the plot to the original data it does not appear to have stabilized the variability, so we will keep the original data.

lambda <- atm1_ts |>
  features(Cash, features = guerrero) |>
  pull(lambda_guerrero)

atm1_ts %>%
  autoplot(box_cox(Cash, lambda)) +
  labs(y = "",
       title = paste0(
         "Transformed Cash in ATM1 with lambda = ", round(lambda, 4)))

Check for Stationarity and Determine Differencing: Unit Root Tests

A kpss p_value > 0.05 will fail to reject the null hypothesis that our data is stationary. On the original data we obtain a p_value of 0.0472 so we reject the null hypothesis in favor of the alternative which states our data is not stationary and differenceing may be needed

# Conduct KPSS test
atm1_ts %>% features(Cash, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.475      0.0472
# Find the number of seasonal differencing 
atm1_ts %>% features(Cash, unitroot_nsdiffs)
## # A tibble: 1 × 1
##   nsdiffs
##     <int>
## 1       1
# Seasonally difference the data 
atm1_ts <- atm1_ts %>% mutate(Cash_diff = difference(Cash, lag = 7))

atm1_ts %>% features(Cash_diff, unitroot_ndiffs)
## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0
# Plot differenced data 
atm1_ts %>% autoplot(Cash_diff)
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Check KPSS test
atm1_ts %>% features(Cash_diff, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1    0.0217         0.1

This step was carried out in anticipation of using fitting and forecasting using an ARIMA model. If using the ARIMA function to select the best model, we would expect that at least one seaosnal difference to obtain stationarity.

MODELLING

Exponential Smoothing Modelling

Holt-Winters’ methods are suggested when dealing with seasonal data. In this case the additive method is preferred when the seasonal variations are roughly constant through the series which is what we are seeing here. Another model suggested for seasonal data is a damped Holt-Winters’ with multiplicative seasonality. Below is a comparison of four different Holt-Winter models. - MAdM - AAA - MAM - Auto selected ETS

fit <- atm1_ts %>% model(
    hw_madm = ETS(Cash ~ error("M") + trend("Ad") + season("M")),
    additive = ETS(Cash ~ error("A") + trend("A") + season("A")),
    multiplicative = ETS(Cash ~ error("M") + trend("A") + season("M")),
    auto = ETS(Cash)
  )

fit %>% accuracy()
## # A tibble: 4 × 10
##   .model         .type         ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>          <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 hw_madm        Training -0.982   26.2  16.2 -127.  143. 0.919 0.947 0.0868
## 2 additive       Training -0.140   23.8  15.1 -107.  122. 0.856 0.859 0.135 
## 3 multiplicative Training -2.07    26.2  16.2 -129.  144. 0.921 0.946 0.0824
## 4 auto           Training -0.0910  23.7  15.0 -106.  121. 0.852 0.857 0.137
fit %>% report()
## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.
## # A tibble: 4 × 9
##   .model          sigma2 log_lik   AIC  AICc   BIC   MSE  AMSE    MAE
##   <chr>            <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 hw_madm          0.136  -2273. 4573. 4574. 4623.  688.  702.  0.219
## 2 additive       583.     -2233. 4491. 4492. 4537.  565.  569. 15.1  
## 3 multiplicative   0.132  -2274. 4571. 4572. 4618.  687.  701.  0.216
## 4 auto           578.     -2233. 4485. 4486. 4524.  563.  566. 15.0
fit %>% select(auto) %>% report()
## Series: Cash 
## Model: ETS(A,N,A) 
##   Smoothing parameters:
##     alpha = 0.01696495 
##     gamma = 0.3311424 
## 
##   Initial states:
##      l[0]      s[0]    s[-1]    s[-2]    s[-3]    s[-4] s[-5]    s[-6]
##  78.07218 -60.61548 2.148172 8.582304 7.293434 13.07175 15.52 13.99981
## 
##   sigma^2:  577.6921
## 
##      AIC     AICc      BIC 
## 4485.400 4486.021 4524.399

Comparing 4 models using an exponential smoothing technique we see that the model generated using an Additive error, No trend and Additive seasonality produced the model with the lowest AICc score and RMSE. We can further evaluate this model by studying the residuals.

fit %>% select(auto) %>% gg_tsresiduals()

# Conduct a ljung_box test to determine if residuals are white noise
fit %>% select(auto) %>% augment() %>% features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 auto      28.7    0.0115

The Ljung - box test tells us that the residuals are not independent and is not determined to be white noise. There may be a better model that is able to explain more of the data than an ETS model. The next set of models to evaluate are ARIMA models.

Fitting an ARIMA model

It was previously determined that difference was needed so we should expect that to be part of the ARIMA model. I have choose to have ARIMA find the best model by setting stepwise to False and approx to false. THe model that came back was an \(ARIMA(0,0,1)(1,1,1)_7\)

# Fit ARIMA MODEL with search function 
fit <- atm1_ts %>% model(ARIMA_auto = ARIMA(Cash, stepwise = FALSE, approx = FALSE))

fit %>% report()
## Series: Cash 
## Model: ARIMA(0,0,1)(1,1,1)[7] 
## 
## Coefficients:
##          ma1    sar1     sma1
##       0.1810  0.1800  -0.7521
## s.e.  0.0547  0.0792   0.0552
## 
## sigma^2 estimated as 555.2:  log likelihood=-1639.68
## AIC=3287.36   AICc=3287.47   BIC=3302.88

Model Diagnostics

# Visualize the residuals
fit %>% gg_tsresiduals()

# Conduct a ljung_box test to determine if residuals are white noise
fit %>% augment() %>% features(.innov, ljung_box, lag = 10)
## # A tibble: 1 × 3
##   .model     lb_stat lb_pvalue
##   <chr>        <dbl>     <dbl>
## 1 ARIMA_auto    11.6     0.315

Visualizing the residuals, there are no autocrrelations, there is mostly homoscedasticity, however there does appear to be some residuals. The Ljung - box test indicates that the residuals are independent and indistinguishable from white noise. This model can be used to forecast the next month. The model that was selected considered a 1 order of seasonal differencing which is what we found earlier when evaluating the stationarity of the original data.

fc <- fit %>% forecast(h = "1 month")

fc %>%
  autoplot(atm1_ts) +
  labs(title="Cash Retrieval ATM 1",
       y="USD")

# Pull out the forecasts for ATM1

atm1_forecasts <- fc[,c(1,2,4)]
atm1_forecasts$DATA <- "ATM1"

# Find the sum of the forcasts for May 2010
sum(atm1_forecasts$.mean)
## [1] 2360.773
# Find sum of all months in the data set 
atm1 %>% group_by(month(DATE)) %>% summarise(Cash_total = sum(Cash))
## # A tibble: 12 × 2
##    `month(DATE)` Cash_total
##            <dbl>      <dbl>
##  1             1      2677 
##  2             2      2531 
##  3             3      2433 
##  4             4      2300 
##  5             5      2480 
##  6             6      2705.
##  7             7      2621 
##  8             8      3129 
##  9             9      2826 
## 10            10      2247 
## 11            11      2289 
## 12            12      2422

ATM 2

EDA

# Plot full time series for ATM2
atm2_ts %>% autoplot(Cash) + labs(title = "ATM_2 Cash Removed", x = "Date", y = "USD")

# Plot the time series for one month 
atm2_ts %>% filter(month(DATE) == 5) %>% autoplot(Cash) + labs(title = "ATM_2 Cash Removed", x = "Date", y = "USD")

Visualizing the data of ATM2, there are large fluctuations similar to the first ATM’s data, suggesting seasonality. One difference that is seen is the presence of a slight downward trend across the time series.

# Visualize the lag plot
atm2_ts %>% gg_lag(Cash, lags = 1:14, geom = "point")

# Visualize the ACF
atm2_ts %>% ACF(Cash, lag_max = 21) %>% autoplot()

There are moderate autocorrelations at lags that are multiples of 7 and we see that the further apart the lags are the autocorrleation decreases, indicating the presence of a trend.

# Decompose data using STL 
stl_decomp <- atm2_ts %>% model(STL(Cash ~ trend() + season(), robust = TRUE)) %>% components()

# Plot the STL decomposition 
stl_decomp %>% autoplot() + labs(title = "STL: ATM2")

Decomposing the time series of ATM2, the trend pattern is more clear. There is an interesting period between February and March where the season_week data looks like it gets pinched. This observation in the season_week component matches with a period in the time series where the amount of Cash taken out seems to rapidly increase and then drop down again. This was not easily seen in the time series plot alone.

MODELLING

As this model exhibits similar characteristics to the data in ATM1, an ARIMA model will be fitted and evaluated.

# Conduct KPSS test
atm2_ts %>% features(Cash, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      2.18        0.01

The original data from ATM2 is not stationary according to the KPSS test so we should expect some differencing in the ARIMA model. Using a search method to find the best ARIMA model with the lowest AICc was used. The model chosen was a seasonal \(ARIMA(5,0,0)(0,1,1)_7\)

fit <- atm2_ts %>% model(ARIMA(Cash, stepwise = FALSE, approx = FALSE))

fit %>% report()
## Series: Cash 
## Model: ARIMA(2,0,2)(0,1,1)[7] 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2     sma1
##       -0.4321  -0.9155  0.4748  0.8099  -0.7555
## s.e.   0.0551   0.0401  0.0860  0.0550   0.0382
## 
## sigma^2 estimated as 609.4:  log likelihood=-1655.7
## AIC=3323.41   AICc=3323.65   BIC=3346.69

Model Diagnostics

# Visualize the residuals
fit %>% gg_tsresiduals(lag=21)

# Conduct a ljung_box test to determine if residuals are white noise
fit %>% augment() %>% features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
##   .model                                        lb_stat lb_pvalue
##   <chr>                                           <dbl>     <dbl>
## 1 ARIMA(Cash, stepwise = FALSE, approx = FALSE)    10.7     0.712

Visualizing and using a Ljung - box test, confirm that the residuals are white noise. Forecasting can move forward using this model.

# Forecast the next month 
fc <- fit %>% forecast(h = "1 month")

fc %>% autoplot(atm2_ts) +
  labs(title="Cash Retrieval ATM 2",
       y="USD")

# Pull out the forecasts for ATM1
atm2_forecasts <- fc[,c(1,2,4)]
atm2_forecasts$DATA <- "ATM2"

# Find the sum of the forecasts for May 2010
sum(atm2_forecasts$.mean)
## [1] 1775.202
# Find sum of all months in the data set 
atm2 %>% group_by(month(DATE)) %>% summarise(Cash_total = sum(Cash))
## # A tibble: 12 × 2
##    `month(DATE)` Cash_total
##            <dbl>      <dbl>
##  1             1      1811 
##  2             2      1517 
##  3             3      1602 
##  4             4      1842 
##  5             5      2338 
##  6             6      2282.
##  7             7      2094 
##  8             8      2230 
##  9             9      1867 
## 10            10      1906 
## 11            11      1692 
## 12            12      1688

ATM 3

EDA

atm3_ts %>% autoplot(Cash) + labs(title = "ATM_3 Cash Removed", x = "Date", y = "USD")

# Plot the time series from April onward to visualize the spike in the data
atm3_ts %>% filter(month(DATE) == 4) %>% autoplot(Cash) + labs(title = "ATM_3 Cash", x = "Date", y = "USD")

tail(atm3_ts)
## # A tsibble: 6 x 3 [1D]
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2010-04-25 ATM3      0
## 2 2010-04-26 ATM3      0
## 3 2010-04-27 ATM3      0
## 4 2010-04-28 ATM3     96
## 5 2010-04-29 ATM3     82
## 6 2010-04-30 ATM3     85
atm3_ts %>% ACF(Cash) %>% autoplot()

Visualizing this data we see that all values except three are non zero, which occur at the end of the time series. Since there are not many of values to consider in the forecast, simpler models will probably be best in this case. The simple model that would be best to fit would be a Naive model, that uses the last value for all forecasts. The ACF plot does not provide much information regarding any seasonality, cyclic behavior or trend as there are only a few non zero observations. The moderate to strong positive autocorrelations also indicate that pervious values might good predictors of futrue ones.

fit <- atm3_ts %>% model(NAIVE(Cash))

fit %>% gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

fc <- fit %>% forecast(h = "1 month")

fc %>%
  autoplot(atm3_ts) +
  labs(title="Cash Retrieval ATM 3",
       y="USD")

#Use the hilo() function to get the 95% prediction intervals
hilo_tibble <- fc %>% hilo(level = 95)

hilo_tibble
## # A tsibble: 30 x 5 [1D]
## # Key:       .model [1]
##    .model      DATE      
##    <chr>       <date>    
##  1 NAIVE(Cash) 2010-05-01
##  2 NAIVE(Cash) 2010-05-02
##  3 NAIVE(Cash) 2010-05-03
##  4 NAIVE(Cash) 2010-05-04
##  5 NAIVE(Cash) 2010-05-05
##  6 NAIVE(Cash) 2010-05-06
##  7 NAIVE(Cash) 2010-05-07
##  8 NAIVE(Cash) 2010-05-08
##  9 NAIVE(Cash) 2010-05-09
## 10 NAIVE(Cash) 2010-05-10
## # ℹ 20 more rows
## # ℹ 3 more variables: Cash <dist>, .mean <dbl>, `95%` <hilo>

Inspecting the 95% predictive intervals we see the that the range gets larger as the horizons get further out which is typical when forecasting time series. It is not surprising that the predictive intervals are very wide as there is minimal data to forecast from.

# Pull out the forecasts for ATM1
atm3_forecasts <- fc[,c(1,2,4)]
atm3_forecasts$DATA <- "ATM3"

# Find the sum of the forecasts for May 2010
sum(atm3_forecasts$.mean)
## [1] 2550

ATM 4

EDA

atm4_ts %>% autoplot(Cash) + labs(title = "ATM_4 Cash Removed", x = "Date", y = "USD")

# Plot the time series for one month 
atm4_ts %>% filter(month(DATE) == 2) %>% autoplot(Cash) + labs(title = "ATM_4 Cash Removed", x = "Date", y = "USD")

While there are no missing values in the data from ATM4, there is clear outlier occurring in the beginning of February. The remaining data on either side does not seem to follow a specific trend or seasonal patterns. The next step would be to determine a strategy to handle this outlier value.

# Visualize the lag plot
atm4_ts %>% gg_lag(Cash, lags = 1:14, geom = "point")

# Visualize the ACF
atm4_ts %>% ACF(Cash, lag_max = 21) %>% autoplot()

When plotting the lag values up to 14 lags we can see the influence the outlier has on the rest of the lags, causing the lags to bunch up at the bottom left of the graph. However when plotting the autocorrelations, there are also no consistent patterns indicating trend or seasonality. The autocorrelation plot looks similar to stationary data.

To handle the outlier we might be able to consider this an error in recording. To confirm any other outlier presence, boxplots can be very helpful. The box plot below identifies 3 total outliers, however there is the one value that is far beyond the other two outliers. One thought was to use a rolling average similar to how NA values were imputed in the original data frame.

atm4_ts %>% ggplot(aes(y =Cash)) + geom_boxplot()

summary(atm4)
##       DATE                ATM                 Cash          
##  Min.   :2009-05-01   Length:365         Min.   :    1.563  
##  1st Qu.:2009-07-31   Class :character   1st Qu.:  124.334  
##  Median :2009-10-30   Mode  :character   Median :  403.839  
##  Mean   :2009-10-30                      Mean   :  474.043  
##  3rd Qu.:2010-01-29                      3rd Qu.:  704.507  
##  Max.   :2010-04-30                      Max.   :10919.762
# Q3 + IQR*1.5
atm4_ts %>% filter(Cash >= 1570)
## # A tsibble: 3 x 3 [1D]
##   DATE       ATM     Cash
##   <date>     <chr>  <dbl>
## 1 2009-09-22 ATM4   1712.
## 2 2010-01-29 ATM4   1575.
## 3 2010-02-09 ATM4  10920.

PRE-PROCESSING

# Impute an NA to that date and recode values using a rolling statistic 
atm4_ts[atm4_ts$DATE == "2010-02-09", 3] = NA

# Fill na with rolling stat
atm4_ts <- atm4_ts %>% na_ma(k = 10, weighting = "simple") 

# Recheck summary stats
summary(atm4_ts)
##       DATE                ATM                 Cash         
##  Min.   :2009-05-01   Length:365         Min.   :   1.563  
##  1st Qu.:2009-07-31   Class :character   1st Qu.: 124.334  
##  Median :2009-10-30   Mode  :character   Median : 403.839  
##  Mean   :2009-10-30                      Mean   : 445.440  
##  3rd Qu.:2010-01-29                      3rd Qu.: 704.192  
##  Max.   :2010-04-30                      Max.   :1712.075
# Visualize the time series plot 
atm4_ts %>% autoplot(Cash) + labs(title = "ATM_4 Cash Removed", x = "Date", y = "USD")

# Visualize the lag plot
atm4_ts %>% gg_lag(Cash, lags = 1:14, geom = "point")

# Visualize the ACF
atm4_ts %>% ACF(Cash, lag_max = 21) %>% autoplot()

Re-coding the outlier revealed an underlying pattern that was not able to be seen with such a highly influential point. Re-plotting the time series, lag plot and autocorrelations we see similar behavior as what was visualized in the ATM1 and ATM2 data.

lambda <- atm4_ts |>
  features(Cash, features = guerrero) |>
  pull(lambda_guerrero)

atm4_ts %>%
  autoplot(box_cox(Cash, lambda)) +
  labs(y = "",
       title = paste0(
         "Transformed Cash in ATM4 with lambda = ", round(lambda, 4)))

atm4_ts <- atm4_ts %>% mutate(Cash_trans = box_cox(Cash, lambda))

A Box-cox transformation was also done on this ATM data in oreder to make the pattern more consistent across the whole data set. The range and spread of values in this ATM data were much larger than in the other ATM’s. Simpler patterns are usually easier to model and lead to more accurate forecasts.

MODELLING

# Conduct KPSS test
atm4_ts %>% features(Cash_trans, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1    0.0769         0.1

Based off of the KPSS stat and p-value the data is stationary and we should not expect any differencing to be needed.

fit <- atm4_ts %>% model(ARIMA(Cash_trans, stepwise = FALSE, approx = FALSE)) 

fit %>% report()
## Series: Cash_trans 
## Model: ARIMA(1,0,0)(2,0,0)[7] w/ mean 
## 
## Coefficients:
##          ar1    sar1    sar2  constant
##       0.0762  0.2093  0.1985   15.9681
## s.e.  0.0526  0.0516  0.0525    0.6947
## 
## sigma^2 estimated as 185.1:  log likelihood=-1469.24
## AIC=2948.48   AICc=2948.65   BIC=2967.98

Model Diagnostics

# Visualize the residuals
fit %>% gg_tsresiduals()

# Conduct a ljung_box test to determine if residuals are white noise
fit %>% augment() %>% features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
##   .model                                              lb_stat lb_pvalue
##   <chr>                                                 <dbl>     <dbl>
## 1 ARIMA(Cash_trans, stepwise = FALSE, approx = FALSE)    15.6     0.336
# Forecast the next month 
fc <- fit %>% forecast(h = "1 month")

fc %>%
  autoplot(atm4_ts) +
  labs(title="Cash Retrieval ATM 4",
       y="USD")

fc$mean_trans <- inv_box_cox(fc$.mean, lambda = lambda)

# Save forcasts in new object
atm4_forecasts <- fc[,c(1,2,5)]
atm4_forecasts$DATA <- "ATM4"

# Sum up the forcasted Cash
sum(atm4_forecasts$mean_trans)
## [1] 9623.493
# Find sum of all months in the data set 
atm4 %>% group_by(month(DATE)) %>% summarise(Cash_total = sum(Cash))
## # A tibble: 12 × 2
##    `month(DATE)` Cash_total
##            <dbl>      <dbl>
##  1             1     14903.
##  2             2     23084.
##  3             3     13315.
##  4             4     11858.
##  5             5     12622.
##  6             6     13744.
##  7             7     14477.
##  8             8     15903.
##  9             9     17733.
## 10            10      9514.
## 11            11     12908.
## 12            12     12966.

Part B – Forecasting Power

Instructions

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013.

Your assignment is to model these data and a monthly forecast for 2014.

The data is given in a single file - ResidentialCustomerForecastLoad-624.xlsx The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward.

Add this to your existing files above.

EDA

power_df <- read_excel("/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_624/Project1/ResidentialCustomerForecastLoad-624.xlsx")

head(power_df)
## # A tibble: 6 × 3
##   CaseSequence `YYYY-MMM`     KWH
##          <dbl> <chr>        <dbl>
## 1          733 1998-Jan   6862583
## 2          734 1998-Feb   5838198
## 3          735 1998-Mar   5420658
## 4          736 1998-Apr   5010364
## 5          737 1998-May   4665377
## 6          738 1998-Jun   6467147
# Re-name columns 
colnames(power_df) <- c("CaseSeq", "YearMonth", "KWH")

head(power_df)
## # A tibble: 6 × 3
##   CaseSeq YearMonth     KWH
##     <dbl> <chr>       <dbl>
## 1     733 1998-Jan  6862583
## 2     734 1998-Feb  5838198
## 3     735 1998-Mar  5420658
## 4     736 1998-Apr  5010364
## 5     737 1998-May  4665377
## 6     738 1998-Jun  6467147
summary(power_df)
##     CaseSeq       YearMonth              KWH          
##  Min.   :733.0   Length:192         Min.   :  770523  
##  1st Qu.:780.8   Class :character   1st Qu.: 5429912  
##  Median :828.5   Mode  :character   Median : 6283324  
##  Mean   :828.5                      Mean   : 6502475  
##  3rd Qu.:876.2                      3rd Qu.: 7620524  
##  Max.   :924.0                      Max.   :10655730  
##                                     NA's   :1
power_df$Date <- paste0(power_df$YearMonth, "-01")
power_df$Date <- as.Date(power_df$Date, format = "%Y-%b-%d")

head(power_df, 20)
## # A tibble: 20 × 4
##    CaseSeq YearMonth     KWH Date      
##      <dbl> <chr>       <dbl> <date>    
##  1     733 1998-Jan  6862583 1998-01-01
##  2     734 1998-Feb  5838198 1998-02-01
##  3     735 1998-Mar  5420658 1998-03-01
##  4     736 1998-Apr  5010364 1998-04-01
##  5     737 1998-May  4665377 1998-05-01
##  6     738 1998-Jun  6467147 1998-06-01
##  7     739 1998-Jul  8914755 1998-07-01
##  8     740 1998-Aug  8607428 1998-08-01
##  9     741 1998-Sep  6989888 1998-09-01
## 10     742 1998-Oct  6345620 1998-10-01
## 11     743 1998-Nov  4640410 1998-11-01
## 12     744 1998-Dec  4693479 1998-12-01
## 13     745 1999-Jan  7183759 1999-01-01
## 14     746 1999-Feb  5759262 1999-02-01
## 15     747 1999-Mar  4847656 1999-03-01
## 16     748 1999-Apr  5306592 1999-04-01
## 17     749 1999-May  4426794 1999-05-01
## 18     750 1999-Jun  5500901 1999-06-01
## 19     751 1999-Jul  7444416 1999-07-01
## 20     752 1999-Aug  7564391 1999-08-01
power_df %>% ggplot(aes(x = Date, y = KWH)) + geom_line() + 
  labs(title = "Residential power usage", 
                                  x = "Date", 
                                  y = "Kilowatt per hour")

From the Summary output and through visualizing the time series, there is a gap in the data between 2008 - 2010. There also appears to be an outlier in the data as well. We can find the date with the missing value below

# Identify observation with missing value
# Check NA values
na_rows <- power_df[rowSums(is.na(power_df)) > 0,]

na_rows$Date
## [1] "2008-09-01"

PRE PROCESSING

One strategy to fill the missing value is to take an average from the data points in the same month, in this case September. The time series appears to have minimal trend toward the end of the series. The point where the missing value is appears to be among the consistent seasonal pattern of the data, where values are similar between the same month in different years.

# View the data over the same month as the missing value
rolling_avg <- power_df %>% filter(month(Date) == 9) %>% na_ma(weighting = "simple") %>% filter(Date == "2008-09-01") %>% pull(KWH)

power_df[power_df$Date == "2008-09-01", "KWH"] = rolling_avg

# Re-check for missing values 
any(is.na(power_df))
## [1] FALSE
power_df %>% ggplot(aes(y = KWH)) + geom_boxplot() + labs(title = "Distribution of Power Usage 1998-2013")

# View the value of the outlier
power_df %>% filter(Date > "2010-01-01")
## # A tibble: 47 × 4
##    CaseSeq YearMonth     KWH Date      
##      <dbl> <chr>       <dbl> <date>    
##  1     878 2010-Feb  8390677 2010-02-01
##  2     879 2010-Mar  7347915 2010-03-01
##  3     880 2010-Apr  5776131 2010-04-01
##  4     881 2010-May  4919289 2010-05-01
##  5     882 2010-Jun  6696292 2010-06-01
##  6     883 2010-Jul   770523 2010-07-01
##  7     884 2010-Aug  7922701 2010-08-01
##  8     885 2010-Sep  7819472 2010-09-01
##  9     886 2010-Oct  5875917 2010-10-01
## 10     887 2010-Nov  4800733 2010-11-01
## # ℹ 37 more rows

The outlier fell during the month of July in 2010. If we assume that this data is from somewhere in the US when hotter months tend to consume a lot energy to power AC units, then replacing the value might be a good strategy. When viewing the power usage around the month of July in other years we see that in these months, power consumption is high. To replace the data we can try imputing the average power consumption from the two months on either side of July (June, August).

# Impute an NA to that date and re-code values using a rolling statistic 
power_df2 <- power_df

power_df2[power_df2$Date == "2010-07-01", "KWH"] = NA

# Fill na with rolling stat
power_df2 <- power_df2 %>% na_ma(k = 2, weighting = "simple") 

# Recheck and compare summary stats
print("BEFORE OUTLIER REPLACED")
## [1] "BEFORE OUTLIER REPLACED"
summary(power_df)
##     CaseSeq       YearMonth              KWH                Date           
##  Min.   :733.0   Length:192         Min.   :  770523   Min.   :1998-01-01  
##  1st Qu.:780.8   Class :character   1st Qu.: 5434539   1st Qu.:2001-12-24  
##  Median :828.5   Mode  :character   Median : 6314472   Median :2005-12-16  
##  Mean   :828.5                      Mean   : 6508071   Mean   :2005-12-15  
##  3rd Qu.:876.2                      3rd Qu.: 7608792   3rd Qu.:2009-12-08  
##  Max.   :924.0                      Max.   :10655730   Max.   :2013-12-01
print("AFTER OUTLIER REPLACED")
## [1] "AFTER OUTLIER REPLACED"
summary(power_df2)
##     CaseSeq       YearMonth              KWH                Date           
##  Min.   :733.0   Length:192         Min.   : 4313019   Min.   :1998-01-01  
##  1st Qu.:780.8   Class :character   1st Qu.: 5443502   1st Qu.:2001-12-24  
##  Median :828.5   Mode  :character   Median : 6351262   Median :2005-12-16  
##  Mean   :828.5                      Mean   : 6539680   Mean   :2005-12-15  
##  3rd Qu.:876.2                      3rd Qu.: 7608792   3rd Qu.:2009-12-08  
##  Max.   :924.0                      Max.   :10655730   Max.   :2013-12-01
# Outlier value
power_df2[power_df2$Date == "2010-07-01", "KWH"]
## # A tibble: 1 × 1
##        KWH
##      <dbl>
## 1 6839438.

Comparing the mean and median values of the target variable KWH after replacing the outlier to before, the mean remained quite stable \(meanBefore = 6508071\) vs. \(meanAfter = 6539680\) and the median increased slightly \(medianBefore = 6314472\) vs. \(medianAfter = 6351262\). Since the overlying distribution was not seriously affected buy replacing the outlier, this final data was used for fitting and forecasting future power consumption.

# Replace the hyphen with a blank space
power_df2$YearMonth <- str_replace(power_df$YearMonth, "-", " ")

# Convert the tibble to tsibble
power_ts <- power_df2 %>% select(CaseSeq, YearMonth, KWH) %>% mutate(YearMonth = yearmonth(YearMonth)) %>% as_tsibble(index = YearMonth)

head(power_ts)
## # A tsibble: 6 x 3 [1M]
##   CaseSeq YearMonth     KWH
##     <dbl>     <mth>   <dbl>
## 1     733  1998 Jan 6862583
## 2     734  1998 Feb 5838198
## 3     735  1998 Mar 5420658
## 4     736  1998 Apr 5010364
## 5     737  1998 May 4665377
## 6     738  1998 Jun 6467147
# Time series plot
power_ts %>% autoplot(KWH) + labs(title = "Power Consumption")

# Season 
power_ts %>% gg_season(KWH) + labs(title = "Yearly Trends in Power Usage")

# Lag plots
power_ts %>% gg_lag(KWH, lags = 1:12, geom = "point")

# ACF plot
power_ts %>% ACF(KWH, lag_max = 12) %>% autoplot() + labs(title = "ACF Power Consumption")

The season plot and ACF plot both indicate strong monthly seasonal patterns. The seasonality is consistent among years, with higher usage during the winter and summer months and these periods are positively correlated with each other (positive correlations at lag 5,6,7). Visualizing the time series plot there does appear to be a slight upward trend toward the end of the series. Decomposing the date may assist in visualizing this more clearly.

It is based on classical decomposition, but includes many extra steps and features in order to overcome the drawbacks of classical decomposition that were discussed in the previous section. In particular, trend-cycle estimates are available for all observations including the end points, and the seasonal component is allowed to vary slowly over time.

#Create the decomposition
x11_dcmp <- power_ts |>
  model(x11 = X_13ARIMA_SEATS(KWH ~ x11())) |>
  components()

# Plot the decompostion
autoplot(x11_dcmp) +
  labs(title =
    "Decomposition of Power Usage using X-11.")

MODELLING

# Conduct KPSS test 
power_ts %>% features(KWH, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      1.64        0.01
# Conduct nsdiff test
power_ts %>% features(KWH, unitroot_nsdiffs)
## # A tibble: 1 × 1
##   nsdiffs
##     <int>
## 1       1

The the initial approach for this data was to use a seasonal Naive model.

# Fit model
fit <- power_ts %>% model(SNAIVE = SNAIVE(KWH))
# Check accuracy 
fit %>% accuracy()
## # A tibble: 1 × 10
##   .model .type        ME    RMSE     MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <chr>     <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE Training 94245. 829452. 618081. 0.679  9.19     1     1 0.301
# Forecast and plot
fc_snaive <- fit %>% forecast(h = "1 year")
fc_snaive %>% autoplot(power_ts) + labs(title = "Seasonal Naive Forecast")

# Check the residuals 
fit %>% gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Conduct a Ljung-box test
fit %>% augment() %>% features(.innov, ljung_box, lag = 24)
## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 SNAIVE    98.6  5.18e-11

While the simple SNAIVE model did do a decent job at finding the seasonal pattern in the data, the residuals did not resemble white noise so there may be more information not accounted for. Using more complex models will be explored next

fit <- power_ts %>% model(ARIMA(KWH, stepwise = FALSE, approx = FALSE)) 

fit %>% report()
## Series: KWH 
## Model: ARIMA(0,0,3)(2,1,0)[12] w/ drift 
## 
## Coefficients:
##          ma1     ma2     ma3     sar1    sar2   constant
##       0.3420  0.0403  0.2116  -0.7298  -0.434  233616.66
## s.e.  0.0785  0.0896  0.0725   0.0767   0.077   76173.02
## 
## sigma^2 estimated as 3.855e+11:  log likelihood=-2657.82
## AIC=5329.65   AICc=5330.3   BIC=5352
# Check the residuals 
fit %>% gg_tsresiduals()

# Conduct a Ljung-box test
fit %>% augment() %>% features(.innov, ljung_box, lag = 24)
## # A tibble: 1 × 3
##   .model                                       lb_stat lb_pvalue
##   <chr>                                          <dbl>     <dbl>
## 1 ARIMA(KWH, stepwise = FALSE, approx = FALSE)    14.9     0.925

The ARIMA model that was selected was an \(ARIMA(0,0,3)(2,1,0)[12]~w/~drift\) and visualizing the residuals and Ljung-box test we can conclude that they are white noise. The model was used to forecast the next year (2014).

fc <- fit %>% forecast(h = "1 year")

fc %>%
  autoplot(power_ts) +
  labs(title="Power Consumption Forecasts ",
       y="USD")

The ARIMA model captured the seasosnal trend as well as the linear trend beginning to appear at the end of the series.

# Save forecasts in new object
power_forecast <- fc[,c(1,2,4)]
power_forecast$DATA <- "Power_usage"

Part C – BONUS, optional (part or all)

Part C consists of two data sets.

Waterflow_Pipe1.xlsx and Waterflow_Pipe2.xlsx

These are simple 2 column sets, however they have different time stamps.

Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows).

Note for multiple recordings within an hour, take the mean.
Then to determine if the data is stationary and can it be forecast.
If so, provide a week forward forecast and present results via Rpubs and .rmd

Submit the forecast in an Excel readable file.

EDA

pipe1 <- read_excel("/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_624/Project1/Waterflow_Pipe1.xlsx",
                    col_types = c("date","numeric"))
head(pipe1)
## # A tibble: 6 × 2
##   `Date Time`         WaterFlow
##   <dttm>                  <dbl>
## 1 2015-10-23 00:24:06     23.4 
## 2 2015-10-23 00:40:02     28.0 
## 3 2015-10-23 00:53:51     23.1 
## 4 2015-10-23 00:55:40     30.0 
## 5 2015-10-23 01:19:17      6.00
## 6 2015-10-23 01:23:58     15.9
pipe2 <- read_excel("/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_624/Project1/Waterflow_Pipe2.xlsx", col_types = c("date","numeric"))

head(pipe2)
## # A tibble: 6 × 2
##   `Date Time`         WaterFlow
##   <dttm>                  <dbl>
## 1 2015-10-23 01:00:00      18.8
## 2 2015-10-23 02:00:00      43.1
## 3 2015-10-23 03:00:00      38.0
## 4 2015-10-23 04:00:00      36.1
## 5 2015-10-23 05:00:00      31.9
## 6 2015-10-23 06:00:00      28.2
# combine the rows of both pipe data
pipe_combined <- rbind(pipe1, pipe2)

# Change column names
colnames(pipe_combined) <- c("DateTime", "WaterFlow")
# Aggregate the data - taking the mean water flow at each hour
pipe_agg <- pipe_combined %>% group_by(date(DateTime), hour(DateTime)) %>% summarise(Avg_WaterFlow = mean(WaterFlow)) %>% ungroup()
## `summarise()` has grouped output by 'date(DateTime)'. You can override using
## the `.groups` argument.
# Convert the Date and Hour to a datetime object
pipe_agg$Date_Time <- as_datetime(paste0(pipe_agg$`date(DateTime)`, pipe_agg$`hour(DateTime)`), format="%Y-%m-%d %H")

# Select columns
pipe_agg <- pipe_agg %>% select(Date_Time, Avg_WaterFlow)

# View first few rows
head(pipe_agg)
## # A tibble: 6 × 2
##   Date_Time           Avg_WaterFlow
##   <dttm>                      <dbl>
## 1 2015-10-23 00:00:00          26.1
## 2 2015-10-23 01:00:00          18.8
## 3 2015-10-23 02:00:00          24.5
## 4 2015-10-23 03:00:00          25.6
## 5 2015-10-23 04:00:00          22.4
## 6 2015-10-23 05:00:00          23.6
summary(pipe_agg)
##    Date_Time                   Avg_WaterFlow   
##  Min.   :2015-10-23 00:00:00   Min.   : 1.885  
##  1st Qu.:2015-11-02 10:00:00   1st Qu.:23.837  
##  Median :2015-11-12 20:00:00   Median :33.527  
##  Mean   :2015-11-12 20:00:00   Mean   :35.980  
##  3rd Qu.:2015-11-23 06:00:00   3rd Qu.:47.127  
##  Max.   :2015-12-03 16:00:00   Max.   :77.388
# Check dates with < 24 hours 
pipe_agg %>% group_by(date(pipe_agg$Date_Time)) %>% summarise(n = n()) %>% filter(n < 24)
## # A tibble: 1 × 2
##   `date(pipe_agg$Date_Time)`     n
##   <date>                     <int>
## 1 2015-12-03                    17
pipe_agg
## # A tibble: 1,001 × 2
##    Date_Time           Avg_WaterFlow
##    <dttm>                      <dbl>
##  1 2015-10-23 00:00:00          26.1
##  2 2015-10-23 01:00:00          18.8
##  3 2015-10-23 02:00:00          24.5
##  4 2015-10-23 03:00:00          25.6
##  5 2015-10-23 04:00:00          22.4
##  6 2015-10-23 05:00:00          23.6
##  7 2015-10-23 06:00:00          21.4
##  8 2015-10-23 07:00:00          16.2
##  9 2015-10-23 08:00:00          22.3
## 10 2015-10-23 09:00:00          29.9
## # ℹ 991 more rows

The last day of the series contains hourly data for 17 hours.

# Create a tsibble 
pipe_ts <- as_tsibble(pipe_agg, index = Date_Time)

# View first few rows
head(pipe_ts)
## # A tsibble: 6 x 2 [1h] <UTC>
##   Date_Time           Avg_WaterFlow
##   <dttm>                      <dbl>
## 1 2015-10-23 00:00:00          26.1
## 2 2015-10-23 01:00:00          18.8
## 3 2015-10-23 02:00:00          24.5
## 4 2015-10-23 03:00:00          25.6
## 5 2015-10-23 04:00:00          22.4
## 6 2015-10-23 05:00:00          23.6
# Plot the time series
pipe_ts %>% autoplot(Avg_WaterFlow) + labs(title = "WaterFlow Time Series")

# Conduct a KPSS test
pipe_ts %>% features(Avg_WaterFlow, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      4.06        0.01

Check any necessary differencing and resultant stationarity

# Plot the differenced time series
pipe_ts %>% autoplot(difference(Avg_WaterFlow)) + labs(title = "WaterFlow Time Series")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

# Re check KPSS 
pipe_ts %>% features(difference(Avg_WaterFlow), unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1   0.00752         0.1

When applying a first order difference, the data achieves stationarity and

# Decompose data using STL 
stl_decomp_c <- pipe_ts %>% model(STL(Avg_WaterFlow)) %>% components()

# Plot the STL decomposition 
stl_decomp_c %>% autoplot() + labs(title = "STL")

pipe_ts %>% gg_lag(Avg_WaterFlow, lags = 1:24, geom = "point")

pipe_ts %>% ACF(Avg_WaterFlow, lag_max = 24) %>% autoplot()

The autocorrelation plot up to 24th lag showed low or very low positive correlations, demonstrating minimal to no seasonality and

MODELLING

# fit model
fit <- pipe_ts %>% model(ARIMA_auto = ARIMA(Avg_WaterFlow, stepwise = FALSE, approx = FALSE), 
                         ETS_auto = ETS(Avg_WaterFlow)) 
fit %>% select(ARIMA_auto) %>% report()
## Series: Avg_WaterFlow 
## Model: ARIMA(0,1,1)(1,0,0)[24] 
## 
## Coefficients:
##           ma1    sar1
##       -0.9649  0.0783
## s.e.   0.0083  0.0320
## 
## sigma^2 estimated as 211.8:  log likelihood=-4097.13
## AIC=8200.25   AICc=8200.28   BIC=8214.98
fit %>% select(ETS_auto) %>% report()
## Series: Avg_WaterFlow 
## Model: ETS(M,N,N) 
##   Smoothing parameters:
##     alpha = 0.03720267 
## 
##   Initial states:
##      l[0]
##  20.63633
## 
##   sigma^2:  0.1579
## 
##      AIC     AICc      BIC 
## 12168.50 12168.53 12183.23
# Caluculate accuracy
fit %>% accuracy()
## # A tibble: 2 × 10
##   .model     .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>      <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 ARIMA_auto Training 0.526  14.5  11.1 -26.3  47.8 0.735 0.739 -0.0231
## 2 ETS_auto   Training 0.630  14.6  11.2 -26.0  47.9 0.738 0.742 -0.0226

Similar accuracy metrics for each model when comparing the RMSE, with the ARIMA model performing slightly better.

fit %>% select(ARIMA_auto) %>% gg_tsresiduals()

# Conduct a ljung_box test to determine if residuals are white noise
fit %>% augment() %>% features(.innov, ljung_box, lag = 38)
## # A tibble: 2 × 3
##   .model     lb_stat lb_pvalue
##   <chr>        <dbl>     <dbl>
## 1 ARIMA_auto    58.1    0.0193
## 2 ETS_auto      56.7    0.0259
library(ggpubr)

arima_fc <- fit %>% select(ARIMA_auto) %>% forecast(h = "1 week") 

ets_fc <- fit %>% select(ETS_auto) %>% forecast(h = "1 week") 

ggarrange(ets_fc %>% autoplot(pipe_ts) + labs(title = "ETS(M,N,N)"), 
          arima_fc %>% autoplot(pipe_ts, color = "red") + labs(title = "ARIMA(0,1,1)(1,0,0)[24]"))

According to the Ljung-box stat and p-value neither model’s residuals depicted white noise but forecasts where still carried out. We can see that neither model captured the nature or patterns in the data. The confidence intervals in both forecasts were very large similar to the range of values within the data set. The data seems to have complex patterns where other modeling techniques need to be considered.

partc_df <- bind_rows(arima_fc, ets_fc)
partc_df$DATA <- "Water_flow"
partc_df <- partc_df[,c(1,2,4,5)]

COMBINE DATA

Combine data from parts A B, and C then save to an excel file

# Rename value column in atm_4 and the date column in the power forecast
colnames(atm4_forecasts) <- c(".model", "DATE", ".mean", "DATA")
colnames(power_forecast) <- c(".model", "DATE", ".mean", "DATA")
colnames(partc_df) <- c(".model", "DATE", ".mean", "DATA")

# combine the tsibble forecasts
all_forecasts <- rbind(as_tibble(atm1_forecasts), 
                 as_tibble(atm2_forecasts), 
                 as_tibble(atm3_forecasts), 
                 as_tibble(atm4_forecasts), 
                 as_tibble(power_forecast),
                 as_tibble(partc_df))

write_excel_csv(all_forecasts, "/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_624/all_forecasts.xlsx")

```