library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fable)
## Warning: package 'fable' was built under R version 4.3.3
## Loading required package: fabletools
## Warning: package 'fabletools' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.3.3
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tsibble 1.1.5 ✔ feasts 0.4.1
## ✔ tsibbledata 0.4.1
## Warning: package 'tsibble' was built under R version 4.3.3
## Warning: package 'feasts' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(readxl)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
Instructions
In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010.
Deliverable:
Explain and demonstrate your process, techniques used and not used, and your actual forecast.
I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file
Also please submit the forecast which you will put in an Excel readable file.
# Read the data in and convert columns to data types
atm_data <- read_excel("ATM624Data.xlsx", col_types = c("date", "text", "numeric"))
atm_data$DATE <- as.Date(atm_data$DATE)
# View first few rows
head(atm_data, 10)
## # A tibble: 10 × 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-01 ATM2 107
## 3 2009-05-02 ATM1 82
## 4 2009-05-02 ATM2 89
## 5 2009-05-03 ATM1 85
## 6 2009-05-03 ATM2 90
## 7 2009-05-04 ATM1 90
## 8 2009-05-04 ATM2 55
## 9 2009-05-05 ATM1 99
## 10 2009-05-05 ATM2 79
# View last few rows
tail(atm_data, 10)
## # A tibble: 10 × 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2010-04-21 ATM4 557.
## 2 2010-04-22 ATM4 386.
## 3 2010-04-23 ATM4 165.
## 4 2010-04-24 ATM4 5.45
## 5 2010-04-25 ATM4 542.
## 6 2010-04-26 ATM4 404.
## 7 2010-04-27 ATM4 13.7
## 8 2010-04-28 ATM4 348.
## 9 2010-04-29 ATM4 44.2
## 10 2010-04-30 ATM4 482.
# Summary Stats
summary(atm_data)
## DATE ATM Cash
## Min. :2009-05-01 Length:1474 Min. : 0.0
## 1st Qu.:2009-08-01 Class :character 1st Qu.: 0.5
## Median :2009-11-01 Mode :character Median : 73.0
## Mean :2009-10-31 Mean : 155.6
## 3rd Qu.:2010-02-01 3rd Qu.: 114.0
## Max. :2010-05-14 Max. :10919.8
## NA's :19
# View distribution of the cash in each ATM
atm_data %>% ggplot(aes(x = ATM, y = Cash)) + geom_boxplot() +
labs(title = "Distribution of Cash",
x = "ATM",
y = "Cash (USD)")
## Warning: Removed 19 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
There is a very clear outlier in the 4th ATM that skews the ability to observe the cash distribution of each ATM. Filtering the data for each ATM and then visualizing the distribution of values might be a better approach.
atm_data %>% group_by(ATM) %>% summarise(Min_Date = min(DATE, na.rm = T),
Max_Date = max(DATE, na.rm = T),
Min_cash = min(Cash, na.rm = T),
Q1_cash = quantile(Cash, 0.25, na.rm = T),
Median_Cash = median(Cash, na.rm = T),
Mean_Cash = mean(Cash, na.rm = T),
SD_Cash = sd(Cash, na.rm = T),
Q3_Cash = quantile(Cash, 0.75, na.rm = T),
Max_Cash = max(Cash, na.rm = T))
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Min_cash = min(Cash, na.rm = T)`.
## ℹ In group 5: `ATM = NA`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
## # A tibble: 5 × 10
## ATM Min_Date Max_Date Min_cash Q1_cash Median_Cash Mean_Cash SD_Cash
## <chr> <date> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 2009-05-01 2010-04-30 1 73 91 83.9 36.7
## 2 ATM2 2009-05-01 2010-04-30 0 25.5 67 62.6 38.9
## 3 ATM3 2009-05-01 2010-04-30 0 0 0 0.721 7.94
## 4 ATM4 2009-05-01 2010-04-30 1.56 124. 404. 474. 651.
## 5 <NA> 2010-05-01 2010-05-14 Inf NA NA NaN NA
## # ℹ 2 more variables: Q3_Cash <dbl>, Max_Cash <dbl>
# Check NA values
na_count <-sapply(atm_data, function(y) sum(length(which(is.na(y)))))
na_count
## DATE ATM Cash
## 0 14 19
na_rows <- atm_data[rowSums(is.na(atm_data)) > 0,]
na_rows
## # A tibble: 19 × 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-06-13 ATM1 NA
## 2 2009-06-16 ATM1 NA
## 3 2009-06-18 ATM2 NA
## 4 2009-06-22 ATM1 NA
## 5 2009-06-24 ATM2 NA
## 6 2010-05-01 <NA> NA
## 7 2010-05-02 <NA> NA
## 8 2010-05-03 <NA> NA
## 9 2010-05-04 <NA> NA
## 10 2010-05-05 <NA> NA
## 11 2010-05-06 <NA> NA
## 12 2010-05-07 <NA> NA
## 13 2010-05-08 <NA> NA
## 14 2010-05-09 <NA> NA
## 15 2010-05-10 <NA> NA
## 16 2010-05-11 <NA> NA
## 17 2010-05-12 <NA> NA
## 18 2010-05-13 <NA> NA
## 19 2010-05-14 <NA> NA
Most of the missing values were among the dates that we are instructed to forecast so we can drop those observations from the original atm data. Viewing the remaining rows with NA values, there were five observations 3 in ATM1 and 2 in ATM2 where the missing values were in the target variable - Cash. The missing will need to be handled and the initial thought was to separate the data into the four ATM’s.
# Re-check missing values
atm_data <- atm_data %>% drop_na(ATM)
na_rows <- atm_data[rowSums(is.na(atm_data)) > 0,]
na_rows
## # A tibble: 5 × 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-06-13 ATM1 NA
## 2 2009-06-16 ATM1 NA
## 3 2009-06-18 ATM2 NA
## 4 2009-06-22 ATM1 NA
## 5 2009-06-24 ATM2 NA
# Create separate data frames for each of the ATM machines
atm1 <- atm_data %>% filter(ATM == "ATM1")
atm2 <- atm_data %>% filter(ATM == "ATM2")
atm3 <- atm_data %>% filter(ATM == "ATM3")
atm4 <- atm_data %>% filter(ATM == "ATM4")
Missing values for ATM 1 and 2 were only found in June. To determine how to deal with the missing values we can looking at the mean and median cash for that month in each ATM (ATM1 and ATM2). Imputing the mean into the missing data and re-checking the how much that effects the spread of the data is a first step to determine whether this simple imputation technique is appropriate.
# ATM1 and ATM2 - Mean and median for the month of June
atm_data %>% filter(ATM %in% c("ATM1", "ATM2")) %>% group_by(ATM) %>% filter(month(DATE) == 6) %>% summarise(mean = mean(Cash, na.rm = T), median = median(Cash, na.rm = T))
## # A tibble: 2 × 3
## ATM mean median
## <chr> <dbl> <dbl>
## 1 ATM1 89.3 104
## 2 ATM2 76.0 82
Since there aren’t big gaps in the data an approach that supports the temporal structure of the data called rolling statistic imputation will be used. This approach replaces missing values with a statistic like mean over a defined window period. This can be achieved using a package called \(imputeTS\). A simple moving average was used with a window of 4. This strategy did a better job of preserving the underlying distribution of the original data than just imputing the mean cash across the entire time series within each ATM.
library(imputeTS)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
atm_data %>% filter(ATM %in% c("ATM1", "ATM2")) %>%
group_by(ATM) %>%
filter(month(DATE) == 6) %>%
arrange(ATM) %>%
na_ma(k = 4, weighting = "simple") %>%
summarise(mean = mean(Cash, na.rm = T), median = median(Cash, na.rm = T))
## # A tibble: 2 × 3
## ATM mean median
## <chr> <dbl> <dbl>
## 1 ATM1 90.2 103.
## 2 ATM2 76.1 81.3
# Impute missing values for ATM1 and ATM2
atm1 <- atm1 %>% na_ma(k = 4, weighting = "simple")
atm2 <- atm2 %>% na_ma(k = 4, weighting = "simple")
# Convert each data frame into a tsibble
atm1_ts <- as_tsibble(atm1, index = DATE)
atm2_ts <- as_tsibble(atm2, index = DATE)
atm3_ts <- as_tsibble(atm3, index = DATE)
atm4_ts <- as_tsibble(atm4, index = DATE)
# Plot entire time series
atm1_ts %>% autoplot(Cash) + labs(title = "ATM_1 Cash Removed", x = "Date", y = "USD")
# Plot the time series for one month
atm1_ts %>% filter(month(DATE) == 5) %>% autoplot(Cash) + labs(title = "ATM_1 Cash Removed", x = "Date", y = "USD")
Visualizing the times series we don’t see any linear trends but a strong pattern of large spikes with intermittent smaller fluctuations indicating seasonality with in the data. Using functions to visualize any autocorrelations will assist in identifying seasonality. There appears to be a few time points where the spikes are higher but no other outliers are noted.
atm1_ts %>% gg_lag(Cash, lags = 1:14, geom = "point")
atm1_ts %>% ACF(Cash, lag_max = 21) %>% autoplot()
# Decompose data using STL
stl_decomp <- atm1_ts %>% model(STL(Cash ~ trend() + season(), robust = TRUE)) %>% components()
# Plot the STL decomposition
stl_decomp %>% autoplot() + labs(title = "STL")
The autocorrelation plot confirms the seasonality as we see strong lags
at multiples of 7, indicating a correlation between the same days on
previous weeks. Decomposing the times series is helpful to analyze the
data to understand any underlying patterns, trends, and seasonal
variations. Visualizing the components in this data we can see there is
no clear linear trend but perhaps some cyclical behavior. We can see a
strong seasonal component in the season_week plot, where the large down
spike is associated with the same day of the week. We also see a lot of
variation in the remainder component which may indicate that the trend
and season component do not account for all of the variability in the
data.
Determine if transformation is necessary
A Box-Cox transformation might not be helpful for this particular ATM data as the variation in the data does not consistently only increase or only decrease throughout the series. Visualizing the transformation and comparing the plot to the original data it does not appear to have stabilized the variability, so we will keep the original data.
lambda <- atm1_ts |>
features(Cash, features = guerrero) |>
pull(lambda_guerrero)
atm1_ts %>%
autoplot(box_cox(Cash, lambda)) +
labs(y = "",
title = paste0(
"Transformed Cash in ATM1 with lambda = ", round(lambda, 4)))
Check for Stationarity and Determine Differencing: Unit Root Tests
A kpss p_value > 0.05 will fail to reject the null hypothesis that our data is stationary. On the original data we obtain a p_value of 0.0472 so we reject the null hypothesis in favor of the alternative which states our data is not stationary and differenceing may be needed
# Conduct KPSS test
atm1_ts %>% features(Cash, unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.475 0.0472
# Find the number of seasonal differencing
atm1_ts %>% features(Cash, unitroot_nsdiffs)
## # A tibble: 1 × 1
## nsdiffs
## <int>
## 1 1
# Seasonally difference the data
atm1_ts <- atm1_ts %>% mutate(Cash_diff = difference(Cash, lag = 7))
atm1_ts %>% features(Cash_diff, unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 0
# Plot differenced data
atm1_ts %>% autoplot(Cash_diff)
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
# Check KPSS test
atm1_ts %>% features(Cash_diff, unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.0217 0.1
This step was carried out in anticipation of using fitting and forecasting using an ARIMA model. If using the ARIMA function to select the best model, we would expect that at least one seaosnal difference to obtain stationarity.
Exponential Smoothing Modelling
Holt-Winters’ methods are suggested when dealing with seasonal data. In this case the additive method is preferred when the seasonal variations are roughly constant through the series which is what we are seeing here. Another model suggested for seasonal data is a damped Holt-Winters’ with multiplicative seasonality. Below is a comparison of four different Holt-Winter models. - MAdM - AAA - MAM - Auto selected ETS
fit <- atm1_ts %>% model(
hw_madm = ETS(Cash ~ error("M") + trend("Ad") + season("M")),
additive = ETS(Cash ~ error("A") + trend("A") + season("A")),
multiplicative = ETS(Cash ~ error("M") + trend("A") + season("M")),
auto = ETS(Cash)
)
fit %>% accuracy()
## # A tibble: 4 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 hw_madm Training -0.982 26.2 16.2 -127. 143. 0.919 0.947 0.0868
## 2 additive Training -0.140 23.8 15.1 -107. 122. 0.856 0.859 0.135
## 3 multiplicative Training -2.07 26.2 16.2 -129. 144. 0.921 0.946 0.0824
## 4 auto Training -0.0910 23.7 15.0 -106. 121. 0.852 0.857 0.137
fit %>% report()
## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.
## # A tibble: 4 × 9
## .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 hw_madm 0.136 -2273. 4573. 4574. 4623. 688. 702. 0.219
## 2 additive 583. -2233. 4491. 4492. 4537. 565. 569. 15.1
## 3 multiplicative 0.132 -2274. 4571. 4572. 4618. 687. 701. 0.216
## 4 auto 578. -2233. 4485. 4486. 4524. 563. 566. 15.0
fit %>% select(auto) %>% report()
## Series: Cash
## Model: ETS(A,N,A)
## Smoothing parameters:
## alpha = 0.01696495
## gamma = 0.3311424
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 78.07218 -60.61548 2.148172 8.582304 7.293434 13.07175 15.52 13.99981
##
## sigma^2: 577.6921
##
## AIC AICc BIC
## 4485.400 4486.021 4524.399
Comparing 4 models using an exponential smoothing technique we see that the model generated using an Additive error, No trend and Additive seasonality produced the model with the lowest AICc score and RMSE. We can further evaluate this model by studying the residuals.
fit %>% select(auto) %>% gg_tsresiduals()
# Conduct a ljung_box test to determine if residuals are white noise
fit %>% select(auto) %>% augment() %>% features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 auto 28.7 0.0115
The Ljung - box test tells us that the residuals are not independent and is not determined to be white noise. There may be a better model that is able to explain more of the data than an ETS model. The next set of models to evaluate are ARIMA models.
Fitting an ARIMA model
It was previously determined that difference was needed so we should expect that to be part of the ARIMA model. I have choose to have ARIMA find the best model by setting stepwise to False and approx to false. THe model that came back was an \(ARIMA(0,0,1)(1,1,1)_7\)
# Fit ARIMA MODEL with search function
fit <- atm1_ts %>% model(ARIMA_auto = ARIMA(Cash, stepwise = FALSE, approx = FALSE))
fit %>% report()
## Series: Cash
## Model: ARIMA(0,0,1)(1,1,1)[7]
##
## Coefficients:
## ma1 sar1 sma1
## 0.1810 0.1800 -0.7521
## s.e. 0.0547 0.0792 0.0552
##
## sigma^2 estimated as 555.2: log likelihood=-1639.68
## AIC=3287.36 AICc=3287.47 BIC=3302.88
Model Diagnostics
# Visualize the residuals
fit %>% gg_tsresiduals()
# Conduct a ljung_box test to determine if residuals are white noise
fit %>% augment() %>% features(.innov, ljung_box, lag = 10)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA_auto 11.6 0.315
Visualizing the residuals, there are no autocrrelations, there is mostly homoscedasticity, however there does appear to be some residuals. The Ljung - box test indicates that the residuals are independent and indistinguishable from white noise. This model can be used to forecast the next month. The model that was selected considered a 1 order of seasonal differencing which is what we found earlier when evaluating the stationarity of the original data.
fc <- fit %>% forecast(h = "1 month")
fc %>%
autoplot(atm1_ts) +
labs(title="Cash Retrieval ATM 1",
y="USD")
# Pull out the forecasts for ATM1
atm1_forecasts <- fc[,c(1,2,4)]
atm1_forecasts$DATA <- "ATM1"
# Find the sum of the forcasts for May 2010
sum(atm1_forecasts$.mean)
## [1] 2360.773
# Find sum of all months in the data set
atm1 %>% group_by(month(DATE)) %>% summarise(Cash_total = sum(Cash))
## # A tibble: 12 × 2
## `month(DATE)` Cash_total
## <dbl> <dbl>
## 1 1 2677
## 2 2 2531
## 3 3 2433
## 4 4 2300
## 5 5 2480
## 6 6 2705.
## 7 7 2621
## 8 8 3129
## 9 9 2826
## 10 10 2247
## 11 11 2289
## 12 12 2422
# Plot full time series for ATM2
atm2_ts %>% autoplot(Cash) + labs(title = "ATM_2 Cash Removed", x = "Date", y = "USD")
# Plot the time series for one month
atm2_ts %>% filter(month(DATE) == 5) %>% autoplot(Cash) + labs(title = "ATM_2 Cash Removed", x = "Date", y = "USD")
Visualizing the data of ATM2, there are large fluctuations similar to the first ATM’s data, suggesting seasonality. One difference that is seen is the presence of a slight downward trend across the time series.
# Visualize the lag plot
atm2_ts %>% gg_lag(Cash, lags = 1:14, geom = "point")
# Visualize the ACF
atm2_ts %>% ACF(Cash, lag_max = 21) %>% autoplot()
There are moderate autocorrelations at lags that are multiples of 7 and we see that the further apart the lags are the autocorrleation decreases, indicating the presence of a trend.
# Decompose data using STL
stl_decomp <- atm2_ts %>% model(STL(Cash ~ trend() + season(), robust = TRUE)) %>% components()
# Plot the STL decomposition
stl_decomp %>% autoplot() + labs(title = "STL: ATM2")
Decomposing the time series of ATM2, the trend pattern is more clear.
There is an interesting period between February and March where the
season_week data looks like it gets pinched. This observation in the
season_week component matches with a period in the time series where the
amount of Cash taken out seems to rapidly increase and then drop down
again. This was not easily seen in the time series plot alone.
As this model exhibits similar characteristics to the data in ATM1, an ARIMA model will be fitted and evaluated.
# Conduct KPSS test
atm2_ts %>% features(Cash, unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 2.18 0.01
The original data from ATM2 is not stationary according to the KPSS test so we should expect some differencing in the ARIMA model. Using a search method to find the best ARIMA model with the lowest AICc was used. The model chosen was a seasonal \(ARIMA(5,0,0)(0,1,1)_7\)
fit <- atm2_ts %>% model(ARIMA(Cash, stepwise = FALSE, approx = FALSE))
fit %>% report()
## Series: Cash
## Model: ARIMA(2,0,2)(0,1,1)[7]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4321 -0.9155 0.4748 0.8099 -0.7555
## s.e. 0.0551 0.0401 0.0860 0.0550 0.0382
##
## sigma^2 estimated as 609.4: log likelihood=-1655.7
## AIC=3323.41 AICc=3323.65 BIC=3346.69
Model Diagnostics
# Visualize the residuals
fit %>% gg_tsresiduals(lag=21)
# Conduct a ljung_box test to determine if residuals are white noise
fit %>% augment() %>% features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(Cash, stepwise = FALSE, approx = FALSE) 10.7 0.712
Visualizing and using a Ljung - box test, confirm that the residuals are white noise. Forecasting can move forward using this model.
# Forecast the next month
fc <- fit %>% forecast(h = "1 month")
fc %>% autoplot(atm2_ts) +
labs(title="Cash Retrieval ATM 2",
y="USD")
# Pull out the forecasts for ATM1
atm2_forecasts <- fc[,c(1,2,4)]
atm2_forecasts$DATA <- "ATM2"
# Find the sum of the forecasts for May 2010
sum(atm2_forecasts$.mean)
## [1] 1775.202
# Find sum of all months in the data set
atm2 %>% group_by(month(DATE)) %>% summarise(Cash_total = sum(Cash))
## # A tibble: 12 × 2
## `month(DATE)` Cash_total
## <dbl> <dbl>
## 1 1 1811
## 2 2 1517
## 3 3 1602
## 4 4 1842
## 5 5 2338
## 6 6 2282.
## 7 7 2094
## 8 8 2230
## 9 9 1867
## 10 10 1906
## 11 11 1692
## 12 12 1688
atm3_ts %>% autoplot(Cash) + labs(title = "ATM_3 Cash Removed", x = "Date", y = "USD")
# Plot the time series from April onward to visualize the spike in the data
atm3_ts %>% filter(month(DATE) == 4) %>% autoplot(Cash) + labs(title = "ATM_3 Cash", x = "Date", y = "USD")
tail(atm3_ts)
## # A tsibble: 6 x 3 [1D]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2010-04-25 ATM3 0
## 2 2010-04-26 ATM3 0
## 3 2010-04-27 ATM3 0
## 4 2010-04-28 ATM3 96
## 5 2010-04-29 ATM3 82
## 6 2010-04-30 ATM3 85
atm3_ts %>% ACF(Cash) %>% autoplot()
Visualizing this data we see that all values except three are non zero,
which occur at the end of the time series. Since there are not many of
values to consider in the forecast, simpler models will probably be best
in this case. The simple model that would be best to fit would be a
Naive model, that uses the last value for all forecasts. The ACF plot
does not provide much information regarding any seasonality, cyclic
behavior or trend as there are only a few non zero observations. The
moderate to strong positive autocorrelations also indicate that pervious
values might good predictors of futrue ones.
fit <- atm3_ts %>% model(NAIVE(Cash))
fit %>% gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
fc <- fit %>% forecast(h = "1 month")
fc %>%
autoplot(atm3_ts) +
labs(title="Cash Retrieval ATM 3",
y="USD")
#Use the hilo() function to get the 95% prediction intervals
hilo_tibble <- fc %>% hilo(level = 95)
hilo_tibble
## # A tsibble: 30 x 5 [1D]
## # Key: .model [1]
## .model DATE
## <chr> <date>
## 1 NAIVE(Cash) 2010-05-01
## 2 NAIVE(Cash) 2010-05-02
## 3 NAIVE(Cash) 2010-05-03
## 4 NAIVE(Cash) 2010-05-04
## 5 NAIVE(Cash) 2010-05-05
## 6 NAIVE(Cash) 2010-05-06
## 7 NAIVE(Cash) 2010-05-07
## 8 NAIVE(Cash) 2010-05-08
## 9 NAIVE(Cash) 2010-05-09
## 10 NAIVE(Cash) 2010-05-10
## # ℹ 20 more rows
## # ℹ 3 more variables: Cash <dist>, .mean <dbl>, `95%` <hilo>
Inspecting the 95% predictive intervals we see the that the range gets larger as the horizons get further out which is typical when forecasting time series. It is not surprising that the predictive intervals are very wide as there is minimal data to forecast from.
# Pull out the forecasts for ATM1
atm3_forecasts <- fc[,c(1,2,4)]
atm3_forecasts$DATA <- "ATM3"
# Find the sum of the forecasts for May 2010
sum(atm3_forecasts$.mean)
## [1] 2550
atm4_ts %>% autoplot(Cash) + labs(title = "ATM_4 Cash Removed", x = "Date", y = "USD")
# Plot the time series for one month
atm4_ts %>% filter(month(DATE) == 2) %>% autoplot(Cash) + labs(title = "ATM_4 Cash Removed", x = "Date", y = "USD")
While there are no missing values in the data from ATM4, there is clear outlier occurring in the beginning of February. The remaining data on either side does not seem to follow a specific trend or seasonal patterns. The next step would be to determine a strategy to handle this outlier value.
# Visualize the lag plot
atm4_ts %>% gg_lag(Cash, lags = 1:14, geom = "point")
# Visualize the ACF
atm4_ts %>% ACF(Cash, lag_max = 21) %>% autoplot()
When plotting the lag values up to 14 lags we can see the influence the outlier has on the rest of the lags, causing the lags to bunch up at the bottom left of the graph. However when plotting the autocorrelations, there are also no consistent patterns indicating trend or seasonality. The autocorrelation plot looks similar to stationary data.
To handle the outlier we might be able to consider this an error in recording. To confirm any other outlier presence, boxplots can be very helpful. The box plot below identifies 3 total outliers, however there is the one value that is far beyond the other two outliers. One thought was to use a rolling average similar to how NA values were imputed in the original data frame.
atm4_ts %>% ggplot(aes(y =Cash)) + geom_boxplot()
summary(atm4)
## DATE ATM Cash
## Min. :2009-05-01 Length:365 Min. : 1.563
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 124.334
## Median :2009-10-30 Mode :character Median : 403.839
## Mean :2009-10-30 Mean : 474.043
## 3rd Qu.:2010-01-29 3rd Qu.: 704.507
## Max. :2010-04-30 Max. :10919.762
# Q3 + IQR*1.5
atm4_ts %>% filter(Cash >= 1570)
## # A tsibble: 3 x 3 [1D]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-09-22 ATM4 1712.
## 2 2010-01-29 ATM4 1575.
## 3 2010-02-09 ATM4 10920.
# Impute an NA to that date and recode values using a rolling statistic
atm4_ts[atm4_ts$DATE == "2010-02-09", 3] = NA
# Fill na with rolling stat
atm4_ts <- atm4_ts %>% na_ma(k = 10, weighting = "simple")
# Recheck summary stats
summary(atm4_ts)
## DATE ATM Cash
## Min. :2009-05-01 Length:365 Min. : 1.563
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 124.334
## Median :2009-10-30 Mode :character Median : 403.839
## Mean :2009-10-30 Mean : 445.440
## 3rd Qu.:2010-01-29 3rd Qu.: 704.192
## Max. :2010-04-30 Max. :1712.075
# Visualize the time series plot
atm4_ts %>% autoplot(Cash) + labs(title = "ATM_4 Cash Removed", x = "Date", y = "USD")
# Visualize the lag plot
atm4_ts %>% gg_lag(Cash, lags = 1:14, geom = "point")
# Visualize the ACF
atm4_ts %>% ACF(Cash, lag_max = 21) %>% autoplot()
Re-coding the outlier revealed an underlying pattern that was not able to be seen with such a highly influential point. Re-plotting the time series, lag plot and autocorrelations we see similar behavior as what was visualized in the ATM1 and ATM2 data.
lambda <- atm4_ts |>
features(Cash, features = guerrero) |>
pull(lambda_guerrero)
atm4_ts %>%
autoplot(box_cox(Cash, lambda)) +
labs(y = "",
title = paste0(
"Transformed Cash in ATM4 with lambda = ", round(lambda, 4)))
atm4_ts <- atm4_ts %>% mutate(Cash_trans = box_cox(Cash, lambda))
A Box-cox transformation was also done on this ATM data in oreder to make the pattern more consistent across the whole data set. The range and spread of values in this ATM data were much larger than in the other ATM’s. Simpler patterns are usually easier to model and lead to more accurate forecasts.
# Conduct KPSS test
atm4_ts %>% features(Cash_trans, unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.0769 0.1
Based off of the KPSS stat and p-value the data is stationary and we should not expect any differencing to be needed.
fit <- atm4_ts %>% model(ARIMA(Cash_trans, stepwise = FALSE, approx = FALSE))
fit %>% report()
## Series: Cash_trans
## Model: ARIMA(1,0,0)(2,0,0)[7] w/ mean
##
## Coefficients:
## ar1 sar1 sar2 constant
## 0.0762 0.2093 0.1985 15.9681
## s.e. 0.0526 0.0516 0.0525 0.6947
##
## sigma^2 estimated as 185.1: log likelihood=-1469.24
## AIC=2948.48 AICc=2948.65 BIC=2967.98
Model Diagnostics
# Visualize the residuals
fit %>% gg_tsresiduals()
# Conduct a ljung_box test to determine if residuals are white noise
fit %>% augment() %>% features(.innov, ljung_box, lag = 14)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(Cash_trans, stepwise = FALSE, approx = FALSE) 15.6 0.336
# Forecast the next month
fc <- fit %>% forecast(h = "1 month")
fc %>%
autoplot(atm4_ts) +
labs(title="Cash Retrieval ATM 4",
y="USD")
fc$mean_trans <- inv_box_cox(fc$.mean, lambda = lambda)
# Save forcasts in new object
atm4_forecasts <- fc[,c(1,2,5)]
atm4_forecasts$DATA <- "ATM4"
# Sum up the forcasted Cash
sum(atm4_forecasts$mean_trans)
## [1] 9623.493
# Find sum of all months in the data set
atm4 %>% group_by(month(DATE)) %>% summarise(Cash_total = sum(Cash))
## # A tibble: 12 × 2
## `month(DATE)` Cash_total
## <dbl> <dbl>
## 1 1 14903.
## 2 2 23084.
## 3 3 13315.
## 4 4 11858.
## 5 5 12622.
## 6 6 13744.
## 7 7 14477.
## 8 8 15903.
## 9 9 17733.
## 10 10 9514.
## 11 11 12908.
## 12 12 12966.
Instructions
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013.
Your assignment is to model these data and a monthly forecast for 2014.
The data is given in a single file - ResidentialCustomerForecastLoad-624.xlsx The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward.
Add this to your existing files above.
power_df <- read_excel("/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_624/Project1/ResidentialCustomerForecastLoad-624.xlsx")
head(power_df)
## # A tibble: 6 × 3
## CaseSequence `YYYY-MMM` KWH
## <dbl> <chr> <dbl>
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
# Re-name columns
colnames(power_df) <- c("CaseSeq", "YearMonth", "KWH")
head(power_df)
## # A tibble: 6 × 3
## CaseSeq YearMonth KWH
## <dbl> <chr> <dbl>
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
summary(power_df)
## CaseSeq YearMonth KWH
## Min. :733.0 Length:192 Min. : 770523
## 1st Qu.:780.8 Class :character 1st Qu.: 5429912
## Median :828.5 Mode :character Median : 6283324
## Mean :828.5 Mean : 6502475
## 3rd Qu.:876.2 3rd Qu.: 7620524
## Max. :924.0 Max. :10655730
## NA's :1
power_df$Date <- paste0(power_df$YearMonth, "-01")
power_df$Date <- as.Date(power_df$Date, format = "%Y-%b-%d")
head(power_df, 20)
## # A tibble: 20 × 4
## CaseSeq YearMonth KWH Date
## <dbl> <chr> <dbl> <date>
## 1 733 1998-Jan 6862583 1998-01-01
## 2 734 1998-Feb 5838198 1998-02-01
## 3 735 1998-Mar 5420658 1998-03-01
## 4 736 1998-Apr 5010364 1998-04-01
## 5 737 1998-May 4665377 1998-05-01
## 6 738 1998-Jun 6467147 1998-06-01
## 7 739 1998-Jul 8914755 1998-07-01
## 8 740 1998-Aug 8607428 1998-08-01
## 9 741 1998-Sep 6989888 1998-09-01
## 10 742 1998-Oct 6345620 1998-10-01
## 11 743 1998-Nov 4640410 1998-11-01
## 12 744 1998-Dec 4693479 1998-12-01
## 13 745 1999-Jan 7183759 1999-01-01
## 14 746 1999-Feb 5759262 1999-02-01
## 15 747 1999-Mar 4847656 1999-03-01
## 16 748 1999-Apr 5306592 1999-04-01
## 17 749 1999-May 4426794 1999-05-01
## 18 750 1999-Jun 5500901 1999-06-01
## 19 751 1999-Jul 7444416 1999-07-01
## 20 752 1999-Aug 7564391 1999-08-01
power_df %>% ggplot(aes(x = Date, y = KWH)) + geom_line() +
labs(title = "Residential power usage",
x = "Date",
y = "Kilowatt per hour")
From the Summary output and through visualizing the time series, there is a gap in the data between 2008 - 2010. There also appears to be an outlier in the data as well. We can find the date with the missing value below
# Identify observation with missing value
# Check NA values
na_rows <- power_df[rowSums(is.na(power_df)) > 0,]
na_rows$Date
## [1] "2008-09-01"
One strategy to fill the missing value is to take an average from the data points in the same month, in this case September. The time series appears to have minimal trend toward the end of the series. The point where the missing value is appears to be among the consistent seasonal pattern of the data, where values are similar between the same month in different years.
# View the data over the same month as the missing value
rolling_avg <- power_df %>% filter(month(Date) == 9) %>% na_ma(weighting = "simple") %>% filter(Date == "2008-09-01") %>% pull(KWH)
power_df[power_df$Date == "2008-09-01", "KWH"] = rolling_avg
# Re-check for missing values
any(is.na(power_df))
## [1] FALSE
power_df %>% ggplot(aes(y = KWH)) + geom_boxplot() + labs(title = "Distribution of Power Usage 1998-2013")
# View the value of the outlier
power_df %>% filter(Date > "2010-01-01")
## # A tibble: 47 × 4
## CaseSeq YearMonth KWH Date
## <dbl> <chr> <dbl> <date>
## 1 878 2010-Feb 8390677 2010-02-01
## 2 879 2010-Mar 7347915 2010-03-01
## 3 880 2010-Apr 5776131 2010-04-01
## 4 881 2010-May 4919289 2010-05-01
## 5 882 2010-Jun 6696292 2010-06-01
## 6 883 2010-Jul 770523 2010-07-01
## 7 884 2010-Aug 7922701 2010-08-01
## 8 885 2010-Sep 7819472 2010-09-01
## 9 886 2010-Oct 5875917 2010-10-01
## 10 887 2010-Nov 4800733 2010-11-01
## # ℹ 37 more rows
The outlier fell during the month of July in 2010. If we assume that this data is from somewhere in the US when hotter months tend to consume a lot energy to power AC units, then replacing the value might be a good strategy. When viewing the power usage around the month of July in other years we see that in these months, power consumption is high. To replace the data we can try imputing the average power consumption from the two months on either side of July (June, August).
# Impute an NA to that date and re-code values using a rolling statistic
power_df2 <- power_df
power_df2[power_df2$Date == "2010-07-01", "KWH"] = NA
# Fill na with rolling stat
power_df2 <- power_df2 %>% na_ma(k = 2, weighting = "simple")
# Recheck and compare summary stats
print("BEFORE OUTLIER REPLACED")
## [1] "BEFORE OUTLIER REPLACED"
summary(power_df)
## CaseSeq YearMonth KWH Date
## Min. :733.0 Length:192 Min. : 770523 Min. :1998-01-01
## 1st Qu.:780.8 Class :character 1st Qu.: 5434539 1st Qu.:2001-12-24
## Median :828.5 Mode :character Median : 6314472 Median :2005-12-16
## Mean :828.5 Mean : 6508071 Mean :2005-12-15
## 3rd Qu.:876.2 3rd Qu.: 7608792 3rd Qu.:2009-12-08
## Max. :924.0 Max. :10655730 Max. :2013-12-01
print("AFTER OUTLIER REPLACED")
## [1] "AFTER OUTLIER REPLACED"
summary(power_df2)
## CaseSeq YearMonth KWH Date
## Min. :733.0 Length:192 Min. : 4313019 Min. :1998-01-01
## 1st Qu.:780.8 Class :character 1st Qu.: 5443502 1st Qu.:2001-12-24
## Median :828.5 Mode :character Median : 6351262 Median :2005-12-16
## Mean :828.5 Mean : 6539680 Mean :2005-12-15
## 3rd Qu.:876.2 3rd Qu.: 7608792 3rd Qu.:2009-12-08
## Max. :924.0 Max. :10655730 Max. :2013-12-01
# Outlier value
power_df2[power_df2$Date == "2010-07-01", "KWH"]
## # A tibble: 1 × 1
## KWH
## <dbl>
## 1 6839438.
Comparing the mean and median values of the target variable KWH after replacing the outlier to before, the mean remained quite stable \(meanBefore = 6508071\) vs. \(meanAfter = 6539680\) and the median increased slightly \(medianBefore = 6314472\) vs. \(medianAfter = 6351262\). Since the overlying distribution was not seriously affected buy replacing the outlier, this final data was used for fitting and forecasting future power consumption.
# Replace the hyphen with a blank space
power_df2$YearMonth <- str_replace(power_df$YearMonth, "-", " ")
# Convert the tibble to tsibble
power_ts <- power_df2 %>% select(CaseSeq, YearMonth, KWH) %>% mutate(YearMonth = yearmonth(YearMonth)) %>% as_tsibble(index = YearMonth)
head(power_ts)
## # A tsibble: 6 x 3 [1M]
## CaseSeq YearMonth KWH
## <dbl> <mth> <dbl>
## 1 733 1998 Jan 6862583
## 2 734 1998 Feb 5838198
## 3 735 1998 Mar 5420658
## 4 736 1998 Apr 5010364
## 5 737 1998 May 4665377
## 6 738 1998 Jun 6467147
# Time series plot
power_ts %>% autoplot(KWH) + labs(title = "Power Consumption")
# Season
power_ts %>% gg_season(KWH) + labs(title = "Yearly Trends in Power Usage")
# Lag plots
power_ts %>% gg_lag(KWH, lags = 1:12, geom = "point")
# ACF plot
power_ts %>% ACF(KWH, lag_max = 12) %>% autoplot() + labs(title = "ACF Power Consumption")
The season plot and ACF plot both indicate strong monthly seasonal patterns. The seasonality is consistent among years, with higher usage during the winter and summer months and these periods are positively correlated with each other (positive correlations at lag 5,6,7). Visualizing the time series plot there does appear to be a slight upward trend toward the end of the series. Decomposing the date may assist in visualizing this more clearly.
It is based on classical decomposition, but includes many extra steps and features in order to overcome the drawbacks of classical decomposition that were discussed in the previous section. In particular, trend-cycle estimates are available for all observations including the end points, and the seasonal component is allowed to vary slowly over time.
#Create the decomposition
x11_dcmp <- power_ts |>
model(x11 = X_13ARIMA_SEATS(KWH ~ x11())) |>
components()
# Plot the decompostion
autoplot(x11_dcmp) +
labs(title =
"Decomposition of Power Usage using X-11.")
# Conduct KPSS test
power_ts %>% features(KWH, unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 1.64 0.01
# Conduct nsdiff test
power_ts %>% features(KWH, unitroot_nsdiffs)
## # A tibble: 1 × 1
## nsdiffs
## <int>
## 1 1
The the initial approach for this data was to use a seasonal Naive model.
# Fit model
fit <- power_ts %>% model(SNAIVE = SNAIVE(KWH))
# Check accuracy
fit %>% accuracy()
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE Training 94245. 829452. 618081. 0.679 9.19 1 1 0.301
# Forecast and plot
fc_snaive <- fit %>% forecast(h = "1 year")
fc_snaive %>% autoplot(power_ts) + labs(title = "Seasonal Naive Forecast")
# Check the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Conduct a Ljung-box test
fit %>% augment() %>% features(.innov, ljung_box, lag = 24)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 SNAIVE 98.6 5.18e-11
While the simple SNAIVE model did do a decent job at finding the seasonal pattern in the data, the residuals did not resemble white noise so there may be more information not accounted for. Using more complex models will be explored next
fit <- power_ts %>% model(ARIMA(KWH, stepwise = FALSE, approx = FALSE))
fit %>% report()
## Series: KWH
## Model: ARIMA(0,0,3)(2,1,0)[12] w/ drift
##
## Coefficients:
## ma1 ma2 ma3 sar1 sar2 constant
## 0.3420 0.0403 0.2116 -0.7298 -0.434 233616.66
## s.e. 0.0785 0.0896 0.0725 0.0767 0.077 76173.02
##
## sigma^2 estimated as 3.855e+11: log likelihood=-2657.82
## AIC=5329.65 AICc=5330.3 BIC=5352
# Check the residuals
fit %>% gg_tsresiduals()
# Conduct a Ljung-box test
fit %>% augment() %>% features(.innov, ljung_box, lag = 24)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(KWH, stepwise = FALSE, approx = FALSE) 14.9 0.925
The ARIMA model that was selected was an \(ARIMA(0,0,3)(2,1,0)[12]~w/~drift\) and visualizing the residuals and Ljung-box test we can conclude that they are white noise. The model was used to forecast the next year (2014).
fc <- fit %>% forecast(h = "1 year")
fc %>%
autoplot(power_ts) +
labs(title="Power Consumption Forecasts ",
y="USD")
The ARIMA model captured the seasosnal trend as well as the linear trend
beginning to appear at the end of the series.
# Save forecasts in new object
power_forecast <- fc[,c(1,2,4)]
power_forecast$DATA <- "Power_usage"
Part C consists of two data sets.
Waterflow_Pipe1.xlsx and Waterflow_Pipe2.xlsx
These are simple 2 column sets, however they have different time stamps.
Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows).
Note for multiple recordings within an hour, take the
mean.
Then to determine if the data is stationary and can it be
forecast.
If so, provide a week forward forecast and present
results via Rpubs and .rmd
Submit the forecast in an Excel readable file.
pipe1 <- read_excel("/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_624/Project1/Waterflow_Pipe1.xlsx",
col_types = c("date","numeric"))
head(pipe1)
## # A tibble: 6 × 2
## `Date Time` WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 00:24:06 23.4
## 2 2015-10-23 00:40:02 28.0
## 3 2015-10-23 00:53:51 23.1
## 4 2015-10-23 00:55:40 30.0
## 5 2015-10-23 01:19:17 6.00
## 6 2015-10-23 01:23:58 15.9
pipe2 <- read_excel("/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_624/Project1/Waterflow_Pipe2.xlsx", col_types = c("date","numeric"))
head(pipe2)
## # A tibble: 6 × 2
## `Date Time` WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 01:00:00 18.8
## 2 2015-10-23 02:00:00 43.1
## 3 2015-10-23 03:00:00 38.0
## 4 2015-10-23 04:00:00 36.1
## 5 2015-10-23 05:00:00 31.9
## 6 2015-10-23 06:00:00 28.2
# combine the rows of both pipe data
pipe_combined <- rbind(pipe1, pipe2)
# Change column names
colnames(pipe_combined) <- c("DateTime", "WaterFlow")
# Aggregate the data - taking the mean water flow at each hour
pipe_agg <- pipe_combined %>% group_by(date(DateTime), hour(DateTime)) %>% summarise(Avg_WaterFlow = mean(WaterFlow)) %>% ungroup()
## `summarise()` has grouped output by 'date(DateTime)'. You can override using
## the `.groups` argument.
# Convert the Date and Hour to a datetime object
pipe_agg$Date_Time <- as_datetime(paste0(pipe_agg$`date(DateTime)`, pipe_agg$`hour(DateTime)`), format="%Y-%m-%d %H")
# Select columns
pipe_agg <- pipe_agg %>% select(Date_Time, Avg_WaterFlow)
# View first few rows
head(pipe_agg)
## # A tibble: 6 × 2
## Date_Time Avg_WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 00:00:00 26.1
## 2 2015-10-23 01:00:00 18.8
## 3 2015-10-23 02:00:00 24.5
## 4 2015-10-23 03:00:00 25.6
## 5 2015-10-23 04:00:00 22.4
## 6 2015-10-23 05:00:00 23.6
summary(pipe_agg)
## Date_Time Avg_WaterFlow
## Min. :2015-10-23 00:00:00 Min. : 1.885
## 1st Qu.:2015-11-02 10:00:00 1st Qu.:23.837
## Median :2015-11-12 20:00:00 Median :33.527
## Mean :2015-11-12 20:00:00 Mean :35.980
## 3rd Qu.:2015-11-23 06:00:00 3rd Qu.:47.127
## Max. :2015-12-03 16:00:00 Max. :77.388
# Check dates with < 24 hours
pipe_agg %>% group_by(date(pipe_agg$Date_Time)) %>% summarise(n = n()) %>% filter(n < 24)
## # A tibble: 1 × 2
## `date(pipe_agg$Date_Time)` n
## <date> <int>
## 1 2015-12-03 17
pipe_agg
## # A tibble: 1,001 × 2
## Date_Time Avg_WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 00:00:00 26.1
## 2 2015-10-23 01:00:00 18.8
## 3 2015-10-23 02:00:00 24.5
## 4 2015-10-23 03:00:00 25.6
## 5 2015-10-23 04:00:00 22.4
## 6 2015-10-23 05:00:00 23.6
## 7 2015-10-23 06:00:00 21.4
## 8 2015-10-23 07:00:00 16.2
## 9 2015-10-23 08:00:00 22.3
## 10 2015-10-23 09:00:00 29.9
## # ℹ 991 more rows
The last day of the series contains hourly data for 17 hours.
# Create a tsibble
pipe_ts <- as_tsibble(pipe_agg, index = Date_Time)
# View first few rows
head(pipe_ts)
## # A tsibble: 6 x 2 [1h] <UTC>
## Date_Time Avg_WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 00:00:00 26.1
## 2 2015-10-23 01:00:00 18.8
## 3 2015-10-23 02:00:00 24.5
## 4 2015-10-23 03:00:00 25.6
## 5 2015-10-23 04:00:00 22.4
## 6 2015-10-23 05:00:00 23.6
# Plot the time series
pipe_ts %>% autoplot(Avg_WaterFlow) + labs(title = "WaterFlow Time Series")
# Conduct a KPSS test
pipe_ts %>% features(Avg_WaterFlow, unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 4.06 0.01
Check any necessary differencing and resultant stationarity
# Plot the differenced time series
pipe_ts %>% autoplot(difference(Avg_WaterFlow)) + labs(title = "WaterFlow Time Series")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
# Re check KPSS
pipe_ts %>% features(difference(Avg_WaterFlow), unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.00752 0.1
When applying a first order difference, the data achieves stationarity and
# Decompose data using STL
stl_decomp_c <- pipe_ts %>% model(STL(Avg_WaterFlow)) %>% components()
# Plot the STL decomposition
stl_decomp_c %>% autoplot() + labs(title = "STL")
pipe_ts %>% gg_lag(Avg_WaterFlow, lags = 1:24, geom = "point")
pipe_ts %>% ACF(Avg_WaterFlow, lag_max = 24) %>% autoplot()
The autocorrelation plot up to 24th lag showed low or very low positive correlations, demonstrating minimal to no seasonality and
# fit model
fit <- pipe_ts %>% model(ARIMA_auto = ARIMA(Avg_WaterFlow, stepwise = FALSE, approx = FALSE),
ETS_auto = ETS(Avg_WaterFlow))
fit %>% select(ARIMA_auto) %>% report()
## Series: Avg_WaterFlow
## Model: ARIMA(0,1,1)(1,0,0)[24]
##
## Coefficients:
## ma1 sar1
## -0.9649 0.0783
## s.e. 0.0083 0.0320
##
## sigma^2 estimated as 211.8: log likelihood=-4097.13
## AIC=8200.25 AICc=8200.28 BIC=8214.98
fit %>% select(ETS_auto) %>% report()
## Series: Avg_WaterFlow
## Model: ETS(M,N,N)
## Smoothing parameters:
## alpha = 0.03720267
##
## Initial states:
## l[0]
## 20.63633
##
## sigma^2: 0.1579
##
## AIC AICc BIC
## 12168.50 12168.53 12183.23
# Caluculate accuracy
fit %>% accuracy()
## # A tibble: 2 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA_auto Training 0.526 14.5 11.1 -26.3 47.8 0.735 0.739 -0.0231
## 2 ETS_auto Training 0.630 14.6 11.2 -26.0 47.9 0.738 0.742 -0.0226
Similar accuracy metrics for each model when comparing the RMSE, with the ARIMA model performing slightly better.
fit %>% select(ARIMA_auto) %>% gg_tsresiduals()
# Conduct a ljung_box test to determine if residuals are white noise
fit %>% augment() %>% features(.innov, ljung_box, lag = 38)
## # A tibble: 2 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA_auto 58.1 0.0193
## 2 ETS_auto 56.7 0.0259
library(ggpubr)
arima_fc <- fit %>% select(ARIMA_auto) %>% forecast(h = "1 week")
ets_fc <- fit %>% select(ETS_auto) %>% forecast(h = "1 week")
ggarrange(ets_fc %>% autoplot(pipe_ts) + labs(title = "ETS(M,N,N)"),
arima_fc %>% autoplot(pipe_ts, color = "red") + labs(title = "ARIMA(0,1,1)(1,0,0)[24]"))
According to the Ljung-box stat and p-value neither model’s residuals
depicted white noise but forecasts where still carried out. We can see
that neither model captured the nature or patterns in the data. The
confidence intervals in both forecasts were very large similar to the
range of values within the data set. The data seems to have complex
patterns where other modeling techniques need to be considered.
partc_df <- bind_rows(arima_fc, ets_fc)
partc_df$DATA <- "Water_flow"
partc_df <- partc_df[,c(1,2,4,5)]
Combine data from parts A B, and C then save to an excel file
# Rename value column in atm_4 and the date column in the power forecast
colnames(atm4_forecasts) <- c(".model", "DATE", ".mean", "DATA")
colnames(power_forecast) <- c(".model", "DATE", ".mean", "DATA")
colnames(partc_df) <- c(".model", "DATE", ".mean", "DATA")
# combine the tsibble forecasts
all_forecasts <- rbind(as_tibble(atm1_forecasts),
as_tibble(atm2_forecasts),
as_tibble(atm3_forecasts),
as_tibble(atm4_forecasts),
as_tibble(power_forecast),
as_tibble(partc_df))
write_excel_csv(all_forecasts, "/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_624/all_forecasts.xlsx")
```