library(tsibble)
## 
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(tsibbledata)
library(ggfortify)
## Loading required package: ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(USgas)
library(readr)
library(tidyr)
library(readxl)
library(httr)
library(feasts)
## Loading required package: fabletools
library(stats)
library(fpp3)
## -- Attaching packages -------------------------------------------- fpp3 0.4.0 --
## v tibble    3.1.6     v fable     0.3.1
## v lubridate 1.8.0
## -- Conflicts ------------------------------------------------- fpp3_conflicts --
## x lubridate::date()     masks base::date()
## x dplyr::filter()       masks stats::filter()
## x tsibble::intersect()  masks base::intersect()
## x lubridate::interval() masks tsibble::interval()
## x dplyr::lag()          masks stats::lag()
## x tsibble::setdiff()    masks base::setdiff()
## x tsibble::union()      masks base::union()
library(seasonal)
## 
## Attaching package: 'seasonal'
## The following object is masked from 'package:tibble':
## 
##     view
pacman::p_load(tidyverse,janitor,DataExplorer,knitr,arsenal,kableExtra,car,geoR,caret,
               psych,gridExtra,DMwR2,lmtest,pscl,MKmisc,ROCR,survey,stats,rstatix,Rcpp,
               corrplot,forecast,cowplot)

library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(lubridate)
library(tseries)
library(zoo)

library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.1.3

ATM Forecast

Initial Summary

The first steps of this analysis are to import and clean the data. Given that this is an XSLX data set the date values will not import correctly automatically. Using lubridate and the as.date function I converted this column into its appropriate format. I then ran initial summary visualizations to understand the missing values, correlations, outliers, and structure of the data. The amount of data that is missing in this time series is minimal so I can drop these values with minimal impact to the overall model.

atm_data$DATE = ymd(as.Date(atm_data$DATE, origin = "1899-12-30"))

atm_ts = as_tsibble(tibble(atm_data),index='DATE',key='ATM')
plot_str(atm_ts)
plot_density(atm_ts)

plot_boxplot(atm_ts,by="ATM")
## Warning: Removed 19 rows containing non-finite values (stat_boxplot).

plot_missing(atm_ts)

plot_correlation(atm_ts)
## 1 features with more than 20 categories ignored!
## DATE: 379 categories
## Warning: Removed 10 rows containing missing values (geom_text).

plot_histogram(atm_ts)

Model Preparation

Analyzing the cash time series trends we can see that there are no associated seasonal trend components for the various ATMs. ATM 1 and 2 show cyclality/trend components but ATM 3 shows a spike in usage after no usage through most of the lifecycle. ATM 4 is a white noise data set according to the ACF and forecasts will most likely not be effective.

The data set will benefit from a Box-Cox transformation, a replacement of the outlier, and a reexamination of the respective ACFs.

Post transformation the data sets normalized. Despite the transformation the ACFs indicated that the ATM cash withdrawals occurred in a non-predictable trend (white-noise). ATM 3 withdrawals have such a short time horizon with a short spike in usage that the data set cannot be transformed; the ACF was unchanged.

atm4_out = atm_ts %>% filter(ATM=='ATM4') %>% select(Cash)
atm3_out = atm_ts %>% filter(ATM=='ATM3') %>% select(Cash)
atm2_out = atm_ts %>% filter(ATM=='ATM2') %>% select(Cash)
atm1_out = atm_ts %>% filter(ATM=='ATM1') %>% select(Cash)

atm4_out$Cash[tsoutliers(atm4_out$Cash)$index] = tsoutliers(atm4_out$Cash)$replacements

clean_atm_ts = atm_ts %>% drop_na()
plot_missing(clean_atm_ts)

clean_atm_ts %>% filter(ATM=='ATM1') %>% autoplot(.vars=Cash)

clean_atm_ts %>% filter(ATM=='ATM2') %>% autoplot(.vars=Cash)

clean_atm_ts %>% filter(ATM=='ATM3') %>% autoplot(.vars=Cash)

clean_atm_ts %>% filter(ATM=='ATM4') %>% autoplot(.vars=Cash)

atm_ts %>% filter(ATM=='ATM1') %>% ACF() %>% autoplot(.vars=Cash)
## Response variable not specified, automatically selected `var = Cash`

atm_ts %>% filter(ATM=='ATM2') %>% ACF() %>% autoplot(.vars=Cash)
## Response variable not specified, automatically selected `var = Cash`

atm_ts %>% filter(ATM=='ATM3') %>% ACF() %>% autoplot(.vars=Cash)
## Response variable not specified, automatically selected `var = Cash`

atm_ts %>% filter(ATM=='ATM4') %>% ACF() %>% autoplot(.vars=Cash)
## Response variable not specified, automatically selected `var = Cash`

atm4_lambda = atm4_out %>%
  features(Cash, features = guerrero) %>%
  pull(lambda_guerrero)

atm4_trans = atm4_out %>% ACF(difference(box_cox(Cash,atm4_lambda),31)) %>% autoplot()

atm4_trans_data = atm4_out %>% mutate(Cash = difference(box_cox(Cash,atm4_lambda),31))

atm3_lambda = atm3_out %>%
  features(Cash, features = guerrero) %>%
  pull(lambda_guerrero)
## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value

## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value
atm3_trans = atm3_out %>% ACF(difference(box_cox(Cash,atm3_lambda),31)) %>% autoplot()

atm3_trans_data = atm3_out %>% mutate(Cash = difference(box_cox(Cash,atm3_lambda),31))



atm2_lambda = atm2_out %>%
  features(Cash, features = guerrero) %>%
  pull(lambda_guerrero)

atm2_trans = atm2_out %>% ACF(difference(box_cox(Cash,atm2_lambda),31)) %>% autoplot()

atm2_trans_data = atm2_out %>% mutate(Cash = difference(box_cox(Cash,atm2_lambda),31))

atm1_lambda = atm1_out %>%
  features(Cash, features = guerrero) %>%
  pull(lambda_guerrero)

atm1_trans = atm1_out %>% ACF(difference(box_cox(Cash,atm1_lambda),31)) %>% autoplot()

atm1_trans_data = atm1_out %>% mutate(Cash = difference(box_cox(Cash,atm1_lambda),31))

atm1_trans

atm2_trans

atm3_trans

atm4_trans

t1 = atm1_trans_data %>% drop_na() 
t2 = atm2_trans_data %>% drop_na()
t3 = atm3_trans_data %>% drop_na()
t4 = atm4_trans_data %>% drop_na()


adf.test(t1$Cash)
## Warning in adf.test(t1$Cash): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  t1$Cash
## Dickey-Fuller = -5.6217, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(t2$Cash)
## Warning in adf.test(t2$Cash): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  t2$Cash
## Dickey-Fuller = -5.877, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(t3$Cash)
## Warning in adf.test(t3$Cash): p-value greater than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  t3$Cash
## Dickey-Fuller = -0.28704, Lag order = 6, p-value = 0.99
## alternative hypothesis: stationary
adf.test(t4$Cash)
## Warning in adf.test(t4$Cash): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  t4$Cash
## Dickey-Fuller = -5.1504, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
t1 %>% autoplot()
## Plot variable not specified, automatically selected `.vars = Cash`

t2 %>% autoplot()
## Plot variable not specified, automatically selected `.vars = Cash`

t3 %>% autoplot()
## Plot variable not specified, automatically selected `.vars = Cash`

t4 %>% autoplot()
## Plot variable not specified, automatically selected `.vars = Cash`

Model Creation

After analyzing the summary statistics and visualizations the best model selection would be to use an auto ARIMA with the search configuration. Given the shape of the data and ETS model would not be appropriate. Financial data is difficult to predict especially in a situation where ATM popularity is also extrapolating the complexity of the data. An auto selection ARIMA would be best because the regressor is the data itself and any additional configurations are handled as best fit to the data.

atmfit_arima_seach1 = atm1_trans_data %>% model(search = ARIMA(Cash,stepwise=FALSE))

atmfit_arima_seach2 = atm2_trans_data %>% model(search = ARIMA(Cash,stepwise=FALSE))

atmfit_arima_seach3 = atm3_trans_data %>% model(search = ARIMA(Cash,stepwise=FALSE))

atmfit_arima_seach4 = atm4_trans_data %>% model(search = ARIMA(Cash,stepwise=FALSE))
report(atmfit_arima_seach1)
## Series: Cash 
## Model: ARIMA(0,0,0)(2,1,0)[7] 
## 
## Coefficients:
##          sar1     sar2
##       -0.5779  -0.2335
## s.e.   0.0557   0.0559
## 
## sigma^2 estimated as 5.274:  log likelihood=-741.05
## AIC=1488.1   AICc=1488.17   BIC=1499.74
report(atmfit_arima_seach2)
## Series: Cash 
## Model: ARIMA(5,0,0)(1,1,0)[7] 
## 
## Coefficients:
##          ar1      ar2      ar3     ar4      ar5     sar1
##       0.0337  -0.1288  -0.0543  0.0237  -0.1541  -0.4574
## s.e.  0.0551   0.0555   0.0555  0.0552   0.0554   0.0511
## 
## sigma^2 estimated as 91.7:  log likelihood=-1203.69
## AIC=2421.37   AICc=2421.69   BIC=2448.54
report(atmfit_arima_seach3)
## Series: Cash 
## Model: ARIMA(2,0,0) w/ mean 
## 
## Coefficients:
##          ar1     ar2  constant
##       0.7748  0.1798    1.8154
## s.e.  0.0541  0.0616    2.2763
## 
## sigma^2 estimated as 1323:  log likelihood=-1688.87
## AIC=3385.73   AICc=3385.85   BIC=3401.33
report(atmfit_arima_seach4)
## Series: Cash 
## Model: ARIMA(0,0,0)(2,0,0)[7] 
## 
## Coefficients:
##         sar1    sar2
##       0.2107  0.2306
## s.e.  0.0538  0.0545
## 
## sigma^2 estimated as 197.6:  log likelihood=-1371.3
## AIC=2748.59   AICc=2748.66   BIC=2760.29
glance(atmfit_arima_seach1)
## # A tibble: 1 x 8
##   .model sigma2 log_lik   AIC  AICc   BIC ar_roots   ma_roots 
##   <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <list>     <list>   
## 1 search   5.27   -741. 1488. 1488. 1500. <cpl [14]> <cpl [0]>
glance(atmfit_arima_seach2)
## # A tibble: 1 x 8
##   .model sigma2 log_lik   AIC  AICc   BIC ar_roots   ma_roots 
##   <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <list>     <list>   
## 1 search   91.7  -1204. 2421. 2422. 2449. <cpl [12]> <cpl [0]>
glance(atmfit_arima_seach3)
## # A tibble: 1 x 8
##   .model sigma2 log_lik   AIC  AICc   BIC ar_roots  ma_roots 
##   <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <list>    <list>   
## 1 search  1323.  -1689. 3386. 3386. 3401. <cpl [2]> <cpl [0]>
glance(atmfit_arima_seach4)
## # A tibble: 1 x 8
##   .model sigma2 log_lik   AIC  AICc   BIC ar_roots   ma_roots 
##   <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <list>     <list>   
## 1 search   198.  -1371. 2749. 2749. 2760. <cpl [14]> <cpl [0]>
atmfit_arima_seach1 %>% gg_tsresiduals()
## Warning: Removed 31 row(s) containing missing values (geom_path).
## Warning: Removed 37 rows containing missing values (geom_point).
## Warning: Removed 37 rows containing non-finite values (stat_bin).

atmfit_arima_seach1 %>% forecast(h=31) %>% autoplot()

atmfit_arima_seach2 %>% gg_tsresiduals()
## Warning: Removed 31 row(s) containing missing values (geom_path).
## Warning: Removed 35 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing non-finite values (stat_bin).

atmfit_arima_seach2 %>% forecast(h=31) %>% autoplot()

atmfit_arima_seach3 %>% gg_tsresiduals()
## Warning: Removed 31 row(s) containing missing values (geom_path).
## Warning: Removed 31 rows containing missing values (geom_point).
## Warning: Removed 31 rows containing non-finite values (stat_bin).

atmfit_arima_seach3 %>% forecast(h=31) %>% autoplot()

atmfit_arima_seach4 %>% gg_tsresiduals()
## Warning: Removed 31 row(s) containing missing values (geom_path).
## Warning: Removed 31 rows containing missing values (geom_point).
## Warning: Removed 31 rows containing non-finite values (stat_bin).

atmfit_arima_seach4 %>% forecast(h=31) %>% autoplot()

ar1_f = atmfit_arima_seach1 %>% forecast(h=31)
ar2_f = atmfit_arima_seach2 %>% forecast(h=31)
ar3_f = atmfit_arima_seach3 %>% forecast(h=31)
ar4_f = atmfit_arima_seach4 %>% forecast(h=31)


ar1_f$Label = 'ATM1'
ar2_f$Label = 'ATM2'
ar3_f$Label = 'ATM3'
ar4_f$Label = 'ATM4'
master_atm_df = rbind(as.data.frame(ar1_f[c('DATE','.mean','Label')]),as.data.frame(ar2_f[c('DATE','.mean','Label')]),as.data.frame(ar3_f[c('DATE','.mean','Label')]),as.data.frame(ar4_f[c('DATE','.mean','Label')]))

Power Forecast

Data Summary

Based on the below summary analytics the amount of missing data is less than ~2%. This will not impact the overall analysis and they can even be dropped from the data set. Based on the density chart the data appears to bi-modal. The ACF is indicative of a non white noise data set. Variances are constant showing a non-stationary data set.

The overall trend of the data indicates it is not season and that there is a cyclical trend involved here. A transformation seems necessary to create a stationary set. Given the lack of available predictors and large historical data set available an ARIMA model seems most appropriate.

colnames(power_data) = c('CS','DATE','KWH')
power_data = power_data %>% select(DATE,KWH)
power_data$DATE = yearmonth(as.Date(as.yearmon(power_data$DATE, "%Y-%b")))
power_data$KWH = as.numeric(power_data$KWH)

power_ts = as_tsibble(tibble(power_data),index='DATE')
plot_str(power_ts)
plot_density(power_ts)

plot_missing(power_ts)

plot_histogram(power_ts)

power_ts %>% ACF() %>% autoplot()
## Response variable not specified, automatically selected `var = KWH`

power_ts %>% autoplot(.vars=KWH)

atm4_out$Cash[tsoutliers(atm4_out$Cash)$index] = tsoutliers(atm4_out$Cash)$replacements


power_ts$KWH[tsoutliers(ts(power_ts$KWH, frequency=12))$index] = tsoutliers(ts(power_ts$KWH, frequency=12))$replacements

Model Preparation

The below transformation converted the data set to a stationary time series. The ACF also indicates that the data is still not a white noise making this modelable. The transformed data is also normally distributed now making this perfect for an ARIMA model.

power_lambda = power_ts %>%
  features(KWH, features = guerrero) %>%
  pull(lambda_guerrero)

power_trans = power_ts %>% ACF(difference(box_cox(KWH,power_lambda),12)) %>% autoplot()

power_trans_data = power_ts %>% mutate(KWH = difference(box_cox(KWH,power_lambda),12))

power_trans

power_trans_data %>% autoplot()
## Plot variable not specified, automatically selected `.vars = KWH`
## Warning: Removed 12 row(s) containing missing values (geom_path).

plot_density(power_trans_data)

t1_p = power_trans_data %>% drop_na()

adf.test(t1_p$KWH)
## Warning in adf.test(t1_p$KWH): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  t1_p$KWH
## Dickey-Fuller = -5.0936, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary

Model Creation

Creating the ARIMA model and the below diagnostics indicates that the model is most robust when self-selected. The below model is optimal and the residuals are normally distributed.

power_arima_search = power_trans_data %>% model(search = ARIMA(KWH,stepwise=FALSE))
report(power_arima_search)
## Series: KWH 
## Model: ARIMA(4,0,0)(2,0,0)[12] w/ mean 
## 
## Coefficients:
##          ar1      ar2     ar3      ar4     sar1     sar2  constant
##       0.2556  -0.0062  0.2442  -0.1481  -0.7166  -0.4151    0.0023
## s.e.  0.0750   0.0750  0.0742   0.0742   0.0739   0.0747    0.0008
## 
## sigma^2 estimated as 9.298e-05:  log likelihood=565.58
## AIC=-1115.16   AICc=-1114.38   BIC=-1089.1
glance(power_arima_search)
## # A tibble: 1 x 8
##   .model    sigma2 log_lik    AIC   AICc    BIC ar_roots   ma_roots 
##   <chr>      <dbl>   <dbl>  <dbl>  <dbl>  <dbl> <list>     <list>   
## 1 search 0.0000930    566. -1115. -1114. -1089. <cpl [28]> <cpl [0]>
power_arima_search %>% gg_tsresiduals()
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 14 rows containing missing values (geom_point).
## Warning: Removed 14 rows containing non-finite values (stat_bin).

power_arima_search %>% forecast(h=12) %>% autoplot()

pwr_ar1_f = power_arima_search %>% forecast(h=12)
master_pwer_df = pwr_ar1_f[c('DATE','.mean')]