DATA624 Project 1

Author

Banu

Published

October 22, 2024

Code

#Load Packages
library(arrow)
library(dplyr)
library(ggplot2)
library(reshape2)
library(tidymodels)
library(tidyr)
library(broom)
library(patchwork)
library(lubridate)
library(rpart.plot)
library(kableExtra)
library(DataExplorer)
library(skimr)
#install.packages("ranger")
#install.packages("openxlsx")
library(DescTools)
library(corrplot)
library(ggcorrplot)
library(caret)
library(rpart)
library(car)
library(rattle)
library(ROSE)
library(mice)
library(MASS)
library(fable)
library(imputeTS)
library(psych)
library(forecast)
library(openxlsx)
library(plotly)
library(naniar)
library(cowplot)
library(fpp3)

Project

Deliverable

This project consists of 3 parts - two required and one bonus.

Part A – ATM Forecast, ATM624Data.xlsx

Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx

Part C – BONUS, optional (part or all), Waterflow_Pipe1.xlsx and Waterflow_Pipe2.xlsx

PART A Forecasting Analysis

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.

Data Exploratory Analysis

Code

library(readxl)
# inputf1 <- read.csv("C:/Users/Banu/Documents/RScriptfiles/DATA 624/PROJECT 1/ATM624Data.csv",stringsAsFactors = F,header=TRUE)

inputf1 <- readxl::read_excel("C:/Users/Banu/Documents/RScriptfiles/DATA 624/PROJECT 1/ATM624Data.xlsx")


#Glimpse variables
introduce(inputf1)

# A tibble: 1 × 9
   rows columns discrete_columns continuous_columns all_missing_columns
  <int>   <int>            <int>              <int>               <int>
1  1474       3                1                  2                   0
# ℹ 4 more variables: total_missing_values <int>, complete_rows <int>,
#   total_observations <int>, memory_usage <dbl>

Code

plot_intro(inputf1)

Code

glimpse(inputf1)

Rows: 1,474
Columns: 3
$ DATE <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
$ ATM  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
$ Cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …

Code

inputf1$DATE <- as.Date(as.numeric(inputf1$DATE),origin = "1899-12-30")
#sorted by date
inputf1 <- inputf1[order(inputf1$DATE),]
inputf1$ATM <- as.factor(inputf1$ATM)


#display
knitr::kable(inputf1[1:10, 1:3], 
             caption = 'ATM data',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()

ATM data
DATE	ATM	Cash
2009-05-01	ATM1	96.0000
2009-05-01	ATM2	107.0000
2009-05-01	ATM3	0.0000
2009-05-01	ATM4	776.9934
2009-05-02	ATM1	82.0000
2009-05-02	ATM2	89.0000
2009-05-02	ATM3	0.0000
2009-05-02	ATM4	524.4180
2009-05-03	ATM1	85.0000
2009-05-03	ATM2	90.0000

Code

knitr::kable(skim(inputf1), 
             format="html") %>% 
  kableExtra::kable_styling()

skim_type	skim_variable	n_missing	complete_rate	Date.min	Date.max	Date.median	Date.n_unique	factor.ordered	factor.n_unique	factor.top_counts	numeric.mean	numeric.sd	numeric.p0	numeric.p25	numeric.p50	numeric.p75	numeric.p100	numeric.hist
Date	DATE	0	1.0000000	2009-05-01	2010-05-14	2009-11-01	379	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
factor	ATM	14	0.9905020	NA	NA	NA	NA	FALSE	4	ATM: 365, ATM: 365, ATM: 365, ATM: 365	NA	NA	NA	NA	NA	NA	NA	NA
numeric	Cash	19	0.9871099	NA	NA	NA	NA	NA	NA	NA	155.582	376.4568	0	0.5	73	114	10919.76	▇▁▁▁▁

Review missing variables

ATM 4 as more cash withdrawals.ATM 3 only has few data points near the end of the year, could be a new machine.ATM 4 Has one usually high outlier so we may have to impute this. Remove values that have both ATM and cash has NA. Then convert the data to tsibble.

Code

# Plot cash taken out 
plot1 <- inputf1 %>%
  group_by(ATM) %>%
  plot_ly(x= ~DATE) %>%
  add_lines(y= ~Cash, color = ~factor(ATM)) %>%
  layout(title = "ATM Withdwals by Date")

plot1

Code

#Look at missing values
plot_missing(inputf1)

Code

knitr::kable(miss_var_summary(inputf1), 
             caption = 'Missing data',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()

Missing data
variable	n_miss	pct_miss
Cash	19	1.29
ATM	14	0.950
DATE	0	0

Code

gg_miss_upset(inputf1)

Code

# summarize the missing data by ATM
inputf1 %>% 
  group_by(as.factor(ATM)) %>%
  summarise(`Missing Values` = sum(is.na(Cash)))

# A tibble: 5 × 2
  `as.factor(ATM)` `Missing Values`
  <fct>                       <int>
1 ATM1                            3
2 ATM2                            2
3 ATM3                            0
4 ATM4                            0
5 <NA>                           14

Code

# Drop any row that are missing both ATM and Cash values
tsibbleATMclean <-   inputf1%>%
  filter(!(is.na(ATM) & is.na(Cash))) %>%
  drop()
# 
# #Create a Tsibble object
#   #tsibbleATM<- read_excel("ATM624DATA.xlsx") %>%
#   # converting into date format
#   tsibbleATMclean
#   
#   %>% mutate(Date = as.Date(as.numeric(DATE),origin = "1899-12-30"),ATM= as.factor(ATM),Cash = as.numeric(Cash)) %>%
#   # renaming column name
#   rename(Date = DATE) %>%
#   # converting to tsibble
# tsibbleATM<-  tsibbleATMclean %>%  mutate(DATE=DATE,ATM=ATM,Cash=Cash) %>%
# as_tsibble(index = Date, key = ATM) 

tsibbleATM <- as_tsibble(tsibbleATMclean, index=DATE, key=ATM)
  
# Confirm 14 rows were dropped
print(c('Original: ', nrow(inputf1), 'After Drop:', nrow(tsibbleATMclean)))

[1] "Original: "  "1474"        "After Drop:" "1460"

Code

tsibbleATM %>%
  autoplot(Cash) +
  facet_wrap(~ATM, scales = "free", nrow = 4) +
  labs(title = " ATM Before Outlier Removal")

Code

 summary(tsibbleATM)

      DATE              ATM           Cash        
 Min.   :2009-05-01   ATM1:365   Min.   :    0.0  
 1st Qu.:2009-07-31   ATM2:365   1st Qu.:    0.5  
 Median :2009-10-30   ATM3:365   Median :   73.0  
 Mean   :2009-10-30   ATM4:365   Mean   :  155.6  
 3rd Qu.:2010-01-29              3rd Qu.:  114.0  
 Max.   :2010-04-30              Max.   :10919.8  
                                 NA's   :5

Review distributions

Let’s take a look at the distributions for the other three ATMs that have steady cash withdrawals first.

Code

plot1 <- tsibbleATM |>
  filter(ATM != "ATM3") |>
  ggplot(aes(x=Cash)) +
  geom_histogram(aes(fill=ATM), bins=30) +
  facet_wrap(~ATM, nrow=1, scales = "free_x") +
  theme(legend.position="none") +
  labs(title = "Histograms of ATM Cash Withdrawals", y = "Count", x = "Cash (Hundreds of $)")

plot2 <- tsibbleATM |>
  filter(ATM != "ATM3") |>
  ggplot(aes(x=Cash)) +
  geom_boxplot(aes(fill=ATM)) +
  facet_wrap(~ATM, nrow=1, scales = "free_x") +
  theme(legend.position="none") +
  labs(title = "Boxplots of ATM Cash Withdrawals", y = "", x = "Cash (Hundreds of $)")

plot_grid(plot1, plot2, ncol=1)

Impute missing values

ATM1 is missing 3 values for Cash. The data appears to be bimodally distributed as can be seen from the boxplot, we will use median imputation for this ATM.

ATM2 is missing 2 values. ATM2 appears to be bimodally distributed with about equal observations in each peak. We will use mode imputation for this ATM.

We will use median imputation to replace the large outlier in ATM4.

Code

# Various NA plots to inspect data
knitr::kable(miss_var_summary(tsibbleATM), 
             caption = 'Missing Values',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()

Missing Values
variable	n_miss	pct_miss
Cash	5	0.342
DATE	0	0
ATM	0	0

Code

tsibbleATM |>
  filter(is.na(Cash)) |>
  knitr::kable()

DATE	ATM	Cash
2009-06-13	ATM1	NA
2009-06-16	ATM1	NA
2009-06-22	ATM1	NA
2009-06-18	ATM2	NA
2009-06-24	ATM2	NA

Code

# function for mode
mode <- function(x) {
   uniq_x <- unique(x)
   uniq_x[which.max(tabulate(match(x, uniq_x)))]
}

tsibbleATM_imputed <- tsibbleATM |>
  mutate(Cash = ifelse(Cash == max(tsibbleATM$Cash, na.rm = T), NA, Cash)) |>
  group_by(ATM) |>
  mutate(Cash = case_when(
    is.na(Cash) & ATM %in% c("ATM1", "ATM4") ~ median(Cash, na.rm=T),
    is.na(Cash) & ATM == "ATM2" ~ mode(Cash),
    TRUE ~ Cash
  ))

#no missing
tsibbleATM_imputed |>
  filter(is.na(Cash)) |>
  knitr::kable()

DATE	ATM	Cash

Code

tsibbleATM_imputed |>
  autoplot(Cash) +
  facet_wrap(~ATM, ncol=1, scales = "free_y") +
  labs(title = "ATM Withdrawal with imputed for Four ATMs (May 2009 - April 2010)")

ATM1 and ATM2 both require seasonal differencing to make the data stationary. So, we can check differencing with a lag 7 to see what happens.The spikes on the ACF are still higher than we want in a residual after differencing.

Code

tsibbleATM_imputed |>
  ACF(Cash) |>
  autoplot() +
  facet_wrap(~ATM, ncol=1, scales = "free_y") +
  labs(title = "ACF Plots after imputation")

Code

tsibbleATM_imputed |>
  features(Cash, unitroot_kpss) |>
  knitr::kable()

ATM	kpss_stat	kpss_pvalue
ATM1	0.4651358	0.0495190
ATM2	1.9660900	0.0100000
ATM3	0.3891606	0.0818273
ATM4	0.1180573	0.1000000

Code

tsibbleATM_imputed |>
  features(Cash, unitroot_ndiffs)

# A tibble: 4 × 2
  ATM   ndiffs
  <fct>  <int>
1 ATM1       1
2 ATM2       1
3 ATM3       0
4 ATM4       0

Code

# atm_finimp |>
#   features(Cash, unitroot_kpss) |>
#   knitr::kable()
# 
# atm_finimp |>
#   features(Cash, unitroot_ndiffs)

tsibbleATM_imputed  |>
  features(difference(Cash,7), unitroot_ndiffs)

# A tibble: 4 × 2
  ATM   ndiffs
  <fct>  <int>
1 ATM1       0
2 ATM2       0
3 ATM3       0
4 ATM4       0

Code

tsibbleATM_imputed |>
  filter(ATM == "ATM1") |>
  gg_tsdisplay(difference(Cash, lag=7), 
               plot_type="partial") +
  labs(title = "ATM1: Seasonally Differenced")

Code

tsibbleATM_imputed |>
  filter(ATM == "ATM2") |>
  gg_tsdisplay(Cash, 
               plot_type="partial") +
  labs(title = "ATM2")

Code

tsibbleATM_imputed |>
  filter(ATM == "ATM4") |>
  gg_tsdisplay(difference(Cash, lag=7), 
               plot_type="partial") +
  labs(title = "ATM4: Seasonally Differenced")

Code

# df = with(ATM.df, ATM.df[(DATE >= "2009-07-01" & DATE <= "2009-08-31"), ])
ts = ggplot(tsibbleATM_imputed, aes(x = DATE, y = Cash, group = ATM, color = ATM)) +
  geom_line() + geom_point() +
  labs(title = "ATM Cash Withdrawal", subtitle ="1 July, 2009 to 31 August, 2009",
       x = "Date") +
  scale_y_continuous("Amount of Cash Withdrawal",
                     labels = scales::dollar_format(scale = 0.1, suffix = "K")) +
  theme_minimal() #+ transition_reveal(DATE)

ts

TimeSeries Models

Code

# Transform the dataframe into a time series
temp = tsibbleATM_imputed %>% drop_na() %>% spread(ATM, Cash)
ATM.ts = ts(temp %>% dplyr::select(-DATE), frequency = 7)

ATM1 Subseries using TS

From the time series plot, we can see seasonality in this set, weekly as previously reviewed. While there is no steady trend over the weeks, there are increasing and decreasing activities over periods. The ACF shows slow decrease in every 7th lags due to the trend.Plotting subseries to show the variations during the week

Code

# ggtsdisplay(ATM.ts[,1], main = "ATM #1 Cash Withdrawal", ylab = "Cash Withdrawal (in hundreds)", xlab = "Week")

ggsubseriesplot(ATM.ts[,1]) +
  labs(title = "ATM #1 Cash Withdrawal", subtitle ="1 May, 2009 to 30 April, 2010",
       x = "Days of the Week") +
  scale_y_continuous("Amount of Cash Withdrawal",
                     labels = scales::dollar_format(scale = 0.1, suffix = "K"))

Code

df_atm_1 <- tsibbleATM_imputed |>

  filter(ATM == "ATM1")
df_atm_1 %>%
  ACF(Cash)%>%
  autoplot()

ATM 1 Transformation boxcox using TS class

We can try lambda to stabilize the variance similar to the above PART A analysis. The AICc is 2339.304 for the ETS model.

Code

bct = function(ts, title){
  a = autoplot(ts) +
    labs(title = sprintf("Before: %s", title), x = "Week") +
    scale_y_continuous("Amount of Cash Withdrawal",
                     labels = scales::dollar_format(scale = 0.1, suffix = "K"))
  lambda = BoxCox.lambda(ts)
  at = autoplot(BoxCox(ts, lambda)) +
    labs(title = sprintf("Transformed: %s", title),
         subtitle = sprintf("lambda = %0.2f", lambda),
         y = "Box-Cox Transformed",
         x = "Week")
  gridExtra::grid.arrange(a, at)
}
lambda = BoxCox.lambda(ATM.ts[,1])
bct(ATM.ts[,1], title = "ATM #1 Cash Withdrawal")

Code

ATM.ts[,1] %>% stl(s.window = 7, robust = TRUE) %>%
  autoplot()  +
  labs(title = "Seasonal & Trend Decomposition for ATM #1 Time Series", x = "Week")

Code

t<- ATM.ts[,1] %>% ets(lambda = lambda, biasadj = TRUE)
t

ETS(A,N,A) 

Call:
ets(y = ., lambda = lambda, biasadj = TRUE)

  Box-Cox transformation: lambda= 0.2446 

  Smoothing parameters:
    alpha = 1e-04 
    gamma = 0.3551 

  Initial states:
    l = 7.5451 
    s = -4.2496 0.846 0.4385 0.5304 1.1743 0.8669
           0.3935

  sigma:  1.2697

     AIC     AICc      BIC 
2338.683 2339.304 2377.682

Code

t %>% autoplot()  +
  labs(title = "ETS for ATM #1 Time Series", x = "Week")

ATM 1 STL and ETS and holtwinter methods using tsibble

I have tried some seasonal methods such as STL and holtwinter since the above data has seasonality and stationarity to it.

Code

fit_dcmp <- df_atm_1  |>
  model(stlf = decomposition_model(
    STL(Cash ~ trend(window = 7), robust = TRUE),
    NAIVE(season_adjust)
  ))
fit_dcmp |>
  forecast() |>
  autoplot(df_atm_1)+
  labs(y = "cash withdrawal",
       title = "ATM1 STL")

Code

fit_dcmp |> gg_tsresiduals()

Code

augment(fit_dcmp)

# A tsibble: 365 x 7 [1D]
# Key:       ATM, .model [1]
   ATM   .model DATE        Cash .fitted  .resid  .innov
   <fct> <chr>  <date>     <dbl>   <dbl>   <dbl>   <dbl>
 1 ATM1  stlf   2009-05-01    96    NA   NA      NA     
 2 ATM1  stlf   2009-05-02    82    NA   NA      NA     
 3 ATM1  stlf   2009-05-03    85    NA   NA      NA     
 4 ATM1  stlf   2009-05-04    90    NA   NA      NA     
 5 ATM1  stlf   2009-05-05    99    NA   NA      NA     
 6 ATM1  stlf   2009-05-06    88    NA   NA      NA     
 7 ATM1  stlf   2009-05-07     8    NA   NA      NA     
 8 ATM1  stlf   2009-05-08   104   100.   3.62    3.62  
 9 ATM1  stlf   2009-05-09    87    90.4 -3.43   -3.43  
10 ATM1  stlf   2009-05-10    93    93.0  0.0293  0.0293
# ℹ 355 more rows

Code

fc <- fit_dcmp  |>
  forecast(h = 24)

fc  |>
autoplot(
    df_atm_1,level=NULL)+
labs(y = "cash withdrawal",
       title = "ATM1 STL forecast")

Code

fit3 <- df_atm_1 |>
  model(
    additive = ETS(Cash ~ error("A") + trend("A") +
                                                season("A")),
    multiplicative = ETS(Cash ~ error("M") + trend("A") +
                                                season("M"))
  )
fc3 <- fit3 |> forecast(h = 24)
fc3 |>
  autoplot(df_atm_1, level = NULL) +
  labs(title="ATM1 ETS",
       y="cash withdrawal") +
  guides(colour = guide_legend(title = "ATM1 STL"))

Code

#augment(fit3)

fit2 <- df_atm_1 %>%
  filter(DATE <= "2010-04-30") |>
  model(
    hw = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
  ) |>
  forecast(h = "2 weeks") |>
  autoplot(df_atm_1 |> filter(DATE <= "2010-05-14")) +
  labs(title = "ATM1 Holt Winter",
       y="cash withdrawal")

fit2a <- df_atm_1 %>%
  filter(DATE <= "2010-04-30") |>
  model(
    hw = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
  )
fit2

Code

report(fit2a)

Series: Cash 
Model: ETS(M,Ad,M) 
  Smoothing parameters:
    alpha = 0.1646764 
    beta  = 0.0001000035 
    gamma = 0.3619369 
    phi   = 0.9760372 

  Initial states:
     l[0]      b[0]      s[0]     s[-1]    s[-2]    s[-3]    s[-4]     s[-5]
 82.93848 -0.917707 0.2975043 0.9715482 1.231799 1.047188 1.103285 0.9785181
    s[-6]
 1.370158

  sigma^2:  0.1368

     AIC     AICc      BIC 
4573.270 4574.307 4623.969

Code

fit2b <- fit2a %>% forecast(h = "2 weeks")
fit2b

# A fable: 14 x 5 [1D]
# Key:     ATM, .model [1]
   ATM   .model DATE               Cash  .mean
   <fct> <chr>  <date>           <dist>  <dbl>
 1 ATM1  hw     2010-05-01   N(83, 951)  83.4 
 2 ATM1  hw     2010-05-02  N(95, 1277)  95.1 
 3 ATM1  hw     2010-05-03   N(70, 704)  69.6 
 4 ATM1  hw     2010-05-04  N(5.1, 3.9)   5.12
 5 ATM1  hw     2010-05-05 N(100, 1527)  99.6 
 6 ATM1  hw     2010-05-06   N(78, 973)  78.4 
 7 ATM1  hw     2010-05-07  N(84, 1141)  83.8 
 8 ATM1  hw     2010-05-08  N(84, 1455)  84.0 
 9 ATM1  hw     2010-05-09  N(96, 1936)  95.9 
10 ATM1  hw     2010-05-10  N(70, 1059)  70.1 
11 ATM1  hw     2010-05-11  N(5.2, 5.8)   5.16
12 ATM1  hw     2010-05-12 N(100, 2262) 100.  
13 ATM1  hw     2010-05-13  N(79, 1431)  79.0 
14 ATM1  hw     2010-05-14  N(84, 1666)  84.4

Code

#augment(fit2a)

fit2c <- df_atm_1 %>%
  filter(DATE <= "2010-04-30") |>
  model(
    hw = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
  ) %>% augment() 

autoplot(fit2c, .innov) +
  labs(y = "$US",
       title = "Residuals from the HW method")

Code

fit2c  |>
  ggplot(aes(x = .innov)) +
  geom_histogram() +
  labs(title = "Histogram of HW residuals")

Code

fit2c  |>
  ACF(.innov) |>
  autoplot() +
  labs(title = "Residuals from the HW method")

Code

components(fit2a) |>
  autoplot() +
  labs(title = "ATM1 Holtwinter")

Code

components(fit3) |>
  autoplot() +
  labs(title = "ATM1 ETS, additive and multiplicative")

ATM 1 ARIMA

We can use seasonal ARIMA modeling to forecast future withdrawals for each ATM.

Let’s use the April 2010 data as a validation set and the rest of the data for training.

We will use an automatically generated ARIMA model and compare with our assumptions from the ACF and PACF plots.

Code

atm1_mod <- df_atm_1 |>
  model(auto = ARIMA(Cash),
        arima001011 = ARIMA(Cash ~ 0 + pdq(0,0,1) + PDQ(0,1,1, period=7)))

atm1_mod |>
  knitr::kable()

ATM	auto	arima001011
ATM1	<ARIMA(0,0,1)(0,1,2)[7]>	<ARIMA(0,0,1)(0,1,1)[7]>

Code

report(atm1_mod) |>
  dplyr::select(.model:BIC) |>
  knitr::kable()

.model	sigma2	log_lik	AIC	AICc	BIC
auto	559.7744	-1641.115	3290.231	3290.344	3305.753
arima001011	564.7218	-1643.061	3292.121	3292.189	3303.763

Code

augment(atm1_mod) |>
  filter(.model == "auto") |>
  features(.innov, ljung_box, lag=14)

# A tibble: 1 × 4
  ATM   .model lb_stat lb_pvalue
  <fct> <chr>    <dbl>     <dbl>
1 ATM1  auto      16.8     0.268

Code

augment(atm1_mod) |>
  filter(.model == "arima001011") |>
  features(.innov, ljung_box, lag=14)

# A tibble: 1 × 4
  ATM   .model      lb_stat lb_pvalue
  <fct> <chr>         <dbl>     <dbl>
1 ATM1  arima001011    19.5     0.147

Code

auto.arima(df_atm_1)

Series: df_atm_1 
ARIMA(0,0,1)(0,1,2)[7] 

Coefficients:
         ma1     sma1     sma2
      0.1695  -0.5867  -0.0989
s.e.  0.0553   0.0508   0.0497

sigma^2 = 559.8:  log likelihood = -1641.12
AIC=3290.23   AICc=3290.34   BIC=3305.75

Code

df_atm_1 %>%

  model(auto = ARIMA(Cash ~ 0 + pdq(0,0,1) + PDQ(0,1,2, period=7)))%>%
       
  residuals() %>% gg_tsdisplay()

Code

fc_atm_1A <- atm1_mod %>%
  forecast(h= 31)

fc_atm_1A %>%
  autoplot(df_atm_1 , level = 80) +
  labs(y = "Cash in hundreds of dollars",
       title = "ATM1 Forecast in May 2010")

ATM2 Subseries, lambda transformation and SNAIVE, ETS

Code

ggsubseriesplot(ATM.ts[,2]) +
  labs(title = "ATM #2 Cash Withdrawal", subtitle ="1 May, 2009 to 30 April, 2010",
       x = "Days of the Week") +
  scale_y_continuous("Amount of Cash Withdrawal",
                     labels = scales::dollar_format(scale = 0.1, suffix = "K"))

Code

df_atm_2 <- tsibbleATM_imputed |>

  filter(ATM == "ATM2")
df_atm_2 %>%
  ACF(Cash)%>%
  autoplot()

Code

lambda_2 <- df_atm_2%>%
  features(Cash, features = guerrero) %>%
  pull(lambda_guerrero)

fit_atm_2_a <- df_atm_2%>%
  model(SNAIVE = SNAIVE(box_cox(Cash, lambda_2)),
        ETS = ETS((box_cox(Cash, lambda_2) ~ error("A") + trend("A") +
                                                season("A"))))

fit_atm_2_b <- df_atm_2%>%
  model(ETS = ETS((box_cox(Cash, lambda_2) ~ error("A") + trend("A") +
                                                season("A"))))
fit_atm_2_c <- df_atm_2%>%
  model(SNAIVE = SNAIVE(box_cox(Cash, lambda_2)))

fc_atm_2 <- fit_atm_2_a %>%
  forecast(h= 31)

fc_atm_2%>%
  autoplot(df_atm_2, level = 80) +
  labs(y = "",
       title = latex2exp::TeX(paste0(
         "Transformed gas production with $\\lambda$ = ",
         round(lambda_2,2))))

Code

fit_atm_2_b |> gg_tsresiduals()

Code

fit_atm_2_c |> gg_tsresiduals()

Code

augment(fit_atm_2_a)%>%
  features(.innov, ljung_box, lag = 7)

# A tibble: 2 × 4
  ATM   .model lb_stat lb_pvalue
  <fct> <chr>    <dbl>     <dbl>
1 ATM2  ETS       24.5  0.000934
2 ATM2  SNAIVE    92.9  0

ATM2 ARIMA

Code

atm2 <- df_atm_2 |>
  filter(ATM == "ATM2")

atm2_mod <- atm2 |>
  model(auto = ARIMA(Cash),
        arima000011 = ARIMA(Cash ~ 0 + pdq(0,0,0) + PDQ(0,1,1, period=7)),
        arima000012 = ARIMA(Cash ~ 0 + pdq(0,0,0) + PDQ(0,1,2, period=7)))

atm2_mod |>
  knitr::kable()

ATM	auto	arima000011	arima000012
ATM2	<ARIMA(2,0,2)(0,1,1)[7]>	<ARIMA(0,0,0)(0,1,1)[7]>	<ARIMA(0,0,0)(0,1,2)[7]>

Code

report(atm2_mod) |>
  dplyr::select(.model:BIC) |>
  knitr::kable()

.model	sigma2	log_lik	AIC	AICc	BIC
auto	588.0783	-1649.266	3310.531	3310.771	3333.814
arima000011	620.7961	-1660.382	3324.764	3324.798	3332.525
arima000012	622.0567	-1660.237	3326.473	3326.541	3338.115

Code

augment(atm2_mod)%>%
  features(.innov, ljung_box, lag = 7)

# A tibble: 3 × 4
  ATM   .model      lb_stat lb_pvalue
  <fct> <chr>         <dbl>     <dbl>
1 ATM2  arima000011   21.7    0.00283
2 ATM2  arima000012   21.6    0.00293
3 ATM2  auto           6.12   0.526

Code

fc_atm_2A <- atm2_mod %>%
  forecast(h= 31)

fc_atm_2A %>%
  autoplot(df_atm_2 , level = 80) +
  labs(y = "Cash in hundreds of dollars",
       title = "ATM1 Forecast in May 2010")

Code

auto.arima(df_atm_2)

Series: df_atm_2 
ARIMA(2,0,2)(0,1,1)[7] 

Coefficients:
          ar1      ar2     ma1     ma2     sma1
      -0.4215  -0.9097  0.4549  0.7936  -0.7489
s.e.   0.0551   0.0435  0.0849  0.0591   0.0387

sigma^2 = 588.1:  log likelihood = -1649.27
AIC=3310.53   AICc=3310.77   BIC=3333.81

Code

df_atm_2 %>%

  model(auto = ARIMA(Cash ~ 0 + pdq(2,0,2) + PDQ(0,1,1, period=7)))%>%
       
  residuals() %>% gg_tsdisplay()

ATM3

Code

ggsubseriesplot(ATM.ts[,3]) +
  labs(title = "ATM #3 Cash Withdrawal", subtitle ="1 May, 2009 to 30 April, 2010",
       x = "Days of the Week") +
  scale_y_continuous("Amount of Cash Withdrawal",
                     labels = scales::dollar_format(scale = 0.1, suffix = "K"))

ATM4 Subseries, lambda transformation , ETS and SNAIVE

Code

ggtsdisplay(ATM.ts[,4], main = "ATM #4 Cash Withdrawal", ylab = "Cash Withdrawal (in hundreds)", xlab = "Week")

Code

ggsubseriesplot(ATM.ts[,4]) +
  labs(title = "ATM #4 Cash Withdrawal", subtitle ="1 May, 2009 to 30 April, 2010",
       x = "Days of the Week") +
  scale_y_continuous("Amount of Cash Withdrawal",
                     labels = scales::dollar_format(scale = 0.1, suffix = "K"))

Code

df_atm_4 <- tsibbleATM_imputed |>
  filter(ATM == "ATM4")

lambda_4 <- df_atm_4 %>%
  features(Cash, features = guerrero) %>%
  pull(lambda_guerrero)

fit_atm_4_a <- df_atm_4%>%
  model(SNAIVE = SNAIVE(box_cox(Cash, lambda_4)),
        ETS = ETS((box_cox(Cash, lambda_4) ~ error("A") + trend("A") +
                                                season("A"))))

fit_atm_4_b <- df_atm_4%>%
  model(ETS = ETS((box_cox(Cash, lambda_4) ~ error("A") + trend("A") +
                                                season("A"))))
fit_atm_4_c <- df_atm_4%>%
  model(SNAIVE = SNAIVE(box_cox(Cash, lambda_4)))


fc_atm_4 <- fit_atm_4_a %>%
  forecast(h= 31)

fc_atm_4%>%
  autoplot(df_atm_4, level = 80) +
  labs(y = "Cash in hundreds of dollars",
       title = "ATM4 Forecast in May 2010")

ATM4 ARIMA

Code

atm4 <- tsibbleATM_imputed |>
  filter(ATM == "ATM4")

atm4_mod <- atm4 |>
  model(auto = ARIMA(Cash),
        arima000100 = ARIMA(Cash ~ 0 + pdq(0,0,0) + PDQ(1,0,0, period=7)))

atm4_mod |>
  knitr::kable()

ATM	auto	arima000100
ATM4	<ARIMA(3,0,2)(1,0,0)[7] w/ mean>	<ARIMA(0,0,0)(1,0,0)[7]>

Code

report(atm4_mod) |>
  dplyr::select(.model:BIC) |>
  knitr::kable()

.model	sigma2	log_lik	AIC	AICc	BIC
auto	117510.1	-2645.176	5306.353	5306.757	5337.552
arima000100	171792.3	-2719.439	5442.878	5442.911	5450.678

Code

fc_atm_4A <- atm4_mod %>%
  forecast(h= 31)

fc_atm_4A %>%
  autoplot(atm4 , level = 80) +
  labs(y = "Cash in hundreds of dollars",
       title = "ATM4 Forecast in May 2010")

Code

auto.arima(atm4)

Series: atm4 
ARIMA(0,0,3)(1,0,0)[7] with non-zero mean 

Coefficients:
         ma1     ma2     ma3    sar1      mean
      0.0874  0.0161  0.0886  0.1838  444.6095
s.e.  0.0532  0.0583  0.0516  0.0526   26.0987

sigma^2 = 119377:  log likelihood = -2648.96
AIC=5309.93   AICc=5310.16   BIC=5333.33

Code

atm4 %>%

  model(auto = ARIMA(Cash ~ 0 + pdq(3,0,2) + PDQ(1,0,0, period=7)))%>%
       
  residuals() %>% gg_tsdisplay()

Forecast

Let’s do ARIMA forecast and NAIVE for the ATM3 model and show forecast plots below.

Code

atm_train <- tsibbleATM_imputed |>
  filter(DATE < "2010-04-01")

atms_mod <- atm_train |>
  filter(ATM != "ATM3") |>
  model(ARIMA(Cash))

fc <- atms_mod |>
  forecast(h=61)

Code

atm3 <- tsibbleATM_imputed |>
  filter(ATM == "ATM3")

atm3_mod <- atm3 |>
  model(NAIVE(Cash))

atm3_fc <- atm3_mod |>
  forecast(h=31)

atm3_fc%>%
  autoplot(atm3, level = 80) +
  labs(y = "",
       title = 
         "ATM3 with NAIVE ",
         )

Code

fc <- fc |>
  bind_rows(atm3_fc)

fc |>
  autoplot(tsibbleATM_imputed)

Forecast - Plot residuals

White we have tried several types of models. We have proceeded with ARIMA as our final forecasts.

Code

atms_mod |>
  filter(ATM == "ATM1") |>
  gg_tsresiduals() +
  labs(title = "ATM1")

Code

atms_mod |>
  filter(ATM == "ATM2") |>
  gg_tsresiduals() +
  labs(title = "ATM2")

Code

atms_mod |>
  filter(ATM == "ATM4") |>
  gg_tsresiduals() +
  labs(title = "ATM4")

Code

train4 <- atm_train|>
  filter(ATM == "ATM4")

lambda_4 <- train4 %>%
  features(Cash, features = guerrero) %>%
  pull(lambda_guerrero)

fit_atm_4_a <- train4 %>%
  model(SNAIVE = SNAIVE(box_cox(Cash, lambda_4)),
        ETS = ETS((box_cox(Cash, lambda_4) ~ error("A") + trend("A") +
                                                season("A"))))

fit_atm_4_b <- train4 %>%
  model(ETS = ETS((box_cox(Cash, lambda_4) ~ error("A") + trend("A") +
                                                season("A"))))
fit_atm_4_c <- train4 %>%
  model(SNAIVE = SNAIVE(box_cox(Cash, lambda_4)))


fc_atm_4 <- fit_atm_4_a %>%
  forecast(h= 31)

fc_atm_4%>%
  autoplot(train4 , level = 80) +
  labs(y = "Cash in hundreds of dollars",
       title = "ATM4 Forecast in May 2010")

Code

fit_atm_4_c |>
  
  gg_tsresiduals() +
  labs(title = "ATM4")

Code

fit_atm_4_b |>
  
  gg_tsresiduals() +
  labs(title = "ATM4")

Code

fc_atm_4b <- fit_atm_4_b %>%
  forecast(h= 61)

fc_atm_4b%>%
  autoplot(train4 , level = 80) +
  labs(y = "Cash in hundreds of dollars",
       title = "ATM4 Forecast in May 2010")

Forecast - Write to file final forecast

Write out Arima for the forecasts. For ATM4, the model for ETS was better on review, therefore I have forecast ETS for ATM4.

Code

may_forecasts <- fc |>
  filter(DATE >= "2010-05-01")

may_forecasts

# A fable: 124 x 5 [1D]
# Key:     ATM, .model [4]
   ATM   .model      DATE              Cash .mean
   <fct> <chr>       <date>          <dist> <dbl>
 1 ATM1  ARIMA(Cash) 2010-05-01  N(91, 899)  90.7
 2 ATM1  ARIMA(Cash) 2010-05-02 N(114, 899) 114. 
 3 ATM1  ARIMA(Cash) 2010-05-03  N(70, 899)  70.0
 4 ATM1  ARIMA(Cash) 2010-05-04  N(23, 899)  23.5
 5 ATM1  ARIMA(Cash) 2010-05-05 N(105, 899) 105. 
 6 ATM1  ARIMA(Cash) 2010-05-06  N(86, 955)  86.4
 7 ATM1  ARIMA(Cash) 2010-05-07  N(91, 957)  91.1
 8 ATM1  ARIMA(Cash) 2010-05-08  N(91, 957)  90.7
 9 ATM1  ARIMA(Cash) 2010-05-09 N(114, 957) 114. 
10 ATM1  ARIMA(Cash) 2010-05-10  N(70, 957)  70.0
# ℹ 114 more rows

Code

may_forecasts_ets <- fc_atm_4b|>
  filter(DATE >= "2010-05-01")
may_forecasts_ets

# A fable: 31 x 5 [1D]
# Key:     ATM, .model [1]
   ATM   .model DATE                Cash .mean
   <fct> <chr>  <date>            <dist> <dbl>
 1 ATM4  ETS    2010-05-01 t(N(25, 101))  516.
 2 ATM4  ETS    2010-05-02 t(N(25, 102))  531.
 3 ATM4  ETS    2010-05-03 t(N(25, 102))  506.
 4 ATM4  ETS    2010-05-04 t(N(24, 102))  461.
 5 ATM4  ETS    2010-05-05 t(N(23, 102))  434.
 6 ATM4  ETS    2010-05-06 t(N(12, 103))  160.
 7 ATM4  ETS    2010-05-07 t(N(26, 103))  561.
 8 ATM4  ETS    2010-05-08 t(N(25, 103))  516.
 9 ATM4  ETS    2010-05-09 t(N(25, 103))  531.
10 ATM4  ETS    2010-05-10 t(N(25, 103))  506.
# ℹ 21 more rows

Code

autoplot(may_forecasts_ets)

Code

openxlsx::write.xlsx(data.frame(may_forecasts), "forecasts_ATM_Banu_ARIMA.xlsx")
openxlsx::write.xlsx(data.frame(may_forecasts_ets), "forecasts_ATM_Banu_ETSATM4.xlsx")

Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

Data Exploration & Plots

First we start reviewing the data and identify any missing variables. The data consists 192 rows and 3 columns. CaseSequence is redundant and can be removed. The date column (YYYY.MMM) is coded as a character. For timeseries analysis we will mutate and create a tsibble with the date after formatting. September 2008 is missing the recorded KWH.

Code

set.seed(2024)
Filepower <- readxl::read_excel("C:/Users/Banu/Documents/RScriptfiles/DATA 624/PROJECT 1/ResidentialCustomerForecastLoad-624.xlsx")


#Glimpse variables
introduce(Filepower)

# A tibble: 1 × 9
   rows columns discrete_columns continuous_columns all_missing_columns
  <int>   <int>            <int>              <int>               <int>
1   192       3                1                  2                   0
# ℹ 4 more variables: total_missing_values <int>, complete_rows <int>,
#   total_observations <int>, memory_usage <dbl>

Code

plot_intro(Filepower)

Code

glimpse(Filepower)

Rows: 192
Columns: 3
$ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
$ `YYYY-MMM`   <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
$ KWH          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…

Code

describe(Filepower)

             vars   n      mean         sd    median   trimmed        mad
CaseSequence    1 192     828.5      55.57     828.5     828.5      71.16
YYYY-MMM*       2 192      96.5      55.57      96.5      96.5      71.16
KWH             3 191 6502474.6 1447570.89 6283324.0 6439474.9 1543073.77
                min      max   range skew kurtosis        se
CaseSequence    733      924     191 0.00    -1.22      4.01
YYYY-MMM*         1      192     191 0.00    -1.22      4.01
KWH          770523 10655730 9885207 0.17     0.45 104742.55

Code

skim(Filepower)

Data summary
Name	Filepower
Number of rows	192
Number of columns	3
_______________________
Column type frequency:
character	1
numeric	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
YYYY-MMM	0	1	8	8	0	192	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
CaseSequence	0	1.00	828.5	55.57	733	780.75	828.5	876.25	924	▇▇▇▇▇
KWH	1	0.99	6502474.6	1447570.89	770523	5429912.00	6283324.0	7620523.50	10655730	▁▁▇▅▁

Code

#Look at missing values

knitr::kable(miss_var_summary(Filepower), 
             caption = 'Missing data',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()

Missing data
variable	n_miss	pct_miss
KWH	1	0.521
CaseSequence	0	0
YYYY-MMM	0	0

Code

Filepower_ts <- Filepower |>
  rename(Date = "YYYY-MMM") |>
  mutate(Date = yearmonth(Date)) |>
  dplyr::select(-CaseSequence) |>
  as_tsibble(index=Date)

Filepower_ts |>
  filter(is.na(KWH)) |>
  knitr::kable()

Date	KWH
2008 Sep	NA

Data Distributions

The line plot shows seasonality of the series. The data follows a clear seasonal trend. There is more power usage in the beginning, middle, and end of the year. There is an outlier point outside of the range during July 2010. We will update the outlier point to NA so we can impute it. The seasonal trend also shows a pattern with usage higher in the beginning, middle and end of the year.

Code

plot1 <- Filepower_ts |>
  autoplot(KWH) +
  labs(title = "Lineplot of Power Usage", x = "Date") +
  scale_y_continuous(labels=scales::comma)

plot2 <- Filepower_ts |>
  ggplot(aes(x = KWH)) +
  geom_histogram(bins=15, fill='lightpink') +
  scale_x_continuous(labels=scales::comma) +
  labs(title = "Histogram of KWH")

plot3 <- Filepower_ts |>
  ggplot(aes(x = KWH)) +
  geom_boxplot(fill='lightpink') +
  scale_x_continuous(labels=scales::comma) +
  labs(title = "Boxplot of KWH")

plot_grid(plot1, plot_grid(plot2, plot3, nrow=1), ncol=1)

Code

#Identify the outlier
Filepower_ts |>
  filter(KWH < 3000000)

# A tsibble: 1 x 2 [1M]
      Date    KWH
     <mth>  <dbl>
1 2010 Jul 770523

Code

outlierpt <- which(Filepower_ts$Date == yearmonth("2010 Jul"))
Filepower_ts$KWH[outlierpt] = NA

Filepower_ts |>
  filter(is.na(KWH)) |>
  knitr::kable()

Date	KWH
2008 Sep	NA
2010 Jul	NA

Code

Filepower_ts |>
  gg_lag(KWH, geom="point", lags = 1:12)

Code

Filepower_ts |>
  gg_season(KWH) +
  scale_y_continuous(labels=scales::comma) +
  labs(title = "Seasonal Decomposition of KWH", x = "Month")

Code

Filepower_ts |>
  features(difference(KWH, lag=12), unitroot_kpss) |>
  knitr::kable()

kpss_stat	kpss_pvalue
0.1323771	0.1

Data Imputation

To impute the missing values, we can use the mean of the same month the 2 years prior. Given the right-skewed nature of the histogram, we can perform a box-cox transformation to reduce the skewness.

Code

set.seed(2024)


imputejul <- Filepower_ts |>
  filter(Date %in% c("2008 Jul", "2009 Jul"))

imputesep <- Filepower_ts |>
  filter(Date %in% c("2006 Sep", "2007 Sep")) 

# impute
Filepower_ts <- Filepower_ts |>
  mutate(KWH = ifelse(Date == yearmonth("2010 Jul"), mean(imputejul$KWH), KWH),
         KWH = ifelse(Date == yearmonth("2008 Sep"), mean(imputesep$KWH), KWH))


ggplot(Filepower_ts, aes(x = KWH)) +
  geom_histogram() +
  labs(title = "Histogram of KWH")

Code

Filepower_ts %>% 
  plot_ly(x= ~year(Date)) %>%
  add_lines(y= ~KWH) %>%
  layout(title = "Power Usage",
         xaxis = list(title="Date"),
         yaxis = list(title="KWH"))

Code

Filepower_ts |>
  autoplot(KWH) +
  labs(title = "Lineplot of Power Usage", x = "Date") +
  scale_y_continuous(labels=scales::comma)

Code

knitr::kable(miss_var_summary(Filepower_ts), 
             caption = 'Missing data',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()

Missing data
variable	n_miss	pct_miss
Date	0	0
KWH	0	0

Models

In light of the observed seasonality,we can try some seasonal methods such as Arima. Here we can conver the dataframe into time series and see the ACF plot. The spikes show outside of the range, indicates it is not white noise and not-normal.

Code

power_ts <- ts(Filepower_ts$KWH, start=1998, frequency = 12)
Acf(power_ts)

Code

ggtsdisplay(power_ts, main="Time series KWH before transformation")

Code

Filepower_ts%>%
  ACF(KWH) %>%
  autoplot() +
  labs(title="KWH consumption before transformation ACF")

The aim now is to find an appropriate ARIMA model based on the ACF and PACF. Both the ACF and PACF plots are sinusoidal decaying and there is a large significant spike at lag 12. There are smaller significant spikes.

We can use auto.arima() function to determine the appropriate seasonal AR and MA models. Also I have used lambda to see if we can tranform the data to minimize the variance.

Train Models

Code

set.seed(2024)

Filepower_ts |>
  gg_tsdisplay(KWH) +
  labs(title = "Power Usage")

Code

Filepower_ts |>
  gg_tsdisplay(difference(KWH, lag=12), 
               plot_type="partial") +
  labs(title = "Power Usage: Seasonally Differenced")

Code

Filepower_ts|>
  features(difference(KWH, lag=12), unitroot_kpss) |>
kable()

kpss_stat	kpss_pvalue
0.1623246	0.1

Code

Filepower_ts |>
  gg_subseries(KWH) +
  labs(title = "Power Usage")

Code

lambda <- Filepower_ts |>
  features(KWH, features = guerrero) |>
  pull(lambda_guerrero)
Filepower_ts  |>
  autoplot(box_cox(KWH, lambda)) +
  labs(y = "",
       title = latex2exp::TeX(paste0(
         "Transformed KWH $\\lambda$ = ",
         round(lambda,2))))

Code

Filepower_ts |>
  model(
    classical_decomposition(KWH, type = "additive")
  ) |>
  components() |>
  autoplot() +
  labs(title = "Classical additive decomposition of total
                  KWH")

Code

Filepower_ts %>% 
  model(
    STL(KWH ~ season(window = "periodic"),
                  
    robust = TRUE)) |>
  components() |>
  autoplot()

Predictions and Evaluation

Here I have tried some seasonal methods such as holt winter and damp holt winter. I have also plotted the histogram showing that for the large part the models show a normal distribution bell curve on the residual, however, there is one large spike on the ACF. We can proceed to look at ARIMA to see how it does by reviewing the AICc value, we can proceed with the lowest AICc on the ARIMA model and write out our forecast.

Code

Filepower_ts %>% auto.arima(lambda = lambda, biasadj = TRUE)

Series: . 
ARIMA(0,0,1)(2,1,0)[12] with drift 
Box Cox transformation: lambda= -0.2048968 

Coefficients:
         ma1     sar1     sar2  drift
      0.2483  -0.7212  -0.3555  1e-04
s.e.  0.0832   0.0751   0.0770  1e-04

sigma^2 = 1.505e-05:  log likelihood = 751.73
AIC=-1493.47   AICc=-1493.12   BIC=-1477.5

Code

Filepower_ts |>
  model(
    holt = ETS(KWH ~ error("A") +
                       trend("A") + season("A")),
    damp = ETS(KWH ~ error("M") +
                       trend("Ad") + season("M"))
  ) |>
  forecast(h = 24) |>
  autoplot(Filepower_ts, level = NULL) +
  labs(title = "Power generation",
       y = "KWH") +
  guides(colour = guide_legend(title = "Forecast holt winters and damp holt winter"))

Code

count_gaps(Filepower_ts)

# A tibble: 0 × 3
# ℹ 3 variables: .from <mth>, .to <mth>, .n <int>

Code

t <- Filepower_ts |>
  model(
    holt = ETS(KWH ~ error("A") +
                       trend("A") + season("A")),
    damp = ETS(KWH ~ error("M") +
                       trend("Ad") + season("M"))
  )

t |> pivot_longer(everything(), names_to = "Model name",
                     values_to = "Orders")

# A mable: 2 x 2
# Key:     Model name [2]
  `Model name`        Orders
  <chr>              <model>
1 holt          <ETS(A,A,A)>
2 damp         <ETS(M,Ad,M)>

Code

t |>  dplyr::select(holt) |> gg_tsresiduals(lag=24)

Code

t |>  dplyr::select(damp ) |> gg_tsresiduals(lag=24)

ARIMA models

The auto arima model <ARIMA(0,0,1)(2,1,0)[12] w/ drift> seems to have the lowest AICc value, hence we will select this model and store our forecast. Additionally, the residuals on this model seem better, however, that the auto arima model and the scores of AICc and BIC are lower as well. ARIMA(0,0,1)(0,1,1)[12] with drift.

Code

power_mods <- 
Filepower_ts |>
  filter(year(Date) < 2012) |>
  model(auto = ARIMA(KWH),
        arima000011 = ARIMA(KWH ~ 0 + pdq(0,0,0) + PDQ(0,1,1, period=12)),
        arima000110 = ARIMA(KWH ~ 0 + pdq(0,0,0) + PDQ(1,1,0, period=12)),
        arima001011 = ARIMA(KWH ~ 0 + pdq(0,0,1) + PDQ(0,1,1, period=12)),
        arima001110 = ARIMA(KWH ~ 0 + pdq(0,0,1) + PDQ(1,1,0, period=12)),
        arima001011drift = ARIMA(KWH ~ 1 + pdq(0,0,1) + PDQ(0,1,1, period=12)),
        arima001110drift = ARIMA(KWH ~ 1 + pdq(0,0,1) + PDQ(1,1,0, period=12)))

power_mods |> pivot_longer(everything(), names_to = "Model name",
                     values_to = "Orders")

# A mable: 7 x 2
# Key:     Model name [7]
  `Model name`                                 Orders
  <chr>                                       <model>
1 auto             <ARIMA(0,0,1)(2,1,0)[12] w/ drift>
2 arima000011               <ARIMA(0,0,0)(0,1,1)[12]>
3 arima000110               <ARIMA(0,0,0)(1,1,0)[12]>
4 arima001011               <ARIMA(0,0,1)(0,1,1)[12]>
5 arima001110               <ARIMA(0,0,1)(1,1,0)[12]>
6 arima001011drift <ARIMA(0,0,1)(0,1,1)[12] w/ drift>
7 arima001110drift <ARIMA(0,0,1)(1,1,0)[12] w/ drift>

Code

report(power_mods) |>
  dplyr::select(.model:BIC) |>
  knitr::kable()

.model	sigma2	log_lik	AIC	AICc	BIC
auto	358748449455	-2298.737	4607.474	4607.874	4622.723
arima000011	473934089563	-2320.148	4644.297	4644.375	4650.396
arima000110	491517415472	-2322.441	4648.882	4648.961	4654.982
arima001011	385874805250	-2304.657	4615.313	4615.471	4624.463
arima001110	429148017595	-2311.241	4628.482	4628.640	4637.632
arima001011drift	356469559457	-2299.458	4606.915	4607.180	4619.115
arima001110drift	418376486644	-2308.873	4625.747	4626.011	4637.946

Code

#power_mods |> summary(auto)

power_mods |>
  forecast(h="3 years") |>
  autoplot(Filepower_ts |> filter(year(Date) > 2010), level= NULL) +
  labs(title = "KWH Forecast Models ARIMA") +
  scale_y_continuous(labels=scales::comma)

Code

power_mods |>
  dplyr::select(auto) |>
  gg_tsresiduals() +
  labs(title = "Auto ARIMA")

Code

power_mods |>
  dplyr::select(arima001011drift) |>
  gg_tsresiduals() +
  labs(title = "ARIMA(0,0,1)(0,1,1)[12] with drift")

Code

fc_2014 <- power_mods |>
dplyr::select(auto) |>
  forecast(h="3 years") |>
  filter(year(Date)  == 2014)

fc_2014a <- power_mods |>
dplyr::select(arima001011drift) |>
  forecast(h="3 years") |>
  filter(year(Date)  == 2014)

fc_2014b <- power_mods |>
dplyr::select(arima001011drift) |>
  forecast(h="3 years") |>
  filter(year(Date)  == 2014)

fc_2014

# A fable: 12 x 4 [1M]
# Key:     .model [1]
   .model     Date                 KWH    .mean
   <chr>     <mth>              <dist>    <dbl>
 1 auto   2014 Jan N(9082193, 5.1e+11) 9082193.
 2 auto   2014 Feb N(8825149, 5.2e+11) 8825149.
 3 auto   2014 Mar N(6827872, 5.2e+11) 6827872.
 4 auto   2014 Apr   N(6e+06, 5.2e+11) 6028873.
 5 auto   2014 May N(5679457, 5.2e+11) 5679457.
 6 auto   2014 Jun N(7723045, 5.2e+11) 7723045.
 7 auto   2014 Jul N(9507586, 5.2e+11) 9507586.
 8 auto   2014 Aug N(9795634, 5.2e+11) 9795634.
 9 auto   2014 Sep N(8816531, 5.2e+11) 8816531.
10 auto   2014 Oct   N(6e+06, 5.2e+11) 6007628.
11 auto   2014 Nov N(6060119, 5.2e+11) 6060119.
12 auto   2014 Dec N(7943802, 5.2e+11) 7943802.

Code

fc_2014a

# A fable: 12 x 4 [1M]
# Key:     .model [1]
   .model               Date                 KWH    .mean
   <chr>               <mth>              <dist>    <dbl>
 1 arima001011drift 2014 Jan N(8922240, 4.9e+11) 8922240.
 2 arima001011drift 2014 Feb N(8223524, 4.9e+11) 8223524.
 3 arima001011drift 2014 Mar N(6801798, 4.9e+11) 6801798.
 4 arima001011drift 2014 Apr   N(6e+06, 4.9e+11) 5959830.
 5 arima001011drift 2014 May N(5557517, 4.9e+11) 5557517.
 6 arima001011drift 2014 Jun N(7198250, 4.9e+11) 7198250.
 7 arima001011drift 2014 Jul N(8709822, 4.9e+11) 8709822.
 8 arima001011drift 2014 Aug N(9089526, 4.9e+11) 9089526.
 9 arima001011drift 2014 Sep N(8429024, 4.9e+11) 8429024.
10 arima001011drift 2014 Oct N(6118739, 4.9e+11) 6118739.
11 arima001011drift 2014 Nov N(5664756, 4.9e+11) 5664756.
12 arima001011drift 2014 Dec N(7291513, 4.9e+11) 7291513.

Code

dataframe1 <- data.frame (fc_2014a)

Write final forecast to a file

In conclusion :

From the book we looked at several models after checking seasonality, stationarity,looked at residuals, ran holt winter, ARIMA models and finally made ARIMA model selection and wrote the forecast to the file.

Definition from the book.

“Transformations such as logarithms can help to stabilise the variance of a time series. Differencing can help stabilise the mean of a time series by removing changes in the level of a time series, and therefore eliminating (or reducing) trend and seasonality.

A stationary time series is one whose properties do not depend on the time at which the series is observed. Thus, time series with trends, or with seasonality, are not stationary — the trend and seasonality will affect the value of the time series at different times. On the other hand, a white noise series is stationary — it does not matter when you observe it, it should look much the same at any point in time.”

Code

date = seq(as.Date('2014-01-01'), as.Date('2014-12-01'), by = 'month') %>% format('%Y-%b')
openxlsx::write.xlsx(data.frame(YYYY.MM = date, KWH = fc_2014a$.mean), "forecasts_Banu.xlsx")

Part C – Bonus

Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.

Code

# Load the Excel data
pipe1_df <- readxl::read_excel('C:/Users/Banu/Documents/RScriptfiles/DATA 624/PROJECT 1/Waterflow_Pipe1.xlsx')
colnames(pipe1_df) <- c('DATE', 'WaterFlow')


pipe2_df <- readxl::read_excel('C:/Users/Banu/Documents/RScriptfiles/DATA 624/PROJECT 1/Waterflow_Pipe2.xlsx')
colnames(pipe2_df) <- c('DATE', 'WaterFlow')

glimpse(pipe1_df)

Rows: 1,000
Columns: 2
$ DATE      <dbl> 42300.02, 42300.03, 42300.04, 42300.04, 42300.06, 42300.06, …
$ WaterFlow <dbl> 23.369599, 28.002881, 23.065895, 29.972809, 5.997953, 15.935…

Let’s format the Date.Time columns to actual date types.

Code

waterpipe1_df <- pipe1_df |>
  mutate(Date.Time = as.POSIXct(pipe1_df$DATE * 86400, origin = "1899-12-30", tz="GMT"))

waterpipe2_df <- pipe2_df |>
  mutate(Date.Time = as.POSIXct(pipe2_df$DATE * 86400, origin = "1899-12-30", tz="GMT"))

Look at missing data , aggregate the mean or average flow and look at hourly data.

Code

head(waterpipe1_df)

# A tibble: 6 × 3
    DATE WaterFlow Date.Time          
   <dbl>     <dbl> <dttm>             
1 42300.     23.4  2015-10-23 00:24:06
2 42300.     28.0  2015-10-23 00:40:02
3 42300.     23.1  2015-10-23 00:53:51
4 42300.     30.0  2015-10-23 00:55:40
5 42300.      6.00 2015-10-23 01:19:17
6 42300.     15.9  2015-10-23 01:23:58

Code

head(waterpipe2_df)

# A tibble: 6 × 3
    DATE WaterFlow Date.Time          
   <dbl>     <dbl> <dttm>             
1 42300.      18.8 2015-10-23 01:00:00
2 42300.      43.1 2015-10-23 01:59:59
3 42300.      38.0 2015-10-23 03:00:00
4 42300.      36.1 2015-10-23 04:00:00
5 42300.      31.9 2015-10-23 04:59:59
6 42300.      28.2 2015-10-23 06:00:00

Code

knitr::kable(miss_var_summary(waterpipe1_df), 
             caption = 'Pipe 1 Missing Data',
             format="html", 
             table.attr="style='width:60%;'") %>% 
  kableExtra::kable_styling()

Pipe 1 Missing Data
variable	n_miss	pct_miss
DATE	0	0
WaterFlow	0	0
Date.Time	0	0

Code

knitr::kable(miss_var_summary(waterpipe2_df), 
             caption = 'Pipe 1 Missing Data',
             format="html", 
             table.attr="style='width:60%;'") %>% 
  kableExtra::kable_styling()

Pipe 1 Missing Data
variable	n_miss	pct_miss
DATE	0	0
WaterFlow	0	0
Date.Time	0	0

Code

hourly_pipe1 <- waterpipe1_df |>
  mutate(Date.Time = floor_date(Date.Time, "hour")) |>
  group_by(Date.Time) |>
  summarize(avg_flow = mean(WaterFlow, na.rm=T)) |>
  as_tsibble(index=Date.Time)

hourly_pipe2 <- waterpipe2_df |>
  mutate(Date.Time = floor_date(Date.Time, "hour")) |>
  group_by(Date.Time) |>
  summarize(avg_flow = mean(WaterFlow, na.rm=T)) |>
  as_tsibble(index=Date.Time)

hourly_pipe1 |>
  features(avg_flow, unitroot_kpss) |>
  knitr::kable()

kpss_stat	kpss_pvalue
0.2465587	0.1

Code

# hourly_pipe1 |>
#  ACF(avg_flow) |>
#   autoplot()

Check for gaps and interpolate

This time series has gaps or missing values, therefore we will use na_interpolation to fill gaps and plot the ACF. The ACF shows one significant spike.

Code

count_gaps(hourly_pipe1)

# A tibble: 4 × 3
  .from               .to                    .n
  <dttm>              <dttm>              <int>
1 2015-10-27 17:00:00 2015-10-27 17:00:00     1
2 2015-10-28 00:00:00 2015-10-28 00:00:00     1
3 2015-11-01 05:00:00 2015-11-01 05:00:00     1
4 2015-11-01 09:00:00 2015-11-01 09:00:00     1

Code

count_gaps(hourly_pipe2)

# A tibble: 333 × 3
   .from               .to                    .n
   <dttm>              <dttm>              <int>
 1 2015-10-23 02:00:00 2015-10-23 02:00:00     1
 2 2015-10-23 05:00:00 2015-10-23 05:00:00     1
 3 2015-10-23 08:00:00 2015-10-23 08:00:00     1
 4 2015-10-23 11:00:00 2015-10-23 11:00:00     1
 5 2015-10-23 14:00:00 2015-10-23 14:00:00     1
 6 2015-10-23 17:00:00 2015-10-23 17:00:00     1
 7 2015-10-23 20:00:00 2015-10-23 20:00:00     1
 8 2015-10-23 23:00:00 2015-10-23 23:00:00     1
 9 2015-10-24 02:00:00 2015-10-24 02:00:00     1
10 2015-10-24 05:00:00 2015-10-24 05:00:00     1
# ℹ 323 more rows

Code

pipes1_tsb <- hourly_pipe1 %>%
  fill_gaps(.full=TRUE) %>%
  na_interpolation(option="spline")

pipes2_tsb <- hourly_pipe2%>%
  fill_gaps(.full=TRUE) %>%
  na_interpolation(option="spline")


pipes1_tsb |> ACF(avg_flow) |>
  autoplot()

Code

pipes2_tsb |> ACF(avg_flow) |>
  autoplot()

Plot pipe1

Code

plot1 <- hourly_pipe1 |>
  fill_gaps() |>
  fill(avg_flow, .direction="down") |>
  autoplot(avg_flow) +
  labs(title = "Pipe 1: Hourly Water Flow", x = "Date", y = "Average Water Flow")

plot2 <- hourly_pipe1 |>
  fill_gaps() |>
  fill(avg_flow, .direction="down") |>
  ACF(avg_flow) |>
  autoplot() +
  labs(title = "ACF")

plot_grid(plot1, plot2, ncol=1)

Plot pipe2

Code

plot1 <- hourly_pipe2 |>
  fill_gaps() |>
  fill(avg_flow, .direction="down") |>
  autoplot(avg_flow) +
  labs(title = "Pipe 2: Hourly Water Flow", x = "Date", y = "Average Water Flow")

plot2 <- hourly_pipe2 |>
  fill_gaps() |>
  fill(avg_flow, .direction="down") |>
  ACF(avg_flow) |>
  autoplot() +
  labs(title = "ACF")

plot_grid(plot1, plot2, ncol=1)

Check Stationarity

I have checked ome ways to check differencing and then looking at the features to see the effects.

From the book: “Differencing can help stabilise the mean of a time series by removing changes in the level of a time series, and therefore eliminating (or reducing) trend and seasonality.A stationary time series is one whose properties do not depend on the time at which the series is observed.17 Thus, time series with trends, or with seasonality, are not stationary — the trend and seasonality will affect the value of the time series at different times. On the other hand, a white noise series is stationary — it does not matter when you observe it, it should look much the same at any point in time.”

Code

pipes2_tsb |> ACF(difference(avg_flow)) |>
  autoplot() + labs(subtitle = "Hourly Water Flow Pipe 2")

Code

pipes2_tsb |>
  features(avg_flow, unitroot_ndiffs)

# A tibble: 1 × 1
  ndiffs
   <int>
1      0

Code

pipes2_tsb |>
  mutate(diff_avg_flow = difference(avg_flow)) |>
  features(diff_avg_flow, unitroot_kpss)

# A tibble: 1 × 2
  kpss_stat kpss_pvalue
      <dbl>       <dbl>
1   0.00806         0.1

Code

t<- pipes2_tsb |>
  mutate(diff_avg_flow = difference(avg_flow)) 

t %>%
  gg_tsdisplay(diff_avg_flow)

Boxcox,ETS,SNAIVE

This time, the p-value is reported as 0.1 (and so it could be larger than that). We can conclude that the differenced data appear stationary.

Code

lambda_5 <- pipes2_tsb %>%
  features(avg_flow, features = guerrero) %>%
  pull(lambda_guerrero)


fit_atm_4_a <-  pipes2_tsb %>%
  model(SNAIVE = SNAIVE(box_cox(avg_flow, lambda_5)),
        ETS = ETS((box_cox(avg_flow, lambda_4) ~ error("A") + trend("A") +
                                                season("A"))))

fit_atm_4_b <- pipes2_tsb%>%
  model(ETS = ETS((box_cox(avg_flow, lambda_4) ~ error("A") + trend("A") +
                                                season("A"))))
fit_atm_4_c <- pipes2_tsb %>%
  model(SNAIVE = SNAIVE(box_cox(avg_flow, lambda_4)))


fc_atm_4 <- fit_atm_4_a %>%
  forecast(h= "1 week")

fc_atm_4%>%
  autoplot(pipes2_tsb , level = 80) +
  labs(y = "Hourly Water Flow",
       title = "Hourly Water Flow Pipe 2 with boxcox and SNAIVE, ETS")

Code

fit_atm_4_b |>
  
  gg_tsresiduals() +
  labs(title = "Hourly Water Flow boxcox ETS")

Code

fit_atm_4_c |>
  
  gg_tsresiduals() +
  labs(title = "Hourly Water Flow boxcox SNAIVE")

Code

fc_atm_4b <- fit_atm_4_b %>%
  forecast(h = "1 week")

fc_atm_4c <- fit_atm_4_c %>%
  forecast(h= "1 week")

fc_atm_4b%>%
  autoplot(pipes2_tsb , level = 80) +
  labs(y = "Hourly Water Flow Pipe 2 with boxcox and ETS",
       title = "Hourly Water Flow Pipe 2 with boxcox and ETS")

Code

fc_atm_4c%>%
  autoplot(pipes2_tsb , level = 80) +
  labs(y = "Hourly Water Flow Pipe 2 with boxcox SNAIVE",
       title = "Hourly Water Flow Pipe 2 with boxcox and SNAIVE")

Forecast ARIMA

Code

pipe1_mod <- hourly_pipe1 |>
  fill_gaps() |>
  fill(avg_flow, .direction="down") |>
  model(ARIMA(avg_flow))

report(pipe1_mod)

Series: avg_flow 
Model: ARIMA(0,0,0) w/ mean 

Coefficients:
      constant
       19.8585
s.e.    0.2743

sigma^2 estimated as 18.14:  log likelihood=-687.79
AIC=1379.59   AICc=1379.64   BIC=1386.55

Code

pipe2_mod <- hourly_pipe2 |>
  fill_gaps() |>
  fill(avg_flow, .direction="down") |>
  model(ARIMA(avg_flow))

report(pipe2_mod)

Series: avg_flow 
Model: ARIMA(0,0,1)(0,0,1)[24] w/ mean 

Coefficients:
         ma1    sma1  constant
      0.2697  0.1168   39.5858
s.e.  0.0317  0.0311    0.5670

sigma^2 estimated as 161.2:  log likelihood=-3959.02
AIC=7926.04   AICc=7926.08   BIC=7945.67

Code

pipe1_fc <- pipe1_mod |>
  forecast(h="1 week")

pipe1_fc |>
  autoplot(hourly_pipe1) +
  labs(title = "Pipe 1 Week Forecast arima", y = "Average Water Flow", x = "Date")

Code

pipe2_fc <- pipe2_mod |>
  forecast(h="1 week")

pipe2_fc |>
  autoplot(hourly_pipe2) +
  labs(title = "Pipe 2 Week Forecast arima", y = "Average Water Flow", x = "Date")

Code

openxlsx::write.xlsx(data.frame(pipe1_fc), "forecasts_ATM_Banu_pipe1.xlsx")
openxlsx::write.xlsx(data.frame(pipe2_fc), "forecasts_ATM_Banu_pipe2.xlsx")