Author

Banu

Published

October 22, 2024

Code
#Load Packages
library(arrow)
library(dplyr)
library(ggplot2)
library(reshape2)
library(tidymodels)
library(tidyr)
library(broom)
library(patchwork)
library(lubridate)
library(rpart.plot)
library(kableExtra)
library(DataExplorer)
library(skimr)
#install.packages("ranger")
#install.packages("openxlsx")
library(DescTools)
library(corrplot)
library(ggcorrplot)
library(caret)
library(rpart)
library(car)
library(rattle)
library(ROSE)
library(mice)
library(MASS)
library(fable)
library(imputeTS)
library(psych)
library(forecast)
library(openxlsx)
library(plotly)
library(naniar)
library(cowplot)
library(fpp3)

Project

Deliverable

This project consists of 3 parts - two required and one bonus.

Part A – ATM Forecast, ATM624Data.xlsx

Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx

Part C – BONUS, optional (part or all), Waterflow_Pipe1.xlsx and Waterflow_Pipe2.xlsx

PART A Forecasting Analysis

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010.  The data is given in a single file.  The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward.   I am being somewhat ambiguous on purpose to make this have a little more business feeling.  Explain and demonstrate your process, techniques used and not used, and your actual forecast.  I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file  Also please submit the forecast which you will put in an Excel readable file.

Data Exploratory Analysis

Code
library(readxl)
# inputf1 <- read.csv("C:/Users/Banu/Documents/RScriptfiles/DATA 624/PROJECT 1/ATM624Data.csv",stringsAsFactors = F,header=TRUE)

inputf1 <- readxl::read_excel("C:/Users/Banu/Documents/RScriptfiles/DATA 624/PROJECT 1/ATM624Data.xlsx")


#Glimpse variables
introduce(inputf1)
# A tibble: 1 × 9
   rows columns discrete_columns continuous_columns all_missing_columns
  <int>   <int>            <int>              <int>               <int>
1  1474       3                1                  2                   0
# ℹ 4 more variables: total_missing_values <int>, complete_rows <int>,
#   total_observations <int>, memory_usage <dbl>
Code
plot_intro(inputf1)

Code
glimpse(inputf1)
Rows: 1,474
Columns: 3
$ DATE <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
$ ATM  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
$ Cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …
Code
inputf1$DATE <- as.Date(as.numeric(inputf1$DATE),origin = "1899-12-30")
#sorted by date
inputf1 <- inputf1[order(inputf1$DATE),]
inputf1$ATM <- as.factor(inputf1$ATM)


#display
knitr::kable(inputf1[1:10, 1:3], 
             caption = 'ATM data',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()
ATM data
DATE ATM Cash
2009-05-01 ATM1 96.0000
2009-05-01 ATM2 107.0000
2009-05-01 ATM3 0.0000
2009-05-01 ATM4 776.9934
2009-05-02 ATM1 82.0000
2009-05-02 ATM2 89.0000
2009-05-02 ATM3 0.0000
2009-05-02 ATM4 524.4180
2009-05-03 ATM1 85.0000
2009-05-03 ATM2 90.0000
Code
knitr::kable(skim(inputf1), 
             format="html") %>% 
  kableExtra::kable_styling()
skim_type skim_variable n_missing complete_rate Date.min Date.max Date.median Date.n_unique factor.ordered factor.n_unique factor.top_counts numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
Date DATE 0 1.0000000 2009-05-01 2010-05-14 2009-11-01 379 NA NA NA NA NA NA NA NA NA NA NA
factor ATM 14 0.9905020 NA NA NA NA FALSE 4 ATM: 365, ATM: 365, ATM: 365, ATM: 365 NA NA NA NA NA NA NA NA
numeric Cash 19 0.9871099 NA NA NA NA NA NA NA 155.582 376.4568 0 0.5 73 114 10919.76 ▇▁▁▁▁

Review missing variables

ATM 4 as more cash withdrawals.ATM 3 only has few data points near the end of the year, could be a new machine.ATM 4 Has one usually high outlier so we may have to impute this. Remove values that have both ATM and cash has NA. Then convert the data to tsibble.

Code
# Plot cash taken out 
plot1 <- inputf1 %>%
  group_by(ATM) %>%
  plot_ly(x= ~DATE) %>%
  add_lines(y= ~Cash, color = ~factor(ATM)) %>%
  layout(title = "ATM Withdwals by Date")

plot1
Code
#Look at missing values
plot_missing(inputf1)

Code
knitr::kable(miss_var_summary(inputf1), 
             caption = 'Missing data',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()
Missing data
variable n_miss pct_miss
Cash 19 1.29
ATM 14 0.950
DATE 0 0
Code
gg_miss_upset(inputf1)

Code
# summarize the missing data by ATM
inputf1 %>% 
  group_by(as.factor(ATM)) %>%
  summarise(`Missing Values` = sum(is.na(Cash)))
# A tibble: 5 × 2
  `as.factor(ATM)` `Missing Values`
  <fct>                       <int>
1 ATM1                            3
2 ATM2                            2
3 ATM3                            0
4 ATM4                            0
5 <NA>                           14
Code
# Drop any row that are missing both ATM and Cash values
tsibbleATMclean <-   inputf1%>%
  filter(!(is.na(ATM) & is.na(Cash))) %>%
  drop()
# 
# #Create a Tsibble object
#   #tsibbleATM<- read_excel("ATM624DATA.xlsx") %>%
#   # converting into date format
#   tsibbleATMclean
#   
#   %>% mutate(Date = as.Date(as.numeric(DATE),origin = "1899-12-30"),ATM= as.factor(ATM),Cash = as.numeric(Cash)) %>%
#   # renaming column name
#   rename(Date = DATE) %>%
#   # converting to tsibble
# tsibbleATM<-  tsibbleATMclean %>%  mutate(DATE=DATE,ATM=ATM,Cash=Cash) %>%
# as_tsibble(index = Date, key = ATM) 

tsibbleATM <- as_tsibble(tsibbleATMclean, index=DATE, key=ATM)
  
# Confirm 14 rows were dropped
print(c('Original: ', nrow(inputf1), 'After Drop:', nrow(tsibbleATMclean)))
[1] "Original: "  "1474"        "After Drop:" "1460"       
Code
tsibbleATM %>%
  autoplot(Cash) +
  facet_wrap(~ATM, scales = "free", nrow = 4) +
  labs(title = " ATM Before Outlier Removal")

Code
 summary(tsibbleATM)
      DATE              ATM           Cash        
 Min.   :2009-05-01   ATM1:365   Min.   :    0.0  
 1st Qu.:2009-07-31   ATM2:365   1st Qu.:    0.5  
 Median :2009-10-30   ATM3:365   Median :   73.0  
 Mean   :2009-10-30   ATM4:365   Mean   :  155.6  
 3rd Qu.:2010-01-29              3rd Qu.:  114.0  
 Max.   :2010-04-30              Max.   :10919.8  
                                 NA's   :5        

Review distributions

Let’s take a look at the distributions for the other three ATMs that have steady cash withdrawals first.

Code
plot1 <- tsibbleATM |>
  filter(ATM != "ATM3") |>
  ggplot(aes(x=Cash)) +
  geom_histogram(aes(fill=ATM), bins=30) +
  facet_wrap(~ATM, nrow=1, scales = "free_x") +
  theme(legend.position="none") +
  labs(title = "Histograms of ATM Cash Withdrawals", y = "Count", x = "Cash (Hundreds of $)")

plot2 <- tsibbleATM |>
  filter(ATM != "ATM3") |>
  ggplot(aes(x=Cash)) +
  geom_boxplot(aes(fill=ATM)) +
  facet_wrap(~ATM, nrow=1, scales = "free_x") +
  theme(legend.position="none") +
  labs(title = "Boxplots of ATM Cash Withdrawals", y = "", x = "Cash (Hundreds of $)")

plot_grid(plot1, plot2, ncol=1)

Impute missing values

ATM1 is missing 3 values for Cash. The data appears to be bimodally distributed as can be seen from the boxplot, we will use median imputation for this ATM.

ATM2 is missing 2 values. ATM2 appears to be bimodally distributed with about equal observations in each peak. We will use mode imputation for this ATM.

We will use median imputation to replace the large outlier in ATM4.

Code
# Various NA plots to inspect data
knitr::kable(miss_var_summary(tsibbleATM), 
             caption = 'Missing Values',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()
Missing Values
variable n_miss pct_miss
Cash 5 0.342
DATE 0 0
ATM 0 0
Code
tsibbleATM |>
  filter(is.na(Cash)) |>
  knitr::kable()
DATE ATM Cash
2009-06-13 ATM1 NA
2009-06-16 ATM1 NA
2009-06-22 ATM1 NA
2009-06-18 ATM2 NA
2009-06-24 ATM2 NA
Code
# function for mode
mode <- function(x) {
   uniq_x <- unique(x)
   uniq_x[which.max(tabulate(match(x, uniq_x)))]
}

tsibbleATM_imputed <- tsibbleATM |>
  mutate(Cash = ifelse(Cash == max(tsibbleATM$Cash, na.rm = T), NA, Cash)) |>
  group_by(ATM) |>
  mutate(Cash = case_when(
    is.na(Cash) & ATM %in% c("ATM1", "ATM4") ~ median(Cash, na.rm=T),
    is.na(Cash) & ATM == "ATM2" ~ mode(Cash),
    TRUE ~ Cash
  ))

#no missing
tsibbleATM_imputed |>
  filter(is.na(Cash)) |>
  knitr::kable()
DATE ATM Cash
Code
tsibbleATM_imputed |>
  autoplot(Cash) +
  facet_wrap(~ATM, ncol=1, scales = "free_y") +
  labs(title = "ATM Withdrawal with imputed for Four ATMs (May 2009 - April 2010)") 

ATM1 and ATM2 both require seasonal differencing to make the data stationary. So, we can check differencing with a lag 7 to see what happens.The spikes on the ACF are still higher than we want in a residual after differencing.

Code
tsibbleATM_imputed |>
  ACF(Cash) |>
  autoplot() +
  facet_wrap(~ATM, ncol=1, scales = "free_y") +
  labs(title = "ACF Plots after imputation") 

Code
tsibbleATM_imputed |>
  features(Cash, unitroot_kpss) |>
  knitr::kable() 
ATM kpss_stat kpss_pvalue
ATM1 0.4651358 0.0495190
ATM2 1.9660900 0.0100000
ATM3 0.3891606 0.0818273
ATM4 0.1180573 0.1000000
Code
tsibbleATM_imputed |>
  features(Cash, unitroot_ndiffs)
# A tibble: 4 × 2
  ATM   ndiffs
  <fct>  <int>
1 ATM1       1
2 ATM2       1
3 ATM3       0
4 ATM4       0
Code
# atm_finimp |>
#   features(Cash, unitroot_kpss) |>
#   knitr::kable()
# 
# atm_finimp |>
#   features(Cash, unitroot_ndiffs)

tsibbleATM_imputed  |>
  features(difference(Cash,7), unitroot_ndiffs)
# A tibble: 4 × 2
  ATM   ndiffs
  <fct>  <int>
1 ATM1       0
2 ATM2       0
3 ATM3       0
4 ATM4       0
Code
tsibbleATM_imputed |>
  filter(ATM == "ATM1") |>
  gg_tsdisplay(difference(Cash, lag=7), 
               plot_type="partial") +
  labs(title = "ATM1: Seasonally Differenced")

Code
tsibbleATM_imputed |>
  filter(ATM == "ATM2") |>
  gg_tsdisplay(Cash, 
               plot_type="partial") +
  labs(title = "ATM2")

Code
tsibbleATM_imputed |>
  filter(ATM == "ATM4") |>
  gg_tsdisplay(difference(Cash, lag=7), 
               plot_type="partial") +
  labs(title = "ATM4: Seasonally Differenced")

Code
# df = with(ATM.df, ATM.df[(DATE >= "2009-07-01" & DATE <= "2009-08-31"), ])
ts = ggplot(tsibbleATM_imputed, aes(x = DATE, y = Cash, group = ATM, color = ATM)) +
  geom_line() + geom_point() +
  labs(title = "ATM Cash Withdrawal", subtitle ="1 July, 2009 to 31 August, 2009",
       x = "Date") +
  scale_y_continuous("Amount of Cash Withdrawal",
                     labels = scales::dollar_format(scale = 0.1, suffix = "K")) +
  theme_minimal() #+ transition_reveal(DATE)

ts

TimeSeries Models

Code
# Transform the dataframe into a time series
temp = tsibbleATM_imputed %>% drop_na() %>% spread(ATM, Cash)
ATM.ts = ts(temp %>% dplyr::select(-DATE), frequency = 7)

ATM1 Subseries using TS

From the time series plot, we can see seasonality in this set, weekly as previously reviewed. While there is no steady trend over the weeks, there are increasing and decreasing activities over periods. The ACF shows slow decrease in every 7th lags due to the trend.Plotting subseries to show the variations during the week

Code
# ggtsdisplay(ATM.ts[,1], main = "ATM #1 Cash Withdrawal", ylab = "Cash Withdrawal (in hundreds)", xlab = "Week")

ggsubseriesplot(ATM.ts[,1]) +
  labs(title = "ATM #1 Cash Withdrawal", subtitle ="1 May, 2009 to 30 April, 2010",
       x = "Days of the Week") +
  scale_y_continuous("Amount of Cash Withdrawal",
                     labels = scales::dollar_format(scale = 0.1, suffix = "K"))

Code
df_atm_1 <- tsibbleATM_imputed |>

  filter(ATM == "ATM1")
df_atm_1 %>%
  ACF(Cash)%>%
  autoplot()

ATM 1 Transformation boxcox using TS class

We can try lambda to stabilize the variance similar to the above PART A analysis. The AICc is 2339.304 for the ETS model.

Code
bct = function(ts, title){
  a = autoplot(ts) +
    labs(title = sprintf("Before: %s", title), x = "Week") +
    scale_y_continuous("Amount of Cash Withdrawal",
                     labels = scales::dollar_format(scale = 0.1, suffix = "K"))
  lambda = BoxCox.lambda(ts)
  at = autoplot(BoxCox(ts, lambda)) +
    labs(title = sprintf("Transformed: %s", title),
         subtitle = sprintf("lambda = %0.2f", lambda),
         y = "Box-Cox Transformed",
         x = "Week")
  gridExtra::grid.arrange(a, at)
}
lambda = BoxCox.lambda(ATM.ts[,1])
bct(ATM.ts[,1], title = "ATM #1 Cash Withdrawal")

Code
ATM.ts[,1] %>% stl(s.window = 7, robust = TRUE) %>%
  autoplot()  +
  labs(title = "Seasonal & Trend Decomposition for ATM #1 Time Series", x = "Week")

Code
t<- ATM.ts[,1] %>% ets(lambda = lambda, biasadj = TRUE)
t
ETS(A,N,A) 

Call:
ets(y = ., lambda = lambda, biasadj = TRUE)

  Box-Cox transformation: lambda= 0.2446 

  Smoothing parameters:
    alpha = 1e-04 
    gamma = 0.3551 

  Initial states:
    l = 7.5451 
    s = -4.2496 0.846 0.4385 0.5304 1.1743 0.8669
           0.3935

  sigma:  1.2697

     AIC     AICc      BIC 
2338.683 2339.304 2377.682 
Code
t %>% autoplot()  +
  labs(title = "ETS for ATM #1 Time Series", x = "Week")

ATM 1 STL and ETS and holtwinter methods using tsibble

I have tried some seasonal methods such as STL and holtwinter since the above data has seasonality and stationarity to it.

Code
fit_dcmp <- df_atm_1  |>
  model(stlf = decomposition_model(
    STL(Cash ~ trend(window = 7), robust = TRUE),
    NAIVE(season_adjust)
  ))
fit_dcmp |>
  forecast() |>
  autoplot(df_atm_1)+
  labs(y = "cash withdrawal",
       title = "ATM1 STL")

Code
fit_dcmp |> gg_tsresiduals()

Code
augment(fit_dcmp)
# A tsibble: 365 x 7 [1D]
# Key:       ATM, .model [1]
   ATM   .model DATE        Cash .fitted  .resid  .innov
   <fct> <chr>  <date>     <dbl>   <dbl>   <dbl>   <dbl>
 1 ATM1  stlf   2009-05-01    96    NA   NA      NA     
 2 ATM1  stlf   2009-05-02    82    NA   NA      NA     
 3 ATM1  stlf   2009-05-03    85    NA   NA      NA     
 4 ATM1  stlf   2009-05-04    90    NA   NA      NA     
 5 ATM1  stlf   2009-05-05    99    NA   NA      NA     
 6 ATM1  stlf   2009-05-06    88    NA   NA      NA     
 7 ATM1  stlf   2009-05-07     8    NA   NA      NA     
 8 ATM1  stlf   2009-05-08   104   100.   3.62    3.62  
 9 ATM1  stlf   2009-05-09    87    90.4 -3.43   -3.43  
10 ATM1  stlf   2009-05-10    93    93.0  0.0293  0.0293
# ℹ 355 more rows
Code
fc <- fit_dcmp  |>
  forecast(h = 24)

fc  |>
autoplot(
    df_atm_1,level=NULL)+
labs(y = "cash withdrawal",
       title = "ATM1 STL forecast")

Code
fit3 <- df_atm_1 |>
  model(
    additive = ETS(Cash ~ error("A") + trend("A") +
                                                season("A")),
    multiplicative = ETS(Cash ~ error("M") + trend("A") +
                                                season("M"))
  )
fc3 <- fit3 |> forecast(h = 24)
fc3 |>
  autoplot(df_atm_1, level = NULL) +
  labs(title="ATM1 ETS",
       y="cash withdrawal") +
  guides(colour = guide_legend(title = "ATM1 STL"))

Code
#augment(fit3)

fit2 <- df_atm_1 %>%
  filter(DATE <= "2010-04-30") |>
  model(
    hw = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
  ) |>
  forecast(h = "2 weeks") |>
  autoplot(df_atm_1 |> filter(DATE <= "2010-05-14")) +
  labs(title = "ATM1 Holt Winter",
       y="cash withdrawal")

fit2a <- df_atm_1 %>%
  filter(DATE <= "2010-04-30") |>
  model(
    hw = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
  )
fit2

Code
report(fit2a)
Series: Cash 
Model: ETS(M,Ad,M) 
  Smoothing parameters:
    alpha = 0.1646764 
    beta  = 0.0001000035 
    gamma = 0.3619369 
    phi   = 0.9760372 

  Initial states:
     l[0]      b[0]      s[0]     s[-1]    s[-2]    s[-3]    s[-4]     s[-5]
 82.93848 -0.917707 0.2975043 0.9715482 1.231799 1.047188 1.103285 0.9785181
    s[-6]
 1.370158

  sigma^2:  0.1368

     AIC     AICc      BIC 
4573.270 4574.307 4623.969 
Code
fit2b <- fit2a %>% forecast(h = "2 weeks")
fit2b
# A fable: 14 x 5 [1D]
# Key:     ATM, .model [1]
   ATM   .model DATE               Cash  .mean
   <fct> <chr>  <date>           <dist>  <dbl>
 1 ATM1  hw     2010-05-01   N(83, 951)  83.4 
 2 ATM1  hw     2010-05-02  N(95, 1277)  95.1 
 3 ATM1  hw     2010-05-03   N(70, 704)  69.6 
 4 ATM1  hw     2010-05-04  N(5.1, 3.9)   5.12
 5 ATM1  hw     2010-05-05 N(100, 1527)  99.6 
 6 ATM1  hw     2010-05-06   N(78, 973)  78.4 
 7 ATM1  hw     2010-05-07  N(84, 1141)  83.8 
 8 ATM1  hw     2010-05-08  N(84, 1455)  84.0 
 9 ATM1  hw     2010-05-09  N(96, 1936)  95.9 
10 ATM1  hw     2010-05-10  N(70, 1059)  70.1 
11 ATM1  hw     2010-05-11  N(5.2, 5.8)   5.16
12 ATM1  hw     2010-05-12 N(100, 2262) 100.  
13 ATM1  hw     2010-05-13  N(79, 1431)  79.0 
14 ATM1  hw     2010-05-14  N(84, 1666)  84.4 
Code
#augment(fit2a)

fit2c <- df_atm_1 %>%
  filter(DATE <= "2010-04-30") |>
  model(
    hw = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
  ) %>% augment() 

autoplot(fit2c, .innov) +
  labs(y = "$US",
       title = "Residuals from the HW method")

Code
fit2c  |>
  ggplot(aes(x = .innov)) +
  geom_histogram() +
  labs(title = "Histogram of HW residuals")

Code
fit2c  |>
  ACF(.innov) |>
  autoplot() +
  labs(title = "Residuals from the HW method")

Code
components(fit2a) |>
  autoplot() +
  labs(title = "ATM1 Holtwinter")

Code
components(fit3) |>
  autoplot() +
  labs(title = "ATM1 ETS, additive and multiplicative")

ATM 1 ARIMA

We can use seasonal ARIMA modeling to forecast future withdrawals for each ATM.

Let’s use the April 2010 data as a validation set and the rest of the data for training.

We will use an automatically generated ARIMA model and compare with our assumptions from the ACF and PACF plots.

Code
atm1_mod <- df_atm_1 |>
  model(auto = ARIMA(Cash),
        arima001011 = ARIMA(Cash ~ 0 + pdq(0,0,1) + PDQ(0,1,1, period=7)))

atm1_mod |>
  knitr::kable()
ATM auto arima001011
ATM1 <ARIMA(0,0,1)(0,1,2)[7]> <ARIMA(0,0,1)(0,1,1)[7]>
Code
report(atm1_mod) |>
  dplyr::select(.model:BIC) |>
  knitr::kable()
.model sigma2 log_lik AIC AICc BIC
auto 559.7744 -1641.115 3290.231 3290.344 3305.753
arima001011 564.7218 -1643.061 3292.121 3292.189 3303.763
Code
augment(atm1_mod) |>
  filter(.model == "auto") |>
  features(.innov, ljung_box, lag=14)
# A tibble: 1 × 4
  ATM   .model lb_stat lb_pvalue
  <fct> <chr>    <dbl>     <dbl>
1 ATM1  auto      16.8     0.268
Code
augment(atm1_mod) |>
  filter(.model == "arima001011") |>
  features(.innov, ljung_box, lag=14)
# A tibble: 1 × 4
  ATM   .model      lb_stat lb_pvalue
  <fct> <chr>         <dbl>     <dbl>
1 ATM1  arima001011    19.5     0.147
Code
auto.arima(df_atm_1) 
Series: df_atm_1 
ARIMA(0,0,1)(0,1,2)[7] 

Coefficients:
         ma1     sma1     sma2
      0.1695  -0.5867  -0.0989
s.e.  0.0553   0.0508   0.0497

sigma^2 = 559.8:  log likelihood = -1641.12
AIC=3290.23   AICc=3290.34   BIC=3305.75
Code
df_atm_1 %>%

  model(auto = ARIMA(Cash ~ 0 + pdq(0,0,1) + PDQ(0,1,2, period=7)))%>%
       
  residuals() %>% gg_tsdisplay()

Code
fc_atm_1A <- atm1_mod %>%
  forecast(h= 31)

fc_atm_1A %>%
  autoplot(df_atm_1 , level = 80) +
  labs(y = "Cash in hundreds of dollars",
       title = "ATM1 Forecast in May 2010")

ATM2 Subseries, lambda transformation and SNAIVE, ETS

Code
ggsubseriesplot(ATM.ts[,2]) +
  labs(title = "ATM #2 Cash Withdrawal", subtitle ="1 May, 2009 to 30 April, 2010",
       x = "Days of the Week") +
  scale_y_continuous("Amount of Cash Withdrawal",
                     labels = scales::dollar_format(scale = 0.1, suffix = "K"))

Code
df_atm_2 <- tsibbleATM_imputed |>

  filter(ATM == "ATM2")
df_atm_2 %>%
  ACF(Cash)%>%
  autoplot()

Code
lambda_2 <- df_atm_2%>%
  features(Cash, features = guerrero) %>%
  pull(lambda_guerrero)

fit_atm_2_a <- df_atm_2%>%
  model(SNAIVE = SNAIVE(box_cox(Cash, lambda_2)),
        ETS = ETS((box_cox(Cash, lambda_2) ~ error("A") + trend("A") +
                                                season("A"))))

fit_atm_2_b <- df_atm_2%>%
  model(ETS = ETS((box_cox(Cash, lambda_2) ~ error("A") + trend("A") +
                                                season("A"))))
fit_atm_2_c <- df_atm_2%>%
  model(SNAIVE = SNAIVE(box_cox(Cash, lambda_2)))

fc_atm_2 <- fit_atm_2_a %>%
  forecast(h= 31)

fc_atm_2%>%
  autoplot(df_atm_2, level = 80) +
  labs(y = "",
       title = latex2exp::TeX(paste0(
         "Transformed gas production with $\\lambda$ = ",
         round(lambda_2,2))))

Code
fit_atm_2_b |> gg_tsresiduals()

Code
fit_atm_2_c |> gg_tsresiduals()

Code
augment(fit_atm_2_a)%>%
  features(.innov, ljung_box, lag = 7)
# A tibble: 2 × 4
  ATM   .model lb_stat lb_pvalue
  <fct> <chr>    <dbl>     <dbl>
1 ATM2  ETS       24.5  0.000934
2 ATM2  SNAIVE    92.9  0       

ATM2 ARIMA

Code
atm2 <- df_atm_2 |>
  filter(ATM == "ATM2")

atm2_mod <- atm2 |>
  model(auto = ARIMA(Cash),
        arima000011 = ARIMA(Cash ~ 0 + pdq(0,0,0) + PDQ(0,1,1, period=7)),
        arima000012 = ARIMA(Cash ~ 0 + pdq(0,0,0) + PDQ(0,1,2, period=7)))

atm2_mod |>
  knitr::kable()
ATM auto arima000011 arima000012
ATM2 <ARIMA(2,0,2)(0,1,1)[7]> <ARIMA(0,0,0)(0,1,1)[7]> <ARIMA(0,0,0)(0,1,2)[7]>
Code
report(atm2_mod) |>
  dplyr::select(.model:BIC) |>
  knitr::kable()
.model sigma2 log_lik AIC AICc BIC
auto 588.0783 -1649.266 3310.531 3310.771 3333.814
arima000011 620.7961 -1660.382 3324.764 3324.798 3332.525
arima000012 622.0567 -1660.237 3326.473 3326.541 3338.115
Code
augment(atm2_mod)%>%
  features(.innov, ljung_box, lag = 7)
# A tibble: 3 × 4
  ATM   .model      lb_stat lb_pvalue
  <fct> <chr>         <dbl>     <dbl>
1 ATM2  arima000011   21.7    0.00283
2 ATM2  arima000012   21.6    0.00293
3 ATM2  auto           6.12   0.526  
Code
fc_atm_2A <- atm2_mod %>%
  forecast(h= 31)

fc_atm_2A %>%
  autoplot(df_atm_2 , level = 80) +
  labs(y = "Cash in hundreds of dollars",
       title = "ATM1 Forecast in May 2010")

Code
auto.arima(df_atm_2) 
Series: df_atm_2 
ARIMA(2,0,2)(0,1,1)[7] 

Coefficients:
          ar1      ar2     ma1     ma2     sma1
      -0.4215  -0.9097  0.4549  0.7936  -0.7489
s.e.   0.0551   0.0435  0.0849  0.0591   0.0387

sigma^2 = 588.1:  log likelihood = -1649.27
AIC=3310.53   AICc=3310.77   BIC=3333.81
Code
df_atm_2 %>%

  model(auto = ARIMA(Cash ~ 0 + pdq(2,0,2) + PDQ(0,1,1, period=7)))%>%
       
  residuals() %>% gg_tsdisplay()

ATM3

Code
ggsubseriesplot(ATM.ts[,3]) +
  labs(title = "ATM #3 Cash Withdrawal", subtitle ="1 May, 2009 to 30 April, 2010",
       x = "Days of the Week") +
  scale_y_continuous("Amount of Cash Withdrawal",
                     labels = scales::dollar_format(scale = 0.1, suffix = "K"))

ATM4 Subseries, lambda transformation , ETS and SNAIVE

Code
ggtsdisplay(ATM.ts[,4], main = "ATM #4 Cash Withdrawal", ylab = "Cash Withdrawal (in hundreds)", xlab = "Week")

Code
ggsubseriesplot(ATM.ts[,4]) +
  labs(title = "ATM #4 Cash Withdrawal", subtitle ="1 May, 2009 to 30 April, 2010",
       x = "Days of the Week") +
  scale_y_continuous("Amount of Cash Withdrawal",
                     labels = scales::dollar_format(scale = 0.1, suffix = "K"))

Code
df_atm_4 <- tsibbleATM_imputed |>
  filter(ATM == "ATM4")

lambda_4 <- df_atm_4 %>%
  features(Cash, features = guerrero) %>%
  pull(lambda_guerrero)

fit_atm_4_a <- df_atm_4%>%
  model(SNAIVE = SNAIVE(box_cox(Cash, lambda_4)),
        ETS = ETS((box_cox(Cash, lambda_4) ~ error("A") + trend("A") +
                                                season("A"))))

fit_atm_4_b <- df_atm_4%>%
  model(ETS = ETS((box_cox(Cash, lambda_4) ~ error("A") + trend("A") +
                                                season("A"))))
fit_atm_4_c <- df_atm_4%>%
  model(SNAIVE = SNAIVE(box_cox(Cash, lambda_4)))


fc_atm_4 <- fit_atm_4_a %>%
  forecast(h= 31)

fc_atm_4%>%
  autoplot(df_atm_4, level = 80) +
  labs(y = "Cash in hundreds of dollars",
       title = "ATM4 Forecast in May 2010")

ATM4 ARIMA

Code
atm4 <- tsibbleATM_imputed |>
  filter(ATM == "ATM4")

atm4_mod <- atm4 |>
  model(auto = ARIMA(Cash),
        arima000100 = ARIMA(Cash ~ 0 + pdq(0,0,0) + PDQ(1,0,0, period=7)))

atm4_mod |>
  knitr::kable()
ATM auto arima000100
ATM4 <ARIMA(3,0,2)(1,0,0)[7] w/ mean> <ARIMA(0,0,0)(1,0,0)[7]>
Code
report(atm4_mod) |>
  dplyr::select(.model:BIC) |>
  knitr::kable()
.model sigma2 log_lik AIC AICc BIC
auto 117510.1 -2645.176 5306.353 5306.757 5337.552
arima000100 171792.3 -2719.439 5442.878 5442.911 5450.678
Code
fc_atm_4A <- atm4_mod %>%
  forecast(h= 31)

fc_atm_4A %>%
  autoplot(atm4 , level = 80) +
  labs(y = "Cash in hundreds of dollars",
       title = "ATM4 Forecast in May 2010")

Code
auto.arima(atm4) 
Series: atm4 
ARIMA(0,0,3)(1,0,0)[7] with non-zero mean 

Coefficients:
         ma1     ma2     ma3    sar1      mean
      0.0874  0.0161  0.0886  0.1838  444.6095
s.e.  0.0532  0.0583  0.0516  0.0526   26.0987

sigma^2 = 119377:  log likelihood = -2648.96
AIC=5309.93   AICc=5310.16   BIC=5333.33
Code
atm4 %>%

  model(auto = ARIMA(Cash ~ 0 + pdq(3,0,2) + PDQ(1,0,0, period=7)))%>%
       
  residuals() %>% gg_tsdisplay()

Forecast

Let’s do ARIMA forecast and NAIVE for the ATM3 model and show forecast plots below.

Code
atm_train <- tsibbleATM_imputed |>
  filter(DATE < "2010-04-01")

atms_mod <- atm_train |>
  filter(ATM != "ATM3") |>
  model(ARIMA(Cash))

fc <- atms_mod |>
  forecast(h=61)
Code
atm3 <- tsibbleATM_imputed |>
  filter(ATM == "ATM3")

atm3_mod <- atm3 |>
  model(NAIVE(Cash))

atm3_fc <- atm3_mod |>
  forecast(h=31)

atm3_fc%>%
  autoplot(atm3, level = 80) +
  labs(y = "",
       title = 
         "ATM3 with NAIVE ",
         )

Code
fc <- fc |>
  bind_rows(atm3_fc)

fc |>
  autoplot(tsibbleATM_imputed)

Forecast - Plot residuals

White we have tried several types of models. We have proceeded with ARIMA as our final forecasts.

Code
atms_mod |>
  filter(ATM == "ATM1") |>
  gg_tsresiduals() +
  labs(title = "ATM1")

Code
atms_mod |>
  filter(ATM == "ATM2") |>
  gg_tsresiduals() +
  labs(title = "ATM2")

Code
atms_mod |>
  filter(ATM == "ATM4") |>
  gg_tsresiduals() +
  labs(title = "ATM4")

Code
train4 <- atm_train|>
  filter(ATM == "ATM4")

lambda_4 <- train4 %>%
  features(Cash, features = guerrero) %>%
  pull(lambda_guerrero)

fit_atm_4_a <- train4 %>%
  model(SNAIVE = SNAIVE(box_cox(Cash, lambda_4)),
        ETS = ETS((box_cox(Cash, lambda_4) ~ error("A") + trend("A") +
                                                season("A"))))

fit_atm_4_b <- train4 %>%
  model(ETS = ETS((box_cox(Cash, lambda_4) ~ error("A") + trend("A") +
                                                season("A"))))
fit_atm_4_c <- train4 %>%
  model(SNAIVE = SNAIVE(box_cox(Cash, lambda_4)))


fc_atm_4 <- fit_atm_4_a %>%
  forecast(h= 31)

fc_atm_4%>%
  autoplot(train4 , level = 80) +
  labs(y = "Cash in hundreds of dollars",
       title = "ATM4 Forecast in May 2010")

Code
fit_atm_4_c |>
  
  gg_tsresiduals() +
  labs(title = "ATM4")

Code
fit_atm_4_b |>
  
  gg_tsresiduals() +
  labs(title = "ATM4")

Code
fc_atm_4b <- fit_atm_4_b %>%
  forecast(h= 61)

fc_atm_4b%>%
  autoplot(train4 , level = 80) +
  labs(y = "Cash in hundreds of dollars",
       title = "ATM4 Forecast in May 2010")

Forecast - Write to file final forecast

Write out Arima for the forecasts. For ATM4, the model for ETS was better on review, therefore I have forecast ETS for ATM4.

Code
may_forecasts <- fc |>
  filter(DATE >= "2010-05-01")

may_forecasts 
# A fable: 124 x 5 [1D]
# Key:     ATM, .model [4]
   ATM   .model      DATE              Cash .mean
   <fct> <chr>       <date>          <dist> <dbl>
 1 ATM1  ARIMA(Cash) 2010-05-01  N(91, 899)  90.7
 2 ATM1  ARIMA(Cash) 2010-05-02 N(114, 899) 114. 
 3 ATM1  ARIMA(Cash) 2010-05-03  N(70, 899)  70.0
 4 ATM1  ARIMA(Cash) 2010-05-04  N(23, 899)  23.5
 5 ATM1  ARIMA(Cash) 2010-05-05 N(105, 899) 105. 
 6 ATM1  ARIMA(Cash) 2010-05-06  N(86, 955)  86.4
 7 ATM1  ARIMA(Cash) 2010-05-07  N(91, 957)  91.1
 8 ATM1  ARIMA(Cash) 2010-05-08  N(91, 957)  90.7
 9 ATM1  ARIMA(Cash) 2010-05-09 N(114, 957) 114. 
10 ATM1  ARIMA(Cash) 2010-05-10  N(70, 957)  70.0
# ℹ 114 more rows
Code
may_forecasts_ets <- fc_atm_4b|>
  filter(DATE >= "2010-05-01")
may_forecasts_ets
# A fable: 31 x 5 [1D]
# Key:     ATM, .model [1]
   ATM   .model DATE                Cash .mean
   <fct> <chr>  <date>            <dist> <dbl>
 1 ATM4  ETS    2010-05-01 t(N(25, 101))  516.
 2 ATM4  ETS    2010-05-02 t(N(25, 102))  531.
 3 ATM4  ETS    2010-05-03 t(N(25, 102))  506.
 4 ATM4  ETS    2010-05-04 t(N(24, 102))  461.
 5 ATM4  ETS    2010-05-05 t(N(23, 102))  434.
 6 ATM4  ETS    2010-05-06 t(N(12, 103))  160.
 7 ATM4  ETS    2010-05-07 t(N(26, 103))  561.
 8 ATM4  ETS    2010-05-08 t(N(25, 103))  516.
 9 ATM4  ETS    2010-05-09 t(N(25, 103))  531.
10 ATM4  ETS    2010-05-10 t(N(25, 103))  506.
# ℹ 21 more rows
Code
autoplot(may_forecasts_ets)

Code
openxlsx::write.xlsx(data.frame(may_forecasts), "forecasts_ATM_Banu_ARIMA.xlsx")
openxlsx::write.xlsx(data.frame(may_forecasts_ets), "forecasts_ATM_Banu_ETSATM4.xlsx")

Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013.  Your assignment is to model these data and a monthly forecast for 2014.  The data is given in a single file.  The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward.    Add this to your existing files above. 

Data Exploration & Plots

First we start reviewing the data and identify any missing variables. The data consists 192 rows and 3 columns. CaseSequence is redundant and can be removed. The date column (YYYY.MMM) is coded as a character. For timeseries analysis we will mutate and create a tsibble with the date after formatting. September 2008 is missing the recorded KWH.

Code
set.seed(2024)
Filepower <- readxl::read_excel("C:/Users/Banu/Documents/RScriptfiles/DATA 624/PROJECT 1/ResidentialCustomerForecastLoad-624.xlsx")


#Glimpse variables
introduce(Filepower)
# A tibble: 1 × 9
   rows columns discrete_columns continuous_columns all_missing_columns
  <int>   <int>            <int>              <int>               <int>
1   192       3                1                  2                   0
# ℹ 4 more variables: total_missing_values <int>, complete_rows <int>,
#   total_observations <int>, memory_usage <dbl>
Code
plot_intro(Filepower)

Code
glimpse(Filepower)
Rows: 192
Columns: 3
$ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
$ `YYYY-MMM`   <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
$ KWH          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…
Code
describe(Filepower)
             vars   n      mean         sd    median   trimmed        mad
CaseSequence    1 192     828.5      55.57     828.5     828.5      71.16
YYYY-MMM*       2 192      96.5      55.57      96.5      96.5      71.16
KWH             3 191 6502474.6 1447570.89 6283324.0 6439474.9 1543073.77
                min      max   range skew kurtosis        se
CaseSequence    733      924     191 0.00    -1.22      4.01
YYYY-MMM*         1      192     191 0.00    -1.22      4.01
KWH          770523 10655730 9885207 0.17     0.45 104742.55
Code
skim(Filepower)
Data summary
Name Filepower
Number of rows 192
Number of columns 3
_______________________
Column type frequency:
character 1
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
YYYY-MMM 0 1 8 8 0 192 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CaseSequence 0 1.00 828.5 55.57 733 780.75 828.5 876.25 924 ▇▇▇▇▇
KWH 1 0.99 6502474.6 1447570.89 770523 5429912.00 6283324.0 7620523.50 10655730 ▁▁▇▅▁
Code
#Look at missing values

knitr::kable(miss_var_summary(Filepower), 
             caption = 'Missing data',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()
Missing data
variable n_miss pct_miss
KWH 1 0.521
CaseSequence 0 0
YYYY-MMM 0 0
Code
Filepower_ts <- Filepower |>
  rename(Date = "YYYY-MMM") |>
  mutate(Date = yearmonth(Date)) |>
  dplyr::select(-CaseSequence) |>
  as_tsibble(index=Date)

Filepower_ts |>
  filter(is.na(KWH)) |>
  knitr::kable()
Date KWH
2008 Sep NA

Data Distributions

The line plot shows seasonality of the series. The data follows a clear seasonal trend. There is more power usage in the beginning, middle, and end of the year. There is an outlier point outside of the range during July 2010. We will update the outlier point to NA so we can impute it. The seasonal trend also shows a pattern with usage higher in the beginning, middle and end of the year.

Code
plot1 <- Filepower_ts |>
  autoplot(KWH) +
  labs(title = "Lineplot of Power Usage", x = "Date") +
  scale_y_continuous(labels=scales::comma)

plot2 <- Filepower_ts |>
  ggplot(aes(x = KWH)) +
  geom_histogram(bins=15, fill='lightpink') +
  scale_x_continuous(labels=scales::comma) +
  labs(title = "Histogram of KWH")

plot3 <- Filepower_ts |>
  ggplot(aes(x = KWH)) +
  geom_boxplot(fill='lightpink') +
  scale_x_continuous(labels=scales::comma) +
  labs(title = "Boxplot of KWH")

plot_grid(plot1, plot_grid(plot2, plot3, nrow=1), ncol=1)

Code
#Identify the outlier
Filepower_ts |>
  filter(KWH < 3000000)
# A tsibble: 1 x 2 [1M]
      Date    KWH
     <mth>  <dbl>
1 2010 Jul 770523
Code
outlierpt <- which(Filepower_ts$Date == yearmonth("2010 Jul"))
Filepower_ts$KWH[outlierpt] = NA

Filepower_ts |>
  filter(is.na(KWH)) |>
  knitr::kable()
Date KWH
2008 Sep NA
2010 Jul NA
Code
Filepower_ts |>
  gg_lag(KWH, geom="point", lags = 1:12)

Code
Filepower_ts |>
  gg_season(KWH) +
  scale_y_continuous(labels=scales::comma) +
  labs(title = "Seasonal Decomposition of KWH", x = "Month")

Code
Filepower_ts |>
  features(difference(KWH, lag=12), unitroot_kpss) |>
  knitr::kable()
kpss_stat kpss_pvalue
0.1323771 0.1

Data Imputation

To impute the missing values, we can use the mean of the same month the 2 years prior. Given the right-skewed nature of the histogram, we can perform a box-cox transformation to reduce the skewness.

Code
set.seed(2024)


imputejul <- Filepower_ts |>
  filter(Date %in% c("2008 Jul", "2009 Jul"))

imputesep <- Filepower_ts |>
  filter(Date %in% c("2006 Sep", "2007 Sep")) 

# impute
Filepower_ts <- Filepower_ts |>
  mutate(KWH = ifelse(Date == yearmonth("2010 Jul"), mean(imputejul$KWH), KWH),
         KWH = ifelse(Date == yearmonth("2008 Sep"), mean(imputesep$KWH), KWH))


ggplot(Filepower_ts, aes(x = KWH)) +
  geom_histogram() +
  labs(title = "Histogram of KWH")

Code
Filepower_ts %>% 
  plot_ly(x= ~year(Date)) %>%
  add_lines(y= ~KWH) %>%
  layout(title = "Power Usage",
         xaxis = list(title="Date"),
         yaxis = list(title="KWH"))
Code
Filepower_ts |>
  autoplot(KWH) +
  labs(title = "Lineplot of Power Usage", x = "Date") +
  scale_y_continuous(labels=scales::comma)

Code
knitr::kable(miss_var_summary(Filepower_ts), 
             caption = 'Missing data',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()
Missing data
variable n_miss pct_miss
Date 0 0
KWH 0 0

Models

In light of the observed seasonality,we can try some seasonal methods such as Arima. Here we can conver the dataframe into time series and see the ACF plot. The spikes show outside of the range, indicates it is not white noise and not-normal.

Code
power_ts <- ts(Filepower_ts$KWH, start=1998, frequency = 12)
Acf(power_ts)

Code
ggtsdisplay(power_ts, main="Time series KWH before transformation")

Code
Filepower_ts%>%
  ACF(KWH) %>%
  autoplot() +
  labs(title="KWH consumption before transformation ACF")

The aim now is to find an appropriate ARIMA model based on the ACF and PACF. Both the ACF and PACF plots are sinusoidal decaying and there is a large significant spike at lag 12. There are smaller significant spikes.

We can use auto.arima() function to determine the appropriate seasonal AR and MA models. Also I have used lambda to see if we can tranform the data to minimize the variance.

Train Models

Code
set.seed(2024)

Filepower_ts |>
  gg_tsdisplay(KWH) +
  labs(title = "Power Usage")

Code
Filepower_ts |>
  gg_tsdisplay(difference(KWH, lag=12), 
               plot_type="partial") +
  labs(title = "Power Usage: Seasonally Differenced")

Code
Filepower_ts|>
  features(difference(KWH, lag=12), unitroot_kpss) |>
kable()
kpss_stat kpss_pvalue
0.1623246 0.1
Code
Filepower_ts |>
  gg_subseries(KWH) +
  labs(title = "Power Usage")

Code
lambda <- Filepower_ts |>
  features(KWH, features = guerrero) |>
  pull(lambda_guerrero)
Filepower_ts  |>
  autoplot(box_cox(KWH, lambda)) +
  labs(y = "",
       title = latex2exp::TeX(paste0(
         "Transformed KWH $\\lambda$ = ",
         round(lambda,2))))

Code
Filepower_ts |>
  model(
    classical_decomposition(KWH, type = "additive")
  ) |>
  components() |>
  autoplot() +
  labs(title = "Classical additive decomposition of total
                  KWH")

Code
Filepower_ts %>% 
  model(
    STL(KWH ~ season(window = "periodic"),
                  
    robust = TRUE)) |>
  components() |>
  autoplot()

Predictions and Evaluation

Here I have tried some seasonal methods such as holt winter and damp holt winter. I have also plotted the histogram showing that for the large part the models show a normal distribution bell curve on the residual, however, there is one large spike on the ACF. We can proceed to look at ARIMA to see how it does by reviewing the AICc value, we can proceed with the lowest AICc on the ARIMA model and write out our forecast.

Code
Filepower_ts %>% auto.arima(lambda = lambda, biasadj = TRUE)
Series: . 
ARIMA(0,0,1)(2,1,0)[12] with drift 
Box Cox transformation: lambda= -0.2048968 

Coefficients:
         ma1     sar1     sar2  drift
      0.2483  -0.7212  -0.3555  1e-04
s.e.  0.0832   0.0751   0.0770  1e-04

sigma^2 = 1.505e-05:  log likelihood = 751.73
AIC=-1493.47   AICc=-1493.12   BIC=-1477.5
Code
Filepower_ts |>
  model(
    holt = ETS(KWH ~ error("A") +
                       trend("A") + season("A")),
    damp = ETS(KWH ~ error("M") +
                       trend("Ad") + season("M"))
  ) |>
  forecast(h = 24) |>
  autoplot(Filepower_ts, level = NULL) +
  labs(title = "Power generation",
       y = "KWH") +
  guides(colour = guide_legend(title = "Forecast holt winters and damp holt winter"))

Code
count_gaps(Filepower_ts)
# A tibble: 0 × 3
# ℹ 3 variables: .from <mth>, .to <mth>, .n <int>
Code
t <- Filepower_ts |>
  model(
    holt = ETS(KWH ~ error("A") +
                       trend("A") + season("A")),
    damp = ETS(KWH ~ error("M") +
                       trend("Ad") + season("M"))
  )

t |> pivot_longer(everything(), names_to = "Model name",
                     values_to = "Orders")
# A mable: 2 x 2
# Key:     Model name [2]
  `Model name`        Orders
  <chr>              <model>
1 holt          <ETS(A,A,A)>
2 damp         <ETS(M,Ad,M)>
Code
t |>  dplyr::select(holt) |> gg_tsresiduals(lag=24)

Code
t |>  dplyr::select(damp ) |> gg_tsresiduals(lag=24)

ARIMA models

The auto arima model <ARIMA(0,0,1)(2,1,0)[12] w/ drift> seems to have the lowest AICc value, hence we will select this model and store our forecast. Additionally, the residuals on this model seem better, however, that the auto arima model and the scores of AICc and BIC are lower as well. ARIMA(0,0,1)(0,1,1)[12] with drift.

Code
power_mods <- 
Filepower_ts |>
  filter(year(Date) < 2012) |>
  model(auto = ARIMA(KWH),
        arima000011 = ARIMA(KWH ~ 0 + pdq(0,0,0) + PDQ(0,1,1, period=12)),
        arima000110 = ARIMA(KWH ~ 0 + pdq(0,0,0) + PDQ(1,1,0, period=12)),
        arima001011 = ARIMA(KWH ~ 0 + pdq(0,0,1) + PDQ(0,1,1, period=12)),
        arima001110 = ARIMA(KWH ~ 0 + pdq(0,0,1) + PDQ(1,1,0, period=12)),
        arima001011drift = ARIMA(KWH ~ 1 + pdq(0,0,1) + PDQ(0,1,1, period=12)),
        arima001110drift = ARIMA(KWH ~ 1 + pdq(0,0,1) + PDQ(1,1,0, period=12)))

power_mods |> pivot_longer(everything(), names_to = "Model name",
                     values_to = "Orders")
# A mable: 7 x 2
# Key:     Model name [7]
  `Model name`                                 Orders
  <chr>                                       <model>
1 auto             <ARIMA(0,0,1)(2,1,0)[12] w/ drift>
2 arima000011               <ARIMA(0,0,0)(0,1,1)[12]>
3 arima000110               <ARIMA(0,0,0)(1,1,0)[12]>
4 arima001011               <ARIMA(0,0,1)(0,1,1)[12]>
5 arima001110               <ARIMA(0,0,1)(1,1,0)[12]>
6 arima001011drift <ARIMA(0,0,1)(0,1,1)[12] w/ drift>
7 arima001110drift <ARIMA(0,0,1)(1,1,0)[12] w/ drift>
Code
report(power_mods) |>
  dplyr::select(.model:BIC) |>
  knitr::kable()
.model sigma2 log_lik AIC AICc BIC
auto 358748449455 -2298.737 4607.474 4607.874 4622.723
arima000011 473934089563 -2320.148 4644.297 4644.375 4650.396
arima000110 491517415472 -2322.441 4648.882 4648.961 4654.982
arima001011 385874805250 -2304.657 4615.313 4615.471 4624.463
arima001110 429148017595 -2311.241 4628.482 4628.640 4637.632
arima001011drift 356469559457 -2299.458 4606.915 4607.180 4619.115
arima001110drift 418376486644 -2308.873 4625.747 4626.011 4637.946
Code
#power_mods |> summary(auto)

power_mods |>
  forecast(h="3 years") |>
  autoplot(Filepower_ts |> filter(year(Date) > 2010), level= NULL) +
  labs(title = "KWH Forecast Models ARIMA") +
  scale_y_continuous(labels=scales::comma)

Code
power_mods |>
  dplyr::select(auto) |>
  gg_tsresiduals() +
  labs(title = "Auto ARIMA")

Code
power_mods |>
  dplyr::select(arima001011drift) |>
  gg_tsresiduals() +
  labs(title = "ARIMA(0,0,1)(0,1,1)[12] with drift")

Code
fc_2014 <- power_mods |>
dplyr::select(auto) |>
  forecast(h="3 years") |>
  filter(year(Date)  == 2014)

fc_2014a <- power_mods |>
dplyr::select(arima001011drift) |>
  forecast(h="3 years") |>
  filter(year(Date)  == 2014)

fc_2014b <- power_mods |>
dplyr::select(arima001011drift) |>
  forecast(h="3 years") |>
  filter(year(Date)  == 2014)

fc_2014
# A fable: 12 x 4 [1M]
# Key:     .model [1]
   .model     Date                 KWH    .mean
   <chr>     <mth>              <dist>    <dbl>
 1 auto   2014 Jan N(9082193, 5.1e+11) 9082193.
 2 auto   2014 Feb N(8825149, 5.2e+11) 8825149.
 3 auto   2014 Mar N(6827872, 5.2e+11) 6827872.
 4 auto   2014 Apr   N(6e+06, 5.2e+11) 6028873.
 5 auto   2014 May N(5679457, 5.2e+11) 5679457.
 6 auto   2014 Jun N(7723045, 5.2e+11) 7723045.
 7 auto   2014 Jul N(9507586, 5.2e+11) 9507586.
 8 auto   2014 Aug N(9795634, 5.2e+11) 9795634.
 9 auto   2014 Sep N(8816531, 5.2e+11) 8816531.
10 auto   2014 Oct   N(6e+06, 5.2e+11) 6007628.
11 auto   2014 Nov N(6060119, 5.2e+11) 6060119.
12 auto   2014 Dec N(7943802, 5.2e+11) 7943802.
Code
fc_2014a 
# A fable: 12 x 4 [1M]
# Key:     .model [1]
   .model               Date                 KWH    .mean
   <chr>               <mth>              <dist>    <dbl>
 1 arima001011drift 2014 Jan N(8922240, 4.9e+11) 8922240.
 2 arima001011drift 2014 Feb N(8223524, 4.9e+11) 8223524.
 3 arima001011drift 2014 Mar N(6801798, 4.9e+11) 6801798.
 4 arima001011drift 2014 Apr   N(6e+06, 4.9e+11) 5959830.
 5 arima001011drift 2014 May N(5557517, 4.9e+11) 5557517.
 6 arima001011drift 2014 Jun N(7198250, 4.9e+11) 7198250.
 7 arima001011drift 2014 Jul N(8709822, 4.9e+11) 8709822.
 8 arima001011drift 2014 Aug N(9089526, 4.9e+11) 9089526.
 9 arima001011drift 2014 Sep N(8429024, 4.9e+11) 8429024.
10 arima001011drift 2014 Oct N(6118739, 4.9e+11) 6118739.
11 arima001011drift 2014 Nov N(5664756, 4.9e+11) 5664756.
12 arima001011drift 2014 Dec N(7291513, 4.9e+11) 7291513.
Code
dataframe1 <- data.frame (fc_2014a)

Write final forecast to a file

In conclusion :

From the book we looked at several models after checking seasonality, stationarity,looked at residuals, ran holt winter, ARIMA models and finally made ARIMA model selection and wrote the forecast to the file.

Definition from the book.

“Transformations such as logarithms can help to stabilise the variance of a time series. Differencing can help stabilise the mean of a time series by removing changes in the level of a time series, and therefore eliminating (or reducing) trend and seasonality.

A stationary time series is one whose properties do not depend on the time at which the series is observed. Thus, time series with trends, or with seasonality, are not stationary — the trend and seasonality will affect the value of the time series at different times. On the other hand, a white noise series is stationary — it does not matter when you observe it, it should look much the same at any point in time.”

Code
date = seq(as.Date('2014-01-01'), as.Date('2014-12-01'), by = 'month') %>% format('%Y-%b')
openxlsx::write.xlsx(data.frame(YYYY.MM = date, KWH = fc_2014a$.mean), "forecasts_Banu.xlsx")

Part C – Bonus

Part C consists of two data sets.  These are simple 2 columns sets, however they have different time stamps.  Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows).  Note for multiple recordings within an hour, take the mean.  Then to determine if the data is stationary and can it be forecast.  If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.   

Code
# Load the Excel data
pipe1_df <- readxl::read_excel('C:/Users/Banu/Documents/RScriptfiles/DATA 624/PROJECT 1/Waterflow_Pipe1.xlsx')
colnames(pipe1_df) <- c('DATE', 'WaterFlow')


pipe2_df <- readxl::read_excel('C:/Users/Banu/Documents/RScriptfiles/DATA 624/PROJECT 1/Waterflow_Pipe2.xlsx')
colnames(pipe2_df) <- c('DATE', 'WaterFlow')

glimpse(pipe1_df)
Rows: 1,000
Columns: 2
$ DATE      <dbl> 42300.02, 42300.03, 42300.04, 42300.04, 42300.06, 42300.06, …
$ WaterFlow <dbl> 23.369599, 28.002881, 23.065895, 29.972809, 5.997953, 15.935…

Let’s format the Date.Time columns to actual date types.

Code
waterpipe1_df <- pipe1_df |>
  mutate(Date.Time = as.POSIXct(pipe1_df$DATE * 86400, origin = "1899-12-30", tz="GMT"))

waterpipe2_df <- pipe2_df |>
  mutate(Date.Time = as.POSIXct(pipe2_df$DATE * 86400, origin = "1899-12-30", tz="GMT"))

Look at missing data , aggregate the mean or average flow and look at hourly data.

Code
head(waterpipe1_df)
# A tibble: 6 × 3
    DATE WaterFlow Date.Time          
   <dbl>     <dbl> <dttm>             
1 42300.     23.4  2015-10-23 00:24:06
2 42300.     28.0  2015-10-23 00:40:02
3 42300.     23.1  2015-10-23 00:53:51
4 42300.     30.0  2015-10-23 00:55:40
5 42300.      6.00 2015-10-23 01:19:17
6 42300.     15.9  2015-10-23 01:23:58
Code
head(waterpipe2_df)
# A tibble: 6 × 3
    DATE WaterFlow Date.Time          
   <dbl>     <dbl> <dttm>             
1 42300.      18.8 2015-10-23 01:00:00
2 42300.      43.1 2015-10-23 01:59:59
3 42300.      38.0 2015-10-23 03:00:00
4 42300.      36.1 2015-10-23 04:00:00
5 42300.      31.9 2015-10-23 04:59:59
6 42300.      28.2 2015-10-23 06:00:00
Code
knitr::kable(miss_var_summary(waterpipe1_df), 
             caption = 'Pipe 1 Missing Data',
             format="html", 
             table.attr="style='width:60%;'") %>% 
  kableExtra::kable_styling()
Pipe 1 Missing Data
variable n_miss pct_miss
DATE 0 0
WaterFlow 0 0
Date.Time 0 0
Code
knitr::kable(miss_var_summary(waterpipe2_df), 
             caption = 'Pipe 1 Missing Data',
             format="html", 
             table.attr="style='width:60%;'") %>% 
  kableExtra::kable_styling()
Pipe 1 Missing Data
variable n_miss pct_miss
DATE 0 0
WaterFlow 0 0
Date.Time 0 0
Code
hourly_pipe1 <- waterpipe1_df |>
  mutate(Date.Time = floor_date(Date.Time, "hour")) |>
  group_by(Date.Time) |>
  summarize(avg_flow = mean(WaterFlow, na.rm=T)) |>
  as_tsibble(index=Date.Time)

hourly_pipe2 <- waterpipe2_df |>
  mutate(Date.Time = floor_date(Date.Time, "hour")) |>
  group_by(Date.Time) |>
  summarize(avg_flow = mean(WaterFlow, na.rm=T)) |>
  as_tsibble(index=Date.Time)

hourly_pipe1 |>
  features(avg_flow, unitroot_kpss) |>
  knitr::kable()
kpss_stat kpss_pvalue
0.2465587 0.1
Code
# hourly_pipe1 |>
#  ACF(avg_flow) |>
#   autoplot() 

Check for gaps and interpolate

This time series has gaps or missing values, therefore we will use na_interpolation to fill gaps and plot the ACF. The ACF shows one significant spike.

Code
count_gaps(hourly_pipe1)
# A tibble: 4 × 3
  .from               .to                    .n
  <dttm>              <dttm>              <int>
1 2015-10-27 17:00:00 2015-10-27 17:00:00     1
2 2015-10-28 00:00:00 2015-10-28 00:00:00     1
3 2015-11-01 05:00:00 2015-11-01 05:00:00     1
4 2015-11-01 09:00:00 2015-11-01 09:00:00     1
Code
count_gaps(hourly_pipe2)
# A tibble: 333 × 3
   .from               .to                    .n
   <dttm>              <dttm>              <int>
 1 2015-10-23 02:00:00 2015-10-23 02:00:00     1
 2 2015-10-23 05:00:00 2015-10-23 05:00:00     1
 3 2015-10-23 08:00:00 2015-10-23 08:00:00     1
 4 2015-10-23 11:00:00 2015-10-23 11:00:00     1
 5 2015-10-23 14:00:00 2015-10-23 14:00:00     1
 6 2015-10-23 17:00:00 2015-10-23 17:00:00     1
 7 2015-10-23 20:00:00 2015-10-23 20:00:00     1
 8 2015-10-23 23:00:00 2015-10-23 23:00:00     1
 9 2015-10-24 02:00:00 2015-10-24 02:00:00     1
10 2015-10-24 05:00:00 2015-10-24 05:00:00     1
# ℹ 323 more rows
Code
pipes1_tsb <- hourly_pipe1 %>%
  fill_gaps(.full=TRUE) %>%
  na_interpolation(option="spline")

pipes2_tsb <- hourly_pipe2%>%
  fill_gaps(.full=TRUE) %>%
  na_interpolation(option="spline")


pipes1_tsb |> ACF(avg_flow) |>
  autoplot() 

Code
pipes2_tsb |> ACF(avg_flow) |>
  autoplot() 

Plot pipe1

Code
plot1 <- hourly_pipe1 |>
  fill_gaps() |>
  fill(avg_flow, .direction="down") |>
  autoplot(avg_flow) +
  labs(title = "Pipe 1: Hourly Water Flow", x = "Date", y = "Average Water Flow")

plot2 <- hourly_pipe1 |>
  fill_gaps() |>
  fill(avg_flow, .direction="down") |>
  ACF(avg_flow) |>
  autoplot() +
  labs(title = "ACF")

plot_grid(plot1, plot2, ncol=1)

Plot pipe2

Code
plot1 <- hourly_pipe2 |>
  fill_gaps() |>
  fill(avg_flow, .direction="down") |>
  autoplot(avg_flow) +
  labs(title = "Pipe 2: Hourly Water Flow", x = "Date", y = "Average Water Flow")

plot2 <- hourly_pipe2 |>
  fill_gaps() |>
  fill(avg_flow, .direction="down") |>
  ACF(avg_flow) |>
  autoplot() +
  labs(title = "ACF")

plot_grid(plot1, plot2, ncol=1)

Check Stationarity

I have checked ome ways to check differencing and then looking at the features to see the effects.

From the book: “Differencing can help stabilise the mean of a time series by removing changes in the level of a time series, and therefore eliminating (or reducing) trend and seasonality.A stationary time series is one whose properties do not depend on the time at which the series is observed.17 Thus, time series with trends, or with seasonality, are not stationary — the trend and seasonality will affect the value of the time series at different times. On the other hand, a white noise series is stationary — it does not matter when you observe it, it should look much the same at any point in time.”

Code
pipes2_tsb |> ACF(difference(avg_flow)) |>
  autoplot() + labs(subtitle = "Hourly Water Flow Pipe 2")

Code
pipes2_tsb |>
  features(avg_flow, unitroot_ndiffs)
# A tibble: 1 × 1
  ndiffs
   <int>
1      0
Code
pipes2_tsb |>
  mutate(diff_avg_flow = difference(avg_flow)) |>
  features(diff_avg_flow, unitroot_kpss)
# A tibble: 1 × 2
  kpss_stat kpss_pvalue
      <dbl>       <dbl>
1   0.00806         0.1
Code
t<- pipes2_tsb |>
  mutate(diff_avg_flow = difference(avg_flow)) 

t %>%
  gg_tsdisplay(diff_avg_flow)

Boxcox,ETS,SNAIVE

This time, the p-value is reported as 0.1 (and so it could be larger than that). We can conclude that the differenced data appear stationary.

Code
lambda_5 <- pipes2_tsb %>%
  features(avg_flow, features = guerrero) %>%
  pull(lambda_guerrero)


fit_atm_4_a <-  pipes2_tsb %>%
  model(SNAIVE = SNAIVE(box_cox(avg_flow, lambda_5)),
        ETS = ETS((box_cox(avg_flow, lambda_4) ~ error("A") + trend("A") +
                                                season("A"))))

fit_atm_4_b <- pipes2_tsb%>%
  model(ETS = ETS((box_cox(avg_flow, lambda_4) ~ error("A") + trend("A") +
                                                season("A"))))
fit_atm_4_c <- pipes2_tsb %>%
  model(SNAIVE = SNAIVE(box_cox(avg_flow, lambda_4)))


fc_atm_4 <- fit_atm_4_a %>%
  forecast(h= "1 week")

fc_atm_4%>%
  autoplot(pipes2_tsb , level = 80) +
  labs(y = "Hourly Water Flow",
       title = "Hourly Water Flow Pipe 2 with boxcox and SNAIVE, ETS")

Code
fit_atm_4_b |>
  
  gg_tsresiduals() +
  labs(title = "Hourly Water Flow boxcox ETS")

Code
fit_atm_4_c |>
  
  gg_tsresiduals() +
  labs(title = "Hourly Water Flow boxcox SNAIVE")

Code
fc_atm_4b <- fit_atm_4_b %>%
  forecast(h = "1 week")

fc_atm_4c <- fit_atm_4_c %>%
  forecast(h= "1 week")

fc_atm_4b%>%
  autoplot(pipes2_tsb , level = 80) +
  labs(y = "Hourly Water Flow Pipe 2 with boxcox and ETS",
       title = "Hourly Water Flow Pipe 2 with boxcox and ETS")

Code
fc_atm_4c%>%
  autoplot(pipes2_tsb , level = 80) +
  labs(y = "Hourly Water Flow Pipe 2 with boxcox SNAIVE",
       title = "Hourly Water Flow Pipe 2 with boxcox and SNAIVE")

Forecast ARIMA

Code
pipe1_mod <- hourly_pipe1 |>
  fill_gaps() |>
  fill(avg_flow, .direction="down") |>
  model(ARIMA(avg_flow))

report(pipe1_mod)
Series: avg_flow 
Model: ARIMA(0,0,0) w/ mean 

Coefficients:
      constant
       19.8585
s.e.    0.2743

sigma^2 estimated as 18.14:  log likelihood=-687.79
AIC=1379.59   AICc=1379.64   BIC=1386.55
Code
pipe2_mod <- hourly_pipe2 |>
  fill_gaps() |>
  fill(avg_flow, .direction="down") |>
  model(ARIMA(avg_flow))

report(pipe2_mod)
Series: avg_flow 
Model: ARIMA(0,0,1)(0,0,1)[24] w/ mean 

Coefficients:
         ma1    sma1  constant
      0.2697  0.1168   39.5858
s.e.  0.0317  0.0311    0.5670

sigma^2 estimated as 161.2:  log likelihood=-3959.02
AIC=7926.04   AICc=7926.08   BIC=7945.67
Code
pipe1_fc <- pipe1_mod |>
  forecast(h="1 week")

pipe1_fc |>
  autoplot(hourly_pipe1) +
  labs(title = "Pipe 1 Week Forecast arima", y = "Average Water Flow", x = "Date")

Code
pipe2_fc <- pipe2_mod |>
  forecast(h="1 week")

pipe2_fc |>
  autoplot(hourly_pipe2) +
  labs(title = "Pipe 2 Week Forecast arima", y = "Average Water Flow", x = "Date")

Code
openxlsx::write.xlsx(data.frame(pipe1_fc), "forecasts_ATM_Banu_pipe1.xlsx")
openxlsx::write.xlsx(data.frame(pipe2_fc), "forecasts_ATM_Banu_pipe2.xlsx")