Data 624 Project 1

library(knitr)
library(rmdformats)

## Global options
options(max.print="85")
opts_chunk$set(cache=TRUE,
               prompt=FALSE,
               tidy=TRUE,
               comment=NA,
               message=FALSE,
               warning=FALSE)
opts_knit$set(width=31)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(tidyr)
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(xts) #extensible time-series
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(tsbox)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.3     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ✓ purrr   0.3.3
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x psych::%+%()             masks ggplot2::%+%()
## x psych::alpha()           masks ggplot2::alpha()
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x xts::first()             masks dplyr::first()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x xts::last()              masks dplyr::last()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
library(tsibble)
## 
## Attaching package: 'tsibble'
## The following object is masked from 'package:zoo':
## 
##     index
## The following objects are masked from 'package:lubridate':
## 
##     interval, new_interval
library(fpp3)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────── fpp3 0.3 ──
## ✓ tsibbledata 0.2.0     ✓ fable       0.2.1
## ✓ feasts      0.1.5
## ── Conflicts ────────────────────────────────────────────────────────────────────────── fpp3_conflicts ──
## x psych::%+%()            masks ggplot2::%+%()
## x psych::alpha()          masks ggplot2::alpha()
## x lubridate::date()       masks base::date()
## x dplyr::filter()         masks stats::filter()
## x xts::first()            masks dplyr::first()
## x fabletools::forecast()  masks forecast::forecast()
## x tsibble::index()        masks zoo::index()
## x tsibble::interval()     masks lubridate::interval()
## x dplyr::lag()            masks stats::lag()
## x xts::last()             masks dplyr::last()
## x tsibble::new_interval() masks lubridate::new_interval()
library(urca)
library(ggseas)
library(zoo)
library(readxl)
library(DT)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale

Part C [Bonus]: Forecasting Water Pipe Flow

Loading data

pipes1 <- read_excel("./Waterflow_Pipe1.xlsx", col_names = TRUE, col_types = c("date", 
    "numeric")) %>% rename(dt = "Date Time", wf = WaterFlow)

pipes2 <- read_excel("./Waterflow_Pipe2.xlsx", col_names = TRUE, col_types = c("date", 
    "numeric")) %>% rename(dt = "Date Time", wf = WaterFlow)
datatable(pipes1, caption = "Table 1: Waterflow Pipe1.", options = list(pageLength = 8, 
    dom = "tip"), rownames = FALSE)
datatable(pipes2, caption = "Table 1: Waterflow Pipe2.", options = list(pageLength = 8, 
    dom = "tip"), rownames = FALSE)

pipes1 has waterflow data recorded as early as Oct 23 2015 and up to Nov 1 2015.

pipes2 has waterflow data recorded hourly from Oct 23 2015 to Dec 3 2015.

As it is noted in the project description, there are some data consolidation work to be done for pipes1 waterflow data. We essentially need to take the mean of all hourly data.

Pipes2

str(pipes2)
tibble [1,000 × 2] (S3: tbl_df/tbl/data.frame)
 $ dt: POSIXct[1:1000], format: "2015-10-23 01:00:00" "2015-10-23 02:00:00" ...
 $ wf: num [1:1000] 18.8 43.1 38 36.1 31.9 ...
pipes2 %>% group_by(Date = date(dt), `Hour of Day` = chron::hours(dt)) %>% tally()
# A tibble: 1,000 x 3
# Groups:   Date [42]
   Date       `Hour of Day`     n
   <date>             <dbl> <int>
 1 2015-10-23             1     1
 2 2015-10-23             2     1
 3 2015-10-23             3     1
 4 2015-10-23             4     1
 5 2015-10-23             5     1
 6 2015-10-23             6     1
 7 2015-10-23             7     1
 8 2015-10-23             8     1
 9 2015-10-23             9     1
10 2015-10-23            10     1
# … with 990 more rows
pipes2 %>% group_by(Date = date(dt)) %>% tally()
# A tibble: 42 x 2
   Date           n
   <date>     <int>
 1 2015-10-23    23
 2 2015-10-24    24
 3 2015-10-25    24
 4 2015-10-26    24
 5 2015-10-27    24
 6 2015-10-28    24
 7 2015-10-29    24
 8 2015-10-30    24
 9 2015-10-31    24
10 2015-11-01    24
# … with 32 more rows
# making a copy of pipes2 into pipe2 just to conform to the same naming
# convention used for pipes1
pipe2 <- pipes2 %>% transmute(Date = dt, WaterFlow = wf)

Confirming that pipes2 is nice and easy, 1 record per hour, 24 records per day with one exception. Oct 23 2015, there are only 23 records.

Pipes1

pipes1 %>% group_by(Date = date(dt), `Hour of Day` = chron::hours(dt)) %>% tally()
# A tibble: 236 x 3
# Groups:   Date [10]
   Date       `Hour of Day`     n
   <date>             <dbl> <int>
 1 2015-10-23             0     4
 2 2015-10-23             1     5
 3 2015-10-23             2     2
 4 2015-10-23             3     5
 5 2015-10-23             4     2
 6 2015-10-23             5     9
 7 2015-10-23             6     8
 8 2015-10-23             7     3
 9 2015-10-23             8     3
10 2015-10-23             9     3
# … with 226 more rows
pipes1 %>% group_by(Date = date(dt)) %>% tally()
# A tibble: 10 x 2
   Date           n
   <date>     <int>
 1 2015-10-23    96
 2 2015-10-24   109
 3 2015-10-25    95
 4 2015-10-26   118
 5 2015-10-27    99
 6 2015-10-28   104
 7 2015-10-29    96
 8 2015-10-30   111
 9 2015-10-31    91
10 2015-11-01    81

Data transformation is needed for pipes1 as there are multiple records within an hour per day.

pipe1 <- pipes1 %>% group_by(Date = date(dt), Hour = hour(dt)) %>% summarise(WaterFlow = mean(wf), 
    .groups = "drop") %>% mutate(DateHour = ymd_h(paste(Date, Hour))) %>% select(DateHour, 
    WaterFlow)

Notice that pipes1 start the ts at 2015-10-23 at hour 0 (midnight) whereas pipes2 begins its time series data of watterflow at 2015-10-23 at hour 1 (1am). Meaning for factual reporting purposes, it’s never a good idea to merge the 2 ts and work off the same time as if we didn’t do this minute difference. Tsclean is not going to do a precise job unless you can start of a tsibble object where you recorded the start of time series data for pipe1 and pipe2 precisely. Until then, you won’t get tsclean to treat the missing value properly.

For the purposes of forecasting, I am not going to spend time doing tsclean on the 2 ts objects.

pipe1ts <- pipe1 %>% as_tsibble(index = DateHour)
pipe1.ts = ts(pipe1$WaterFlow, start = c(2015, 10, 23, 0, 0, 0), freq = 24)

pipe2ts <- pipe2 %>% as_tsibble()
pipe2.ts = ts(pipe2$WaterFlow, start = c(2015, 10, 23, 1, 0, 0), freq = 24)
# str(cbind(pipe1.ts, pipe2.ts))
pipes.ts <- cbind(pipe1.ts, pipe2.ts)

psych::describe(pipes.ts) %>% select(-vars) %>% kable(digits = 2L, caption = "Descriptive Statistics of Waterflow of Pipes", 
    table.attr = "style='width:22%;'") %>% kableExtra::kable_styling(full_width = F)
Descriptive Statistics of Waterflow of Pipes
n mean sd median trimmed mad min max range skew kurtosis se
pipe1.ts 236 19.89 4.24 19.78 19.84 4.32 8.92 31.73 22.81 0.13 -0.17 0.28
pipe2.ts 1000 39.56 16.05 39.68 39.46 16.73 1.88 78.30 76.42 0.03 -0.55 0.51
boxplot(pipe1$WaterFlow, main = sprintf("%s", "Waterflow of Pipes1"), col = "violet", 
    fill = "purple", xlab = sprintf("# of outliers = %d", length(boxplot(pipes1, 
        plot = F)$out)), ylab = "WaterFlow")

boxplot(pipe2$WaterFlow, main = sprintf("%s", "Waterflow of Pipes2"), col = "violet", 
    fill = "purple", xlab = sprintf("# of outliers = %d", length(boxplot(pipes2, 
        plot = F)$out)), ylab = "WaterFlow")

Great. No outliers in both ts of pipes.

Time Series Analysis

lambda1 <- BoxCox.lambda(pipe1.ts)
pipes1.bc <- BoxCox(pipe1.ts, lambda = lambda1)


# time plot of the original pipe1.ts
ggtsdisplay(pipe1.ts, main = "Waterflow of Pipes1", ylab = "Waterflow", xlab = "hour")

# time series display of the Box-Cox transformed series
ggtsdisplay(pipes1.bc, main = "Waterflow of Pipes1", ylab = "Transformed Waterflow", 
    xlab = "hour")

I’m going to keep the BoxCox transformation

# Number of differences required for a stationary series
ndiffs(pipes1.bc)
[1] 0
# Number of differences required for a seasonally stationary series
nsdiffs(pipes1.bc)
[1] 0
pipes1.bc %>% ur.kpss %>% summary

####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 4 lags. 

Value of test-statistic is: 0.2445 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

The test-statistic is not in critical region.

Both PACF suggested that we have 2 autoregressive terms. There is no moving average term neeed.

By visualizing the PACF and ACF, I came up with a manual order of the arima model ARIMA(p,d,q)(P,D,Q)m of p = 2, P = 0, q = 0, Q = 0, d= 0, D= 0 where m = 24.

pipe1_arima <- Arima(pipe1.ts, order = c(2, 0, 0), seasonal = c(0, 0, 0), lambda = lambda1, 
    , biasadj = TRUE)
pipe1_arima
Series: pipe1.ts 
ARIMA(2,0,0) with non-zero mean 
Box Cox transformation: lambda= 0.8414107 

Coefficients:
         ar1      ar2     mean
      0.0270  -0.0375  13.4800
s.e.  0.0652   0.0651   0.1698

sigma^2 estimated as 7.038:  log likelihood=-563.62
AIC=1135.24   AICc=1135.41   BIC=1149.09
summary(pipe1_arima)
Series: pipe1.ts 
ARIMA(2,0,0) with non-zero mean 
Box Cox transformation: lambda= 0.8414107 

Coefficients:
         ar1      ar2     mean
      0.0270  -0.0375  13.4800
s.e.  0.0652   0.0651   0.1698

sigma^2 estimated as 7.038:  log likelihood=-563.62
AIC=1135.24   AICc=1135.41   BIC=1149.09

Training set error measures:
                       ME     RMSE      MAE       MPE     MAPE      MASE
Training set 0.0005288162 4.227096 3.320638 -5.046513 18.22338 0.6822205
                     ACF1
Training set -0.003222213

Manual Arima:

ARIMA(2,0,0) with non-zero mean 

Accuracy Metrics:

AICc=1135.41

RMSE: 4.227096
pipe1_auto.arima <- pipe1.ts %>% auto.arima(lambda = lambda1, biasadj = T)

summary(pipe1_auto.arima)
Series: . 
ARIMA(0,0,0) with non-zero mean 
Box Cox transformation: lambda= 0.8414107 

Coefficients:
         mean
      13.4802
s.e.   0.1718

sigma^2 estimated as 6.993:  log likelihood=-563.86
AIC=1131.73   AICc=1131.78   BIC=1138.66

Training set error measures:
                       ME     RMSE      MAE       MPE     MAPE      MASE
Training set 0.0004107926 4.231136 3.332445 -5.061922 18.29474 0.6846463
                   ACF1
Training set 0.02101267

Auto.arima also produces the same order, i.e., ARIMA(0,0,0) with non-zero mean.

Accuracy Metrics:

AICc=1131.78

RMSE: 4.231136
forecast(pipe1_auto.arima, h = 168) %>% autoplot

Unfortunately, I don’t think the above way of doing forecast is the right way. It is just a simple exponential smoothing.

Let’s change gear to use STL methods. Forecasting out a week, with frequency = 24.

# stlf models
stlf.ets <- stlf(pipe1.ts, lambda = lambda1, method = "ets", h = 168)



autoplot(pipe1.ts) + autolayer(stlf.ets, PI = FALSE, series = "STLF-ETS") + labs(title = "STLF forecast using ETS", 
    x = "DateHour (Supposed to be)", y = "Waterflow")

# ts converts its index to numeric. A numeric based index is not suitable for the
# requirement for POSIXct datetime labels, which limits my ability to fix the
# x-axis after hours of trying. Going the route of converting everything to
# dataframe would not work as easily as autolayer is needed for automatically
# plotting the object stlf.ets.  tsibble -> dataframe -> ggplot (does not work
# well with stlf.ets)

# head(pipe1) tail(pipe1) min = as.POSIXct('2015-10-23 00:00:00 CET') max =
# as.POSIXct('2015-11-01 23:00:00 CET') pipe1.ts.coerced <- as.ts(pipe1ts)
# pipe1_df <- tsbox::ts_df(pipe1.ts.coerced) pipe.ts_new <- ts(pipe1.ts,
# start=c(2015,10,23,0,0,0), freq = 24) autoplot(pipe1.ts) + autolayer(stlf.ets,
# PI = FALSE, series = 'STLF-ETS')+ scale_x_datetime(breaks = seq(min, max, '1
# hour'), labels = date_format('%a-%d\n%H'), limits = c(min, max)) + xlab('')
# scale_x_datetime(breaks = seq(min, max, '1 hour'), labels =
# date_format('%a-%d\n%H'), limits = c(min, max)) + labs(title = 'STL forecast
# using ETS', x = 'Hour', y = 'Waterflow')

Repeating the steps for Pipes2

lambda2 <- BoxCox.lambda(pipe2.ts)
pipes2.bc <- BoxCox(pipe2.ts, lambda = lambda2)


# time plot of the original pipe2.ts
ggtsdisplay(pipe2.ts, main = "Waterflow of Pipes2", ylab = "Waterflow", xlab = "hour")

# time series display of the Box-Cox transformed series
ggtsdisplay(pipes2.bc, main = "Waterflow of Pipes2", ylab = "Transformed Waterflow", 
    xlab = "hour")

Since we’re going to default to ets. I am going to skip the manual arima and auto.arima steps just knowing that the eventual forecast will be just a simple exponential smoothing, which does us no good.

# stlf models
stlf2.ets <- stlf(pipe2.ts, lambda = lambda2, method = "ets", h = 168)



autoplot(pipe2.ts) + autolayer(stlf2.ets, PI = FALSE, series = "STLF-ETS") + labs(title = "STLF forecast using ETS", 
    x = "DateHour (Supposed to be)", y = "Waterflow")

autoplot(pipe2.ts) + autolayer(stlf2.ets, PI = FALSE, series = "STLF-ETS") + labs(title = "STLF forecast using ETS (close-up)", 
    x = "DateHour (Supposed to be)", y = "Waterflow") + coord_cartesian(xlim = c(2053, 
    2171))

Final Forecast

  • Pipes1: ETS( ) with a BoxCox transformation of \(\lambda = 0.84\).

  • Pipes2: ETS( ) with a BoxCox transformation of \(\lambda = 0.94\).

date.pipe1 = seq(ymd_hms("2015-11-01 24:00:00"), ymd_hms("2015-11-08 23:00:00"), 
    by = as.difftime(hours(1)))
date.pipe2 = seq(ymd_hms("2015-12-03 17:00:00"), ymd_hms("2015-12-10 16:00:00"), 
    by = as.difftime(hours(1)))

# openxlsx::write.xlsx(data.frame('Date.Time' = date.pipe1, 'WaterFlow' =
# stlf.ets$mean), 'pipes1_forecasts.xlsx')
# openxlsx::write.xlsx(data.frame('Date.Time' = date.pipe2, 'WaterFlow' =
# stlf2.ets$mean), 'pipes2_forecasts.xlsx')