Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via RPubs and .rmd and the forecast in an Excel readable file.
Note As this optional assignment is mainly focused on data management and determining if it is better to forecast the pipes together or separate, I will be using the auto-arima function.
data_raw_1 <- read_excel ('./data/Waterflow_Pipe1.xlsx', col_names = TRUE, col_types = c("date", "numeric"))
data_raw_2 <- read_excel ('./data/Waterflow_Pipe2.xlsx', col_names = TRUE, col_types = c("date", "numeric"))
colnames(data_raw_1)= c("DateTime", "FLOW")
colnames(data_raw_2)= c("DateTime", "FLOW")
pipe1 <- data_raw_1
pipe1 <- pipe1%>%
mutate(Date = date(DateTime),
Hour = hour(DateTime)+1)%>%
group_by(Date, Hour)%>%
summarise(FLOW = mean(FLOW))%>%
ungroup()%>%
mutate(DateTime = ymd_h(paste(Date, Hour)))%>%
select(DateTime, FLOW)
## `summarise()` regrouping output by 'Date' (override with `.groups` argument)
head(pipe1)
## # A tibble: 6 x 2
## DateTime FLOW
## <dttm> <dbl>
## 1 2015-10-23 01:00:00 26.1
## 2 2015-10-23 02:00:00 18.9
## 3 2015-10-23 03:00:00 15.2
## 4 2015-10-23 04:00:00 23.1
## 5 2015-10-23 05:00:00 15.5
## 6 2015-10-23 06:00:00 22.7
summary(pipe1)
## DateTime FLOW
## Min. :2015-10-23 01:00:00 Min. : 8.923
## 1st Qu.:2015-10-25 11:45:00 1st Qu.:17.033
## Median :2015-10-27 23:30:00 Median :19.784
## Mean :2015-10-27 23:38:53 Mean :19.893
## 3rd Qu.:2015-10-30 11:15:00 3rd Qu.:22.789
## Max. :2015-11-02 00:00:00 Max. :31.730
boxplot(pipe1$FLOW,
ylab = "Flow Rate")
out <- boxplot.stats(pipe1$FLOW)$out
out
## [1] 31.72993
tspipe1 <- ts(pipe1$FLOW, frequency = 24, start = c(2015,1))
grid.arrange(nrow=2,
ggseasonplot(tspipe1),
ggsubseriesplot(tspipe1))
ggtsdisplay(tspipe1)
#ndiffs(tspipe1)
tspipe1 %>% ur.kpss %>% summary
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 4 lags.
##
## Value of test-statistic is: 0.2466
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
tspipe1_lambda <- BoxCox.lambda(tspipe1)
auto_pipe1 <- auto.arima(tspipe1, approximation = FALSE, lambda = tspipe1_lambda)
summary(auto_pipe1)
## Series: tspipe1
## ARIMA(0,0,0) with non-zero mean
## Box Cox transformation: lambda= 0.8414107
##
## Coefficients:
## mean
## 13.4802
## s.e. 0.1718
##
## sigma^2 estimated as 6.993: log likelihood=-563.86
## AIC=1131.73 AICc=1131.78 BIC=1138.66
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.07255518 4.231758 3.331073 -4.680888 18.2215 0.6843644
## ACF1
## Training set 0.02101267
pipe2 <- data_raw_2
summary(pipe2)
## DateTime FLOW
## Min. :2015-10-23 01:00:00 Min. : 1.885
## 1st Qu.:2015-11-02 10:45:00 1st Qu.:28.140
## Median :2015-11-12 20:30:00 Median :39.682
## Mean :2015-11-12 20:30:00 Mean :39.556
## 3rd Qu.:2015-11-23 06:15:00 3rd Qu.:50.782
## Max. :2015-12-03 16:00:00 Max. :78.303
boxplot(pipe2$FLOW,
ylab = "Flow Rate")
out <- boxplot.stats(pipe2$FLOW)$out
out
## numeric(0)
tspipe2 <- ts(pipe2$FLOW, frequency = 24, start = c(2015,1))
grid.arrange(nrow=2,
ggseasonplot(tspipe2),
ggsubseriesplot(tspipe2))
ggtsdisplay(tspipe2)
#ndiffs(tspipe2)
tspipe2 %>% ur.kpss %>% summary
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 7 lags.
##
## Value of test-statistic is: 0.1054
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
tspipe2_lambda <- BoxCox.lambda(tspipe2)
auto_pipe2 <- auto.arima(tspipe2, approximation = FALSE, lambda = tspipe2_lambda)
summary(auto_pipe2)
## Series: tspipe2
## ARIMA(0,0,0)(1,0,0)[24] with non-zero mean
## Box Cox transformation: lambda= 0.9447974
##
## Coefficients:
## sar1 mean
## 0.0861 32.9725
## s.e. 0.0320 0.4525
##
## sigma^2 estimated as 172.1: log likelihood=-3992.06
## AIC=7990.13 AICc=7990.15 BIC=8004.85
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.193946 15.98305 13.05623 -33.92608 56.91807 0.7504193
## ACF1
## Training set -0.02616276
pipe_join <- rbind(pipe1, pipe2)
pipe_join <- pipe_join%>%
mutate(Date = date(DateTime),
Hour = hour(DateTime)+1)%>%
group_by(Date, Hour)%>%
summarise(FLOW = mean(FLOW))%>%
ungroup()%>%
mutate(DateTime = ymd_h(paste(Date, Hour)))%>%
select(DateTime, FLOW)
## `summarise()` regrouping output by 'Date' (override with `.groups` argument)
head(pipe_join)
## # A tibble: 6 x 2
## DateTime FLOW
## <dttm> <dbl>
## 1 2015-10-23 02:00:00 22.5
## 2 2015-10-23 03:00:00 31.0
## 3 2015-10-23 04:00:00 26.6
## 4 2015-10-23 05:00:00 29.6
## 5 2015-10-23 06:00:00 23.7
## 6 2015-10-23 07:00:00 25.5
summary(pipe_join)
## DateTime FLOW
## Min. :2015-10-23 02:00:00 Min. : 1.885
## 1st Qu.:2015-11-02 11:45:00 1st Qu.:26.374
## Median :2015-11-12 21:30:00 Median :35.728
## Mean :2015-11-12 21:30:00 Mean :37.229
## 3rd Qu.:2015-11-23 07:15:00 3rd Qu.:47.131
## Max. :2015-12-03 17:00:00 Max. :77.388
boxplot(pipe_join$FLOW,
ylab = "Flow Rate")
out <- boxplot.stats(pipe_join$FLOW)$out
out
## numeric(0)
tspipeall <- ts(pipe_join$FLOW, frequency = 24, start = c(2015,1))
grid.arrange(nrow=2,
ggseasonplot(tspipeall),
ggsubseriesplot(tspipeall))
ggtsdisplay(tspipeall)
#ndiffs(tspipeall)
tspipeall %>% ur.kpss %>% summary
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 7 lags.
##
## Value of test-statistic is: 2.3359
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
lambda <- BoxCox.lambda(tspipeall)
ggtsdisplay(tspipeall %>% BoxCox(lambda))
ur.kpss(tspipeall %>% BoxCox(lambda) %>% diff()) %>% summary
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 7 lags.
##
## Value of test-statistic is: 0.0053
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ggtsdisplay(diff(tspipeall, 2), points = FALSE)
autopipe_all <- auto.arima(tspipeall, lambda = lambda, approximation = FALSE)
summary(autopipe_all)
## Series: tspipeall
## ARIMA(0,1,1)(1,0,0)[24]
## Box Cox transformation: lambda= -0.2394658
##
## Coefficients:
## ma1 sar1
## -0.9862 0.0548
## s.e. 0.0067 0.0319
##
## sigma^2 estimated as 0.05379: log likelihood=41.56
## AIC=-77.13 AICc=-77.1 BIC=-62.41
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 4.925571 15.58797 12.31813 -12.29636 45.51632 0.7888465
## ACF1
## Training set -0.003024184
pipe_forecast_1 <- forecast(auto_pipe1, h = 7*24)
autoplot(pipe_forecast_1)
pipe_forecast_2 <- forecast(auto_pipe2, h = 7*24)
autoplot(pipe_forecast_2)
pipe_forecast_3 <- forecast(autopipe_all, h = 7*24)
autoplot(pipe_forecast_3)