Time-base Sequence - Project 1 Part C

There are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to test appropriate assumptions and forecast a week forward with confidence bands (80 and 95%). Add these to your existing files above – clearly labeled.

Merge, Join and clean data into a single hourly timeseries with weighted avg waterflows

# Read CSV into R
#path<-paste0(getwd(),'/')
#file1<-'Waterflow_Pipe1.csv'
#file2<-'Waterflow_Pipe2.csv'
#data1 <- read.csv(paste0(path,file1), header=TRUE, sep=",")
#data2 <- read.csv(paste0(path,file2), header=TRUE, sep=",")
url1<-"https://raw.githubusercontent.com/scottkarr/IS624-HW-Project1-PartC/master/Waterflow_Pipe1.csv"
url2<-"https://raw.githubusercontent.com/scottkarr/IS624-HW-Project1-PartC/master/Waterflow_Pipe2.csv"
data1 <- read.csv(url1, header=TRUE, sep=",")
data2 <- read.csv(url2, header=TRUE, sep=",")

# Convert to dataframe
df_wf1<-as.data.frame(data1)
df_wf1<-df_wf1 %>% 
  select(DateTime,WaterFlow) %>% 
  mutate(DateTimeHrly = as.POSIXct(df_wf1$DateTime,format="%Y-%m-%d %H")) 

df_wf1<-df_wf1 %>% 
  select(DateTimeHrly,WaterFlow) %>% 
  group_by(DateTimeHrly) %>% 
  summarise(
    AvgWaterFlow = mean(WaterFlow, na.rm = TRUE),
    n = n())

df_wf2<-as.data.frame(data2)
df_wf2<-df_wf2 %>% 
  select(DateTime,WaterFlow) %>% 
  mutate(DateTimeHrly = as.POSIXct(df_wf2$DateTime,format="%Y-%m-%d %H"))

df_wf2<-df_wf2 %>% 
  select(DateTimeHrly,WaterFlow) %>% 
  group_by(DateTimeHrly) %>% 
  summarise(
    AvgWaterFlow = mean(WaterFlow, na.rm = TRUE),
    n = n())


mymergedata <- merge(x = df_wf2, y = df_wf1, by = "DateTimeHrly", all = TRUE)
mymergedata[is.na(mymergedata )]<- 0


df_merged_wf<-
  mymergedata %>% 
  select(AvgWaterFlow.x,n.x,AvgWaterFlow.y,n.y) %>% 
  mutate(DateTime = mymergedata$DateTimeHrly,
         WAvgWaterFlow = ((AvgWaterFlow.x * n.x + AvgWaterFlow.y * n.y)/(n.x + n.y)),
         n=(n.x + n.y)
  ) 

df_merged_wf<-
  df_merged_wf %>% 
    select(DateTime,WAvgWaterFlow,n)

head(df_merged_wf,n=20)

##               DateTime WAvgWaterFlow  n
## 1  2015-10-23 00:00:00      26.10280  4
## 2  2015-10-23 01:00:00      18.84515  6
## 3  2015-10-23 02:00:00      24.46806  3
## 4  2015-10-23 03:00:00      25.56366  6
## 5  2015-10-23 04:00:00      22.36159  3
## 6  2015-10-23 05:00:00      23.63798 10
## 7  2015-10-23 06:00:00      21.43968  9
## 8  2015-10-23 07:00:00      16.24117  4
## 9  2015-10-23 08:00:00      22.26368  4
## 10 2015-10-23 09:00:00      29.85751  4
## 11 2015-10-23 10:00:00      26.00906  4
## 12 2015-10-23 11:00:00      33.65960  4
## 13 2015-10-23 12:00:00      29.17997  4
## 14 2015-10-23 13:00:00      26.41384  4
## 15 2015-10-23 14:00:00      23.92929  7
## 16 2015-10-23 15:00:00      25.14217  5
## 17 2015-10-23 16:00:00      29.96475  7
## 18 2015-10-23 17:00:00      24.46020  4
## 19 2015-10-23 18:00:00      24.02635  5
## 20 2015-10-23 19:00:00      17.27455  4

Consider the df_merged_wf series — the merged hourly waterflows dataset. The df_merged_wf ts contains 1000 records sampling hourly waterflows from a pipe.

Use the ses() function in R to find the optimal values of \({\alpha}\) and \({\ell}_{0}\), and generate forecasts for the next four months. Note this is hourly data and shouldn’t be overfit. We are using a simple exponential smoothing model that weights the more recent data more heavily.

Compute a 80% & 95% prediction interval for the first forecast using \(\hat{y} {\pm} 1.96s\) is the standard deviation of the residuals. Compare your interval with the interval produced by R.

# Get timeseries
df_merged_wf_ts<-ts(df_merged_wf[2])
# Get 1 week forecasted estimate parameters from model
round(ses(df_merged_wf_ts, h=24*7)$model$par[1:2],4)

##   alpha       l 
##  0.0362 24.8597

# Forecast out 1 week for visibility on graph
fc_df_merged_wf_ses<-ses(df_merged_wf_ts, h=24*7)
# Accuracy of one-step-ahead training errors
round(accuracy(fc_df_merged_wf_ses),2)

##                ME  RMSE   MAE    MPE  MAPE MASE  ACF1
## Training set 0.53 14.58 11.14 -26.41 47.92 0.71 -0.02

# see how SES model was fitted
fc_df_merged_wf_ses$model

## Simple exponential smoothing 
## 
## Call:
##  ses(y = df_merged_wf_ts, h = 24 * 7) 
## 
##   Smoothing parameters:
##     alpha = 0.0362 
## 
##   Initial states:
##     l = 24.8597 
## 
##   sigma:  14.5904
## 
##      AIC     AICc      BIC 
## 12285.75 12285.78 12300.48

# get 1 week of forecasts
#tsCV(df_merged_wf_ts,ses,h=24*7)[1:4,]
# 95% prediction interval for the first forecast
fc_df_merged_wf_ses$upper[1, "95%"]

##      95% 
## 72.60698

fc_df_merged_wf_ses$lower[1, "95%"]

##      95% 
## 15.41355

# 80% prediction interval for the first forecast
fc_df_merged_wf_ses$upper[1, "80%"]

##      80% 
## 62.70865

fc_df_merged_wf_ses$lower[1, "80%"]

##      80% 
## 25.31188

# calculate standard deviation with and without model
s <- sd(fc_df_merged_wf_ses$residuals)
s

## [1] 14.57355

# s = 10273.69 vs s (estimated) 10308.58
# calculate 95% prediction interval with and without model
fc_df_merged_wf_ses$mean[1] + 1.96*s

## [1] 72.57443

fc_df_merged_wf_ses$mean[1] - 1.96*s

## [1] 15.44611

mean(df_merged_wf_ts) + 1.96*s

## [1] 64.54455

mean(df_merged_wf_ts)  - 1.96*s

## [1] 7.41623

# calculate 80% prediction interval with and without model
fc_df_merged_wf_ses$mean[1] + 1.28*s

## [1] 62.66441

fc_df_merged_wf_ses$mean[1] - 1.28*s

## [1] 25.35612

mean(df_merged_wf_ts) + 1.28*s

## [1] 54.63453

mean(df_merged_wf_ts)  - 1.28*s

## [1] 17.32624

# Note that forecast using ses doesn't have a trend component.
fc_df_merged_wf_ses<-ses(df_merged_wf_ts, h=24*7)
autoplot(fc_df_merged_wf_ses) + 
    autolayer(fitted(fc_df_merged_wf_ses), series="fitted") +
    ylab("Predicted Water flow rate (base on weighted avg) ") + xlab("hourly")

df_predicted_wf<-as.data.frame(fc_df_merged_wf_ses)
df_predicted_wf<-df_predicted_wf %>% 
  mutate(WAvgWaterFlow =`Point Forecast`,
         DateTime=as.POSIXct("2015-12-03 16:00:00",format="%Y-%m-%d %H:%M:%OS") + 3600 * row_number(),
         n=1)

df_predicted_wf<-
  df_predicted_wf %>% 
    select(DateTime,WAvgWaterFlow,n)

# write combined output of 1000 training recs plus 168 predictions (1 week)
df_wf_combined<-rbind(df_merged_wf, df_predicted_wf)
#write.csv(df_wf_combined, file = "df_wf_combined.csv")