library(tidyverse)
library(timetk)
library(lubridate)
library(DataExplorer)
library(ggfortify)
library(forecast)
library(caret)
library(fpp)
library(DMwR)
library(imputeTS)
library(zoo)
library(kableExtra)

setwd('C:/Users/robbj/OneDrive/CUNY SPS/DATA624/Project1')

Part 1: ATM withdrawals

Objective: Forecast cash withdrawals for 4 ATMs for May 2010.

Import & clean data

The raw data contain 3 variables:
* Date: character of sequential dates in mm/dd/YYYY format beginning on 5/1/2009 and ending 4/30/2010
* ATM: character indicating unique ATM associated with the observation
* Cash: integer indicating total withdrawal amount associated with the observation in hundreds of USD

Data is in long form and includes 1474 observations. Some basic data wrangling was performed, including coercing data types of the Date and ATM variables to Date and factor types, respectively. Fourteen observations contained missing ATM and Cash values and were removed, leaving 1460 observations.

# import raw dataset, separate out the DAteTime
ATM_raw <- read.csv('ATM624Data.csv',na.strings="NA") %>% 
  separate(DATE, c('Date','Time'), sep=" ") %>% 
  select(-Time)

# change data to a Date type
ATM_raw$Date <- as.Date(ATM_raw$Date, format="%m/%d/%Y")

# make ATM ID aa factor
ATM_raw$ATM <- as.factor(ATM_raw$ATM)

# write to different df object
ATM_df <- ATM_raw

# remove observations with blank ATM
ATM_df <- ATM_df %>% filter(ATM!="")

Exploratory Data Analysis

The data were examined to provide a high-level insights. The distributions of the 4 ATMs were visualized using boxplots and density plots, and are output below. Note the density plots are omitting observations where Cash is 0 or greater than 200.

# boxplots 
ATM_df %>%
  ggplot(aes(x=Cash)) +
    geom_boxplot(fill="dodgerblue", alpha=0.5)+ 
    facet_wrap(~ATM, scales="free") +
    theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          panel.background = element_blank(), 
          axis.line = element_line(colour = "black")
       )

# density plots
ATM_df %>% 
  ggplot(aes(x=Cash))+
    geom_density(fill="dodgerblue", lwd=1, alpha=0.5) + 
    xlim(5,200)+
    facet_wrap(~ATM,scale="free") +
    theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          panel.background = element_blank(), 
          axis.line = element_line(colour = "black")
          ) 

  • ATM1 is approximately normally distributed
  • ATM2 has more variance
  • ATM3 only has 3 values
  • ATM4 contains at least one outlier

Individual ATMs show different patterns of cash widthrawals over the year of data provided. Each ATM will be analyzed. For each ATM, missing values will examined (and imputed if applicable), outliers will be removed, autocorrelation checked, and visuals regarding day of the week/seasonal significance.

ATM1

ATM1 has three missing values in June of 2009 and no observations that were egregiously outside the norm. The three missing values were imputed using linear interpolation.

The interactive time series plot fit with a smooth line show that cash withdrawals were relatively constant throughout the year, but slightly higher in the early spring and summer months. Thursday withdrawals showed the most drastic difference, much lower that the other days of the week (median of 16000), next lower was Monday with median of 85000. The Partial Autocorrelation function shows the importance of day of the week on cash out - there are significant spikes at lags of 7 and 14.

# create df for just ATM1
ATM1_df <- ATM_df %>% 
  filter(ATM=='ATM1') 

# create ts object
ATM1_ts <- ATM1_df %>% 
  select(Cash) %>% 
  ts(frequency=365)

## which observations are missing: ##
# ATM1_df %>% 
#   filter(is.na(Cash))

# impute missing values
ATM1_ts <- na_interpolation(ATM1_ts)

# update the df
ATM1_df <- tk_tbl(ATM1_ts, preserve_index  = F) %>% 
  bind_cols(Date=ATM1_df$Date)

# plot time series
ATM1_df %>%  
  plot_time_series(Date,Cash, .title="ATM1", .plotly_slider=T)
# check for anomlies
# ATM1_df %>% plot_anomaly_diagnostics(Date,Cash)

# seasonal diagnostics
ATM1_df %>% 
  plot_seasonal_diagnostics(Date, Cash, .feature_set=c("wday.lbl"),.title="ATM1: Day of Week")
ATM1_df %>% 
  plot_seasonal_diagnostics(Date, Cash, .feature_set=c("month.lbl" ),.title="ATM1: Month of Year")
# check autocorrelation
par(mfrow=c(1,2))
acf(ATM1_ts)
pacf(ATM1_ts)

ATM 2

2 missing values and no extreme values for the year. Missing values were interpolated. As with ATM1, ATM2 show cash withdrawals that are lower on Thursdays and cold weather months.

# create df for just ATM2
ATM2_df <- ATM_df %>% 
  filter(ATM=='ATM2') 

# create ts object
ATM2_ts <- ATM2_df %>% 
  select(Cash) %>% 
  ts(start=c(2009,5), frequency=365)

# impute missing values
ATM2_ts <- na_interpolation(ATM2_ts)

# update the df
ATM2_df <- tk_tbl(ATM2_ts, preserve_index  = F) %>% 
  bind_cols(Date=ATM2_df$Date)

# check for anomlies
# ATM2_df %>% 
  # plot_anomaly_diagnostics(Date,Cash)

# which observations are missing
# ATM2_df %>% filter(is.na(Cash))

# plot time series
ATM2_df %>%  
  plot_time_series(Date,Cash, .title="ATM2", .plotly_slider=T)
# seasonal diagnostics
ATM2_df %>% 
  plot_seasonal_diagnostics(Date, Cash, .feature_set=c("wday.lbl"),.title="ATM2: Day of Week")
ATM2_df %>% 
  plot_seasonal_diagnostics(Date, Cash, .feature_set=c("month.lbl" ),.title="ATM2: Month of Year")
# check autocorrelations
par(mfrow=c(1,2))
acf(ATM2_ts)
pacf(ATM2_ts)

ATM 4

ATM4 has a clear outlier in the time series that was removed and replaced with an interpolated value. There were no other missing values in the time series. The same general patterns hold true, the middle of the week has less withdrawal. Warmer months are associated with slightly more withdrawal. The correlograms indicate correlation of values lagged by 7 days.

# create df for just ATM4
ATM4_df <- ATM_df %>% 
  filter(ATM=='ATM4') 

# create ts object
ATM4_ts <- ATM4_df %>% 
  select(Cash) %>% 
  ts(start=c(2009,5,1), end=c(2010,4,30),frequency=365)

# check for anomlies
ATM4_df %>% 
  plot_anomaly_diagnostics(Date,Cash)
# Make the outlier an NA
ATM4_ts[ATM4_ts==10920] <- NA

# impute missing values
ATM4_ts <- na_interpolation(ATM4_ts)

# which observations are missing? NONE
# ATM4_df %>% 
#   filter(is.na(Cash))

# update the df
ATM4_df <- tk_tbl(ATM4_ts, preserve_index  = F) %>% 
  bind_cols(Date=ATM4_df$Date)

# plot time series
ATM4_df %>%  
  plot_time_series(Date,Cash)
# seasonal diagnostics
ATM4_df %>% 
  plot_seasonal_diagnostics(Date, Cash, .feature_set=c("wday.lbl"),.title="ATM4: Day of Week")
ATM4_df %>% 
  plot_seasonal_diagnostics(Date, Cash, .feature_set=c("month.lbl" ),.title="ATM4: Month of Year")
# check autocorrelations
par(mfrow=c(1,2))
acf(ATM4_ts)
pacf(ATM4_ts)
mtext("Autocorrelations", side = 3, line =-1, outer = TRUE)

ATM 3

ATM3 only has three values, the last three days of the dataset time series. Cash values are not useful to make predictions for future ATM3 withdrawals, but they can be used to determine if another ATM can be used as a proxy for ATM3. We see that ATM1 and ATM3 have identical values for 4/28/2010 - 4/30/2010 (strange coincidence). The model that used to forecast ATM1 future Cash values will also be used in the ATM3 forecast.

# create df for just ATM3
ATM3_df <- ATM_df %>% 
  filter(ATM=='ATM3') 

# create ts object
ATM3_ts <- ATM3_df %>% 
  select(Cash) %>% 
  ts(start=c(2009,5), frequency=365)

# only last 3 times are available 
ATM3_df %>% plot_time_series(Date,Cash)
table <- data.frame(c(ATM1_df %>% tail(3),
                      ATM2_df %>% tail(3),
                      ATM3_df %>% tail(3),
                      ATM4_df %>% tail(3)
                      )
                    )
table %>% select(Date, 'ATM1'=Cash, 'ATM2'=Cash.1, "ATM3"=Cash.2, "ATM4"=Cash.3 ) %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover"))
Date ATM1 ATM2 ATM3 ATM4
2010-04-28 96 107 96 348
2010-04-29 82 89 82 44
2010-04-30 85 90 85 482

From the analysis of individual ATM cash withdrawal, there are several useful pieces of information that will govern the modeling process:
* ATM1, AT2, and ATM4 have different distributions, means, and variances and require different models for forecasting
* Each ATM showed similar patterns: Day of week and month of year is important
* ATM3 only has 3 values; ATM1’s data will be used as a proxy.
* There is no need for a mathematical transformation, such as Box-Cox
* There are complexities in the data set due to variations in day of the week and seasonal influence that is, there are multiple seasonal effects (since there is only one year of data, we cannot be certain of a time of year seasonality, but the pattern was strong enough for each of the ATMs to make the assumption).

One of several techniques could be used. * The daily data could be aggregated by week or month to reduce the complexity.
* Multiple seasonality could be incorporated into the model; one based on month and one based on day of the week.
* Since there is only one year of data, the time of year seasonality is omitted and only weekly seasonality is considered in the model. This option is less complex than option 2, and is supported by the strong correlation on the PACF lag 7, and clear pattern on the day of week boxplots.

The third option was selected to limit complexity and leverage the day of the week patterns in each of the ATM data. The time series object were adjusted in R to define 7 days as the ‘seasonality’ of the time series.

ATM1_ts <- ATM1_df %>% 
  select(Cash) %>% 
  ts(frequency=7)

ATM2_ts <- ATM2_df %>% 
  select(Cash) %>% 
  ts(frequency=7)

ATM4_ts <- ATM4_df %>% 
  select(Cash) %>% 
  ts(frequency=7)

Create Models

For each model, four candidate models were created:

  • seasonal naive
  • random walk forecast
  • ARIMA
  • ETS
# naive
naive1 <- snaive(ATM1_ts)
naive2 <- snaive(ATM2_ts)
naive4 <- snaive(ATM4_ts)

# random walk
rwf1 <- rwf(ATM1_ts)
rwf2 <- rwf(ATM2_ts)
rwf4 <- rwf(ATM4_ts)

# ARIMA
AR1 <- auto.arima(ATM1_ts, stepwise=F, approximation = F)
AR2 <- auto.arima(ATM2_ts, stepwise=F, approximation = F)
AR4 <- auto.arima(ATM4_ts, stepwise=F, approximation = F)

# ETS
ETS1 <- ets(ATM1_ts, model="ZZZ")
ETS2 <- ets(ATM2_ts, model="ZZZ")
ETS4 <- ets(ATM4_ts, model="ZZZ")

Compare Models

The data were not split into train/test sets to evaluate model performance given the small data set. Instead, candidate model accuracy were evaluated for fitted values. Although not ideal, it will give general understanding of which family of model will be best for each ATM.

# accuracy diagnostics
## ATM1
ATM1_acc <- rbind(accuracy(naive1), accuracy(rwf1), accuracy(AR1),accuracy(ETS1))
row.names(ATM1_acc) <- c("Naive", "Random Walk", "ARIMA", "ETS")

## ATM2
ATM2_acc <- rbind(accuracy(naive2), accuracy(rwf2), accuracy(AR2),accuracy(ETS2))
row.names(ATM2_acc) <- c("Naive", "Random Walk", "ARIMA", "ETS")

### ATM2
ATM4_acc <- rbind(accuracy(naive4), accuracy(rwf4), accuracy(AR4),accuracy(ETS4))
row.names(ATM4_acc) <- c("Naive", "Random Walk", "ARIMA", "ETS")

# table of results 
#ATM1: ARIMA
#ATM2: 
ATM1_acc %>% kbl(caption = "ATM1: Model Accuracy") %>% kable_styling(bootstrap_options = c("striped", "hover"))
ATM1: Model Accuracy
ME RMSE MAE MPE MAPE MASE ACF1
Naive -0.0363128 27.74593 17.69832 -96.53364 116.2264 1.0000000 0.1879628
Random Walk -0.0302198 49.94443 37.20604 -132.05619 166.7693 2.1022354 -0.3557445
ARIMA -0.0667763 23.23545 14.53882 -102.49739 117.4116 0.8214799 -0.0086713
ETS -0.0482208 23.78972 15.07958 -106.83543 121.8655 0.8520341 0.1439166
ATM2_acc %>% kbl(caption = "ATM2: Model Accuracy") %>% kable_styling(bootstrap_options = c("striped", "hover"))
ATM2: Model Accuracy
ME RMSE MAE MPE MAPE MASE ACF1
Naive 0.0223464 30.19721 20.78771 -Inf Inf 1.0000000 0.0481966
Random Walk -0.0467033 54.18008 42.54670 -Inf Inf 2.0467240 -0.3072004
ARIMA -0.8907648 24.13932 17.04479 -Inf Inf 0.8199456 -0.0042753
ETS -0.6885457 25.14201 17.86610 -Inf Inf 0.8594547 0.0198839
ATM4_acc %>% kbl(caption = "ATM4: Model Accuracy") %>% kable_styling(bootstrap_options = c("striped", "hover"))
ATM4: Model Accuracy
ME RMSE MAE MPE MAPE MASE ACF1
Naive -3.6955307 453.0350 346.3883 -383.0109 438.0732 1.0000000 0.0621953
Random Walk -0.8104396 476.7094 386.2060 -459.4828 518.9954 1.1149513 -0.4639654
ARIMA -0.2045415 342.4063 282.5505 -501.9492 534.1686 0.8157047 -0.0007724
ETS -30.3919593 354.0891 276.5696 -434.2023 466.1756 0.7984380 0.0570724

Model Selection & Forecasts

The ARIMA models performed slightly better than ETS for each ATM

  • ATM1: ARIMA(0,0,1)(1,1,1)[7]
  • ATM2: ARIMA(2,0,2)(0,1,1)[7]
  • ATM4: ARIMA(1,0,0)(2,0,0)[7]
  • ATM3: will use ATM1 forecasts

The residual plots show valid models. Residuals have near constant variance over time, are not autocorrelated, and approximate normal distributions- although ATM4 shows some skew in its residual distribution. The May 2010 forecasts are output below and are available as csv files.

# final model output
# AR1
# AR2
# AR4
# check residuals of final model
checkresiduals(AR1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(1,1,1)[7]
## Q* = 15.095, df = 11, p-value = 0.1782
## 
## Model df: 3.   Total lags used: 14
checkresiduals(AR2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 10.231, df = 9, p-value = 0.3321
## 
## Model df: 5.   Total lags used: 14
checkresiduals(AR4)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(2,0,0)[7] with non-zero mean
## Q* = 15.694, df = 10, p-value = 0.1087
## 
## Model df: 4.   Total lags used: 14
# forecast plots
AR1 %>% forecast(h=31) %>% autoplot(main="ATM1 time series with forecast")

AR2 %>% forecast(h=31) %>% autoplot(main="ATM2 time series with forecast")

AR4 %>% forecast(h=31) %>% autoplot(main="ATM4 time series with forecast")

# write to a csv
forecast_ATM1 <- AR1 %>% forecast(h=31)
forecast_ATM1_mean<- forecast_ATM1$mean

forecast_ATM2 <- AR2 %>% forecast(h=31)
forecast_ATM2_mean<- forecast_ATM2$mean

forecast_ATM3_mean<- forecast_ATM1$mean

forecast_ATM4 <- AR4 %>% forecast(h=31)
forecast_ATM4_mean <- forecast_ATM4$mean

rows <- seq(1:31)
rows <- paste("May ", rows)

data.frame(date= rows,
           ATM1=forecast_ATM1_mean,
           ATM2=forecast_ATM2_mean,
           ATM3=forecast_ATM3_mean,
           ATM4=forecast_ATM4_mean) %>% 
   kbl(caption = "May 2010 Forecasts") %>% kable_styling(bootstrap_options = c("striped", "hover"))
May 2010 Forecasts
date ATM1 ATM2 ATM3 ATM4
May 1 87.040565 66.436775 87.040565 375.1464
May 2 104.202236 72.717930 104.202236 429.8268
May 3 73.512143 13.674789 73.512143 468.2596
May 4 9.181767 5.297995 9.181767 333.0300
May 5 99.565861 97.634046 99.565861 442.4000
May 6 80.483913 88.290342 80.483913 378.1255
May 7 87.019776 64.988924 87.019776 417.3209
May 8 87.993527 66.444443 87.993527 383.3132
May 9 103.315610 72.727023 103.315610 452.7264
May 10 73.421988 13.663861 73.421988 442.6114
May 11 10.139357 5.294414 10.139357 378.1444
May 12 100.224832 97.645570 100.224832 432.4067
May 13 80.203740 88.288633 80.203740 388.2084
May 14 87.393031 64.979141 87.393031 444.0295
May 15 88.169634 66.450229 88.169634 426.8144
May 16 103.151761 72.733455 103.151761 443.2771
May 17 73.405327 13.655799 73.405327 446.1807
May 18 10.316320 5.292024 10.316320 421.2408
May 19 100.346610 97.653963 100.346610 441.7239
May 20 80.151964 88.287189 80.151964 427.8751
May 21 87.462008 64.972102 87.462008 440.5682
May 22 88.202178 66.454588 88.202178 434.1488
May 23 103.121482 72.737999 103.121482 444.5036
May 24 73.402248 13.649857 73.402248 443.7749
May 25 10.349022 5.290443 10.349022 432.7379
May 26 100.369114 97.660071 100.369114 441.9529
May 27 80.142396 88.285994 80.142396 434.8643
May 28 87.474755 64.967041 87.474755 443.1111
May 29 88.208193 66.457865 88.208193 440.1992
May 30 103.115887 72.741204 103.115887 443.6042
May 31 73.401679 13.645480 73.401679 443.8288
              #           ) %>%
              # write.csv("ATM_forecasts.csv")

Part 2: Residential Power Usage

Objective: Provide monthly forecasts of residential power consumption for 2014.

Import & clean data

This dataset contains 192 observations of power consumption in Kilowatt hours aggregated in monthly sequences from 1998 to 2013.

# import 
raw_data <- read.csv('ResidentialCustomerForecastLoad-624.csv', na.strings="NA")

# get structure
str(raw_data)
## 'data.frame':    192 obs. of  3 variables:
##  $ CaseSequence: int  733 734 735 736 737 738 739 740 741 742 ...
##  $ YYYY.MMM    : chr  "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
##  $ KWH         : int  6862583 5838198 5420658 5010364 4665377 6467147 8914755 8607428 6989888 6345620 ...
# change date type 
raw_data$date <-  as.yearmon(raw_data$YYYY.MMM, '%Y-%b') %>% 
  as.Date()

# convert to tibble
power_df <- raw_data %>% 
  select(date, KWH)

# convert to time series
power_ts <- power_df %>%
  select(KWH) %>% 
  ts(start=1998, frequency=12)

Missing and Extreme values

There is only one missing value which was for September 2008, and there is an outlier to be removed from July 2010. These two values were imputed with a linear interpolation.

power_df %>% 
  plot_time_series(date, KWH, .title="Power Consumption time series")
# remove July 2010 outlier
power_ts[power_ts==770523] <- NA

# impute the missing values
power_ts <- na_interpolation(power_ts)
power_df <- tk_tbl(power_ts)
power_df$index <- as.Date(power_df$index)

Trend and Seasaonal Analysis

From the interactive time series plot, an upward trend in consumption is seen, especially after 2008 - this observation is more clearly visible in the trend component of the decomposition. The polar seasonal plot shows two seasonal cycles per year, one peak is during the winter months and one is during the summer. This makes sense when considering the energy required to heat and cool buildings during summer and winter. There is some increase in variance with the level of the time series - a Box-Cox transformation will be considered.

# seasonal diagnostics
power_df %>% 
  plot_seasonal_diagnostics(index, KWH, .feature_set=c("month.lbl"), .title="Power consumption by month [KWH]")
power_df %>% 
  plot_seasonal_diagnostics(index, KWH, .feature_set=c("year"), .title="Power consumption by year [KWH]")
# seasonal plot
ggseasonplot(power_ts, polar=T, main="Power consumption season plot") +    
        theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          panel.background = element_blank(), 
          axis.line = element_line(colour = "black")
       )

# decomposition
decompose(power_ts) %>% autoplot

#power_df %>% tk_stl_diagnostics(index, KWH) 
# ggsubseriesplot(power_ts)

Transformation

A Box-Cox transformation can stabilize the data, and was applied. The value of lambda=0.0 was selected. An optimal value of lambda=-0.2 was calculated, but lambda=0 is more interpretable (log transformation). No additional transformations were applied to the data.

paste("optimal lambda: ", BoxCox.lambda(power_ts) %>% round(3)) 
## [1] "optimal lambda:  -0.207"
power_ts_transform <- BoxCox(power_ts, lambda = 0)

power_ts_transform %>% 
  autoplot(main="Log transformed power consumption time series", 
           ylab="log of power consuption in KWH", color="dodgerblue", lwd=2)+
    theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          panel.background = element_blank(), 
          axis.line = element_line(colour = "black")
       )

Test/Train set split

The data were split into train and test sets so candidate model can be evaluated based on unseen data.

train <- window(power_ts_transform, end=c(2010,12))
test <- window(power_ts_transform, start=c(2011,1))

Create Models

For each model, four candidate models were created:

  • seasonal naive
  • random walk forecast
  • ARIMA
  • ETS
# seasonal naive
naive <- snaive(train)

# random walk
rwf <- rwf(train)

# ARIMA
AR <- auto.arima(train, stepwise = F, approximation = F)

# ETS
ETS <- ets(train, model="ZZZ")

Compare Models

Two tables of results are provided - one for the training set and one for the test set.

# accuracy diagnostics
## ATM1
acc_train <- rbind(accuracy(naive), accuracy(rwf), accuracy(AR),accuracy(ETS))
row.names(acc_train) <- c("Naive", "Random Walk", "ARIMA", "ETS")

# 1. table of training set accuracy 
acc_train %>% 
  kbl(caption="Training set accuracy") %>% 
  kable_styling(bootstrap_options = c("striped", "hover"))
Training set accuracy
ME RMSE MAE MPE MAPE MASE ACF1
Naive 0.0088178 0.1139787 0.0913879 0.0537439 0.5840611 1.0000000 0.2563240
Random Walk -0.0007046 0.2048886 0.1715962 -0.0130818 1.0971481 1.8776687 0.1726307
ARIMA -0.0003971 0.0783363 0.0614673 -0.0043181 0.3930279 0.6725973 -0.0017689
ETS 0.0089396 0.0840044 0.0651353 0.0544209 0.4161158 0.7127344 0.2848504
# 2. table of test set accuracy
AR_test_acc <- AR %>% 
  forecast(h=24) %>% 
  accuracy(test)
ETS_test_acc <- ETS %>% 
  forecast(h=24) %>% 
  accuracy(test)
acc <- rbind(accuracy(naive,test), accuracy(rwf,test), AR_test_acc, ETS_test_acc ) 

# get just the test set results
acc <- acc[c(2,4,6,8),]
row.names(acc) <- c("Naive", "Random Walk", "ARIMA", "ETS")

acc %>% 
  kbl(caption = "Test set accuracy") %>% 
  kable_styling(bootstrap_options = c("striped", "hover"))
Test set accuracy
ME RMSE MAE MPE MAPE MASE ACF1 Theil’s U
Naive 0.0792027 0.1612895 0.1363807 0.4965889 0.8601012 1.492328 0.3665427 0.7680838
Random Walk 0.2081387 0.3136470 0.2648156 1.2922402 1.6570476 2.897709 0.3955775 1.2454857
ARIMA 0.1014687 0.1420595 0.1116912 0.6374437 0.7029380 1.222166 0.2163076 0.6840179
ETS 0.1083409 0.1466202 0.1229850 0.6810322 0.7748039 1.345747 0.1390291 0.7056191

Model Selection & Forecast

Based on the metrics calculated on the test set, the best model for the data is the ARIMA model: ARIMA(4,0,0)(0,1,1)[12].

A check of the residuals confirms a valid model - the residuals have approximately constant variance through the span of the series, are approximately normally distributed, and are not autocorrelated. Since a log transformtion was applied earlier, a back-transformation is needed to provided the final forecasts which are output below and available as a csv file: power_forecast.csv

AR <- arima(power_ts_transform, order = c(4L, 0L, 0L),
      seasonal = list(order = c(0L, 1L, 1L), period = 12))

checkresiduals(AR)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,0,0)(0,1,1)[12]
## Q* = 16.788, df = 19, p-value = 0.6042
## 
## Model df: 5.   Total lags used: 24
AR %>% 
  forecast(h=12) %>% 
  autoplot(main="Power consumption time series with 2014 forecasts (log-scale)" )+
  theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          panel.background = element_blank(), 
          axis.line = element_line(colour = "black")
       )

# backtransformation
forecast_trans <- AR %>% forecast(h=12)
forecast_orig <- exp(forecast_trans$mean) %>% data.frame() 
colnames(forecast_orig) <- "KWH_2014"
rownames(forecast_orig) <- c(1:12)
forecast_orig %>% ts(start=c(2014,1),frequency=12)
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 2014 10512431  8375656  7083053  5914195  5626279  7596631  8501090  9073518
##           Sep      Oct      Nov      Dec
## 2014  7900939  5662400  5543183  7699813
# write to csv
# forecast_orig %>% write.csv("power_forecast.csv")

Part 3

These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.

Import & clean data

Date were imported and cleaned. A POSIXct data type was used for the Date.Time variable. Data were aggregated into hourly intervals.

library(lubridate)
pipe1 <- read.csv('Waterflow_Pipe1.csv')
pipe2 <- read.csv(('Waterflow_Pipe2.csv'))

tail(pipe1)
##             Date.Time WaterFlow
## 995  11/1/15 10:09 PM  15.28143
## 996  11/1/15 10:09 PM  26.33668
## 997  11/1/15 10:25 PM  29.06427
## 998  11/1/15 11:08 PM  22.84410
## 999  11/1/15 11:34 PM  16.21870
## 1000 11/1/15 11:35 PM  21.21171
tail(pipe2)
##             Date.Time WaterFlow
## 995  12/3/15 11:00 AM  72.96677
## 996  12/3/15 12:00 PM  31.48311
## 997   12/3/15 1:00 PM  66.81673
## 998   12/3/15 2:00 PM  42.93666
## 999   12/3/15 3:00 PM  33.40133
## 1000  12/3/15 4:00 PM  66.68147
# create of POSIXct data type for date.time
pipe1$Date.Time <- as.POSIXct(strptime(pipe1$Date.Time, "%m/%d/%y %I:%M %p"))
pipe2$Date.Time <- as.POSIXct(strptime(pipe2$Date.Time, "%m/%d/%y %I:%M %p"))


pipe1_agg <- pipe1 %>% 
  summarize_by_time(Date.Time,
                    .by="hour",
                    Flow=mean(WaterFlow))

pipe2_agg <- pipe2 %>% 
  summarize_by_time(Date.Time,
                    .by="hour",
                    Flow=mean(WaterFlow))

Pipe 1

Visuals

Based on the time series plot and ACF/PACF the time series is stationary.

# plot time series
pipe1_agg %>% plot_time_series(Date.Time,Flow)
# creat ts object
pipe1_ts <- pipe1_agg$Flow %>% ts(frequency = 24)

# check autocorrelation
pipe1_ts %>% acf(lag.max = 50)

pipe1_ts %>% pacf(lag.max = 50)

Model & Forecast

ARIMA(0,0,0) - essentially a random walk forecast. The forecasts are output below.

pipe1_arima <- auto.arima(pipe1_ts)
pipe1_arima %>% forecast(h=7) %>% autoplot

pipe1_forecast <- pipe1_arima %>% forecast(h=7)
#pipe1_forecast$mean %>% data.frame() %>% write.csv("pipe1_forecast.csv")
pipe1_forecast$mean
## Time Series:
## Start = c(10, 22) 
## End = c(11, 4) 
## Frequency = 24 
## [1] 19.88428 19.88428 19.88428 19.88428 19.88428 19.88428 19.88428

Pipe 2

Visuals

Based on the time series plot and ACF/PACF the time series is stationary. The time series is given a frequency of 24.

# plot time series
pipe2_agg %>% plot_time_series(Date.Time,Flow)
# creat ts object
pipe2_ts <- pipe2_agg$Flow %>% ts(frequency = 24)

# check autocorrelation
pipe2_ts %>% acf

pipe2_ts %>% pacf

Model & Forecast

ARIMA(0,0,0)(0,0,1)[24] with non-zero mean

pipe2_arima <- auto.arima(pipe2_ts)
pipe2_arima %>% forecast(h=7) %>% autoplot

pipe2_forecast <- pipe2_arima %>% forecast(h=7)
#pipe2_forecast$mean %>% data.frame() %>% write.csv("pipe2_forecast.csv")
pipe2_forecast$mean
## Time Series:
## Start = c(42, 17) 
## End = c(42, 23) 
## Frequency = 24 
## [1] 39.56376 40.87797 41.88069 39.63039 41.10972 40.19510 38.33596