library(tidyverse)
library(timetk)
library(lubridate)
library(DataExplorer)
library(ggfortify)
library(forecast)
library(caret)
library(fpp)
library(DMwR)
library(imputeTS)
library(zoo)
library(kableExtra)
setwd('C:/Users/robbj/OneDrive/CUNY SPS/DATA624/Project1')
Objective: Forecast cash withdrawals for 4 ATMs for May 2010.
The raw data contain 3 variables:
* Date: character of sequential dates in mm/dd/YYYY format beginning on 5/1/2009 and ending 4/30/2010
* ATM: character indicating unique ATM associated with the observation
* Cash: integer indicating total withdrawal amount associated with the observation in hundreds of USD
Data is in long form and includes 1474 observations. Some basic data wrangling was performed, including coercing data types of the Date
and ATM
variables to Date and factor types, respectively. Fourteen observations contained missing ATM
and Cash
values and were removed, leaving 1460 observations.
# import raw dataset, separate out the DAteTime
ATM_raw <- read.csv('ATM624Data.csv',na.strings="NA") %>%
separate(DATE, c('Date','Time'), sep=" ") %>%
select(-Time)
# change data to a Date type
ATM_raw$Date <- as.Date(ATM_raw$Date, format="%m/%d/%Y")
# make ATM ID aa factor
ATM_raw$ATM <- as.factor(ATM_raw$ATM)
# write to different df object
ATM_df <- ATM_raw
# remove observations with blank ATM
ATM_df <- ATM_df %>% filter(ATM!="")
The data were examined to provide a high-level insights. The distributions of the 4 ATMs were visualized using boxplots and density plots, and are output below. Note the density plots are omitting observations where Cash is 0 or greater than 200.
# boxplots
ATM_df %>%
ggplot(aes(x=Cash)) +
geom_boxplot(fill="dodgerblue", alpha=0.5)+
facet_wrap(~ATM, scales="free") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")
)
# density plots
ATM_df %>%
ggplot(aes(x=Cash))+
geom_density(fill="dodgerblue", lwd=1, alpha=0.5) +
xlim(5,200)+
facet_wrap(~ATM,scale="free") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")
)
Individual ATMs show different patterns of cash widthrawals over the year of data provided. Each ATM will be analyzed. For each ATM, missing values will examined (and imputed if applicable), outliers will be removed, autocorrelation checked, and visuals regarding day of the week/seasonal significance.
ATM1 has three missing values in June of 2009 and no observations that were egregiously outside the norm. The three missing values were imputed using linear interpolation.
The interactive time series plot fit with a smooth line show that cash withdrawals were relatively constant throughout the year, but slightly higher in the early spring and summer months. Thursday withdrawals showed the most drastic difference, much lower that the other days of the week (median of 16000), next lower was Monday with median of 85000. The Partial Autocorrelation function shows the importance of day of the week on cash out - there are significant spikes at lags of 7 and 14.
# create df for just ATM1
ATM1_df <- ATM_df %>%
filter(ATM=='ATM1')
# create ts object
ATM1_ts <- ATM1_df %>%
select(Cash) %>%
ts(frequency=365)
## which observations are missing: ##
# ATM1_df %>%
# filter(is.na(Cash))
# impute missing values
ATM1_ts <- na_interpolation(ATM1_ts)
# update the df
ATM1_df <- tk_tbl(ATM1_ts, preserve_index = F) %>%
bind_cols(Date=ATM1_df$Date)
# plot time series
ATM1_df %>%
plot_time_series(Date,Cash, .title="ATM1", .plotly_slider=T)
# check for anomlies
# ATM1_df %>% plot_anomaly_diagnostics(Date,Cash)
# seasonal diagnostics
ATM1_df %>%
plot_seasonal_diagnostics(Date, Cash, .feature_set=c("wday.lbl"),.title="ATM1: Day of Week")
ATM1_df %>%
plot_seasonal_diagnostics(Date, Cash, .feature_set=c("month.lbl" ),.title="ATM1: Month of Year")
# check autocorrelation
par(mfrow=c(1,2))
acf(ATM1_ts)
pacf(ATM1_ts)
2 missing values and no extreme values for the year. Missing values were interpolated. As with ATM1, ATM2 show cash withdrawals that are lower on Thursdays and cold weather months.
# create df for just ATM2
ATM2_df <- ATM_df %>%
filter(ATM=='ATM2')
# create ts object
ATM2_ts <- ATM2_df %>%
select(Cash) %>%
ts(start=c(2009,5), frequency=365)
# impute missing values
ATM2_ts <- na_interpolation(ATM2_ts)
# update the df
ATM2_df <- tk_tbl(ATM2_ts, preserve_index = F) %>%
bind_cols(Date=ATM2_df$Date)
# check for anomlies
# ATM2_df %>%
# plot_anomaly_diagnostics(Date,Cash)
# which observations are missing
# ATM2_df %>% filter(is.na(Cash))
# plot time series
ATM2_df %>%
plot_time_series(Date,Cash, .title="ATM2", .plotly_slider=T)
# seasonal diagnostics
ATM2_df %>%
plot_seasonal_diagnostics(Date, Cash, .feature_set=c("wday.lbl"),.title="ATM2: Day of Week")
ATM2_df %>%
plot_seasonal_diagnostics(Date, Cash, .feature_set=c("month.lbl" ),.title="ATM2: Month of Year")
# check autocorrelations
par(mfrow=c(1,2))
acf(ATM2_ts)
pacf(ATM2_ts)
ATM4 has a clear outlier in the time series that was removed and replaced with an interpolated value. There were no other missing values in the time series. The same general patterns hold true, the middle of the week has less withdrawal. Warmer months are associated with slightly more withdrawal. The correlograms indicate correlation of values lagged by 7 days.
# create df for just ATM4
ATM4_df <- ATM_df %>%
filter(ATM=='ATM4')
# create ts object
ATM4_ts <- ATM4_df %>%
select(Cash) %>%
ts(start=c(2009,5,1), end=c(2010,4,30),frequency=365)
# check for anomlies
ATM4_df %>%
plot_anomaly_diagnostics(Date,Cash)
# Make the outlier an NA
ATM4_ts[ATM4_ts==10920] <- NA
# impute missing values
ATM4_ts <- na_interpolation(ATM4_ts)
# which observations are missing? NONE
# ATM4_df %>%
# filter(is.na(Cash))
# update the df
ATM4_df <- tk_tbl(ATM4_ts, preserve_index = F) %>%
bind_cols(Date=ATM4_df$Date)
# plot time series
ATM4_df %>%
plot_time_series(Date,Cash)
# seasonal diagnostics
ATM4_df %>%
plot_seasonal_diagnostics(Date, Cash, .feature_set=c("wday.lbl"),.title="ATM4: Day of Week")
ATM4_df %>%
plot_seasonal_diagnostics(Date, Cash, .feature_set=c("month.lbl" ),.title="ATM4: Month of Year")
# check autocorrelations
par(mfrow=c(1,2))
acf(ATM4_ts)
pacf(ATM4_ts)
mtext("Autocorrelations", side = 3, line =-1, outer = TRUE)
ATM3 only has three values, the last three days of the dataset time series. Cash values are not useful to make predictions for future ATM3 withdrawals, but they can be used to determine if another ATM can be used as a proxy for ATM3. We see that ATM1 and ATM3 have identical values for 4/28/2010 - 4/30/2010 (strange coincidence). The model that used to forecast ATM1 future Cash values will also be used in the ATM3 forecast.
# create df for just ATM3
ATM3_df <- ATM_df %>%
filter(ATM=='ATM3')
# create ts object
ATM3_ts <- ATM3_df %>%
select(Cash) %>%
ts(start=c(2009,5), frequency=365)
# only last 3 times are available
ATM3_df %>% plot_time_series(Date,Cash)
table <- data.frame(c(ATM1_df %>% tail(3),
ATM2_df %>% tail(3),
ATM3_df %>% tail(3),
ATM4_df %>% tail(3)
)
)
table %>% select(Date, 'ATM1'=Cash, 'ATM2'=Cash.1, "ATM3"=Cash.2, "ATM4"=Cash.3 ) %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover"))
Date | ATM1 | ATM2 | ATM3 | ATM4 |
---|---|---|---|---|
2010-04-28 | 96 | 107 | 96 | 348 |
2010-04-29 | 82 | 89 | 82 | 44 |
2010-04-30 | 85 | 90 | 85 | 482 |
From the analysis of individual ATM cash withdrawal, there are several useful pieces of information that will govern the modeling process:
* ATM1, AT2, and ATM4 have different distributions, means, and variances and require different models for forecasting
* Each ATM showed similar patterns: Day of week and month of year is important
* ATM3 only has 3 values; ATM1’s data will be used as a proxy.
* There is no need for a mathematical transformation, such as Box-Cox
* There are complexities in the data set due to variations in day of the week and seasonal influence that is, there are multiple seasonal effects (since there is only one year of data, we cannot be certain of a time of year seasonality, but the pattern was strong enough for each of the ATMs to make the assumption).
One of several techniques could be used. * The daily data could be aggregated by week or month to reduce the complexity.
* Multiple seasonality could be incorporated into the model; one based on month and one based on day of the week.
* Since there is only one year of data, the time of year seasonality is omitted and only weekly seasonality is considered in the model. This option is less complex than option 2, and is supported by the strong correlation on the PACF lag 7, and clear pattern on the day of week boxplots.
The third option was selected to limit complexity and leverage the day of the week patterns in each of the ATM data. The time series object were adjusted in R to define 7 days as the ‘seasonality’ of the time series.
ATM1_ts <- ATM1_df %>%
select(Cash) %>%
ts(frequency=7)
ATM2_ts <- ATM2_df %>%
select(Cash) %>%
ts(frequency=7)
ATM4_ts <- ATM4_df %>%
select(Cash) %>%
ts(frequency=7)
For each model, four candidate models were created:
# naive
naive1 <- snaive(ATM1_ts)
naive2 <- snaive(ATM2_ts)
naive4 <- snaive(ATM4_ts)
# random walk
rwf1 <- rwf(ATM1_ts)
rwf2 <- rwf(ATM2_ts)
rwf4 <- rwf(ATM4_ts)
# ARIMA
AR1 <- auto.arima(ATM1_ts, stepwise=F, approximation = F)
AR2 <- auto.arima(ATM2_ts, stepwise=F, approximation = F)
AR4 <- auto.arima(ATM4_ts, stepwise=F, approximation = F)
# ETS
ETS1 <- ets(ATM1_ts, model="ZZZ")
ETS2 <- ets(ATM2_ts, model="ZZZ")
ETS4 <- ets(ATM4_ts, model="ZZZ")
The data were not split into train/test sets to evaluate model performance given the small data set. Instead, candidate model accuracy were evaluated for fitted values. Although not ideal, it will give general understanding of which family of model will be best for each ATM.
# accuracy diagnostics
## ATM1
ATM1_acc <- rbind(accuracy(naive1), accuracy(rwf1), accuracy(AR1),accuracy(ETS1))
row.names(ATM1_acc) <- c("Naive", "Random Walk", "ARIMA", "ETS")
## ATM2
ATM2_acc <- rbind(accuracy(naive2), accuracy(rwf2), accuracy(AR2),accuracy(ETS2))
row.names(ATM2_acc) <- c("Naive", "Random Walk", "ARIMA", "ETS")
### ATM2
ATM4_acc <- rbind(accuracy(naive4), accuracy(rwf4), accuracy(AR4),accuracy(ETS4))
row.names(ATM4_acc) <- c("Naive", "Random Walk", "ARIMA", "ETS")
# table of results
#ATM1: ARIMA
#ATM2:
ATM1_acc %>% kbl(caption = "ATM1: Model Accuracy") %>% kable_styling(bootstrap_options = c("striped", "hover"))
ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | |
---|---|---|---|---|---|---|---|
Naive | -0.0363128 | 27.74593 | 17.69832 | -96.53364 | 116.2264 | 1.0000000 | 0.1879628 |
Random Walk | -0.0302198 | 49.94443 | 37.20604 | -132.05619 | 166.7693 | 2.1022354 | -0.3557445 |
ARIMA | -0.0667763 | 23.23545 | 14.53882 | -102.49739 | 117.4116 | 0.8214799 | -0.0086713 |
ETS | -0.0482208 | 23.78972 | 15.07958 | -106.83543 | 121.8655 | 0.8520341 | 0.1439166 |
ATM2_acc %>% kbl(caption = "ATM2: Model Accuracy") %>% kable_styling(bootstrap_options = c("striped", "hover"))
ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | |
---|---|---|---|---|---|---|---|
Naive | 0.0223464 | 30.19721 | 20.78771 | -Inf | Inf | 1.0000000 | 0.0481966 |
Random Walk | -0.0467033 | 54.18008 | 42.54670 | -Inf | Inf | 2.0467240 | -0.3072004 |
ARIMA | -0.8907648 | 24.13932 | 17.04479 | -Inf | Inf | 0.8199456 | -0.0042753 |
ETS | -0.6885457 | 25.14201 | 17.86610 | -Inf | Inf | 0.8594547 | 0.0198839 |
ATM4_acc %>% kbl(caption = "ATM4: Model Accuracy") %>% kable_styling(bootstrap_options = c("striped", "hover"))
ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | |
---|---|---|---|---|---|---|---|
Naive | -3.6955307 | 453.0350 | 346.3883 | -383.0109 | 438.0732 | 1.0000000 | 0.0621953 |
Random Walk | -0.8104396 | 476.7094 | 386.2060 | -459.4828 | 518.9954 | 1.1149513 | -0.4639654 |
ARIMA | -0.2045415 | 342.4063 | 282.5505 | -501.9492 | 534.1686 | 0.8157047 | -0.0007724 |
ETS | -30.3919593 | 354.0891 | 276.5696 | -434.2023 | 466.1756 | 0.7984380 | 0.0570724 |
The ARIMA models performed slightly better than ETS for each ATM
The residual plots show valid models. Residuals have near constant variance over time, are not autocorrelated, and approximate normal distributions- although ATM4 shows some skew in its residual distribution. The May 2010 forecasts are output below and are available as csv files.
# final model output
# AR1
# AR2
# AR4
# check residuals of final model
checkresiduals(AR1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(1,1,1)[7]
## Q* = 15.095, df = 11, p-value = 0.1782
##
## Model df: 3. Total lags used: 14
checkresiduals(AR2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 10.231, df = 9, p-value = 0.3321
##
## Model df: 5. Total lags used: 14
checkresiduals(AR4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(2,0,0)[7] with non-zero mean
## Q* = 15.694, df = 10, p-value = 0.1087
##
## Model df: 4. Total lags used: 14
# forecast plots
AR1 %>% forecast(h=31) %>% autoplot(main="ATM1 time series with forecast")
AR2 %>% forecast(h=31) %>% autoplot(main="ATM2 time series with forecast")
AR4 %>% forecast(h=31) %>% autoplot(main="ATM4 time series with forecast")
# write to a csv
forecast_ATM1 <- AR1 %>% forecast(h=31)
forecast_ATM1_mean<- forecast_ATM1$mean
forecast_ATM2 <- AR2 %>% forecast(h=31)
forecast_ATM2_mean<- forecast_ATM2$mean
forecast_ATM3_mean<- forecast_ATM1$mean
forecast_ATM4 <- AR4 %>% forecast(h=31)
forecast_ATM4_mean <- forecast_ATM4$mean
rows <- seq(1:31)
rows <- paste("May ", rows)
data.frame(date= rows,
ATM1=forecast_ATM1_mean,
ATM2=forecast_ATM2_mean,
ATM3=forecast_ATM3_mean,
ATM4=forecast_ATM4_mean) %>%
kbl(caption = "May 2010 Forecasts") %>% kable_styling(bootstrap_options = c("striped", "hover"))
date | ATM1 | ATM2 | ATM3 | ATM4 |
---|---|---|---|---|
May 1 | 87.040565 | 66.436775 | 87.040565 | 375.1464 |
May 2 | 104.202236 | 72.717930 | 104.202236 | 429.8268 |
May 3 | 73.512143 | 13.674789 | 73.512143 | 468.2596 |
May 4 | 9.181767 | 5.297995 | 9.181767 | 333.0300 |
May 5 | 99.565861 | 97.634046 | 99.565861 | 442.4000 |
May 6 | 80.483913 | 88.290342 | 80.483913 | 378.1255 |
May 7 | 87.019776 | 64.988924 | 87.019776 | 417.3209 |
May 8 | 87.993527 | 66.444443 | 87.993527 | 383.3132 |
May 9 | 103.315610 | 72.727023 | 103.315610 | 452.7264 |
May 10 | 73.421988 | 13.663861 | 73.421988 | 442.6114 |
May 11 | 10.139357 | 5.294414 | 10.139357 | 378.1444 |
May 12 | 100.224832 | 97.645570 | 100.224832 | 432.4067 |
May 13 | 80.203740 | 88.288633 | 80.203740 | 388.2084 |
May 14 | 87.393031 | 64.979141 | 87.393031 | 444.0295 |
May 15 | 88.169634 | 66.450229 | 88.169634 | 426.8144 |
May 16 | 103.151761 | 72.733455 | 103.151761 | 443.2771 |
May 17 | 73.405327 | 13.655799 | 73.405327 | 446.1807 |
May 18 | 10.316320 | 5.292024 | 10.316320 | 421.2408 |
May 19 | 100.346610 | 97.653963 | 100.346610 | 441.7239 |
May 20 | 80.151964 | 88.287189 | 80.151964 | 427.8751 |
May 21 | 87.462008 | 64.972102 | 87.462008 | 440.5682 |
May 22 | 88.202178 | 66.454588 | 88.202178 | 434.1488 |
May 23 | 103.121482 | 72.737999 | 103.121482 | 444.5036 |
May 24 | 73.402248 | 13.649857 | 73.402248 | 443.7749 |
May 25 | 10.349022 | 5.290443 | 10.349022 | 432.7379 |
May 26 | 100.369114 | 97.660071 | 100.369114 | 441.9529 |
May 27 | 80.142396 | 88.285994 | 80.142396 | 434.8643 |
May 28 | 87.474755 | 64.967041 | 87.474755 | 443.1111 |
May 29 | 88.208193 | 66.457865 | 88.208193 | 440.1992 |
May 30 | 103.115887 | 72.741204 | 103.115887 | 443.6042 |
May 31 | 73.401679 | 13.645480 | 73.401679 | 443.8288 |
# ) %>%
# write.csv("ATM_forecasts.csv")
Objective: Provide monthly forecasts of residential power consumption for 2014.
This dataset contains 192 observations of power consumption in Kilowatt hours aggregated in monthly sequences from 1998 to 2013.
# import
raw_data <- read.csv('ResidentialCustomerForecastLoad-624.csv', na.strings="NA")
# get structure
str(raw_data)
## 'data.frame': 192 obs. of 3 variables:
## $ CaseSequence: int 733 734 735 736 737 738 739 740 741 742 ...
## $ YYYY.MMM : chr "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
## $ KWH : int 6862583 5838198 5420658 5010364 4665377 6467147 8914755 8607428 6989888 6345620 ...
# change date type
raw_data$date <- as.yearmon(raw_data$YYYY.MMM, '%Y-%b') %>%
as.Date()
# convert to tibble
power_df <- raw_data %>%
select(date, KWH)
# convert to time series
power_ts <- power_df %>%
select(KWH) %>%
ts(start=1998, frequency=12)
There is only one missing value which was for September 2008, and there is an outlier to be removed from July 2010. These two values were imputed with a linear interpolation.
power_df %>%
plot_time_series(date, KWH, .title="Power Consumption time series")
# remove July 2010 outlier
power_ts[power_ts==770523] <- NA
# impute the missing values
power_ts <- na_interpolation(power_ts)
power_df <- tk_tbl(power_ts)
power_df$index <- as.Date(power_df$index)
From the interactive time series plot, an upward trend in consumption is seen, especially after 2008 - this observation is more clearly visible in the trend component of the decomposition. The polar seasonal plot shows two seasonal cycles per year, one peak is during the winter months and one is during the summer. This makes sense when considering the energy required to heat and cool buildings during summer and winter. There is some increase in variance with the level of the time series - a Box-Cox transformation will be considered.
# seasonal diagnostics
power_df %>%
plot_seasonal_diagnostics(index, KWH, .feature_set=c("month.lbl"), .title="Power consumption by month [KWH]")
power_df %>%
plot_seasonal_diagnostics(index, KWH, .feature_set=c("year"), .title="Power consumption by year [KWH]")
# seasonal plot
ggseasonplot(power_ts, polar=T, main="Power consumption season plot") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")
)
# decomposition
decompose(power_ts) %>% autoplot
#power_df %>% tk_stl_diagnostics(index, KWH)
# ggsubseriesplot(power_ts)
A Box-Cox transformation can stabilize the data, and was applied. The value of lambda=0.0 was selected. An optimal value of lambda=-0.2 was calculated, but lambda=0 is more interpretable (log transformation). No additional transformations were applied to the data.
paste("optimal lambda: ", BoxCox.lambda(power_ts) %>% round(3))
## [1] "optimal lambda: -0.207"
power_ts_transform <- BoxCox(power_ts, lambda = 0)
power_ts_transform %>%
autoplot(main="Log transformed power consumption time series",
ylab="log of power consuption in KWH", color="dodgerblue", lwd=2)+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")
)
The data were split into train and test sets so candidate model can be evaluated based on unseen data.
train <- window(power_ts_transform, end=c(2010,12))
test <- window(power_ts_transform, start=c(2011,1))
For each model, four candidate models were created:
# seasonal naive
naive <- snaive(train)
# random walk
rwf <- rwf(train)
# ARIMA
AR <- auto.arima(train, stepwise = F, approximation = F)
# ETS
ETS <- ets(train, model="ZZZ")
Two tables of results are provided - one for the training set and one for the test set.
# accuracy diagnostics
## ATM1
acc_train <- rbind(accuracy(naive), accuracy(rwf), accuracy(AR),accuracy(ETS))
row.names(acc_train) <- c("Naive", "Random Walk", "ARIMA", "ETS")
# 1. table of training set accuracy
acc_train %>%
kbl(caption="Training set accuracy") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | |
---|---|---|---|---|---|---|---|
Naive | 0.0088178 | 0.1139787 | 0.0913879 | 0.0537439 | 0.5840611 | 1.0000000 | 0.2563240 |
Random Walk | -0.0007046 | 0.2048886 | 0.1715962 | -0.0130818 | 1.0971481 | 1.8776687 | 0.1726307 |
ARIMA | -0.0003971 | 0.0783363 | 0.0614673 | -0.0043181 | 0.3930279 | 0.6725973 | -0.0017689 |
ETS | 0.0089396 | 0.0840044 | 0.0651353 | 0.0544209 | 0.4161158 | 0.7127344 | 0.2848504 |
# 2. table of test set accuracy
AR_test_acc <- AR %>%
forecast(h=24) %>%
accuracy(test)
ETS_test_acc <- ETS %>%
forecast(h=24) %>%
accuracy(test)
acc <- rbind(accuracy(naive,test), accuracy(rwf,test), AR_test_acc, ETS_test_acc )
# get just the test set results
acc <- acc[c(2,4,6,8),]
row.names(acc) <- c("Naive", "Random Walk", "ARIMA", "ETS")
acc %>%
kbl(caption = "Test set accuracy") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | Theil’s U | |
---|---|---|---|---|---|---|---|---|
Naive | 0.0792027 | 0.1612895 | 0.1363807 | 0.4965889 | 0.8601012 | 1.492328 | 0.3665427 | 0.7680838 |
Random Walk | 0.2081387 | 0.3136470 | 0.2648156 | 1.2922402 | 1.6570476 | 2.897709 | 0.3955775 | 1.2454857 |
ARIMA | 0.1014687 | 0.1420595 | 0.1116912 | 0.6374437 | 0.7029380 | 1.222166 | 0.2163076 | 0.6840179 |
ETS | 0.1083409 | 0.1466202 | 0.1229850 | 0.6810322 | 0.7748039 | 1.345747 | 0.1390291 | 0.7056191 |
Based on the metrics calculated on the test set, the best model for the data is the ARIMA model: ARIMA(4,0,0)(0,1,1)[12].
A check of the residuals confirms a valid model - the residuals have approximately constant variance through the span of the series, are approximately normally distributed, and are not autocorrelated. Since a log transformtion was applied earlier, a back-transformation is needed to provided the final forecasts which are output below and available as a csv file: power_forecast.csv
AR <- arima(power_ts_transform, order = c(4L, 0L, 0L),
seasonal = list(order = c(0L, 1L, 1L), period = 12))
checkresiduals(AR)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,0,0)(0,1,1)[12]
## Q* = 16.788, df = 19, p-value = 0.6042
##
## Model df: 5. Total lags used: 24
AR %>%
forecast(h=12) %>%
autoplot(main="Power consumption time series with 2014 forecasts (log-scale)" )+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")
)
# backtransformation
forecast_trans <- AR %>% forecast(h=12)
forecast_orig <- exp(forecast_trans$mean) %>% data.frame()
colnames(forecast_orig) <- "KWH_2014"
rownames(forecast_orig) <- c(1:12)
forecast_orig %>% ts(start=c(2014,1),frequency=12)
## Jan Feb Mar Apr May Jun Jul Aug
## 2014 10512431 8375656 7083053 5914195 5626279 7596631 8501090 9073518
## Sep Oct Nov Dec
## 2014 7900939 5662400 5543183 7699813
# write to csv
# forecast_orig %>% write.csv("power_forecast.csv")
These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.
Date were imported and cleaned. A POSIXct data type was used for the Date.Time
variable. Data were aggregated into hourly intervals.
library(lubridate)
pipe1 <- read.csv('Waterflow_Pipe1.csv')
pipe2 <- read.csv(('Waterflow_Pipe2.csv'))
tail(pipe1)
## Date.Time WaterFlow
## 995 11/1/15 10:09 PM 15.28143
## 996 11/1/15 10:09 PM 26.33668
## 997 11/1/15 10:25 PM 29.06427
## 998 11/1/15 11:08 PM 22.84410
## 999 11/1/15 11:34 PM 16.21870
## 1000 11/1/15 11:35 PM 21.21171
tail(pipe2)
## Date.Time WaterFlow
## 995 12/3/15 11:00 AM 72.96677
## 996 12/3/15 12:00 PM 31.48311
## 997 12/3/15 1:00 PM 66.81673
## 998 12/3/15 2:00 PM 42.93666
## 999 12/3/15 3:00 PM 33.40133
## 1000 12/3/15 4:00 PM 66.68147
# create of POSIXct data type for date.time
pipe1$Date.Time <- as.POSIXct(strptime(pipe1$Date.Time, "%m/%d/%y %I:%M %p"))
pipe2$Date.Time <- as.POSIXct(strptime(pipe2$Date.Time, "%m/%d/%y %I:%M %p"))
pipe1_agg <- pipe1 %>%
summarize_by_time(Date.Time,
.by="hour",
Flow=mean(WaterFlow))
pipe2_agg <- pipe2 %>%
summarize_by_time(Date.Time,
.by="hour",
Flow=mean(WaterFlow))
Based on the time series plot and ACF/PACF the time series is stationary.
# plot time series
pipe1_agg %>% plot_time_series(Date.Time,Flow)
# creat ts object
pipe1_ts <- pipe1_agg$Flow %>% ts(frequency = 24)
# check autocorrelation
pipe1_ts %>% acf(lag.max = 50)
pipe1_ts %>% pacf(lag.max = 50)
ARIMA(0,0,0) - essentially a random walk forecast. The forecasts are output below.
pipe1_arima <- auto.arima(pipe1_ts)
pipe1_arima %>% forecast(h=7) %>% autoplot
pipe1_forecast <- pipe1_arima %>% forecast(h=7)
#pipe1_forecast$mean %>% data.frame() %>% write.csv("pipe1_forecast.csv")
pipe1_forecast$mean
## Time Series:
## Start = c(10, 22)
## End = c(11, 4)
## Frequency = 24
## [1] 19.88428 19.88428 19.88428 19.88428 19.88428 19.88428 19.88428
Based on the time series plot and ACF/PACF the time series is stationary. The time series is given a frequency of 24.
# plot time series
pipe2_agg %>% plot_time_series(Date.Time,Flow)
# creat ts object
pipe2_ts <- pipe2_agg$Flow %>% ts(frequency = 24)
# check autocorrelation
pipe2_ts %>% acf
pipe2_ts %>% pacf
ARIMA(0,0,0)(0,0,1)[24] with non-zero mean
pipe2_arima <- auto.arima(pipe2_ts)
pipe2_arima %>% forecast(h=7) %>% autoplot
pipe2_forecast <- pipe2_arima %>% forecast(h=7)
#pipe2_forecast$mean %>% data.frame() %>% write.csv("pipe2_forecast.csv")
pipe2_forecast$mean
## Time Series:
## Start = c(42, 17)
## End = c(42, 23)
## Frequency = 24
## [1] 39.56376 40.87797 41.88069 39.63039 41.10972 40.19510 38.33596