Source Code: https://github.com/djlofland/DATA624_PredictiveAnalytics/tree/master/Project_1
library(fpp2) # Hyndman's Forecasting Principals and Practice package
# === libraries imported by fpp2 ===
# library(forecast)
# library(ggplot2)
# library(purrr)
# --- fpp2 suggests ---
library(GGally) #
library(gridExtra) #
library(seasonal) #
library(tidyverse) #
library(vars) #
# === Time Series Tools ===
library(timetk) #
library(imputeTS) # Tools for imputing missing time series data points
library(tseries) # Time series analysis and computational finance
library(zoo) # Infrastructure for Regular and Irregular Time Series
library(prophet) # Facebook forecasting library
library(fable) # Tidy wrapper for forecasting
library(feasts) # collection of tools for the analysis of time series data
library(tsibble) # Tidy Time series data wrapper
library(fable.prophet) # enable FB prophet through fable
library(dtw) # Dynamic time warping to align and compare time series
library(dtwclust) # DTW CLuster tools
# === Other Support ===
library(data.table)
library(ModelMetrics) # additional criteria for evaluting models
library(dbplyr)
library(plotly) # interactive charts
library(lubridate) # date time manipulation
library(naniar) # tools for working with na's
#library(caret)
#library(mlbench)
#library(AppliedPredictiveModeling)
#library(corrplot)
#library(RColorBrewer)
#library(e1071)
library(lattice)
#library(hablar)
#library(broom)
#' Print common ACF plots to help determine if stationary
#'
#' @param series A time series
#' @param lambda the Box Cox lambda transform to use
#' @examples
#' acf_plots(my_series, 0.5)
#' @export
acf_plots <- function(series, lambda) {
a1 <- ''
a2 <- ''
series_bc <- BoxCox(series, lambda)
p1 <- autoplot(series) + ggtitle('Original Time Series')
p2 <- autoplot(series_bc) + ggtitle(paste('Box Cox Transformed Time Series with lambda=',lambda))
p3 <- ggAcf(series_bc)
p4 <- ggPacf(series_bc)
grid.arrange(p1, p2, p3, p4, nrow=2)
series_bc_diff <- diff(series)
a1 <- adf.test(series_bc_diff)
print(a1)
# b1 <- Box.test(series_bc_diff, lag=10, type='Ljung-Box')
# print(b1)
# k1 <- kpss.test(series_bc_diff, null='Trend')
# print(k1)
series_bc_diff2 <- diff(series_bc_diff)
a2 <- adf.test(series_bc_diff2)
print(a2)
# b2 <- Box.test(series_bc_diff2, lag=10, type='Ljung-Box')
# print(b2)
# k2 <- kpss.test(series_bc_diff2, null='Trend')
# print(k2)
p1 <- autoplot(series_bc_diff) + ggtitle('Differencing Order 1')
p2 <- autoplot(series_bc_diff2) + ggtitle('Differencing Order 2')
grid.arrange(p1, p2, nrow=1)
}
#' Return a properly rounded Box Cox lambda (-n, ..., -1, -0.5, 0, 1, ..., n)
#'
#' @param series A time series
#' @examples
#' round_lambda(my_series)
#' @return new_lambda
#' @export
round_lambda <- function(series) {
lambda <- BoxCox.lambda(series)
if ((lambda > 0.25) & (lambda < 0.75)) {
new_lambda <- 0.5
} else if ((lambda > -0.75) & (lambda < -0.25)) {
new_lambda <- -0.5
} else {
new_lambda <- round(lambda)
}
print(paste('lambda:', lambda, ', rounded lambda:', new_lambda))
return(new_lambda)
}In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
# Load the Excel data
atm_df <- readxl::read_excel('./datasets/ATM624Data.xlsx')
# Fix the excel date format - convert to standard R date
atm_df$DATE <- as.Date(atm_df$DATE, origin = "1899-12-30")
# Make sure our Data is sorted by date
atm_df <- atm_df[order(atm_df$DATE),]
# inspect data.frame to make sure things look fine
knitr::kable(atm_df[1:10, 1:3],
caption = 'ATM Dataset (1st few rows)',
format="html",
table.attr="style='width:50%;'") %>%
kableExtra::kable_styling()| DATE | ATM | Cash |
|---|---|---|
| 2009-05-01 | ATM1 | 96.0000 |
| 2009-05-01 | ATM2 | 107.0000 |
| 2009-05-01 | ATM3 | 0.0000 |
| 2009-05-01 | ATM4 | 776.9934 |
| 2009-05-02 | ATM1 | 82.0000 |
| 2009-05-02 | ATM2 | 89.0000 |
| 2009-05-02 | ATM3 | 0.0000 |
| 2009-05-02 | ATM4 | 524.4180 |
| 2009-05-03 | ATM1 | 85.0000 |
| 2009-05-03 | ATM2 | 90.0000 |
Let’s do a quick plot of the raw data to get a sense for how it look, scale, any issues or trends.
# Plot data for inspection using ggplot. Note I used a log_scale to handle the huge outlier
# ggplot(atm_df, aes(x=DATE, y=Cash, color=ATM, group=ATM)) +
# geom_line() +
# scale_y_log10()
# Plot data for inspection using plotly for interactive inspection Note I used a a y log scale to handle the huge outlier
fig <- atm_df %>%
group_by(ATM) %>%
plot_ly(x= ~DATE) %>%
add_lines(y= ~Cash, color = ~factor(ATM)) %>%
layout(title = "Raw ATM Withdwals by Date",
xaxis = list(title="Date"),
yaxis = list(title="Daily Cash Withdrawl (Hundreds of Dollars)", type = "log"))
figInitial takeaways are:
I’m using the nanair package to inspect for missing values in the dataset. As we can see, there are 14 rows missing both an ATM and Cash values. In addition, we have 5 rows missing Cash values.
# Various NA plots to inspect data
knitr::kable(miss_var_summary(atm_df),
caption = 'Missing Values',
format="html",
table.attr="style='width:50%;'") %>%
kableExtra::kable_styling()| variable | n_miss | pct_miss |
|---|---|---|
| Cash | 19 | 1.2890095 |
| ATM | 14 | 0.9497965 |
| DATE | 0 | 0.0000000 |
We have 14 empty rows with a DATE and no ATM identifier or Cash value - these were probably added to illustrate the forecast dates we need to make. I’m going to drop them for now so they don’t interfere. We’ll re-add forecast rows later.
# Drop any row that are missing both ATM and Cash values
atm_clean_df <- atm_df %>%
filter(!(is.na(ATM) & is.na(Cash))) %>%
drop()
# Confirm 14 rows were dropped
print(c('Original: ', nrow(atm_df), 'After Drop:', nrow(atm_clean_df)))## [1] "Original: " "1474" "After Drop:" "1460"
We have 5 rows with an ATM identifier, but no Cash value. I will impute these values using the na_kalman() function from the imputeTS package. Since financial transactions often have weekly cycles, I want to make sure my imputing captures the weekly cycle along with any overall trends. Note, I knew there was one problematic outlier from the exploratory graph above. If we had found several outliers, I probably would have moved to a z-score based system to identify all values over +3 standard deviations (only on the high side as low values down to $0 is fine). I would have then replaced all those outlier values with NA and imputed with the na_kalman() below. Note that ATM 3 didn’t have any missing values; however, it does have all 0’s until the last few days - will handle that problem in a later section.
# Get highest cash value.
max_cash <- max(atm_clean_df$Cash, na.rm = TRUE)
max_date <- atm_clean_df %>%
filter(Cash==max_cash)
# Replace out outlier with NA to we impute it in the next step
atm_clean_df$Cash[atm_clean_df$ATM==max_date$ATM & atm_clean_df$DATE==max_date$DATE] <- NAWe have one crazy outlier for ATM4 on 2010-02-09 with a value of $1.0919762^{4}. Since this doesn’t line up on a known holiday that we might need to account for, I’m going to set it to NA and when we impute, we’ll replace the outlier with a more reasonable value.
# Get each unique ATM machine so we can loop over them
machines <- unique(atm_clean_df$ATM)
# As we impute each machine, we'll construct a new data.frame
atm_impute_df <- data.frame()
# Loop thru ATM machines
for (machine in machines) {
# Grab the rows for just one machine at a time
single_machine <- atm_clean_df %>%
filter(ATM == machine)
# Are there any NA's?
has_na <- sum(is.na(single_machine$Cash))
# If we found an NA, then impute, otherwise skip the impute (so no errors)
if (has_na > 0) {
imp <- single_machine %>%
na_kalman()
# Plot the time series with imputed values for visual inspection
print(ggplot_na_imputations(single_machine$Cash, imp$Cash) +
ggtitle(paste('Imputed Values for ', machine)))
} else {
imp <- single_machine
}
# Construct new data.frame for model building
atm_impute_df <- bind_rows(atm_impute_df, imp)
}So, it appear like ATM 3 is a new machine that was brought online at the end of data tracking so we only have a few values. We have a couple of options:
Let’s check the cross correlation first.
atm1 <- atm_impute_df %>% filter(ATM=='ATM1') %>% select(Cash)
atm2 <- atm_impute_df %>% filter(ATM=='ATM2') %>% select(Cash)
atm4 <- atm_impute_df %>% filter(ATM=='ATM4') %>% select(Cash)
# Pairwise Cross correlation
ccf(atm1, atm2, type="correlation")# Plot data for inspection using plotly for interactive inspection Note I used a a y log scale to handle the huge outlier
fig <- atm_impute_df %>%
group_by(ATM) %>%
plot_ly(x= ~DATE) %>%
add_lines(y= ~Cash, color = ~factor(ATM)) %>%
layout(title = "Raw ATM Withdwals by Date",
xaxis = list(title="Date"),
yaxis = list(title="Daily Cash Withdrawl (Dollars)", type = "log"))
figWe see a significant correlation at lag=0 along with the increments \(\pm7\). Comparing the different combinations, ATM 1 & 2 have the most significant lag correlation spike. Given this, it appears the ATM’s while they have different values are moving in similar daily directions and have strong weekly trends. The few data points we have for ATM 3 are fairly close to what we see in ATM 1 and ATM 2. Also, we note that ATM 4 has a significantly higher daily withdrawal than 1, 2 or 3. With this in mind, we will proceed to construct a mean time series based on ATM 1 and ATM 2. We will be using Dynamic Time Warping (DTW) via the dtw and dtwclust packages. Rather than a simple day-over-day comparison (ATM 1 on day 4 compared with ATM2 on day 4), DTW is a more advanced approach for comparing and aligning time series. It maps patterns and finds nearest points allowing for both x and y to vary. It can also warp data allowing us to stretch or compress one time series to find better matches. dba from the dtwclust package is useful as it returns a mean time series given any number of input time series that best represents the patterns seen across them all. Behind the scenes, dba does pairwise hierarchical clustering. So if we had 4 ATM’s, the it would average 1 & 2, then 3 & 4, then average the mean from 1 & 2 with the mean from 3 & 4. This give a final mean from all 4 ATM’s. See:
While I’m going to use mean (main because I want to play with dtw), we could also rationalize using the max between ATM1 and ATM2 for each date and use that as our daily value for ATM 3. This really comes down to the cost function - is it worse to put in extra money that isn’t used or allow an ATM to run out of money. Using mean assumes the penalty is the same for each condition. If we knew that money was tight and it’s OK to run out, we might use a min function. If we are more concerned with lost opportunity if a machine runs out, then we’d go with the max function.
I also round all the values up to the nearest ten dollar amount (i.e. 1 decimal position since Cash is in hundreds of dollars). Most ATM machine only give out $20 increments, but some older ATM’s gave out $10 increments.
# Build data.frame with only ATM 1, 2 and 4 (eliminate ATM 3)
atm_12_df <- atm_impute_df %>%
filter(ATM=='ATM1' | ATM=='ATM2') %>%
pivot_wider(names_from = ATM, values_from=Cash) %>%
select(-DATE)
# the `dba` function from `dtwclust` package requires our data be transposed with
# a row for each time series and columns are the dates. It must also be a matrix
atm_12t_df <- transpose(atm_12_df)
colnames(atm_12t_df) <- atm_impute_df$DATE[1:365]
rownames(atm_12t_df) <- colnames(atm_12_df)
# Convert our data.frame to a matrix
atm_12t_m <- data.matrix(atm_12t_df, rownames.force=TRUE)
# run dba to get a mean time series. This uses the dtwclust package which uses
# dynamic time warping to
atm_12_avg <- DBA(atm_12t_m, centroid = atm_12t_m[1,], trace=TRUE)## DBA Iteration: 1, 2, 3, 4, 5, 6 - Converged!
# Turn the new mean into a time series and plot for inspection
start_ts <- min(single_machine$DATE)
start_ts <- c(year(start_ts), yday(start_ts))
end_ts <- max(single_machine$DATE)
end_ts <- c(year(end_ts), yday(end_ts))
atm_12_avg_ts <- ts(atm_12_avg, start=start_ts, frequency = 7)
autoplot(atm_12_avg_ts) + ggtitle('ATM 3 (Based on ATM 1 & 2 Average)')# Put all our ATM time series back into a single data.frame so we can move onto modeling
atm_final_df <- atm_12_df
atm_final_df$ATM3 <- atm_12_avg_ts
atm4 <- atm_impute_df %>%
filter(ATM=='ATM4') %>%
select(Cash)
atm_final_df$ATM4 <- atm4$Cash
# I'm assuming ATM's don't given our smaller than $10 increments. So, will round up to the
# nearest tenth (note `Cash` is in hundreds of dollars).
atm_final_df <- ceiling(atm_final_df * 10) / 10
# Finally switch this back to a long format with a DATE.
atm_final_df$DATE <- colnames(atm_12t_df)
atm_final_df <- atm_final_df %>%
pivot_longer(-DATE, names_to = "ATM", values_to = "Cash")Based on Augmented Dickey-Fuller Test and visual inspection, we can say each ATM machine series is stationary. We see a strong lag 7 pattern, which aligns with an expected weekly seasonal pattern.
# Get each unique ATM machine so we can loop over them
machines <- unique(atm_final_df$ATM)
for (machine in machines) {
# Grab the rows for just one machine at a time
single_machine <- atm_final_df %>%
filter(ATM == machine) %>%
select(DATE, Cash)
start_ts <- min(single_machine$DATE)
start_ts <- c(year(start_ts), yday(start_ts))
end_ts <- max(single_machine$DATE)
end_ts <- c(year(end_ts), yday(end_ts))
atm_ts <- ts(single_machine$Cash, start=start_ts, end=end_ts, frequency=365)
lambda <- round_lambda(atm_ts)
acf_plots(atm_ts, lambda)
}## [1] "lambda: 1 , rounded lambda: 1"
##
## Augmented Dickey-Fuller Test
##
## data: series_bc_diff
## Dickey-Fuller = -11.989, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
##
##
## Augmented Dickey-Fuller Test
##
## data: series_bc_diff2
## Dickey-Fuller = -19.513, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## [1] "lambda: 1 , rounded lambda: 1"
##
## Augmented Dickey-Fuller Test
##
## data: series_bc_diff
## Dickey-Fuller = -11.718, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
##
##
## Augmented Dickey-Fuller Test
##
## data: series_bc_diff2
## Dickey-Fuller = -18.477, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## [1] "lambda: 1 , rounded lambda: 1"
##
## Augmented Dickey-Fuller Test
##
## data: series_bc_diff
## Dickey-Fuller = -11.114, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
##
##
## Augmented Dickey-Fuller Test
##
## data: series_bc_diff2
## Dickey-Fuller = -19.202, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## [1] "lambda: 1 , rounded lambda: 1"
##
## Augmented Dickey-Fuller Test
##
## data: series_bc_diff
## Dickey-Fuller = -10.409, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
##
##
## Augmented Dickey-Fuller Test
##
## data: series_bc_diff2
## Dickey-Fuller = -16.803, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Below is also a plot of the distribution of withdraws by day of the week over the entire duration. This could be used by the bank manager to help determine how much cash to provide on a daily basis. Since we saw no other seasonal, cycles or trends, this weekly seasonal cycles is probably a good base. In fact, we could technically stop the analysis right here and use the upr value as our conservative prediction. If we are concerned with putting out too much cash, then we would choose the mean.
library(Hmisc)
atm_final_df$week <- strftime(atm_final_df$DATE, '%V')
atm_final_df$day <- weekdays(as.Date(atm_final_df$DATE))
atm_grp2_df <- atm_final_df %>%
group_by(ATM, day) %>%
summarise(ci = list(mean_cl_normal(Cash) %>%
rename(mean=y, lwr=ymin, upr=ymax))) %>%
unnest
atm_grp2_df$mean <- ceiling(atm_grp2_df$mean * 10) / 10
atm_grp2_df$upr <- ceiling(atm_grp2_df$upr * 10) / 10
atm_grp2_df$lwr <- ceiling(atm_grp2_df$lwr * 10) / 10
ggplot(atm_grp2_df, aes(x=day, y=mean, colour=ATM)) +
geom_errorbar(aes(ymin=lwr, ymax=upr), width=.1) +
geom_line() +
geom_point() +
coord_trans(y = "log10") +
ggtitle('Cash withdrawl by Day of Week') +
xlab('Day of Week') +
ylab('Hundreds of Dollars, log scale')# inspect data.frame to make sure things look fine
knitr::kable(atm_grp2_df,
caption = 'Cash Withdrawl by Day of Week (Hundred $)',
format="html",
table.attr="style='width:50%;'") %>%
kableExtra::kable_styling()| ATM | day | mean | lwr | upr |
|---|---|---|---|---|
| ATM1 | Friday | 98.7 | 90.3 | 107.0 |
| ATM1 | Monday | 86.1 | 80.3 | 91.9 |
| ATM1 | Saturday | 96.6 | 92.7 | 100.4 |
| ATM1 | Sunday | 102.7 | 97.0 | 108.5 |
| ATM1 | Thursday | 31.7 | 22.6 | 40.9 |
| ATM1 | Tuesday | 89.6 | 76.4 | 102.9 |
| ATM1 | Wednesday | 82.2 | 75.2 | 89.2 |
| ATM2 | Friday | 92.1 | 81.7 | 102.4 |
| ATM2 | Monday | 58.8 | 49.8 | 67.8 |
| ATM2 | Saturday | 76.0 | 68.4 | 83.7 |
| ATM2 | Sunday | 67.2 | 60.3 | 74.1 |
| ATM2 | Thursday | 26.6 | 16.8 | 36.4 |
| ATM2 | Tuesday | 73.3 | 61.8 | 84.8 |
| ATM2 | Wednesday | 44.4 | 35.4 | 53.4 |
| ATM3 | Friday | 97.6 | 89.7 | 105.4 |
| ATM3 | Monday | 85.0 | 78.8 | 91.2 |
| ATM3 | Saturday | 96.3 | 91.8 | 100.8 |
| ATM3 | Sunday | 99.7 | 94.5 | 104.8 |
| ATM3 | Thursday | 35.9 | 27.2 | 44.5 |
| ATM3 | Tuesday | 85.3 | 73.5 | 97.1 |
| ATM3 | Wednesday | 82.2 | 75.8 | 88.7 |
| ATM4 | Friday | 573.6 | 474.5 | 672.8 |
| ATM4 | Monday | 481.4 | 403.4 | 559.4 |
| ATM4 | Saturday | 491.6 | 401.8 | 581.5 |
| ATM4 | Sunday | 539.2 | 435.6 | 642.7 |
| ATM4 | Thursday | 170.1 | 99.6 | 240.5 |
| ATM4 | Tuesday | 445.8 | 324.2 | 567.5 |
| ATM4 | Wednesday | 413.8 | 344.0 | 483.6 |
Now we loop through each ATM machine, convert our data to a time series and build a forecasting model. Based on the strong pACF of 7 for each machine, we will do a diff(7) for each. Once we have the model, we’ll predict the next 14 days for that machine and add it to our final dataset.
# Get each unique ATM machine so we can loop over them
machines <- unique(atm_final_df$ATM)
# Build final output DF. Note we are using the original data minus the empty rows.
# The problem is give output for the future 2 weeks, not report our cleaned data.
atm_output_df <- atm_df %>%
filter(!(is.na(ATM) & is.na(Cash))) %>%
drop()
pred_df <- data.frame()
# Loop through each machine
for (machine in machines) {
# Grab the rows for just one machine at a time
single_machine <- atm_final_df %>%
filter(ATM == machine)
# Calculate time series start and and dates
start_ts <- min(single_machine$DATE)
start_ts <- c(year(start_ts), yday(start_ts))
end_ts <- max(single_machine$DATE)
end_ts <- c(year(end_ts), yday(end_ts))
# build time series object (note we set the weekly seasonal pattern)
atm_ts <- ts(single_machine$Cash, start=start_ts, frequency=7)
atm_ts %>% diff(7) %>% ggtsdisplay(main=paste(machine, ' with Differencing=7'))
model <- auto.arima(atm_ts)
summary(model)
print(model %>%
forecast(14) %>%
autoplot() +
ggtitle(paste(machine, 'Arima Model')))
print(checkresiduals(model))
data <- predict(model, n.ahead=14)
st <- as.Date(max(single_machine$DATE)) + 1
en <- st + 13
data$DATE <- seq.Date(st, en, by="1 day")
predictions <- cbind(as.character(data$DATE), rep(machine,14), as.numeric(data$pred))
colnames(predictions) <- c('DATE', 'ATM', 'Cash')
pred_df <- rbind(pred_df, predictions)
pred_df$Cash <- as.numeric(pred_df$Cash)
pred_df$Cash <- ceiling(pred_df$Cash)
}## Series: atm_ts
## ARIMA(0,0,1)(0,1,2)[7]
##
## Coefficients:
## ma1 sma1 sma2
## 0.1697 -0.5866 -0.0990
## s.e. 0.0552 0.0508 0.0497
##
## sigma^2 estimated as 559.7: log likelihood=-1641.09
## AIC=3290.18 AICc=3290.29 BIC=3305.7
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.07363627 23.33147 14.60095 -102.7316 117.5976 0.8247044
## ACF1
## Training set -0.008129974
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(0,1,2)[7]
## Q* = 16.763, df = 11, p-value = 0.1151
##
## Model df: 3. Total lags used: 14
##
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(0,1,2)[7]
## Q* = 16.763, df = 11, p-value = 0.1151
## Series: atm_ts
## ARIMA(2,0,2)(0,1,1)[7]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4318 -0.9159 0.4740 0.8107 -0.7552
## s.e. 0.0550 0.0400 0.0859 0.0548 0.0383
##
## sigma^2 estimated as 609: log likelihood=-1655.58
## AIC=3323.15 AICc=3323.39 BIC=3346.44
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.8946498 24.26803 17.12705 -Inf Inf 0.8188195 -0.004982575
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 10.638, df = 9, p-value = 0.3014
##
## Model df: 5. Total lags used: 14
##
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 10.638, df = 9, p-value = 0.3014
## Series: atm_ts
## ARIMA(0,0,1)(0,1,1)[7]
##
## Coefficients:
## ma1 sma1
## 0.1414 -0.6562
## s.e. 0.0521 0.0441
##
## sigma^2 estimated as 514.1: log likelihood=-1626.37
## AIC=3258.73 AICc=3258.8 BIC=3270.37
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.5063048 22.39344 14.94065 -58.47454 72.88051 0.82734 0.00165455
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(0,1,1)[7]
## Q* = 23.059, df = 12, p-value = 0.02723
##
## Model df: 2. Total lags used: 14
##
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(0,1,1)[7]
## Q* = 23.059, df = 12, p-value = 0.02723
## Series: atm_ts
## ARIMA(0,0,3)(1,0,0)[7] with non-zero mean
##
## Coefficients:
## ma1 ma2 ma3 sar1 mean
## 0.0871 0.0156 0.0884 0.1839 444.759
## s.e. 0.0531 0.0583 0.0516 0.0526 26.079
##
## sigma^2 estimated as 119371: log likelihood=-2648.96
## AIC=5309.91 AICc=5310.15 BIC=5333.31
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.2162626 343.1262 287.3652 -514.7456 548.019 0.829546
## ACF1
## Training set -0.008222277
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,3)(1,0,0)[7] with non-zero mean
## Q* = 16.061, df = 9, p-value = 0.06562
##
## Model df: 5. Total lags used: 14
##
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,3)(1,0,0)[7] with non-zero mean
## Q* = 16.061, df = 9, p-value = 0.06562
# Append forecast values to end of original dataset
atm_output_df <- rbind(atm_output_df, pred_df)
# Round Cash up to the nearest integer
atm_output_df$Cash <- as.numeric(atm_output_df$Cash)
atm_output_df$Cash <- ceiling(atm_output_df$Cash)
# Save final dataset with forecasts appended to end
write.csv(atm_output_df, './datasets/atm_final.csv')Note the residuals for each ATM arima model look good. I just used auto.arima. The final models are:
The final predictions are:
# inspect data.frame to make sure things look fine
pred_df <- pred_df %>%
pivot_wider(names_from = ATM, values_from=Cash)
knitr::kable(pred_df,
caption = '2 Week Forecasts for ATM Machines',
format="html",
table.attr="style='width:60%;'") %>%
kableExtra::kable_styling()| DATE | ATM1 | ATM2 | ATM3 | ATM4 |
|---|---|---|---|---|
| 2010-05-01 | 88 | 67 | 88 | 363 |
| 2010-05-02 | 104 | 73 | 95 | 433 |
| 2010-05-03 | 74 | 14 | 74 | 451 |
| 2010-05-04 | 9 | 6 | 8 | 366 |
| 2010-05-05 | 101 | 98 | 102 | 428 |
| 2010-05-06 | 81 | 89 | 78 | 372 |
| 2010-05-07 | 87 | 65 | 87 | 452 |
| 2010-05-08 | 89 | 67 | 88 | 430 |
| 2010-05-09 | 103 | 73 | 95 | 443 |
| 2010-05-10 | 74 | 14 | 74 | 446 |
| 2010-05-11 | 9 | 6 | 8 | 431 |
| 2010-05-12 | 101 | 98 | 102 | 442 |
| 2010-05-13 | 81 | 89 | 78 | 432 |
| 2010-05-14 | 87 | 65 | 87 | 447 |
** Final Forecast results are on GitHub at:
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
# Load the Excel data
power_df <- readxl::read_excel('./datasets/ResidentialCustomerForecastLoad-624.xlsx',
col_names=FALSE, skip=1)
colnames(power_df) <- c('CASE', 'DATE', 'KWH')
# Fix the excel date format - convert to standard R date
power_df$DATE_YM <- yearmonth(power_df$DATE)
power_df$DATE <- as.Date(power_df$DATE_YM)
# Make sure our Data is sorted by date
power_df <- power_df[order(power_df$DATE),]
# inspect data.frame to make sure things look fine
knitr::kable(power_df[1:10, 1:4],
caption = 'Power DataFrame top 10 Rows',
format="html",
table.attr="style='width:60%;'") %>%
kableExtra::kable_styling()| CASE | DATE | KWH | DATE_YM |
|---|---|---|---|
| 733 | 1998-01-01 | 6862583 | 1998 Jan |
| 734 | 1998-02-01 | 5838198 | 1998 Feb |
| 735 | 1998-03-01 | 5420658 | 1998 Mar |
| 736 | 1998-04-01 | 5010364 | 1998 Apr |
| 737 | 1998-05-01 | 4665377 | 1998 May |
| 738 | 1998-06-01 | 6467147 | 1998 Jun |
| 739 | 1998-07-01 | 8914755 | 1998 Jul |
| 740 | 1998-08-01 | 8607428 | 1998 Aug |
| 741 | 1998-09-01 | 6989888 | 1998 Sep |
| 742 | 1998-10-01 | 6345620 | 1998 Oct |
# Plot data for inspection using plotly for interactive inspection Note I used a a y log scale to handle the huge outlier
fig <- power_df %>%
plot_ly(x= ~DATE) %>%
add_lines(y= ~KWH) %>%
layout(title = "Power Usage (KWH) by Month",
xaxis = list(title="Date"),
yaxis = list(title="Power (KWH)"))
figTakeaways:
We need to check for any missing values in our data set. First let’s check for explicit missing (i.e. nulls). Note there is 1 missing KWH value.
# Various NA plots to inspect data
knitr::kable(miss_var_summary(power_df),
caption = 'Missing Values',
format="html",
table.attr="style='width:50%;'") %>%
kableExtra::kable_styling()| variable | n_miss | pct_miss |
|---|---|---|
| KWH | 1 | 0.5208333 |
| CASE | 0 | 0.0000000 |
| DATE | 0 | 0.0000000 |
| DATE_YM | 0 | 0.0000000 |
We will use na_kalman() from the imputeTS package to impute any explicit missing values. This approach as the advantage that it will use trend and seasonal patterns to help fill in values to have better context with surrounding values.
# Get highest cash value.
min_kwh <- min(power_df$KWH, na.rm = TRUE)
min_date <- power_df %>%
filter(KWH==min_kwh)
# Replace out outlier with NA to we impute it in the next step
power_df$KWH[power_df$DATE==min_date$DATE] <- NA
# Impute the missing values
imp <- power_df %>%
na_kalman()
# Plot the time series with imputed values for visual inspection
print(ggplot_na_imputations(power_df$KWH, imp$KWH) +
ggtitle(paste('Imputed Values')))Next we double check if there are any implicit missing values, i.e. are there any gaps in our time series? Ideally, we want a continuous time series with a data point at each incremental step in the time series. We convert our DF to a tsibble and leverage the count_gaps() function. We see no gaps - cool.
# Convert our time series to a tsibble for convenience
power_tsb <- as_tsibble(power_df, index=DATE_YM, regular = TRUE, .drop = TRUE)
# Are there any gaps in the data (i.e. no data for a given hour)
count_gaps(power_tsb)## # A tibble: 0 x 3
## # … with 3 variables: .from <mth>, .to <mth>, .n <int>
Let’s check whether a transformation will be helpful by calculating BoxCox lamba.
## [1] "lambda: -0.206647402717936 , rounded lambda: 0"
Recommended lambda is 0, so no transformation will be needed.
Let’s visualize the seasonal and trend patterns. This will be help us understand whether differencing will be helpful before modeling.
We do see a generate upward trend in the data. Given this, we will want to check the impact of differencing.
Checking the BoxCox, we get a recommended \(\lambda=0.5\). Looking at the ACF and pACF plots along with Dickey-Fuller Test, there is a clear lag t 7 which would corespond to ~7 months. We would expect residential power to peak in Summer and Winter (heating and cooling), so the 7 lag is closee to, but not exactly what we might expect. We will want to difference with 7 when modeling with Arima.
## [1] "lambda: 0.453125398020207 , rounded lambda: 0.5"
##
## Augmented Dickey-Fuller Test
##
## data: series_bc_diff
## Dickey-Fuller = -10.409, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
##
##
## Augmented Dickey-Fuller Test
##
## data: series_bc_diff2
## Dickey-Fuller = -16.803, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
We will create a training and testing data sets to allow us to evaluate model performance on data that hasn’t been seen. Since we will be forecasting future data, we want to evaluate with out of sample data. This will be a balancing act - by removing months at the end of 2013, the model will perform even worse on 2014. I’ll slice off the last 6 months, and use that as a holdout test set. We’ll train our various models using data through mid-2013 and evaluate on late 2013. This should help us identify the best candidate model for then predicting 2014. If we don’t apply a hold out, we could get an over fit model that performs well on known data, but poorly on future data.
# FB Prophet require column names to be ds and y
colnames(power_df) <- c('CASE', 'ds', 'y')
power_df$ds <- as.Date(power_df$ds)
# We want to create a train/test split to evaluate model performance on out of sample data
holdout_size <- 6
# Build the training DF and TS
power_train_df <- head(power_df, nrow(power_df) - holdout_size)
train_start <- min(power_train_df$ds)
train_start <- c(year(train_start), month(train_start))
power_train_ts <- ts(power_train_df$y, start=train_start, frequency = 12)
# Build the testing DF and TS
power_test_df <- tail(power_df, holdout_size)
test_start <- min(power_test_df$ds)
test_start <- c(year(test_start), month(test_start))
power_test_ts <- ts(power_test_df$y, start=test_start, frequency = 12)# Buld ARIMA model using built in auto.arima()
model2 <- auto.arima(power_train_ts, stepwise = FALSE, parallel = TRUE, approximation = FALSE)
summary(model2)## Series: power_train_ts
## ARIMA(0,0,3)(2,1,0)[12] with drift
##
## Coefficients:
## ma1 ma2 ma3 sar1 sar2 drift
## 0.3458 0.0650 0.2303 -0.7213 -0.4695 8298.887
## s.e. 0.0758 0.0855 0.0692 0.0746 0.0793 2937.879
##
## sigma^2 estimated as 3.603e+11: log likelihood=-2563.72
## AIC=5141.44 AICc=5142.11 BIC=5163.55
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -7948.288 570427.7 424291.9 -0.7686357 6.531847 0.6854824
## ACF1
## Training set -0.006623981
print(model2 %>%
forecast(6) %>%
autoplot() +
ggtitle(paste(machine, 'Training Set - Predict Holdout')) +
autolayer(power_test_ts)
) ##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,3)(2,1,0)[12] with drift
## Q* = 12.398, df = 18, p-value = 0.826
##
## Model df: 6. Total lags used: 24
##
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,3)(2,1,0)[12] with drift
## Q* = 12.398, df = 18, p-value = 0.826
pred_value <- model2 %>% forecast(holdout_size)
model2_rmse <- ModelMetrics::rmse(pred_value$mean, power_test_df$y)
print(paste('auto.arima() RMSE on Holdout Test Data: ', model2_rmse))## [1] "auto.arima() RMSE on Holdout Test Data: 1077118.14410098"
power_output_df <- power_df
colnames(power_output_df) <- c('CaseSequence', 'DATE', 'KWH', 'YYYY-MMM')
power_output_df <- power_output_df %>%
select('CaseSequence', 'YYYY-MMM', 'KWH')
power_output_df$`YYYY-MMM` <- strftime(power_output_df$`YYYY-MMM`, '%Y-%b')
print(model2 %>%
forecast(18) %>%
autoplot() +
ggtitle(paste(machine, 'Residential Forecast Next 12 months')) +
autolayer(power_test_ts)
) data <- model2 %>% forecast(18)
st <- as.Date(max(power_train_df$ds)) %m+% months(1)
en <- st %m+% months(17)
data$ds <- seq.Date(st, en, by="1 month")
data$ds <- strftime(data$ds, '%Y-%b')
cs_st <- max(power_train_df$CASE) + 1
predictions <- cbind(seq(cs_st, cs_st+17), as.character(data$ds), as.numeric(data$mean))
colnames(predictions) <- c('CaseSequence', 'YYYY-MMM', 'KWH')
pred_df <- as.data.frame(predictions)
pred_df$KWH <- as.numeric(pred_df$KWH)
pred_df$KWH <- ceiling(pred_df$KWH)
# Append forecast values to end of original dataset
power_output_df <- rbind(power_output_df, pred_df)
# Save final dataset with forecasts appended to end
write.csv(power_output_df, './datasets/residential_power_final.csv')knitr::kable(pred_df,
caption = '1 Year Forecast for Residential Power',
format="html",
table.attr="style='width:60%;'") %>%
kableExtra::kable_styling()| CaseSequence | YYYY-MMM | KWH |
|---|---|---|
| 919 | 2013-Jul | 8719951 |
| 920 | 2013-Aug | 9395634 |
| 921 | 2013-Sep | 8261838 |
| 922 | 2013-Oct | 5942261 |
| 923 | 2013-Nov | 5619185 |
| 924 | 2013-Dec | 7032069 |
| 925 | 2014-Jan | 9393251 |
| 926 | 2014-Feb | 8539119 |
| 927 | 2014-Mar | 6619857 |
| 928 | 2014-Apr | 5991440 |
| 929 | 2014-May | 5915303 |
| 930 | 2014-Jun | 8195403 |
| 931 | 2014-Jul | 9624986 |
| 932 | 2014-Aug | 10096804 |
| 933 | 2014-Sep | 8623193 |
| 934 | 2014-Oct | 5909577 |
| 935 | 2014-Nov | 6116912 |
| 936 | 2014-Dec | 7726610 |
** Final Forecast results are on GitHub at: **
Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.
# Load the Excel data
pipe1_df <- readxl::read_excel('./datasets/Waterflow_Pipe1.xlsx')
colnames(pipe1_df) <- c('DATE', 'WaterFlow')
pipe2_df <- readxl::read_excel('./datasets/Waterflow_Pipe2.xlsx')
colnames(pipe2_df) <- c('DATE', 'WaterFlow')
# Fix the excel date format - convert to standard R date
pipe1_df$DATE <- as.POSIXct(pipe1_df$DATE * 86400, origin = "1899-12-30", tz="GMT")
pipe2_df$DATE <- as.POSIXct(pipe2_df$DATE * 86400, origin = "1899-12-30", tz="GMT")
# Make sure our Data is sorted by date
pipe1_df <- pipe1_df[order(pipe1_df$DATE),]
pipe2_df <- pipe2_df[order(pipe2_df$DATE),]
# inspect data.frame to make sure things look fine
knitr::kable(pipe1_df[1:10, 1:2],
caption = 'Pipe 1 First 10 Rows',
format="html",
table.attr="style='width:60%;'") %>%
kableExtra::kable_styling()| DATE | WaterFlow |
|---|---|
| 2015-10-23 00:24:06 | 23.369599 |
| 2015-10-23 00:40:02 | 28.002881 |
| 2015-10-23 00:53:51 | 23.065895 |
| 2015-10-23 00:55:40 | 29.972809 |
| 2015-10-23 01:19:17 | 5.997953 |
| 2015-10-23 01:23:58 | 15.935223 |
| 2015-10-23 01:50:05 | 26.617330 |
| 2015-10-23 01:55:33 | 33.282900 |
| 2015-10-23 01:59:15 | 12.426692 |
| 2015-10-23 02:51:51 | 21.833494 |
knitr::kable(pipe2_df[1:10, 1:2],
caption = 'Pipe 1 First 10 Rows',
format="html",
table.attr="style='width:60%;'") %>%
kableExtra::kable_styling()| DATE | WaterFlow |
|---|---|
| 2015-10-23 01:00:00 | 18.810791 |
| 2015-10-23 01:59:59 | 43.087025 |
| 2015-10-23 03:00:00 | 37.987705 |
| 2015-10-23 04:00:00 | 36.120379 |
| 2015-10-23 04:59:59 | 31.851259 |
| 2015-10-23 06:00:00 | 28.238090 |
| 2015-10-23 07:00:00 | 9.863582 |
| 2015-10-23 07:59:59 | 26.679610 |
| 2015-10-23 09:00:00 | 55.773785 |
| 2015-10-23 10:00:00 | 54.156889 |
We have no missing data to deal with.
knitr::kable(miss_var_summary(pipe1_df),
caption = 'Pipe 1 Missing Data',
format="html",
table.attr="style='width:60%;'") %>%
kableExtra::kable_styling()| variable | n_miss | pct_miss |
|---|---|---|
| DATE | 0 | 0 |
| WaterFlow | 0 | 0 |
knitr::kable(miss_var_summary(pipe2_df),
caption = 'Pipe 2 Missing Data',
format="html",
table.attr="style='width:60%;'") %>%
kableExtra::kable_styling()| variable | n_miss | pct_miss |
|---|---|---|
| DATE | 0 | 0 |
| WaterFlow | 0 | 0 |
# Aggregate Pipe1 to the hour, sum flow
pipe1_hourly_df <- pipe1_df %>%
mutate(pipe='PIPE1',
hour = as.POSIXct(strftime(DATE, tz="GMT", format='%Y-%m-%d %H:00:00'))) %>%
group_by(hour) %>%
summarise(WaterFlow_tot = sum(WaterFlow))
# Aggregate Pipe2 to the hour, sum flow
pipe2_hourly_df <- pipe2_df %>%
mutate(pipe='PIPE2',
hour = as.POSIXct(strftime(DATE, tz="GMT", format='%Y-%m-%d %H:00:00'))) %>%
group_by(hour) %>%
summarise(WaterFlow_tot = sum(WaterFlow))
# Combine both Pipe 1 hourly reading and Pipe 2 hourly reading
pipes_hourly_df <- rbind(pipe1_hourly_df, pipe2_hourly_df)
# Aggregate both pipes by averaging their hourly flow
pipes_hourly_df <- pipes_hourly_df %>%
group_by(hour) %>%
summarise(WaterFlow_tot = mean(WaterFlow_tot))Check for gaps in the time series - we will need to impute those gaps so we have a consistent regular interval. There are 256 gaps throughout the data. I’m using the imputeTS package with na_interpolation() function with the spline` option.
# Convert our time series to a tsibble for convenience
pipes_tsb <- as_tsibble(pipes_hourly_df, index=hour)
# Are there any gaps in the data (i.e. no data for a given hour)
count_gaps(pipes_tsb)## # A tibble: 256 x 3
## .from .to .n
## <dttm> <dttm> <int>
## 1 2015-10-27 17:00:00 2015-10-27 17:00:00 1
## 2 2015-11-01 01:00:00 2015-11-01 01:00:00 1
## 3 2015-11-01 05:00:00 2015-11-01 05:00:00 1
## 4 2015-11-02 02:00:00 2015-11-02 02:00:00 1
## 5 2015-11-02 05:00:00 2015-11-02 05:00:00 1
## 6 2015-11-02 08:00:00 2015-11-02 08:00:00 1
## 7 2015-11-02 11:00:00 2015-11-02 11:00:00 1
## 8 2015-11-02 14:00:00 2015-11-02 14:00:00 1
## 9 2015-11-02 17:00:00 2015-11-02 17:00:00 1
## 10 2015-11-02 20:00:00 2015-11-02 20:00:00 1
## # … with 246 more rows
# We will impute and fill gaps using the mean of the hour before and after
pipes2_tsb <- pipes_tsb %>%
fill_gaps(.full=TRUE) %>%
na_interpolation(option="spline")
autoplot(pipes2_tsb)We will apply a BoxCox transform with \(\lambda=0.5\). The series is stationary. We a clear 3 hour lag “seasonal” cycle. We’ll want to diff by 3 when building model.
## [1] "lambda: 0.343487379652572 , rounded lambda: 0.5"
##
## Augmented Dickey-Fuller Test
##
## data: series_bc_diff
## Dickey-Fuller = -16.863, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
##
##
## Augmented Dickey-Fuller Test
##
## data: series_bc_diff2
## Dickey-Fuller = -20.796, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
pipes2_ts <- as.ts(pipes2_tsb$WaterFlow_tot, frequency=3)
pipes2_ts %>% diff(3) %>% ggtsdisplay(main=paste('Pipes Waterflow with Differencing=3'))# Buld ARIMA model using built in auto.arima()
model3 <- auto.arima(pipes2_ts, stepwise = FALSE, parallel = TRUE, approximation = FALSE)
summary(model3)## Series: pipes2_ts
## ARIMA(4,1,1)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1
## 0.1614 -0.3218 0.2678 -0.2427 -0.9752
## s.e. 0.0313 0.0307 0.0307 0.0312 0.0084
##
## sigma^2 estimated as 683.7: log likelihood=-4686.74
## AIC=9385.49 AICc=9385.57 BIC=9414.94
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.7803287 26.06852 19.88712 -27.35529 51.98199 0.6409768
## ACF1
## Training set 0.009053284
print(model3 %>%
forecast(7*24) %>%
autoplot() +
ggtitle(paste(machine, 'Forecast WaterFlow (1 week forward)'))
) ##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,1,1)
## Q* = 57.732, df = 5, p-value = 3.572e-11
##
## Model df: 5. Total lags used: 10
##
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,1,1)
## Q* = 57.732, df = 5, p-value = 3.572e-11
data <- model3 %>% forecast(7*24)
st <- max(pipes2_tsb$hour) + as.difftime(c(1), units = "hours")
en <- st %m+% weeks(1)
data$ds <- seq.POSIXt(st, en, by=as.difftime(c(1), units = "hours"))
#data$ds <- strftime(data$ds, '%Y-%b')
predictions <- cbind(as.character(data$ds), as.numeric(data$mean))
colnames(predictions) <- c('DateTime', 'WaterFlow')
pred_df <- as.data.frame(predictions)
pred_df$WaterFlow <- as.numeric(pred_df$WaterFlow)
# Append forecast values to end of original dataset
pipes2_df <- data.frame(DateTime=pipes2_tsb$hour, WaterFlow=pipes2_tsb$WaterFlow_tot)
waterflow_output_df <- rbind(pipes2_df, pred_df)
colnames(waterflow_output_df) <- c('Date Time', 'WaterFlow')
# Save final dataset with forecasts appended to end
write.csv(waterflow_output_df, './datasets/waterflow_final.csv')** Final Forecast results are on GitHub at: **