Forecasting Project

Source Code: https://github.com/djlofland/DATA624_PredictiveAnalytics/tree/master/Project_1

library(fpp2)          # Hyndman's Forecasting Principals and Practice package

# === libraries imported by fpp2 ===
# library(forecast)
# library(ggplot2)
# library(purrr)

# --- fpp2 suggests ---
library(GGally)        #
library(gridExtra)     #
library(seasonal)      #
library(tidyverse)     #
library(vars)          #

# === Time Series Tools ===
library(timetk)        #
library(imputeTS)      # Tools for imputing missing time series data points
library(tseries)       # Time series analysis and computational finance
library(zoo)           # Infrastructure for Regular and Irregular Time Series
library(prophet)       # Facebook forecasting library

library(fable)         # Tidy wrapper for forecasting
library(feasts)        # collection of tools for the analysis of time series data
library(tsibble)       # Tidy Time series data wrapper 
library(fable.prophet) # enable FB prophet through fable

library(dtw)           # Dynamic time warping to align and compare time series
library(dtwclust)      # DTW CLuster tools


# === Other Support ===
library(data.table)    
library(ModelMetrics)  # additional criteria for evaluting models
library(dbplyr)        
library(plotly)        # interactive charts
library(lubridate)     # date time manipulation
library(naniar)        # tools for working with na's


#library(caret)
#library(mlbench)
#library(AppliedPredictiveModeling)
#library(corrplot)
#library(RColorBrewer)
#library(e1071)
library(lattice)
#library(hablar)
#library(broom)

#' Print common ACF plots to help determine if stationary
#'
#' @param series A time series
#' @param lambda the Box Cox lambda transform to use
#' @examples
#' acf_plots(my_series, 0.5)
#' @export
acf_plots <- function(series, lambda) {
  a1 <- ''
  a2 <- ''
  
  series_bc <- BoxCox(series, lambda)

  p1 <- autoplot(series) + ggtitle('Original Time Series')
  p2 <- autoplot(series_bc) + ggtitle(paste('Box Cox Transformed Time Series with lambda=',lambda))
  p3 <- ggAcf(series_bc)
  p4 <- ggPacf(series_bc)

  grid.arrange(p1, p2, p3, p4, nrow=2)

  series_bc_diff <- diff(series)
  
  a1 <- adf.test(series_bc_diff)
  print(a1)
  # b1 <- Box.test(series_bc_diff, lag=10, type='Ljung-Box')
  # print(b1)
  # k1 <- kpss.test(series_bc_diff, null='Trend')
  # print(k1)
  
  series_bc_diff2 <- diff(series_bc_diff)
  
  a2 <- adf.test(series_bc_diff2)
  print(a2)
  # b2 <- Box.test(series_bc_diff2, lag=10, type='Ljung-Box')
  # print(b2)
  # k2 <- kpss.test(series_bc_diff2, null='Trend')
  # print(k2)

  p1 <- autoplot(series_bc_diff) + ggtitle('Differencing Order 1')
  p2 <- autoplot(series_bc_diff2) + ggtitle('Differencing Order 2')

  grid.arrange(p1, p2, nrow=1)
}

#' Return a properly rounded Box Cox lambda (-n, ..., -1, -0.5, 0, 1, ..., n)
#'
#' @param series A time series
#' @examples
#' round_lambda(my_series)
#' @return new_lambda
#' @export
round_lambda <- function(series) {
  lambda <- BoxCox.lambda(series)
  
  if ((lambda > 0.25) & (lambda < 0.75)) {
    new_lambda <- 0.5
  } else if ((lambda > -0.75) & (lambda < -0.25)) {
    new_lambda <- -0.5
  } else {
    new_lambda <- round(lambda)
  }

  print(paste('lambda:', lambda, ',  rounded lambda:', new_lambda))
  
  return(new_lambda)
}

Part A – ATM Forecast

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.

Load ATM Data

# Load the Excel data
atm_df <- readxl::read_excel('./datasets/ATM624Data.xlsx')

# Fix the excel date format - convert to standard R date
atm_df$DATE <- as.Date(atm_df$DATE, origin = "1899-12-30")

# Make sure our Data is sorted by date
atm_df <- atm_df[order(atm_df$DATE),]

# inspect data.frame to make sure things look fine
knitr::kable(atm_df[1:10, 1:3], 
             caption = 'ATM Dataset (1st few rows)',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()

ATM Dataset (1st few rows)
DATE	ATM	Cash
2009-05-01	ATM1	96.0000
2009-05-01	ATM2	107.0000
2009-05-01	ATM3	0.0000
2009-05-01	ATM4	776.9934
2009-05-02	ATM1	82.0000
2009-05-02	ATM2	89.0000
2009-05-02	ATM3	0.0000
2009-05-02	ATM4	524.4180
2009-05-03	ATM1	85.0000
2009-05-03	ATM2	90.0000

#head(atm_df)

Exploratory Analysis

Let’s do a quick plot of the raw data to get a sense for how it look, scale, any issues or trends.

# Plot data for inspection using ggplot.  Note I used a log_scale to handle the huge outlier
# ggplot(atm_df, aes(x=DATE, y=Cash, color=ATM, group=ATM)) + 
#  geom_line() +
#  scale_y_log10()

# Plot data for inspection using plotly for interactive inspection  Note I used a a y log scale to handle the huge outlier
fig <- atm_df %>%
  group_by(ATM) %>%
  plot_ly(x= ~DATE) %>%
  add_lines(y= ~Cash, color = ~factor(ATM)) %>%
  layout(title = "Raw ATM Withdwals by Date",
         xaxis = list(title="Date"),
         yaxis = list(title="Daily Cash Withdrawl (Hundreds of Dollars)", type = "log"))

fig

Initial takeaways are:

There doesn’t appear to be any clear trend with any of the ATM machines over the year.
ATM 4 clearly gives out quite a bit more cash on average than the other machines.
ATM 3 is unusual as it only has a few data points near the end of the year, suggesting it might be a newly deployed machine.
ATM 4 Has one usually high outlier (this forced me to use the log y scale above)

Clean and Prep Data

Identify Missing Values

I’m using the nanair package to inspect for missing values in the dataset. As we can see, there are 14 rows missing both an ATM and Cash values. In addition, we have 5 rows missing Cash values.

# Various NA plots to inspect data
knitr::kable(miss_var_summary(atm_df), 
             caption = 'Missing Values',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()

Missing Values
variable	n_miss	pct_miss
Cash	19	1.2890095
ATM	14	0.9497965
DATE	0	0.0000000

gg_miss_var(atm_df)

gg_miss_upset(atm_df)

Drop Empty Rows

We have 14 empty rows with a DATE and no ATM identifier or Cash value - these were probably added to illustrate the forecast dates we need to make. I’m going to drop them for now so they don’t interfere. We’ll re-add forecast rows later.

# Drop any row that are missing both ATM and Cash values
atm_clean_df <- atm_df %>%
  filter(!(is.na(ATM) & is.na(Cash))) %>%
  drop()

# Confirm 14 rows were dropped
print(c('Original: ', nrow(atm_df), 'After Drop:', nrow(atm_clean_df)))

## [1] "Original: "  "1474"        "After Drop:" "1460"

Impute Missing Cash

We have 5 rows with an ATM identifier, but no Cash value. I will impute these values using the na_kalman() function from the imputeTS package. Since financial transactions often have weekly cycles, I want to make sure my imputing captures the weekly cycle along with any overall trends. Note, I knew there was one problematic outlier from the exploratory graph above. If we had found several outliers, I probably would have moved to a z-score based system to identify all values over +3 standard deviations (only on the high side as low values down to $0 is fine). I would have then replaced all those outlier values with NA and imputed with the na_kalman() below. Note that ATM 3 didn’t have any missing values; however, it does have all 0’s until the last few days - will handle that problem in a later section.

# Get highest cash value.
max_cash <- max(atm_clean_df$Cash, na.rm = TRUE)
max_date <- atm_clean_df %>% 
  filter(Cash==max_cash) 

# Replace out outlier with NA to we impute it in the next step
atm_clean_df$Cash[atm_clean_df$ATM==max_date$ATM & atm_clean_df$DATE==max_date$DATE] <- NA

We have one crazy outlier for ATM4 on 2010-02-09 with a value of $1.0919762^{4}. Since this doesn’t line up on a known holiday that we might need to account for, I’m going to set it to NA and when we impute, we’ll replace the outlier with a more reasonable value.

# Get each unique ATM machine so we can loop over them
machines <- unique(atm_clean_df$ATM)

# As we impute each machine, we'll construct a new data.frame
atm_impute_df <- data.frame()

# Loop thru ATM machines
for (machine in machines) {
  # Grab the rows for just one machine at a time
  single_machine <- atm_clean_df %>% 
    filter(ATM == machine)
  
  # Are there any NA's?
  has_na <- sum(is.na(single_machine$Cash))
  
  # If we found an NA, then impute, otherwise skip the impute (so no errors)
  if (has_na > 0) {
    imp <- single_machine %>% 
      na_kalman()
    
    # Plot the time series with imputed values for visual inspection
    print(ggplot_na_imputations(single_machine$Cash, imp$Cash) +
            ggtitle(paste('Imputed Values for ', machine)))
  } else {
    imp <- single_machine
  }

  # Construct new data.frame for model building
  atm_impute_df <- bind_rows(atm_impute_df, imp)
}

ATM 3 Challenge

So, it appear like ATM 3 is a new machine that was brought online at the end of data tracking so we only have a few values. We have a couple of options:

Replace the 0’s with the mean of the few data points we do have. This would give us a forecast that is basically the mean and we wouldn’t be able to account for weekly cycles. Not very helpful.
We could just use the few data points and do a naive forecast for future dates. This again wouldn’t account for weekly cycles and wouldn’t be very helpful.
Alternatively, we could make an assumption that ATM 3 will behave similar to the other ATM machines and use their historical data to inform what ATM 3 might have looked like if it had been around all year. With this approach, we use the aggregated data from ATM 1, 2 and/or 4 to help create a base for ATM 3. We first need to see if there is a correlation between ATM 1, 2 and 4. If we see strong correlation then it might makes sense to try this approach and average the 3 known ATM’s then scale based on ATM3’s few data points.

Let’s check the cross correlation first.

atm1 <- atm_impute_df %>% filter(ATM=='ATM1') %>% select(Cash)
atm2 <- atm_impute_df %>% filter(ATM=='ATM2') %>% select(Cash)
atm4 <- atm_impute_df %>% filter(ATM=='ATM4') %>% select(Cash)

# Pairwise Cross correlation 
ccf(atm1, atm2, type="correlation")

ccf(atm1, atm4, type="correlation")

ccf(atm2, atm4, type="correlation")

# Plot data for inspection using plotly for interactive inspection  Note I used a a y log scale to handle the huge outlier
fig <- atm_impute_df %>%
  group_by(ATM) %>%
  plot_ly(x= ~DATE) %>%
  add_lines(y= ~Cash, color = ~factor(ATM)) %>%
  layout(title = "Raw ATM Withdwals by Date",
         xaxis = list(title="Date"),
         yaxis = list(title="Daily Cash Withdrawl (Dollars)", type = "log"))

fig

We see a significant correlation at lag=0 along with the increments $\pm7$. Comparing the different combinations, ATM 1 & 2 have the most significant lag correlation spike. Given this, it appears the ATM’s while they have different values are moving in similar daily directions and have strong weekly trends. The few data points we have for ATM 3 are fairly close to what we see in ATM 1 and ATM 2. Also, we note that ATM 4 has a significantly higher daily withdrawal than 1, 2 or 3. With this in mind, we will proceed to construct a mean time series based on ATM 1 and ATM 2. We will be using Dynamic Time Warping (DTW) via the dtw and dtwclust packages. Rather than a simple day-over-day comparison (ATM 1 on day 4 compared with ATM2 on day 4), DTW is a more advanced approach for comparing and aligning time series. It maps patterns and finds nearest points allowing for both x and y to vary. It can also warp data allowing us to stretch or compress one time series to find better matches. dba from the dtwclust package is useful as it returns a mean time series given any number of input time series that best represents the patterns seen across them all. Behind the scenes, dba does pairwise hierarchical clustering. So if we had 4 ATM’s, the it would average 1 & 2, then 3 & 4, then average the mean from 1 & 2 with the mean from 3 & 4. This give a final mean from all 4 ATM’s. See:

While I’m going to use mean (main because I want to play with dtw), we could also rationalize using the max between ATM1 and ATM2 for each date and use that as our daily value for ATM 3. This really comes down to the cost function - is it worse to put in extra money that isn’t used or allow an ATM to run out of money. Using mean assumes the penalty is the same for each condition. If we knew that money was tight and it’s OK to run out, we might use a min function. If we are more concerned with lost opportunity if a machine runs out, then we’d go with the max function.

I also round all the values up to the nearest ten dollar amount (i.e. 1 decimal position since Cash is in hundreds of dollars). Most ATM machine only give out $20 increments, but some older ATM’s gave out $10 increments.

# Build data.frame with only ATM 1, 2 and 4 (eliminate ATM 3)
atm_12_df <- atm_impute_df %>%
  filter(ATM=='ATM1' | ATM=='ATM2') %>%
  pivot_wider(names_from = ATM, values_from=Cash) %>%
  select(-DATE)

# the `dba` function from `dtwclust` package requires our data be transposed with
#   a row for each time series and columns are the dates.  It must also be a matrix
atm_12t_df <- transpose(atm_12_df)
colnames(atm_12t_df) <- atm_impute_df$DATE[1:365]  
rownames(atm_12t_df) <- colnames(atm_12_df)

# Convert our data.frame to a matrix
atm_12t_m <- data.matrix(atm_12t_df, rownames.force=TRUE)  

# run dba to get a mean time series.  This uses the dtwclust package which uses
# dynamic time warping to 
atm_12_avg <- DBA(atm_12t_m, centroid = atm_12t_m[1,], trace=TRUE)

##  DBA Iteration: 1, 2, 3, 4, 5, 6 - Converged!

# Turn the new mean into a time series and plot for inspection
start_ts <- min(single_machine$DATE)
start_ts <- c(year(start_ts), yday(start_ts))
end_ts <- max(single_machine$DATE)
end_ts <- c(year(end_ts), yday(end_ts))

atm_12_avg_ts <- ts(atm_12_avg, start=start_ts, frequency = 7)
autoplot(atm_12_avg_ts) + ggtitle('ATM 3 (Based on ATM 1 & 2 Average)')

# Put all our ATM time series back into a single data.frame so we can move onto modeling
atm_final_df <- atm_12_df
atm_final_df$ATM3 <- atm_12_avg_ts

atm4 <- atm_impute_df %>%
  filter(ATM=='ATM4') %>%
  select(Cash)

atm_final_df$ATM4 <- atm4$Cash

# I'm assuming ATM's don't given our smaller than $10 increments.  So, will round up to the
# nearest tenth (note `Cash` is in hundreds of dollars).
atm_final_df <- ceiling(atm_final_df * 10) / 10

# Finally switch this back to a long format with a DATE.
atm_final_df$DATE <- colnames(atm_12t_df)
atm_final_df <- atm_final_df %>%
  pivot_longer(-DATE, names_to = "ATM", values_to = "Cash")

Check Stationary & Seasonal

Based on Augmented Dickey-Fuller Test and visual inspection, we can say each ATM machine series is stationary. We see a strong lag 7 pattern, which aligns with an expected weekly seasonal pattern.

# Get each unique ATM machine so we can loop over them
machines <- unique(atm_final_df$ATM)

for (machine in machines) {

  # Grab the rows for just one machine at a time
  single_machine <- atm_final_df %>% 
    filter(ATM == machine) %>%
    select(DATE, Cash)
  
  start_ts <- min(single_machine$DATE)
  start_ts <- c(year(start_ts), yday(start_ts))
  end_ts <- max(single_machine$DATE)
  end_ts <- c(year(end_ts), yday(end_ts))
    
  atm_ts <- ts(single_machine$Cash, start=start_ts, end=end_ts, frequency=365)

  lambda <- round_lambda(atm_ts)
  
  acf_plots(atm_ts, lambda)
}

## [1] "lambda: 1 ,  rounded lambda: 1"

## 
##  Augmented Dickey-Fuller Test
## 
## data:  series_bc_diff
## Dickey-Fuller = -11.989, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  series_bc_diff2
## Dickey-Fuller = -19.513, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary

## [1] "lambda: 1 ,  rounded lambda: 1"

## 
##  Augmented Dickey-Fuller Test
## 
## data:  series_bc_diff
## Dickey-Fuller = -11.718, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  series_bc_diff2
## Dickey-Fuller = -18.477, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary

## [1] "lambda: 1 ,  rounded lambda: 1"

## 
##  Augmented Dickey-Fuller Test
## 
## data:  series_bc_diff
## Dickey-Fuller = -11.114, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  series_bc_diff2
## Dickey-Fuller = -19.202, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary

## [1] "lambda: 1 ,  rounded lambda: 1"

## 
##  Augmented Dickey-Fuller Test
## 
## data:  series_bc_diff
## Dickey-Fuller = -10.409, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  series_bc_diff2
## Dickey-Fuller = -16.803, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary

Below is also a plot of the distribution of withdraws by day of the week over the entire duration. This could be used by the bank manager to help determine how much cash to provide on a daily basis. Since we saw no other seasonal, cycles or trends, this weekly seasonal cycles is probably a good base. In fact, we could technically stop the analysis right here and use the upr value as our conservative prediction. If we are concerned with putting out too much cash, then we would choose the mean.

library(Hmisc)

atm_final_df$week <- strftime(atm_final_df$DATE, '%V')
atm_final_df$day <- weekdays(as.Date(atm_final_df$DATE)) 

atm_grp2_df <- atm_final_df %>% 
  group_by(ATM, day) %>% 
  summarise(ci = list(mean_cl_normal(Cash) %>% 
                        rename(mean=y, lwr=ymin, upr=ymax))) %>% 
  unnest

atm_grp2_df$mean <- ceiling(atm_grp2_df$mean * 10) / 10
atm_grp2_df$upr <- ceiling(atm_grp2_df$upr * 10) / 10
atm_grp2_df$lwr <- ceiling(atm_grp2_df$lwr * 10) / 10

ggplot(atm_grp2_df, aes(x=day, y=mean, colour=ATM)) + 
    geom_errorbar(aes(ymin=lwr, ymax=upr), width=.1) +
    geom_line() +
    geom_point() +
    coord_trans(y = "log10") +
    ggtitle('Cash withdrawl by Day of Week') +
    xlab('Day of Week') +
    ylab('Hundreds of Dollars, log scale')

# inspect data.frame to make sure things look fine
knitr::kable(atm_grp2_df, 
             caption = 'Cash Withdrawl by Day of Week (Hundred $)',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()

Cash Withdrawl by Day of Week (Hundred $)
ATM	day	mean	lwr	upr
ATM1	Friday	98.7	90.3	107.0
ATM1	Monday	86.1	80.3	91.9
ATM1	Saturday	96.6	92.7	100.4
ATM1	Sunday	102.7	97.0	108.5
ATM1	Thursday	31.7	22.6	40.9
ATM1	Tuesday	89.6	76.4	102.9
ATM1	Wednesday	82.2	75.2	89.2
ATM2	Friday	92.1	81.7	102.4
ATM2	Monday	58.8	49.8	67.8
ATM2	Saturday	76.0	68.4	83.7
ATM2	Sunday	67.2	60.3	74.1
ATM2	Thursday	26.6	16.8	36.4
ATM2	Tuesday	73.3	61.8	84.8
ATM2	Wednesday	44.4	35.4	53.4
ATM3	Friday	97.6	89.7	105.4
ATM3	Monday	85.0	78.8	91.2
ATM3	Saturday	96.3	91.8	100.8
ATM3	Sunday	99.7	94.5	104.8
ATM3	Thursday	35.9	27.2	44.5
ATM3	Tuesday	85.3	73.5	97.1
ATM3	Wednesday	82.2	75.8	88.7
ATM4	Friday	573.6	474.5	672.8
ATM4	Monday	481.4	403.4	559.4
ATM4	Saturday	491.6	401.8	581.5
ATM4	Sunday	539.2	435.6	642.7
ATM4	Thursday	170.1	99.6	240.5
ATM4	Tuesday	445.8	324.2	567.5
ATM4	Wednesday	413.8	344.0	483.6

Forecasting

Build and Test Models

Now we loop through each ATM machine, convert our data to a time series and build a forecasting model. Based on the strong pACF of 7 for each machine, we will do a diff(7) for each. Once we have the model, we’ll predict the next 14 days for that machine and add it to our final dataset.

# Get each unique ATM machine so we can loop over them
machines <- unique(atm_final_df$ATM)

# Build final output DF.  Note we are using the original data minus the empty rows.
# The problem is give output for the future 2 weeks, not report our cleaned data.
atm_output_df <- atm_df %>%
  filter(!(is.na(ATM) & is.na(Cash))) %>%
  drop()

pred_df <- data.frame()

# Loop through each machine
for (machine in machines) {
  # Grab the rows for just one machine at a time
  single_machine <- atm_final_df %>% 
    filter(ATM == machine)
  
  # Calculate time series start and and dates
  start_ts <- min(single_machine$DATE)
  start_ts <- c(year(start_ts), yday(start_ts))
  end_ts <- max(single_machine$DATE)
  end_ts <- c(year(end_ts), yday(end_ts))
    
  # build time series object (note we set the weekly seasonal pattern)
  atm_ts <- ts(single_machine$Cash, start=start_ts, frequency=7)
  
  atm_ts %>% diff(7) %>% ggtsdisplay(main=paste(machine, ' with Differencing=7'))
  
  model <- auto.arima(atm_ts)
  summary(model)
  
  print(model %>% 
          forecast(14) %>% 
          autoplot() + 
          ggtitle(paste(machine, 'Arima Model')))

  print(checkresiduals(model))
  
  data <- predict(model, n.ahead=14)
  
  st <- as.Date(max(single_machine$DATE)) + 1
  en <- st + 13
  data$DATE <- seq.Date(st, en, by="1 day")
  
  predictions <- cbind(as.character(data$DATE), rep(machine,14), as.numeric(data$pred))
  colnames(predictions) <- c('DATE', 'ATM', 'Cash')
  
  pred_df <- rbind(pred_df, predictions)
  pred_df$Cash <- as.numeric(pred_df$Cash)
  pred_df$Cash <- ceiling(pred_df$Cash)
}

## Series: atm_ts 
## ARIMA(0,0,1)(0,1,2)[7] 
## 
## Coefficients:
##          ma1     sma1     sma2
##       0.1697  -0.5866  -0.0990
## s.e.  0.0552   0.0508   0.0497
## 
## sigma^2 estimated as 559.7:  log likelihood=-1641.09
## AIC=3290.18   AICc=3290.29   BIC=3305.7
## 
## Training set error measures:
##                       ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.07363627 23.33147 14.60095 -102.7316 117.5976 0.8247044
##                      ACF1
## Training set -0.008129974

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(0,1,2)[7]
## Q* = 16.763, df = 11, p-value = 0.1151
## 
## Model df: 3.   Total lags used: 14
## 
## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(0,1,2)[7]
## Q* = 16.763, df = 11, p-value = 0.1151

## Series: atm_ts 
## ARIMA(2,0,2)(0,1,1)[7] 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2     sma1
##       -0.4318  -0.9159  0.4740  0.8107  -0.7552
## s.e.   0.0550   0.0400  0.0859  0.0548   0.0383
## 
## sigma^2 estimated as 609:  log likelihood=-1655.58
## AIC=3323.15   AICc=3323.39   BIC=3346.44
## 
## Training set error measures:
##                      ME     RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set -0.8946498 24.26803 17.12705 -Inf  Inf 0.8188195 -0.004982575

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 10.638, df = 9, p-value = 0.3014
## 
## Model df: 5.   Total lags used: 14
## 
## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 10.638, df = 9, p-value = 0.3014

## Series: atm_ts 
## ARIMA(0,0,1)(0,1,1)[7] 
## 
## Coefficients:
##          ma1     sma1
##       0.1414  -0.6562
## s.e.  0.0521   0.0441
## 
## sigma^2 estimated as 514.1:  log likelihood=-1626.37
## AIC=3258.73   AICc=3258.8   BIC=3270.37
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE     MAPE    MASE       ACF1
## Training set -0.5063048 22.39344 14.94065 -58.47454 72.88051 0.82734 0.00165455

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(0,1,1)[7]
## Q* = 23.059, df = 12, p-value = 0.02723
## 
## Model df: 2.   Total lags used: 14
## 
## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(0,1,1)[7]
## Q* = 23.059, df = 12, p-value = 0.02723

## Series: atm_ts 
## ARIMA(0,0,3)(1,0,0)[7] with non-zero mean 
## 
## Coefficients:
##          ma1     ma2     ma3    sar1     mean
##       0.0871  0.0156  0.0884  0.1839  444.759
## s.e.  0.0531  0.0583  0.0516  0.0526   26.079
## 
## sigma^2 estimated as 119371:  log likelihood=-2648.96
## AIC=5309.91   AICc=5310.15   BIC=5333.31
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE    MAPE     MASE
## Training set -0.2162626 343.1262 287.3652 -514.7456 548.019 0.829546
##                      ACF1
## Training set -0.008222277

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,3)(1,0,0)[7] with non-zero mean
## Q* = 16.061, df = 9, p-value = 0.06562
## 
## Model df: 5.   Total lags used: 14
## 
## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,3)(1,0,0)[7] with non-zero mean
## Q* = 16.061, df = 9, p-value = 0.06562

# Append forecast values to end of original dataset
atm_output_df <- rbind(atm_output_df, pred_df)

# Round Cash up to the nearest integer
atm_output_df$Cash <- as.numeric(atm_output_df$Cash)
atm_output_df$Cash <- ceiling(atm_output_df$Cash)

# Save final dataset with forecasts appended to end
write.csv(atm_output_df, './datasets/atm_final.csv')

RESULTS

Note the residuals for each ATM arima model look good. I just used auto.arima. The final models are:

ATM 1 ARIMA(0,0,1)(0,1,2)[7]
ATM 2 ARIMA(2,0,2)(0,1,1)[7]
ATM 3 ARIMA(0,0,1)(0,1,1)[7]
ATM 4 ARIMA(0,0,3)(1,0,0)[7]

The final predictions are:

# inspect data.frame to make sure things look fine
pred_df <- pred_df %>% 
  pivot_wider(names_from = ATM, values_from=Cash)

knitr::kable(pred_df, 
             caption = '2 Week Forecasts for ATM Machines',
             format="html", 
             table.attr="style='width:60%;'") %>% 
  kableExtra::kable_styling()

2 Week Forecasts for ATM Machines
DATE	ATM1	ATM2	ATM3	ATM4
2010-05-01	88	67	88	363
2010-05-02	104	73	95	433
2010-05-03	74	14	74	451
2010-05-04	9	6	8	366
2010-05-05	101	98	102	428
2010-05-06	81	89	78	372
2010-05-07	87	65	87	452
2010-05-08	89	67	88	430
2010-05-09	103	73	95	443
2010-05-10	74	14	74	446
2010-05-11	9	6	8	431
2010-05-12	101	98	102	442
2010-05-13	81	89	78	432
2010-05-14	87	65	87	447

** Final Forecast results are on GitHub at:

https://github.com/djlofland/DATA624_PredictiveAnalytics/blob/master/Project_1/datasets/atm_final.csv

Part B – Forecasting Power

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

Load Data

# Load the Excel data
power_df <- readxl::read_excel('./datasets/ResidentialCustomerForecastLoad-624.xlsx',
                             col_names=FALSE, skip=1)
colnames(power_df) <- c('CASE', 'DATE', 'KWH')

# Fix the excel date format - convert to standard R date
power_df$DATE_YM <- yearmonth(power_df$DATE)
power_df$DATE <- as.Date(power_df$DATE_YM)

# Make sure our Data is sorted by date
power_df <- power_df[order(power_df$DATE),]

# inspect data.frame to make sure things look fine
knitr::kable(power_df[1:10, 1:4], 
             caption = 'Power DataFrame top 10 Rows',
             format="html", 
             table.attr="style='width:60%;'") %>% 
  kableExtra::kable_styling()

Power DataFrame top 10 Rows
CASE	DATE	KWH	DATE_YM
733	1998-01-01	6862583	1998 Jan
734	1998-02-01	5838198	1998 Feb
735	1998-03-01	5420658	1998 Mar
736	1998-04-01	5010364	1998 Apr
737	1998-05-01	4665377	1998 May
738	1998-06-01	6467147	1998 Jun
739	1998-07-01	8914755	1998 Jul
740	1998-08-01	8607428	1998 Aug
741	1998-09-01	6989888	1998 Sep
742	1998-10-01	6345620	1998 Oct

Exploratory Analysis

# Plot data for inspection using plotly for interactive inspection  Note I used a a y log scale to handle the huge outlier
fig <- power_df %>%
  plot_ly(x= ~DATE) %>%
  add_lines(y= ~KWH) %>%
  layout(title = "Power Usage (KWH) by Month",
         xaxis = list(title="Date"),
         yaxis = list(title="Power (KWH)"))

fig

Takeaways:

We have some missing data.
We have an outlier to deal with
There is a slight trend up over time with clear seasonal patterns.

Clean Data

Missing Values

We need to check for any missing values in our data set. First let’s check for explicit missing (i.e. nulls). Note there is 1 missing KWH value.

# Various NA plots to inspect data
knitr::kable(miss_var_summary(power_df), 
             caption = 'Missing Values',
             format="html", 
             table.attr="style='width:50%;'") %>% 
  kableExtra::kable_styling()

Missing Values
variable	n_miss	pct_miss
KWH	1	0.5208333
CASE	0	0.0000000
DATE	0	0.0000000
DATE_YM	0	0.0000000

Impute Missing Values

We will use na_kalman() from the imputeTS package to impute any explicit missing values. This approach as the advantage that it will use trend and seasonal patterns to help fill in values to have better context with surrounding values.

# Get highest cash value.
min_kwh <- min(power_df$KWH, na.rm = TRUE)
min_date <- power_df %>% 
  filter(KWH==min_kwh) 

# Replace out outlier with NA to we impute it in the next step
power_df$KWH[power_df$DATE==min_date$DATE] <- NA

# Impute the missing values
imp <- power_df %>% 
  na_kalman()

# Plot the time series with imputed values for visual inspection
print(ggplot_na_imputations(power_df$KWH, imp$KWH) +
        ggtitle(paste('Imputed Values')))

power_df <- imp

Check Implicit Missing

Next we double check if there are any implicit missing values, i.e. are there any gaps in our time series? Ideally, we want a continuous time series with a data point at each incremental step in the time series. We convert our DF to a tsibble and leverage the count_gaps() function. We see no gaps - cool.

# Convert our time series to a tsibble for convenience
power_tsb <- as_tsibble(power_df, index=DATE_YM, regular = TRUE, .drop = TRUE)

# Are there any gaps in the data (i.e. no data for a given hour)
count_gaps(power_tsb)

## # A tibble: 0 x 3
## # … with 3 variables: .from <mth>, .to <mth>, .n <int>

Data Transformation

Let’s check whether a transformation will be helpful by calculating BoxCox lamba.

power_ts <- ts(power_df$KWH, start=1998, frequency = 12)

lambda <- round_lambda(power_ts)

## [1] "lambda: -0.206647402717936 ,  rounded lambda: 0"

Recommended lambda is 0, so no transformation will be needed.

Seasonal Patterns

Let’s visualize the seasonal and trend patterns. This will be help us understand whether differencing will be helpful before modeling.

d <- decompose(power_ts)
autoplot(d)

We do see a generate upward trend in the data. Given this, we will want to check the impact of differencing.

Checking the BoxCox, we get a recommended $\lambda=0.5$. Looking at the ACF and pACF plots along with Dickey-Fuller Test, there is a clear lag t 7 which would corespond to ~7 months. We would expect residential power to peak in Summer and Winter (heating and cooling), so the 7 lag is closee to, but not exactly what we might expect. We will want to difference with 7 when modeling with Arima.

lambda <- round_lambda(atm_ts)

## [1] "lambda: 0.453125398020207 ,  rounded lambda: 0.5"

acf_plots(atm_ts, lambda)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  series_bc_diff
## Dickey-Fuller = -10.409, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  series_bc_diff2
## Dickey-Fuller = -16.803, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary

Forecasting

Training/Test Split

We will create a training and testing data sets to allow us to evaluate model performance on data that hasn’t been seen. Since we will be forecasting future data, we want to evaluate with out of sample data. This will be a balancing act - by removing months at the end of 2013, the model will perform even worse on 2014. I’ll slice off the last 6 months, and use that as a holdout test set. We’ll train our various models using data through mid-2013 and evaluate on late 2013. This should help us identify the best candidate model for then predicting 2014. If we don’t apply a hold out, we could get an over fit model that performs well on known data, but poorly on future data.

# FB Prophet require column names to be ds and y
colnames(power_df) <- c('CASE', 'ds', 'y')
power_df$ds <- as.Date(power_df$ds)

# We want to create a train/test split to evaluate model performance on out of sample data
holdout_size <- 6

# Build the training DF and TS
power_train_df <- head(power_df, nrow(power_df) - holdout_size)
train_start <- min(power_train_df$ds)
train_start <- c(year(train_start), month(train_start))
power_train_ts <- ts(power_train_df$y, start=train_start, frequency = 12)

# Build the testing DF and TS
power_test_df <- tail(power_df, holdout_size)
test_start <- min(power_test_df$ds)
test_start <- c(year(test_start), month(test_start))
power_test_ts <- ts(power_test_df$y, start=test_start, frequency = 12)

ARIMA Model

power_train_ts %>% diff(7) %>% ggtsdisplay(main=paste('Residential Power with Differencing=7'))

# Buld ARIMA model using built in auto.arima()
model2 <- auto.arima(power_train_ts, stepwise = FALSE, parallel = TRUE, approximation = FALSE)
summary(model2)

## Series: power_train_ts 
## ARIMA(0,0,3)(2,1,0)[12] with drift 
## 
## Coefficients:
##          ma1     ma2     ma3     sar1     sar2     drift
##       0.3458  0.0650  0.2303  -0.7213  -0.4695  8298.887
## s.e.  0.0758  0.0855  0.0692   0.0746   0.0793  2937.879
## 
## sigma^2 estimated as 3.603e+11:  log likelihood=-2563.72
## AIC=5141.44   AICc=5142.11   BIC=5163.55
## 
## Training set error measures:
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -7948.288 570427.7 424291.9 -0.7686357 6.531847 0.6854824
##                      ACF1
## Training set -0.006623981

print(model2 %>% 
        forecast(6) %>% 
        autoplot() + 
        ggtitle(paste(machine, 'Training Set - Predict Holdout')) +
        autolayer(power_test_ts)
      )

print(checkresiduals(model2))

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,3)(2,1,0)[12] with drift
## Q* = 12.398, df = 18, p-value = 0.826
## 
## Model df: 6.   Total lags used: 24
## 
## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,3)(2,1,0)[12] with drift
## Q* = 12.398, df = 18, p-value = 0.826

Arima Model Evaluation

pred_value <- model2 %>% forecast(holdout_size)

model2_rmse <- ModelMetrics::rmse(pred_value$mean, power_test_df$y)
print(paste('auto.arima() RMSE on Holdout Test Data: ', model2_rmse))

## [1] "auto.arima() RMSE on Holdout Test Data:  1077118.14410098"

Final Predictions

power_output_df <- power_df
colnames(power_output_df) <- c('CaseSequence', 'DATE', 'KWH', 'YYYY-MMM')

power_output_df <- power_output_df %>%
  select('CaseSequence', 'YYYY-MMM', 'KWH')

power_output_df$`YYYY-MMM` <- strftime(power_output_df$`YYYY-MMM`, '%Y-%b')

print(model2 %>% 
        forecast(18) %>% 
        autoplot() + 
        ggtitle(paste(machine, 'Residential Forecast Next 12 months')) +
        autolayer(power_test_ts)
      )

data <- model2 %>% forecast(18)

st <- as.Date(max(power_train_df$ds)) %m+% months(1)
en <- st %m+% months(17)
data$ds <- seq.Date(st, en, by="1 month")
data$ds <- strftime(data$ds, '%Y-%b')

cs_st <- max(power_train_df$CASE) + 1

predictions <- cbind(seq(cs_st, cs_st+17), as.character(data$ds), as.numeric(data$mean))
colnames(predictions) <- c('CaseSequence', 'YYYY-MMM', 'KWH')

pred_df <- as.data.frame(predictions)
pred_df$KWH <- as.numeric(pred_df$KWH)
pred_df$KWH <- ceiling(pred_df$KWH)

# Append forecast values to end of original dataset
power_output_df <- rbind(power_output_df, pred_df)

# Save final dataset with forecasts appended to end
write.csv(power_output_df, './datasets/residential_power_final.csv')

knitr::kable(pred_df, 
             caption = '1 Year Forecast for Residential Power',
             format="html", 
             table.attr="style='width:60%;'") %>% 
  kableExtra::kable_styling()

1 Year Forecast for Residential Power
CaseSequence	YYYY-MMM	KWH
919	2013-Jul	8719951
920	2013-Aug	9395634
921	2013-Sep	8261838
922	2013-Oct	5942261
923	2013-Nov	5619185
924	2013-Dec	7032069
925	2014-Jan	9393251
926	2014-Feb	8539119
927	2014-Mar	6619857
928	2014-Apr	5991440
929	2014-May	5915303
930	2014-Jun	8195403
931	2014-Jul	9624986
932	2014-Aug	10096804
933	2014-Sep	8623193
934	2014-Oct	5909577
935	2014-Nov	6116912
936	2014-Dec	7726610

** Final Forecast results are on GitHub at: **

https://github.com/djlofland/DATA624_PredictiveAnalytics/blob/master/Project_1/datasets/residential_power_final.csv

Part C – BONUS

Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.

Load Data

# Load the Excel data
pipe1_df <- readxl::read_excel('./datasets/Waterflow_Pipe1.xlsx')
colnames(pipe1_df) <- c('DATE', 'WaterFlow')

pipe2_df <- readxl::read_excel('./datasets/Waterflow_Pipe2.xlsx')
colnames(pipe2_df) <- c('DATE', 'WaterFlow')

# Fix the excel date format - convert to standard R date
pipe1_df$DATE <- as.POSIXct(pipe1_df$DATE * 86400, origin = "1899-12-30", tz="GMT")
pipe2_df$DATE <- as.POSIXct(pipe2_df$DATE * 86400, origin = "1899-12-30", tz="GMT")

# Make sure our Data is sorted by date
pipe1_df <- pipe1_df[order(pipe1_df$DATE),]
pipe2_df <- pipe2_df[order(pipe2_df$DATE),]

# inspect data.frame to make sure things look fine
knitr::kable(pipe1_df[1:10, 1:2], 
             caption = 'Pipe 1 First 10 Rows',
             format="html", 
             table.attr="style='width:60%;'") %>% 
  kableExtra::kable_styling()

Pipe 1 First 10 Rows
DATE	WaterFlow
2015-10-23 00:24:06	23.369599
2015-10-23 00:40:02	28.002881
2015-10-23 00:53:51	23.065895
2015-10-23 00:55:40	29.972809
2015-10-23 01:19:17	5.997953
2015-10-23 01:23:58	15.935223
2015-10-23 01:50:05	26.617330
2015-10-23 01:55:33	33.282900
2015-10-23 01:59:15	12.426692
2015-10-23 02:51:51	21.833494

knitr::kable(pipe2_df[1:10, 1:2], 
             caption = 'Pipe 1 First 10 Rows',
             format="html", 
             table.attr="style='width:60%;'") %>% 
  kableExtra::kable_styling()

Pipe 1 First 10 Rows
DATE	WaterFlow
2015-10-23 01:00:00	18.810791
2015-10-23 01:59:59	43.087025
2015-10-23 03:00:00	37.987705
2015-10-23 04:00:00	36.120379
2015-10-23 04:59:59	31.851259
2015-10-23 06:00:00	28.238090
2015-10-23 07:00:00	9.863582
2015-10-23 07:59:59	26.679610
2015-10-23 09:00:00	55.773785
2015-10-23 10:00:00	54.156889

Check Missing Data

We have no missing data to deal with.

knitr::kable(miss_var_summary(pipe1_df), 
             caption = 'Pipe 1 Missing Data',
             format="html", 
             table.attr="style='width:60%;'") %>% 
  kableExtra::kable_styling()

Pipe 1 Missing Data
variable	n_miss	pct_miss
DATE	0	0
WaterFlow	0	0

knitr::kable(miss_var_summary(pipe2_df), 
             caption = 'Pipe 2 Missing Data',
             format="html", 
             table.attr="style='width:60%;'") %>% 
  kableExtra::kable_styling()

Pipe 2 Missing Data
variable	n_miss	pct_miss
DATE	0	0
WaterFlow	0	0

Aggregate Pipes Data

# Aggregate Pipe1 to the hour, sum flow
pipe1_hourly_df <- pipe1_df %>%
  mutate(pipe='PIPE1', 
         hour = as.POSIXct(strftime(DATE, tz="GMT", format='%Y-%m-%d %H:00:00'))) %>%
  group_by(hour) %>%
  summarise(WaterFlow_tot = sum(WaterFlow))

# Aggregate Pipe2 to the hour, sum flow
pipe2_hourly_df <- pipe2_df %>%
  mutate(pipe='PIPE2', 
         hour = as.POSIXct(strftime(DATE, tz="GMT", format='%Y-%m-%d %H:00:00'))) %>%
  group_by(hour) %>%
  summarise(WaterFlow_tot = sum(WaterFlow))

# Combine both Pipe 1 hourly reading and Pipe 2 hourly reading
pipes_hourly_df <- rbind(pipe1_hourly_df, pipe2_hourly_df)

# Aggregate both pipes by averaging their hourly flow
pipes_hourly_df <- pipes_hourly_df %>%
  group_by(hour) %>%
  summarise(WaterFlow_tot = mean(WaterFlow_tot))

Clean up Time Series

Implicit Missing Data

Check for gaps in the time series - we will need to impute those gaps so we have a consistent regular interval. There are 256 gaps throughout the data. I’m using the imputeTS package with na_interpolation() function with the spline` option.

# Convert our time series to a tsibble for convenience
pipes_tsb <- as_tsibble(pipes_hourly_df, index=hour)

# Are there any gaps in the data (i.e. no data for a given hour)
count_gaps(pipes_tsb)

## # A tibble: 256 x 3
##    .from               .to                    .n
##    <dttm>              <dttm>              <int>
##  1 2015-10-27 17:00:00 2015-10-27 17:00:00     1
##  2 2015-11-01 01:00:00 2015-11-01 01:00:00     1
##  3 2015-11-01 05:00:00 2015-11-01 05:00:00     1
##  4 2015-11-02 02:00:00 2015-11-02 02:00:00     1
##  5 2015-11-02 05:00:00 2015-11-02 05:00:00     1
##  6 2015-11-02 08:00:00 2015-11-02 08:00:00     1
##  7 2015-11-02 11:00:00 2015-11-02 11:00:00     1
##  8 2015-11-02 14:00:00 2015-11-02 14:00:00     1
##  9 2015-11-02 17:00:00 2015-11-02 17:00:00     1
## 10 2015-11-02 20:00:00 2015-11-02 20:00:00     1
## # … with 246 more rows

# We will impute and fill gaps using the mean of the hour before and after
pipes2_tsb <- pipes_tsb %>%
  fill_gaps(.full=TRUE) %>%
  na_interpolation(option="spline")

autoplot(pipes2_tsb)

pipes2_tsb$hour <- as_datetime(pipes2_tsb$hour)

Check Stationary

We will apply a BoxCox transform with $\lambda=0.5$. The series is stationary. We a clear 3 hour lag “seasonal” cycle. We’ll want to diff by 3 when building model.

pipes2_ts <- as.ts(pipes2_tsb$WaterFlow_tot )

lambda <- round_lambda(pipes2_ts)

## [1] "lambda: 0.343487379652572 ,  rounded lambda: 0.5"

acf_plots(pipes2_ts, lambda)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  series_bc_diff
## Dickey-Fuller = -16.863, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  series_bc_diff2
## Dickey-Fuller = -20.796, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary

Forecasting

pipes2_ts <- as.ts(pipes2_tsb$WaterFlow_tot, frequency=3)

pipes2_ts %>% diff(3) %>% ggtsdisplay(main=paste('Pipes Waterflow with Differencing=3'))

# Buld ARIMA model using built in auto.arima()
model3 <- auto.arima(pipes2_ts, stepwise = FALSE, parallel = TRUE, approximation = FALSE)
summary(model3)

## Series: pipes2_ts 
## ARIMA(4,1,1) 
## 
## Coefficients:
##          ar1      ar2     ar3      ar4      ma1
##       0.1614  -0.3218  0.2678  -0.2427  -0.9752
## s.e.  0.0313   0.0307  0.0307   0.0312   0.0084
## 
## sigma^2 estimated as 683.7:  log likelihood=-4686.74
## AIC=9385.49   AICc=9385.57   BIC=9414.94
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.7803287 26.06852 19.88712 -27.35529 51.98199 0.6409768
##                     ACF1
## Training set 0.009053284

print(model3 %>% 
        forecast(7*24) %>% 
        autoplot() + 
        ggtitle(paste(machine, 'Forecast WaterFlow (1 week forward)')) 
      )

print(checkresiduals(model3))

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,1,1)
## Q* = 57.732, df = 5, p-value = 3.572e-11
## 
## Model df: 5.   Total lags used: 10
## 
## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,1,1)
## Q* = 57.732, df = 5, p-value = 3.572e-11

data <- model3 %>% forecast(7*24)

st <- max(pipes2_tsb$hour) + as.difftime(c(1), units = "hours")
en <- st %m+% weeks(1)
data$ds <- seq.POSIXt(st, en, by=as.difftime(c(1), units = "hours"))
#data$ds <- strftime(data$ds, '%Y-%b')

predictions <- cbind(as.character(data$ds), as.numeric(data$mean))
colnames(predictions) <- c('DateTime', 'WaterFlow')

pred_df <- as.data.frame(predictions)
pred_df$WaterFlow <- as.numeric(pred_df$WaterFlow)

# Append forecast values to end of original dataset
pipes2_df <- data.frame(DateTime=pipes2_tsb$hour, WaterFlow=pipes2_tsb$WaterFlow_tot)
waterflow_output_df <- rbind(pipes2_df, pred_df)
colnames(waterflow_output_df) <- c('Date Time', 'WaterFlow')

# Save final dataset with forecasts appended to end
write.csv(waterflow_output_df, './datasets/waterflow_final.csv')

** Final Forecast results are on GitHub at: **

https://github.com/djlofland/DATA624_PredictiveAnalytics/blob/master/Project_1/datasets/waterflow_final.csv

Forecasting Project

DS624 Project 1 (Fall 2020)

Donny Lofland

10/23/2020

Part A – ATM Forecast

Load ATM Data

Exploratory Analysis

Clean and Prep Data

Identify Missing Values

Drop Empty Rows

Impute Missing Cash

ATM 3 Challenge

Check Stationary & Seasonal

Forecasting

Build and Test Models

RESULTS

Part B – Forecasting Power

Load Data

Exploratory Analysis

Clean Data

Missing Values

Impute Missing Values

Check Implicit Missing

Data Transformation

Seasonal Patterns

Forecasting

Training/Test Split

ARIMA Model

Arima Model Evaluation

Final Predictions

Part C – BONUS

Load Data

Check Missing Data

Aggregate Pipes Data

Clean up Time Series

Implicit Missing Data

Check Stationary

Forecasting