DATA 624 Project One - Part A - ATM Time Series Analysis
DATA 624 Project One - Part A - ATM Time Series Analysis
Read in Data and Split into Individual ATMs
Read in the ATM Data
Impute Nulls with Zeros
Split the ATMs out Individual ATMs
## Split up into the respective ATMs
ATM1 <- ATM624Data %>% filter(ATM == 'ATM1')
## monthly summation
ATM1$Month <- floor_date(ATM1$DATE,"month")
ATM1_monthly <-ddply(ATM1, "Month", summarise, Cash = sum(Cash))
ATM2 <- ATM624Data %>% filter(ATM == 'ATM2')
## monthly summation
ATM2$Month <- floor_date(ATM2$DATE,"month")
ATM2_monthly <-ddply(ATM2, "Month", summarise, Cash = sum(Cash))
ATM3 <- ATM624Data %>% filter(ATM == 'ATM3')
## monthly summation
ATM3$Month <- floor_date(ATM3$DATE,"month")
ATM3_monthly <-ddply(ATM3, "Month", summarise, Cash = sum(Cash))
ATM4 <- ATM624Data %>% filter(ATM == 'ATM4')
## monthly summation
ATM4$Month <- floor_date(ATM4$DATE,"month")
ATM4_monthly <-ddply(ATM4, "Month", summarise, Cash = sum(Cash))
Take a Look at the TimeSeries via the autoplot function
## create a time series version of the different datasets
ATM1_ts <- ts(ATM1 %>% select(Cash))
## use the autoplot function
autoplot(ATM1_ts)
## create a time series version of the different datasets
ATM2_ts <- ts(ATM2 %>% select(Cash))
## use the autoplot function
autoplot(ATM2_ts)
## create a time series version of the different datasets
ATM3_ts <- ts(ATM3 %>% select(Cash))
## use the autoplot function
autoplot(ATM3_ts)
Compare the 4 ATMs to see the aggregate Montly Patterns Over Time
From the graphs below we can see that ATM #4 has by far the most activity with ATM #1 and ATM #2 having moderate activity and ATM #3 only having activity very recently, these will be important considerations when building out the time series models.
ATM1_monthly_ts <- ts(ATM1_monthly[,-1],
frequency=12, start=c(2009,5))
ATM2_monthly_ts <- ts(ATM2_monthly[,-1],
frequency=12, start=c(2009,5))
ATM3_monthly_ts <- ts(ATM3_monthly[,-1],
frequency=12, start=c(2009,5))
ATM4_monthly_ts <- ts(ATM4_monthly[,-1],
frequency=12, start=c(2009,5))
autoplot(ATM1_monthly_ts)
## column bind the different montly time series datasets to compare the monthly patterns across the different datasets
library(dplyr)
temp1 <- ATM1_monthly %>% dplyr::rename(Cash1 = Cash)
temp2 <- ATM2_monthly %>% dplyr::rename(Cash2 = Cash)
temp3 <- ATM3_monthly %>% dplyr::rename(Cash3 = Cash)
temp4 <- ATM4_monthly %>% dplyr::rename(Cash4 = Cash)
df <- merge(temp1,temp2)
df <- merge(df,temp3)
df <- merge(df,temp4)
## monthly patterns of each of the dataset compared
df_ts <- ts(df[,2:5],frequency = 12, start=c(2009,5))
autoplot(df_ts)
ATM1
Create Daily Time Series with Frequency of 7
There definitely seems to be some seasonality most likely a weekly seasonality and the mean seems relatively stationary
ATM1_daily <- ATM1 %>% select(DATE,Cash)
ATM1_daily_ts <- ts(ATM1_daily[,-1],frequency = 7)
autoplot(ATM1_daily_ts)
In the winter it appears that thursdays are much lower
Thursdays Drop off in the Winter but Fridays and Saturdays really pickup
Build the Auto ARIMA Function for the ATM#1 Time Series
atm1_daily_aa <- auto.arima(ATM1_daily_ts)
atm1_daily_aa_forecast = forecast(atm1_daily_aa,h=31)
atm1_may_daily_df <- forecast(atm1_daily_aa,h=31)
plot(atm1_daily_aa_forecast)
ATM1_May_Daily_Predictions <- as.data.frame(atm1_may_daily_df$mean)
colnames(ATM1_May_Daily_Predictions) <- c("Cash")
atm1_arima_forecast <- function(x, h) {
forecast(Arima(x, order = c(0,0,0), seasonal = c(0, 1, 1)), h = 31)
}
Look at the PACF and ACF Plots for ATM1
It is interesting because we see a definite weekly pattern with the lag 7,14,21 all being very high which makes a lot of sense, all fridays are highly correlated with one another but the other interesting signal captured by these is the fact that there is a strong negative lag 2 and lag 5 correlation which makes sense because if there was a lot of activity on a friday then there might not be as much activity on sunday because people already took their money out. This might also be the case why the lag 5 is strong if people took money out the previous sunday then they might have to take money out on Friday.
Looking at the Fit for the Holt Winter’s Exponential Model
fit1 <- hw(ATM1_daily_ts,seasonal="additive",h=31)
autoplot(ATM1_daily_ts) +
autolayer(fit1, series="HW additive forecasts", PI=FALSE)
Write a loop to look at cross validation for each time series forecast method The Auto Arima model was the best performing model in this cross model cross validation experiment, the next models included the forecast model the holt winters model and the seasonal naive model, I think this suggests that this data has a very strong weekly seasonality.
method <- c("atm1_arima_forecast","hw","snaive","rwf","croston","holt","ses","forecast","stlf")
for (i in method){
# Compute cross-validated errors for up to 8 steps ahead
e <- tsCV(ATM1_daily_ts, forecastfunction = get(paste0(i)), h = 31)
# Compute the MSE values and remove missing values
mse <- colMeans(e^2, na.rm = TRUE)
print(paste("The residual mean squared error of the",i,"model is:",sqrt(mean(e^2, na.rm = TRUE))))
}
## [1] "The residual mean squared error of the atm1_arima_forecast model is: 29.1065899435762"
## [1] "The residual mean squared error of the hw model is: 34.5888153397788"
## [1] "The residual mean squared error of the snaive model is: 34.2174197194286"
## [1] "The residual mean squared error of the rwf model is: 52.7049119687405"
## [1] "The residual mean squared error of the croston model is: 38.0672562340957"
## [1] "The residual mean squared error of the holt model is: 93.1366051062762"
## [1] "The residual mean squared error of the ses model is: 38.0252391767253"
## [1] "The residual mean squared error of the forecast model is: 30.7509172015664"
## [1] "The residual mean squared error of the stlf model is: 30.1257231767895"
##Produce the Output Dataset
Predictions_ATM1 <- as.data.frame(atm1_daily_aa_forecast$mean)
colnames(Predictions_ATM1) <- c("Cash")
Predictions_ATM1$Dates <- seq(from = as.Date("2010-05-01"), to = as.Date("2010-05-31"), by = 'day')
#Predictions_ATM1$Dates <- format(as.Date(Predictions_ATM1$Dates), "%Y-%m")
write.csv(Predictions_ATM1,file = "ATM1_Predictions.csv")
ATM2
Look at the Patterns for ATM2
ATM2_daily <- ATM2 %>% select(DATE,Cash)
ATM2_daily_ts <- ts(ATM2_daily[,-1],frequency = 7)
#frequency=12, start=c(2009,5))
autoplot(ATM2_daily_ts)
ATM2 has a very similar pattern to ATM1 with friday and saturdays really picking up at the end of the year and wednesdays and thursdays really falling off at the end of the year
Look at the PACF and ACF Plots for ATM2
ATM2 has this same pattern going on of being highly correlated on a day of week basis but also on a lag 2 and a lag 5 basis for the reasons that I outlined for previously
Look at the Auto Arima Model for the Daily Data of ATM2
atm2_daily_aa <- auto.arima(ATM2_daily_ts)
atm2_daily_aa_forecast = forecast(atm2_daily_aa,h=31)
atm2_may_daily_df <- forecast(atm2_daily_aa,h=31)
plot(atm2_daily_aa_forecast)
atm2_May_Daily_Predictions <- as.data.frame(atm2_may_daily_df$mean)
colnames(atm2_May_Daily_Predictions) <- c("Cash")
atm2_arima_forecast <- function(x, h) {
forecast(Arima(x, order = c(2, 0, 2), seasonal = c(0, 1, 1)), h = 31)
}
Write a loop to look at cross validation for each time series forecast method
method <- c("atm2_arima_forecast","hw","snaive","rwf","croston","holt","ses","forecast","stlf")
for (i in method){
# Compute cross-validated errors for up to 8 steps ahead
e <- tsCV(ATM2_daily_ts, forecastfunction = get(paste0(i)), h = 31)
# Compute the MSE values and remove missing values
mse <- colMeans(e^2, na.rm = TRUE)
print(paste("The residual mean squared error of the",i,"model is:",sqrt(mean(e^2, na.rm = TRUE))))
}
## [1] "The residual mean squared error of the atm2_arima_forecast model is: 29.4072668588952"
## [1] "The residual mean squared error of the hw model is: 39.8609680065869"
## [1] "The residual mean squared error of the snaive model is: 32.3215720436129"
## [1] "The residual mean squared error of the rwf model is: 54.616660214384"
## [1] "The residual mean squared error of the croston model is: 39.0371364130515"
## [1] "The residual mean squared error of the holt model is: 51.3220082384641"
## [1] "The residual mean squared error of the ses model is: 39.2157011040056"
## [1] "The residual mean squared error of the forecast model is: 37.291695905464"
## [1] "The residual mean squared error of the stlf model is: 29.6972764945933"
Produce the Output Dataset
Predictions_ATM2 <- as.data.frame(atm2_daily_aa_forecast$mean)
colnames(Predictions_ATM2) <- c("Cash")
Predictions_ATM2$Dates <- seq(from = as.Date("2010-05-01"), to = as.Date("2010-05-31"), by = 'day')
#Predictions_ATM1$Dates <- format(as.Date(Predictions_ATM1$Dates), "%Y-%m")
write.csv(Predictions_ATM2,file = "ATM2_Predictions.csv")
The ARIMA Model was the best performing model so this is the one that we will choose Look how the predictions get worse as you forecast further and further ahead
e <- tsCV(ATM2_daily_ts, forecastfunction = atm2_arima_forecast, h = 31)
# Compute the MSE values and remove missing values
mse <- colMeans(e^2, na.rm = TRUE)
print(mean(e^2, na.rm = TRUE))
## [1] 864.7873
# Plot the MSE values against the forecast horizon
data.frame(h = 1:31, MSE = mse) %>%
ggplot(aes(x = h, y = MSE)) + geom_point()
ATM3
ATM has practically no data so there are no real day of week patterns that are present
ATM3_daily <- ATM3 %>% select(DATE,Cash)
ATM3_daily_ts <- ts(ATM3_daily[,-1],frequency = 7)
autoplot(ATM3_daily_ts)
Look at the Daily for atm3
Produce the Output Dataset
For ATM3 I am just using the average of the very few observations that we have to estimate the cash volume for May
#Predictions_ATM2 <- as.data.frame(estimate)
#colnames(Predictions_ATM2) <- c("KWH")
Predictions_ATM3 <- as.data.frame(seq(from = as.Date("2010-05-01"), to = as.Date("2010-05-31"), by = 'day'))
Predictions_ATM3$Cash <- estimate
colnames(Predictions_ATM3)<- c("Dates","Cash")
write.csv(Predictions_ATM2,file = "ATM3_Predictions.csv")
ATM 4
Look at the Patterns for ATM4 - I replace the max observation with the time series mean since it is an outlier
ATM4_daily <- ATM4 %>% select(DATE,Cash)
ATM4_daily$Cash[ATM4_daily$Cash == max(ATM4_daily$Cash)] <- mean(ATM4_daily$Cash)
ATM4_daily_ts <- ts(ATM4_daily[,-1],frequency = 7)
autoplot(ATM4_daily_ts)
ATM4 has some of the same patterns as the other time series the biggest difference is that friday is relatively high the entire year but takes a dip in the 3rd quarter but then rebounds in the 4th Quarter
Look at the Auto Arima Model for the Daily Data of ATM4
atm4_daily_aa <- auto.arima(ATM4_daily_ts)
atm4_daily_aa_forecast = forecast(atm4_daily_aa,h=31)
atm4_may_daily_df <- forecast(atm4_daily_aa,h=31)
plot(atm4_daily_aa_forecast)
atm4_May_Daily_Predictions <- as.data.frame(atm4_may_daily_df$mean)
colnames(atm4_May_Daily_Predictions) <- c("Cash")
atm4_arima_forecast <- function(x, h) {
forecast(Arima(x, order = c(0, 0, 3), seasonal = c(1,0,0)), h = 31)
}
Write a loop to look at cross validation for each time series forecast method In this circumstance for ATM4 the stlf function actually turned out to be the best when we went through the cross validation process
method <- c("atm4_arima_forecast","hw","snaive","rwf","croston","holt","ses","forecast","stlf")
for (i in method){
# Compute cross-validated errors for up to 8 steps ahead
e <- tsCV(ATM4_daily_ts, forecastfunction = get(paste0(i)), h = 31)
# Compute the MSE values and remove missing values
mse <- colMeans(e^2, na.rm = TRUE)
print(paste("The residual mean squared error of the",i,"model is:",sqrt(mean(e^2, na.rm = TRUE))))
}
## [1] "The residual mean squared error of the atm4_arima_forecast model is: 353.185687483537"
## [1] "The residual mean squared error of the hw model is: 417.851618570007"
## [1] "The residual mean squared error of the snaive model is: 462.839307087467"
## [1] "The residual mean squared error of the rwf model is: 497.809131158165"
## [1] "The residual mean squared error of the croston model is: 365.683954212915"
## [1] "The residual mean squared error of the holt model is: 623.458955536311"
## [1] "The residual mean squared error of the ses model is: 359.313978870583"
## [1] "The residual mean squared error of the forecast model is: 348.852156295643"
## [1] "The residual mean squared error of the stlf model is: 353.106559731865"
Build out the STL Model
##
## Forecast method: STL + ETS(A,N,N)
##
## Model Information:
## ETS(A,N,N)
##
## Call:
## ets(y = x, model = etsmodel, allow.multiplicative.trend = allow.multiplicative.trend)
##
## Smoothing parameters:
## alpha = 1e-04
##
## Initial states:
## l = 444.0484
##
## sigma: 302.0805
##
## AIC AICc BIC
## 6326.263 6326.330 6337.963
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.06823511 301.2518 234.929 -375.4241 407.5933 0.6781699
## ACF1
## Training set 0.1187377
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 53.14286 410.80723 23.67545 797.9390 -181.25973 1002.8742
## 53.28571 455.95663 68.82485 843.0884 -136.11033 1048.0236
## 53.42857 452.18884 65.05706 839.3206 -139.87813 1044.2558
## 53.57143 66.55626 -320.57553 453.6880 -525.51072 658.6232
## 53.71429 607.52712 220.39533 994.6589 15.46014 1199.5941
## 53.85714 492.93405 105.80227 880.0658 -99.13292 1085.0010
## 54.00000 623.06195 235.93016 1010.1937 30.99497 1215.1289
## 54.14286 410.80723 23.67544 797.9390 -181.25975 1002.8742
## 54.28571 455.95663 68.82484 843.0884 -136.11035 1048.0236
## 54.42857 452.18884 65.05705 839.3206 -139.87815 1044.2558
## 54.57143 66.55626 -320.57554 453.6881 -525.51074 658.6232
## 54.71429 607.52712 220.39532 994.6589 15.46012 1199.5941
## 54.85714 492.93405 105.80225 880.0659 -99.13294 1085.0011
## 55.00000 623.06195 235.93015 1010.1938 30.99495 1215.1290
## 55.14286 410.80723 23.67543 797.9390 -181.25977 1002.8742
## 55.28571 455.95663 68.82483 843.0884 -136.11037 1048.0236
## 55.42857 452.18884 65.05703 839.3207 -139.87817 1044.2559
## 55.57143 66.55626 -320.57556 453.6881 -525.51076 658.6233
## 55.71429 607.52712 220.39530 994.6589 15.46010 1199.5941
## 55.85714 492.93405 105.80224 880.0659 -99.13296 1085.0011
## 56.00000 623.06195 235.93014 1010.1938 30.99493 1215.1290
## 56.14286 410.80723 23.67541 797.9390 -181.25979 1002.8743
## 56.28571 455.95663 68.82481 843.0885 -136.11039 1048.0237
## 56.42857 452.18884 65.05702 839.3207 -139.87819 1044.2559
## 56.57143 66.55626 -320.57557 453.6881 -525.51078 658.6233
## 56.71429 607.52712 220.39529 994.6589 15.46008 1199.5942
## 56.85714 492.93405 105.80223 880.0659 -99.13298 1085.0011
## 57.00000 623.06195 235.93012 1010.1938 30.99491 1215.1290
## 57.14286 410.80723 23.67540 797.9391 -181.25981 1002.8743
## 57.28571 455.95663 68.82480 843.0885 -136.11041 1048.0237
## 57.42857 452.18884 65.05701 839.3207 -139.87821 1044.2559
##Produce the Output Dataset