## Libraries
#install.packages("tidyverse")
library(forecast)
library(fpp2)
library(ggplot2)
#library(tidyverse)
library(lubridate)
library(scales)
library(tidyverse)
library(dplyr)
In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
## Loading the ATM data
df_atm <- readxl::read_excel("ATM624Data.xlsx")
df_atm<- drop_na(df_atm)
head(df_atm)
## # A tibble: 6 x 3
## DATE ATM Cash
## <dbl> <chr> <dbl>
## 1 39934 ATM1 96
## 2 39934 ATM2 107
## 3 39935 ATM1 82
## 4 39935 ATM2 89
## 5 39936 ATM1 85
## 6 39936 ATM2 90
df_atm$DATE =as.Date(df_atm$DATE, origin = "1899-12-30")
# Remove na's
df_atm = df_atm[!is.na(df_atm$ATM),]
df_atm <- drop_na(df_atm)
head(df_atm)
## # A tibble: 6 x 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-01 ATM2 107
## 3 2009-05-02 ATM1 82
## 4 2009-05-02 ATM2 89
## 5 2009-05-03 ATM1 85
## 6 2009-05-03 ATM2 90
summary(df_atm)
## DATE ATM Cash
## Min. :2009-05-01 Length:1455 Min. : 0.0
## 1st Qu.:2009-08-01 Class :character 1st Qu.: 0.5
## Median :2009-10-31 Mode :character Median : 73.0
## Mean :2009-10-30 Mean : 155.6
## 3rd Qu.:2010-01-29 3rd Qu.: 114.0
## Max. :2010-04-30 Max. :10919.8
str(df_atm)
## tibble [1,455 × 3] (S3: tbl_df/tbl/data.frame)
## $ DATE: Date[1:1455], format: "2009-05-01" "2009-05-01" ...
## $ ATM : chr [1:1455] "ATM1" "ATM2" "ATM1" "ATM2" ...
## $ Cash: num [1:1455] 96 107 82 89 85 90 90 55 99 79 ...
## Create Time Series and Visualize data.
df_atm_ts <- ts(df_atm$Cash, start = c(2009,5,1), frequency = 365)
autoplot(df_atm_ts)+xlab("days") + ylab("Dollars(hundreds)") + ggtitle("CASH WITHDRAWAL FOR ATM1,ATM2,ATM3,ATM4")
Based on the below plots for ATM1 & ATM2 they seem to have similar characteristics, i.e., seasonality effects, along with the range of cash withdrawals.
ATM 3 did not have much activity until April 28, 2010, which is starkly different than the previous 2 ATMs. It may have been recently installed. Given the very small number of withdrawals from this ATM, forecasting may be a simple method like taking the mean or median as there is not enough data points for modelling.
2010-04-25 ATM3 0
2010-04-26 ATM3 0
2010-04-27 ATM3 0
2010-04-28 ATM3 96
2010-04-29 ATM3 82
2010-04-30 ATM3 85
ATM 4 has similar characteristics as ATM1 and ATM2 except for a major spike in cash withdrawals ($1,091,900) on Feb 9th 2010. An initial assessment is that this is an outlier but further investigation is needed to confirm that it is indeed an outlier. If so, removing or replacing this data point with the median or mean may be appropriate.
Given the variations in the ATMs cash withdrawals. I will move forward with forecasting cash withdrawals for each ATM separately.
Subsetting data for each ATM to work on separately.
#ATM1
df_atm1<- filter(df_atm, ATM == "ATM1")
df_atm1_ts <- ts(df_atm1$Cash, start = c(2009,5,1), frequency = 365)
autoplot(df_atm1_ts)+xlab("days") + ylab("Dollars(hundreds)") + ggtitle("ATM1")
#ATM2
df_atm2<- filter(df_atm, ATM == "ATM2")
df_atm2_ts <- ts(df_atm2$Cash, start = c(2009,5,1), frequency = 365)
autoplot(df_atm2_ts)+xlab("days") + ylab("Dollars(hundreds)") + ggtitle("ATM2")
#ATM3
df_atm3<- filter(df_atm, ATM == "ATM3")
df_atm3_ts <- ts(df_atm3$Cash, start = c(2009,5,1), frequency = 365)
autoplot(df_atm3_ts)+xlab("days") + ylab("Dollars(hundreds)") + ggtitle("ATM3")
#ATM4
df_atm4<- filter(df_atm, ATM == "ATM4")
df_atm4_ts <- ts(df_atm4$Cash, start = c(2009,5,1), frequency = 365)
autoplot(df_atm4_ts)+xlab("days") + ylab("Dollars(hundreds)") + ggtitle("ATM4")
#tail(df_atm2)
#tail(df_atm3)
#max(df_atm4$Cash)
#$1,091,900
Given that all cash withdrawals from ATM4 was within $150,000 and the spike withdrawal was over 1 Million dollars, it is more likely an outlier. Based on this, this data point will be replaced with the mean value of cash withdrawals for ATM4.
max(df_atm4$Cash)
## [1] 10919.76
df_atm4<- filter(df_atm, ATM == "ATM4")
df_atm4$Cash[which.max(df_atm4$Cash)] <- median(df_atm4$Cash, na.rm = TRUE)
df_atm4_ts <- ts(df_atm4$Cash, start = c(2009,5,1), frequency = 365)
autoplot(df_atm4_ts)+xlab("days") + ylab("Dollars(hundreds)") + ggtitle("ATM4")
#Testing for stationary:
tseries::adf.test(df_atm1_ts)
##
## Augmented Dickey-Fuller Test
##
## data: df_atm1_ts
## Dickey-Fuller = -4.6043, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
From the results of the adf test (P-value < 0.05 ) we see that the ATM data is stationary. Setting the frequency to 7.
#From the adf plot there is a weekly seasonality present in the data:
df_atm1_ts <- ts(df_atm1$Cash, start = c(2009,5,1), frequency = 7)
ggtsdisplay(df_atm1_ts, main = "ATM1 With Frequency = 7")
The spikes at 7,14,21 days(lag) on the ACF and PACF plots suggests that there is a seasonal and nonseasonal component to the series. Based on this assessment using the ARIMA for forecasting ATM1 cash withdrawal seems appropriate.
Using the auto.arima function to find pdq,PDQ which will do a grid search to find the best model parameters
auto_1 = auto.arima(df_atm1_ts, seasonal = TRUE,trace = FALSE)
auto_1
## Series: df_atm1_ts
## ARIMA(0,0,1)(0,1,1)[7]
##
## Coefficients:
## ma1 sma1
## 0.1674 -0.5813
## s.e. 0.0565 0.0472
##
## sigma^2 estimated as 666.6: log likelihood=-1658.32
## AIC=3322.63 AICc=3322.7 BIC=3334.25
Using the above pdq/PDQ results to fit the ARIMA model:
atm1_lambda<-BoxCox.lambda(df_atm1_ts)
# Using lamba and pdq/PDQ to fit the Arima model
df_atm1_ts_fit<-Arima(df_atm1_ts, order = c(0, 0, 1), seasonal = c(0, 1, 1), lambda = atm1_lambda)
atm1_forecast<-forecast(df_atm1_ts_fit, 31)
autoplot(atm1_forecast) + labs(title = "ATM1 ARIMA(0,0,1)(0,1,1) FORECAST", x = "Week", y = NULL)
Checking model fit: Based on the below ACF and the normal distribution of residuals. This Arima model fits well and will be used for predicting cash withdrawals for ATM1
# Understanding how well the model fits.
ggtsdisplay(residuals(auto_1), plot.type = "histogram",lag.max = 45, main = 'ATM1 Arima Model Fit Residuals')
#Testing for stationary:
tseries::adf.test(df_atm2_ts)
##
## Augmented Dickey-Fuller Test
##
## data: df_atm2_ts
## Dickey-Fuller = -6.009, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
From the results of the adf test (P-value < 0.05 ) we see that the ATM data is stationary. Setting the frequency to 7.
#From the adf plot there is a weekly seasonality present in the data:
df_atm2_ts <- ts(df_atm2$Cash, start = c(2009,5,1), frequency = 7)
ggtsdisplay(df_atm2_ts, main = "ATM2 With Frequency = 7")
The spikes at 7,14,21 days(lag) on the ACF and PACF plots suggests that there is a seasonal and nonseasonal component to the series. Based on this assessment using the ARIMA for forecasting ATM2 cash withdrawal seems appropriate.
Using the auto.arima function to find pdq,PDQ which will do a grid search to find the best model parameters
auto_atm2 = auto.arima(df_atm2_ts, seasonal = TRUE,trace = FALSE)
auto_atm2
## Series: df_atm2_ts
## ARIMA(2,0,2)(0,1,2)[7]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1 sma2
## -0.4148 -0.8922 0.4008 0.7534 -0.7220 0.0801
## s.e. 0.0674 0.0510 0.1027 0.0688 0.0567 0.0550
##
## sigma^2 estimated as 708.1: log likelihood=-1671.98
## AIC=3357.96 AICc=3358.28 BIC=3385.08
Using the above pdq/PDQ results in the ARIMA model:
atm2_lambda<-BoxCox.lambda(df_atm2_ts)
# Using lamba and pdq/PDQ to fit the Arima model
df_atm2_ts_fit<-Arima(df_atm2_ts, order = c(0, 0, 1), seasonal = c(0, 1, 1), lambda = atm2_lambda)
atm2_forecast<-forecast(df_atm2_ts_fit, 31)
autoplot(atm2_forecast) + labs(title = "ATM2 ARIMA(2,0,2)(0,1,2) FORECAST", x = "Week", y = NULL)
Checking model fit: Based on the below ACF and the normal distribution of residuals. This Arima model fits well and will be used for predicting cash withdrawals for ATM2
# Understanding how well the model fits.
ggtsdisplay(residuals(auto_atm2), plot.type = "histogram",lag.max = 45, main = 'ATM2 Arima Model Fit Residuals')
Due to the fact that ATM3 on have data at the very end of the period with only a few withdrawals, I will forecast it using the average/mean.
head(df_atm_ts)
## Time Series:
## Start = c(2009, 5)
## End = c(2009, 10)
## Frequency = 365
## [1] 96 107 82 89 85 90
#head(df_atm3)
df_atm3[df_atm3==0] <- NA
df_atm3<-df_atm3[complete.cases(df_atm3),]
df_atm3_ts<-ts(df_atm3['Cash'], start = c(2010, 4, 28), frequency = 3)
atm3_forecast<-meanf(df_atm3_ts, 31)
autoplot(atm3_forecast) + labs(title = "ATM3: mean", x = "Day", y = NULL) + theme(legend.position = "none")
#head(df_atm3)
df_atm3[df_atm3==0] <- NA
df_atm3<-df_atm3[complete.cases(df_atm3),]
df_atm3_ts<-ts(df_atm3['Cash'], start = c(2010, 4, 28), frequency = 3)
#Testing for stationary:
tseries::adf.test(df_atm4_ts)
##
## Augmented Dickey-Fuller Test
##
## data: df_atm4_ts
## Dickey-Fuller = -5.6201, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
From the results of the adf test (P-value < 0.05 ) we see that the ATM data is stationary. Setting the frequency to 7.
#From the adf plot there is a weekly seasonality present in the data:
#df_atm4_ts <- ts(df_atm4$Cash, start = c(2009,5,1), frequency = 7)
#ggtsdisplay(df_atm4_ts, main = "ATM4 With Frequency = 7")
df_atm4$Cash[which.max(df_atm4$Cash)] <- mean(df_atm4$Cash, na.rm = TRUE)
df_atm4_ts <- ts(df_atm4$Cash, start = c(2009,5,1), frequency = 7)
ggtsdisplay(df_atm4_ts, main = "ATM4 With Frequency = 7")
The spikes at 7,14,21 days(lag) on the ACF and PACF plots suggests that there is a seasonal and nonseasonal component to the series. Based on this assessment using the ARIMA for forecasting ATM4 cash withdrawal seems appropriate.
Using the auto.arima function to find pdq,PDQ which will do a grid search to find the best model parameters
auto_atm4 = auto.arima(df_atm4_ts, seasonal = TRUE,trace = FALSE)
auto_atm4
## Series: df_atm4_ts
## ARIMA(2,0,3)(2,0,0)[7] with non-zero mean
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 sar1 sar2 mean
## -1.0378 -0.8320 1.1296 0.9639 0.1849 0.2255 0.0943 439.9880
## s.e. 0.0895 0.0973 0.0982 0.1139 0.0546 0.0576 0.0554 28.7199
##
## sigma^2 estimated as 111724: log likelihood=-2635.55
## AIC=5289.1 AICc=5289.6 BIC=5324.2
Using the above pdq/PDQ results in the ARIMA model:
atm4_lambda<-BoxCox.lambda(df_atm4_ts)
# Using lamba and pdq/PDQ to fit the Arima model
df_atm4_ts_fit<-Arima(df_atm4_ts, order = c(2, 0, 3), seasonal = c(2, 0, 0), lambda = atm4_lambda)
atm4_forecast<-forecast(df_atm4_ts_fit, 31)
autoplot(atm4_forecast) + labs(title = "ATM4 ARIMA(2,0,3)(2,0,0) FORECAST", x = "Week", y = NULL)
Checking model fit: Based on the below ACF and the normal distribution of residuals. This Arima model fits well and will be used for predicting cash withdrawals for ATM4. There is a bit of a right skewed histogram for the residual but still a I believe a decent fit for forecasting.
# Understanding how well the model fits.
ggtsdisplay(residuals(auto_atm4), plot.type = "histogram",lag.max = 45, main = 'ATM4 Arima Model Fit Residuals')
#Excel File
#data_frame(DATE = rep(max(df$DATE) + 1:31, 4), ATM = rep(names(df)[-1], each = 31), Cash = c(atm1_forecast$mean, atm2_forecast$mean, atm3_forecast$mean, atm4_forecast$mean)) %>%
# write_csv("calvin_cox_project1_atm_cash_prediction.csv")
Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
df_power <- readxl::read_excel('ResidentialCustomerForecastLoad-624.xlsx')
df_power<- drop_na(df_power)
tail(df_power)
## # A tibble: 6 x 3
## CaseSequence `YYYY-MMM` KWH
## <dbl> <chr> <dbl>
## 1 919 2013-Jul 8415321
## 2 920 2013-Aug 9080226
## 3 921 2013-Sep 7968220
## 4 922 2013-Oct 5759367
## 5 923 2013-Nov 5769083
## 6 924 2013-Dec 9606304
summary(df_power)
## CaseSequence YYYY-MMM KWH
## Min. :733.0 Length:191 Min. : 770523
## 1st Qu.:780.5 Class :character 1st Qu.: 5429912
## Median :828.0 Mode :character Median : 6283324
## Mean :828.3 Mean : 6502475
## 3rd Qu.:876.5 3rd Qu.: 7620524
## Max. :924.0 Max. :10655730
str(df_power)
## tibble [191 × 3] (S3: tbl_df/tbl/data.frame)
## $ CaseSequence: num [1:191] 733 734 735 736 737 738 739 740 741 742 ...
## $ YYYY-MMM : chr [1:191] "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
## $ KWH : num [1:191] 6862583 5838198 5420658 5010364 4665377 ...
df_power_ts<-ts(df_power[, "KWH"], start = c(1998, 1), frequency = 12)
autoplot(df_power_ts) + ggtitle('Residential Customer Power Usage')
#Testing for stationary:
tseries::adf.test(df_power_ts)
##
## Augmented Dickey-Fuller Test
##
## data: df_power_ts
## Dickey-Fuller = -4.8347, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
Checking KWH ACF / PACF Plots
ggtsdisplay(df_power_ts, main = "Checking KWH ACF / PACF")
Based on the ADF test, this series is stationary. In reviewing the ACT / PACF plots there appears to be seasonality of freq 12, however there are also large spikes at the 6 month periods as well.
Will try BoxCox to see if we can get a better series
df_power_ts_lambda<-BoxCox.lambda(df_power_ts)
df_power_ts_boxcox<-BoxCox(df_power_ts, df_power_ts_lambda)
ggtsdisplay(diff(df_power_ts_boxcox,12), main = "Checking KWH with Box Cox ACF / PACF")
From the ACF and PACF plot we can see significant spikes at 12,24,36(lag) that there is a seasonal and nonseasonal component to the series and as such decided to use ARIMA for forecasting KWH
Using the auto.arima function to find pdq,PDQ which will do a grid search to find the best model parameters
auto_kwh = auto.arima(df_power_ts_boxcox, seasonal = TRUE,trace = FALSE)
auto_kwh
## Series: df_power_ts_boxcox
## ARIMA(0,1,2)(0,0,2)[12]
##
## Coefficients:
## ma1 ma2 sma1 sma2
## -0.7492 -0.2268 0.2085 0.1959
## s.e. 0.0722 0.0711 0.0782 0.0718
##
## sigma^2 estimated as 0.3874: log likelihood=-179.35
## AIC=368.7 AICc=369.03 BIC=384.94
Using the above pdq/PDQ results in the ARIMA model:
df_power_ts_lambda<-BoxCox.lambda(df_power_ts)
# Using lamba and pdq/PDQ to fit the Arima model
df_power_ts_fit<-Arima(df_power_ts, order = c(0, 1, 2), seasonal = c(2, 0, 0), lambda = df_power_ts_lambda)
#df_atm4_ts_fit<-Arima(df_atm4_ts, order = c(0, 0, 1), lambda = atm4_lambda)
kwh_forecast<-forecast(df_power_ts_fit, 12)
autoplot(kwh_forecast) + labs(title = "KWH ARIMA(0,1,2)(2,0,0) FORECAST", x = "Week", y = NULL)
Checking model fit: Based on the below ACF and the normal distribution of residuals. This Arima model fits well and will be used for predicting KWH
# Understanding how well the model fits.
ggtsdisplay(residuals(auto_kwh), plot.type = "histogram",lag.max = 45, main = '(0,1,0) Model Residuals')
data_frame(`YYYY-MMM` = paste0(2014, "-", month.abb), KWH = kwh_forecast$mean) %>%
write_csv("calvin_cox_project1_kwh_prediction.csv")