We are asked to forecast how much cash is taken out of 4 different ATM machines for May 2010. We are given data in a single file with variable cash provided in hundreds of dollars. Explain and demonstrate you process, techniques used and not used and your actual forecast.
# Load the dataset
data <- read_excel("ATM624Data.xlsx")
# View the first few rows of the dataset
head(data)
## # A tibble: 6 × 3
## DATE ATM Cash
## <dbl> <chr> <dbl>
## 1 39934 ATM1 96
## 2 39934 ATM2 107
## 3 39935 ATM1 82
## 4 39935 ATM2 89
## 5 39936 ATM1 85
## 6 39936 ATM2 90
# Check for missing values
summarise_all(data, funs(sum(is.na(.))))
## # A tibble: 1 × 3
## DATE ATM Cash
## <int> <int> <int>
## 1 0 14 19
# Summary statistics for 'Cash'
summary(data$Cash)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 0.5 73.0 155.6 114.0 10919.8 19
There is 19 missing values. To handle missing values, we can either remove rows with missing data or impute them.
data_clean <- na.omit(data)
Now, we are plotting the data.
# Convert DATE to Date type
data_clean$DATE <- as.Date(data_clean$DATE)
# Plot cash withdrawals over time by ATM
ggplot(data_clean, aes(x = DATE, y = Cash, color = ATM)) +
geom_line() +
labs(title = "Cash Withdrawals Over Time by ATM", x = "Date", y = "Cash Withdrawn (in Hundreds of Dollars)") +
theme_minimal()
# Example for ATM1
data_atm1 <- data_clean %>%
filter(ATM == "ATM1") %>%
arrange(DATE)
# Ensure DATE is a Date object and Cash is numeric
data_atm1$DATE <- as.Date(data_atm1$DATE)
data_atm1$Cash <- as.numeric(data_atm1$Cash)
# Convert to ts (time series) object, assuming monthly data starting from May 2009
ts_data_atm1 <- ts(data_atm1$Cash, start = c(2009, 5), frequency = 12)
# season plotting
ggseasonplot(ts_data_atm1)
# Plotting Time series for ATM1
tsdisplay(ts_data_atm1)
# Performing seasonal decomposition
plot(decompose(ts_data_atm1))
ATM 1 shows no trend but there is obvious seasonality, shown by the spikes at 7, 14, and 21 in the ACF plot.
# Fit an ARIMA model
model_atm1 <- auto.arima(ts_data_atm1)
# Showing ARIMA model
model_atm1
## Series: ts_data_atm1
## ARIMA(5,0,3)(1,0,0)[12] with non-zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 ma3
## 0.0081 -0.7882 0.0220 -0.1709 -0.3984 0.1017 0.7305 -0.1646
## s.e. 0.0812 0.0547 0.0916 0.0557 0.0539 0.0804 0.0411 0.0722
## sar1 mean
## 0.1999 83.7966
## s.e. 0.0585 1.4214
##
## sigma^2 = 946.9: log likelihood = -1750.44
## AIC=3522.89 AICc=3523.64 BIC=3565.7
# Forecast for May 2010 (1 month ahead)
forecast_data <- forecast(model_atm1, h = 1)
# Print forecast
print(forecast_data)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jul 2039 89.93613 50.50065 129.3716 29.62477 150.2475
# Example plot of forecasts
autoplot(forecast_data) +
labs(title = "Forecast of Cash Withdrawals for ATM1", x = "Date", y = "Cash Withdrawn (in Hundreds of Dollars)")
# Assuming 'forecast_data' contains your final forecasts
# Replace 'forecast_data' with your actual forecast data frame
# write.xlsx(forecast_data, file = "forecast_data.xlsx")
# Filter data for ATM2
data_atm2 <- data_clean %>% filter(ATM == "ATM2") %>% arrange(DATE)
# Convert to ts object for ATM2
ts_data_atm2 <- ts(data_atm2$Cash, start = c(2009, 5), frequency = 12)
# season plotting
ggseasonplot(ts_data_atm2)
# Plotting Time series for ATM2
tsdisplay(ts_data_atm2)
# Performing seasonal decomposition
plot(decompose(ts_data_atm2))
# Fit an ARIMA model for ATM2
model_atm2 <- auto.arima(ts_data_atm2)
# Showing ARIMA model
model_atm2
## Series: ts_data_atm2
## ARIMA(3,1,1)(1,0,2)[12] with drift
##
## Coefficients:
## ar1 ar2 ar3 ma1 sar1 sma1 sma2 drift
## -0.0731 -0.3713 0.0382 -0.9800 0.1661 -0.3317 -0.1563 -0.0661
## s.e. 0.0587 0.0629 0.0625 0.0131 0.2625 0.2613 0.0953 0.0200
##
## sigma^2 = 1206: log likelihood = -1796.98
## AIC=3611.96 AICc=3612.47 BIC=3646.98
# Forecast for May 2010 for ATM2
forecast_data_atm2 <- forecast(model_atm2, h = 1)
# Print forecast for ATM2
print(forecast_data_atm2)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Aug 2039 32.17034 -12.33733 76.67801 -35.89827 100.239
# Example plot of forecasts for ATM2
autoplot(forecast_data_atm2) +
labs(title = "Forecast of Cash Withdrawals for ATM2", x = "Date", y = "Cash Withdrawn (in Hundreds of Dollars)")
# Example for ATM3
data_atm3 <- data_clean %>%
filter(ATM == "ATM3") %>%
arrange(DATE)
# Ensure DATE is a Date object and Cash is numeric
data_atm3$DATE <- as.Date(data_atm3$DATE)
data_atm3$Cash <- as.numeric(data_atm3$Cash)
# Convert to ts (time series) object, assuming monthly data starting from May 2009
ts_data_atm3 <- ts(data_atm3$Cash, start = c(2009, 5), frequency = 12)
# season plotting
#ggseasonplot(ts_data_atm3)
# Plotting Time series for ATM3
tsdisplay(ts_data_atm3)
# Performing seasonal decomposition
plot(decompose(ts_data_atm3))
# Fit an ARIMA model
model_atm3 <- auto.arima(ts_data_atm3)
# Showing ARIMA model
model_atm3
## Series: ts_data_atm3
## ARIMA(0,0,2) with zero mean
##
## Coefficients:
## ma1 ma2
## 0.8392 0.8557
## s.e. 0.0496 0.0611
##
## sigma^2 = 25.4: log likelihood = -1108.69
## AIC=2223.39 AICc=2223.46 BIC=2235.09
# Forecast for May 2010 (1 month ahead)
forecast_data <- forecast(model_atm3, h = 1)
# Print forecast
print(forecast_data)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Oct 2039 2.610608 -3.848409 9.069625 -7.267606 12.48882
# Example plot of forecasts
# autoplot(forecast_data) +
# labs(title = "Forecast of Cash Withdrawals for ATM3", x = "Date", y = "Cash Withdrawn (in Hundreds of Dollars)")
I will not be forecasting ATM 3 as it only contains 3 point, which indicates to me that this is a newly installed ATM. I would say to wait at least 3 months and then attempt to forecast at that time.Imputing the3 entries was not fruitful because there were only 3 entries.
# Example for ATM4
data_atm4 <- data_clean %>%
filter(ATM == "ATM4") %>%
arrange(DATE)
# Ensure DATE is a Date object and Cash is numeric
data_atm4$DATE <- as.Date(data_atm4$DATE)
data_atm4$Cash <- as.numeric(data_atm4$Cash)
# Convert to ts (time series) object, assuming monthly data starting from May 2009
ts_data_atm4 <- ts(data_atm4$Cash, start = c(2009, 5), frequency = 12)
# season plotting
ggseasonplot(ts_data_atm4)
# Plotting Time series for ATM4
tsdisplay(ts_data_atm4)
# Performing seasonal decomposition
plot(decompose(ts_data_atm4))
# Fit an ARIMA model
model_atm4 <- auto.arima(ts_data_atm4)
# Showing ARIMA model
model_atm4
## Series: ts_data_atm4
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 474.0433
## s.e. 34.0248
##
## sigma^2 = 423718: log likelihood = -2882.03
## AIC=5768.06 AICc=5768.1 BIC=5775.86
# Forecast for May 2010 (1 month ahead)
forecast_data <- forecast(model_atm4, h = 1)
# Print forecast
print(forecast_data)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Oct 2039 474.0433 -360.1647 1308.251 -801.7678 1749.854
# Example plot of forecasts
autoplot(forecast_data) +
labs(title = "Forecast of Cash Withdrawals for ATM4", x = "Date", y = "Cash Withdrawn (in Hundreds of Dollars)")
ATM 4 shows the same seasonality information. The spike from the outlier is no longer present because we used tsclean. I wonder if it would be worth it to show the before and after.
Part B consists of a simple data set of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
data <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
head(data)
## # A tibble: 6 × 3
## CaseSequence `YYYY-MMM` KWH
## <dbl> <chr> <dbl>
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
data$KWH <- as.numeric(data$KWH)
summary(data$KWH)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 770523 5429912 6283324 6502475 7620524 10655730 1
Handling Missing Values
data_b <- na.omit(data)
summary(data_b$KWH)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 770523 5429912 6283324 6502475 7620524 10655730
# Convert the data into time series
ts_data_b <- ts(data_b$KWH, start = c(1998, 1), frequency = 12)
# Plotting Time series
tsdisplay(ts_data_b)
# Performing seasonal decomposition
plot(decompose(ts_data_b))
It does not look as though tsclean replaced our outlier like I thought it would. I’m going to go back to manually change the outlier to the average. The decomposition plot shows an upward trend. Once again, I’ll use the Dickey-Fuller test to test for deterministic or stochastic trend.
model_b <- auto.arima(ts_data_b)
model_b
## Series: ts_data_b
## ARIMA(0,0,1)(0,1,2)[12] with drift
##
## Coefficients:
## ma1 sma1 sma2 drift
## 0.2431 -0.7287 0.1929 8501.749
## s.e. 0.0773 0.0800 0.0861 3639.647
##
## sigma^2 = 9.396e+11: log likelihood = -2722.65
## AIC=5455.3 AICc=5455.65 BIC=5471.24
forecast_b <- forecast(model_b, h = 12) # Forecasting for the next 12 months (2014)
autoplot(forecast_b) +
labs(title = "Forecast of Monthly Residential Power Usage for 2014",
x = "Year", y = "Power Usage (KWH)") +
autolayer(fitted(model_b), series = "Fitted")
The chart is used to visualize the historical trend of power usage and to forecast future consumption, allowing for planning and resource allocation based on expected demand.