Data 624 - Project 1

Part A – ATM Forecast

We are asked to forecast how much cash is taken out of 4 different ATM machines for May 2010. We are given data in a single file with variable cash provided in hundreds of dollars. Explain and demonstrate you process, techniques used and not used and your actual forecast.

# Load the dataset
data <- read_excel("ATM624Data.xlsx")

# View the first few rows of the dataset
head(data)

## # A tibble: 6 × 3
##    DATE ATM    Cash
##   <dbl> <chr> <dbl>
## 1 39934 ATM1     96
## 2 39934 ATM2    107
## 3 39935 ATM1     82
## 4 39935 ATM2     89
## 5 39936 ATM1     85
## 6 39936 ATM2     90

# Check for missing values
summarise_all(data, funs(sum(is.na(.))))

## # A tibble: 1 × 3
##    DATE   ATM  Cash
##   <int> <int> <int>
## 1     0    14    19

# Summary statistics for 'Cash'
summary(data$Cash)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0     0.5    73.0   155.6   114.0 10919.8      19

Handling Missing Values

There is 19 missing values. To handle missing values, we can either remove rows with missing data or impute them.

data_clean <- na.omit(data)

Now, we are plotting the data.

# Convert DATE to Date type
data_clean$DATE <- as.Date(data_clean$DATE)

# Plot cash withdrawals over time by ATM
ggplot(data_clean, aes(x = DATE, y = Cash, color = ATM)) + 
  geom_line() + 
  labs(title = "Cash Withdrawals Over Time by ATM", x = "Date", y = "Cash Withdrawn (in Hundreds of Dollars)") +
  theme_minimal()

Data Preparation and Model Selection

ATM1

# Example for ATM1
data_atm1 <- data_clean %>% 
  filter(ATM == "ATM1") %>% 
  arrange(DATE)

# Ensure DATE is a Date object and Cash is numeric
data_atm1$DATE <- as.Date(data_atm1$DATE)
data_atm1$Cash <- as.numeric(data_atm1$Cash)

# Convert to ts (time series) object, assuming monthly data starting from May 2009
ts_data_atm1 <- ts(data_atm1$Cash, start = c(2009, 5), frequency = 12)

# season plotting
ggseasonplot(ts_data_atm1)

# Plotting Time series for ATM1
tsdisplay(ts_data_atm1)

# Performing seasonal decomposition
plot(decompose(ts_data_atm1))

ATM 1 shows no trend but there is obvious seasonality, shown by the spikes at 7, 14, and 21 in the ACF plot.

# Fit an ARIMA model
model_atm1 <- auto.arima(ts_data_atm1)

# Showing ARIMA model
model_atm1

## Series: ts_data_atm1 
## ARIMA(5,0,3)(1,0,0)[12] with non-zero mean 
## 
## Coefficients:
##          ar1      ar2     ar3      ar4      ar5     ma1     ma2      ma3
##       0.0081  -0.7882  0.0220  -0.1709  -0.3984  0.1017  0.7305  -0.1646
## s.e.  0.0812   0.0547  0.0916   0.0557   0.0539  0.0804  0.0411   0.0722
##         sar1     mean
##       0.1999  83.7966
## s.e.  0.0585   1.4214
## 
## sigma^2 = 946.9:  log likelihood = -1750.44
## AIC=3522.89   AICc=3523.64   BIC=3565.7

# Forecast for May 2010 (1 month ahead)
forecast_data <- forecast(model_atm1, h = 1)

# Print forecast
print(forecast_data)

##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Jul 2039       89.93613 50.50065 129.3716 29.62477 150.2475

# Example plot of forecasts
autoplot(forecast_data) + 
  labs(title = "Forecast of Cash Withdrawals for ATM1", x = "Date", y = "Cash Withdrawn (in Hundreds of Dollars)")

# Assuming 'forecast_data' contains your final forecasts
# Replace 'forecast_data' with your actual forecast data frame
# write.xlsx(forecast_data, file = "forecast_data.xlsx")

ATM2

# Filter data for ATM2
data_atm2 <- data_clean %>% filter(ATM == "ATM2") %>% arrange(DATE)

# Convert to ts object for ATM2
ts_data_atm2 <- ts(data_atm2$Cash, start = c(2009, 5), frequency = 12)

# season plotting
ggseasonplot(ts_data_atm2)

# Plotting Time series for ATM2
tsdisplay(ts_data_atm2)

# Performing seasonal decomposition
plot(decompose(ts_data_atm2))

# Fit an ARIMA model for ATM2
model_atm2 <- auto.arima(ts_data_atm2)

# Showing ARIMA model
model_atm2

## Series: ts_data_atm2 
## ARIMA(3,1,1)(1,0,2)[12] with drift 
## 
## Coefficients:
##           ar1      ar2     ar3      ma1    sar1     sma1     sma2    drift
##       -0.0731  -0.3713  0.0382  -0.9800  0.1661  -0.3317  -0.1563  -0.0661
## s.e.   0.0587   0.0629  0.0625   0.0131  0.2625   0.2613   0.0953   0.0200
## 
## sigma^2 = 1206:  log likelihood = -1796.98
## AIC=3611.96   AICc=3612.47   BIC=3646.98

# Forecast for May 2010 for ATM2
forecast_data_atm2 <- forecast(model_atm2, h = 1)

# Print forecast for ATM2
print(forecast_data_atm2)

##          Point Forecast     Lo 80    Hi 80     Lo 95   Hi 95
## Aug 2039       32.17034 -12.33733 76.67801 -35.89827 100.239

# Example plot of forecasts for ATM2
autoplot(forecast_data_atm2) + 
  labs(title = "Forecast of Cash Withdrawals for ATM2", x = "Date", y = "Cash Withdrawn (in Hundreds of Dollars)")

ATM3

# Example for ATM3
data_atm3 <- data_clean %>% 
  filter(ATM == "ATM3") %>% 
  arrange(DATE)

# Ensure DATE is a Date object and Cash is numeric
data_atm3$DATE <- as.Date(data_atm3$DATE)
data_atm3$Cash <- as.numeric(data_atm3$Cash)

# Convert to ts (time series) object, assuming monthly data starting from May 2009
ts_data_atm3 <- ts(data_atm3$Cash, start = c(2009, 5), frequency = 12)

# season plotting
#ggseasonplot(ts_data_atm3)

# Plotting Time series for ATM3
tsdisplay(ts_data_atm3)

# Performing seasonal decomposition
plot(decompose(ts_data_atm3))

# Fit an ARIMA model
model_atm3 <- auto.arima(ts_data_atm3)

# Showing ARIMA model
model_atm3

## Series: ts_data_atm3 
## ARIMA(0,0,2) with zero mean 
## 
## Coefficients:
##          ma1     ma2
##       0.8392  0.8557
## s.e.  0.0496  0.0611
## 
## sigma^2 = 25.4:  log likelihood = -1108.69
## AIC=2223.39   AICc=2223.46   BIC=2235.09

# Forecast for May 2010 (1 month ahead)
forecast_data <- forecast(model_atm3, h = 1)

# Print forecast
print(forecast_data)

##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## Oct 2039       2.610608 -3.848409 9.069625 -7.267606 12.48882

# Example plot of forecasts
# autoplot(forecast_data) + 
#   labs(title = "Forecast of Cash Withdrawals for ATM3", x = "Date", y = "Cash Withdrawn (in Hundreds of Dollars)")

I will not be forecasting ATM 3 as it only contains 3 point, which indicates to me that this is a newly installed ATM. I would say to wait at least 3 months and then attempt to forecast at that time.Imputing the3 entries was not fruitful because there were only 3 entries.

ATM4

# Example for ATM4
data_atm4 <- data_clean %>% 
  filter(ATM == "ATM4") %>% 
  arrange(DATE)

# Ensure DATE is a Date object and Cash is numeric
data_atm4$DATE <- as.Date(data_atm4$DATE)
data_atm4$Cash <- as.numeric(data_atm4$Cash)

# Convert to ts (time series) object, assuming monthly data starting from May 2009
ts_data_atm4 <- ts(data_atm4$Cash, start = c(2009, 5), frequency = 12)

# season plotting
ggseasonplot(ts_data_atm4)

# Plotting Time series for ATM4
tsdisplay(ts_data_atm4)

# Performing seasonal decomposition
plot(decompose(ts_data_atm4))

# Fit an ARIMA model
model_atm4 <- auto.arima(ts_data_atm4)

# Showing ARIMA model
model_atm4

## Series: ts_data_atm4 
## ARIMA(0,0,0) with non-zero mean 
## 
## Coefficients:
##           mean
##       474.0433
## s.e.   34.0248
## 
## sigma^2 = 423718:  log likelihood = -2882.03
## AIC=5768.06   AICc=5768.1   BIC=5775.86

# Forecast for May 2010 (1 month ahead)
forecast_data <- forecast(model_atm4, h = 1)

# Print forecast
print(forecast_data)

##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## Oct 2039       474.0433 -360.1647 1308.251 -801.7678 1749.854

# Example plot of forecasts
autoplot(forecast_data) + 
  labs(title = "Forecast of Cash Withdrawals for ATM4", x = "Date", y = "Cash Withdrawn (in Hundreds of Dollars)")

ATM 4 shows the same seasonality information. The spike from the outlier is no longer present because we used tsclean. I wonder if it would be worth it to show the before and after.

Part B – Forecasting Power

Part B consists of a simple data set of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

data <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")

head(data)

## # A tibble: 6 × 3
##   CaseSequence `YYYY-MMM`     KWH
##          <dbl> <chr>        <dbl>
## 1          733 1998-Jan   6862583
## 2          734 1998-Feb   5838198
## 3          735 1998-Mar   5420658
## 4          736 1998-Apr   5010364
## 5          737 1998-May   4665377
## 6          738 1998-Jun   6467147

data$KWH <- as.numeric(data$KWH)

summary(data$KWH)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   770523  5429912  6283324  6502475  7620524 10655730        1

Handling Missing Values

data_b <- na.omit(data)

summary(data_b$KWH)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   770523  5429912  6283324  6502475  7620524 10655730

# Convert the data into time series
ts_data_b <- ts(data_b$KWH, start = c(1998, 1), frequency = 12)

# Plotting Time series
tsdisplay(ts_data_b)

# Performing seasonal decomposition
plot(decompose(ts_data_b))

It does not look as though tsclean replaced our outlier like I thought it would. I’m going to go back to manually change the outlier to the average. The decomposition plot shows an upward trend. Once again, I’ll use the Dickey-Fuller test to test for deterministic or stochastic trend.

Model Selection and Forecasting

model_b <- auto.arima(ts_data_b)

model_b

## Series: ts_data_b 
## ARIMA(0,0,1)(0,1,2)[12] with drift 
## 
## Coefficients:
##          ma1     sma1    sma2     drift
##       0.2431  -0.7287  0.1929  8501.749
## s.e.  0.0773   0.0800  0.0861  3639.647
## 
## sigma^2 = 9.396e+11:  log likelihood = -2722.65
## AIC=5455.3   AICc=5455.65   BIC=5471.24

forecast_b <- forecast(model_b, h = 12) # Forecasting for the next 12 months (2014)

Visualization

autoplot(forecast_b) + 
  labs(title = "Forecast of Monthly Residential Power Usage for 2014",
       x = "Year", y = "Power Usage (KWH)") +
  autolayer(fitted(model_b), series = "Fitted")

The chart is used to visualize the historical trend of power usage and to forecast future consumption, allowing for planning and resource allocation based on expected demand.