##Part A
TO start we need to explore the data, reading the excel file into R and configuring the variable format.
head(atm_data)
## # A tibble: 6 × 3
## DATE ATM Cash
## <dttm> <chr> <dbl>
## 1 2009-05-01 00:00:00 ATM1 96
## 2 2009-05-01 00:00:00 ATM2 107
## 3 2009-05-02 00:00:00 ATM1 82
## 4 2009-05-02 00:00:00 ATM2 89
## 5 2009-05-03 00:00:00 ATM1 85
## 6 2009-05-03 00:00:00 ATM2 90
Taking a look at the data set structure. noticing that we need to configure it to tsibble. We can see that the data starts at 2009-05-01 up until 2010-04-30 with remaining values up to 2010-05-14 empty.
str(atm_data)
## tibble [1,474 × 3] (S3: tbl_df/tbl/data.frame)
## $ DATE: POSIXct[1:1474], format: "2009-05-01" "2009-05-01" ...
## $ ATM : chr [1:1474] "ATM1" "ATM2" "ATM1" "ATM2" ...
## $ Cash: num [1:1474] 96 107 82 89 85 90 90 55 99 79 ...
summary(atm_data)
## DATE ATM Cash
## Min. :2009-05-01 00:00:00.00 Length:1474 Min. : 0.0
## 1st Qu.:2009-08-01 00:00:00.00 Class :character 1st Qu.: 0.5
## Median :2009-11-01 00:00:00.00 Mode :character Median : 73.0
## Mean :2009-10-31 19:11:48.27 Mean : 155.6
## 3rd Qu.:2010-02-01 00:00:00.00 3rd Qu.: 114.0
## Max. :2010-05-14 00:00:00.00 Max. :10919.8
## NA's :19
Using the code we can look closely into the dataset and examine what values are missing. We can see that there are 14 missing values from ATM and 19 missing values for cash.
sum(is.na(atm_data))
## [1] 33
colSums(is.na(atm_data))
## DATE ATM Cash
## 0 14 19
Upon examining we can see the amount of cash withdrawn from each ATM on average for each transaction and total. We can see that ATM 4 was the most used in terms of cash with an average of 474 per transaction and 173 thousand dollars total for the year.
atm_counts <- atm_data %>%
group_by(ATM) %>%
summarise(Count = n(),
Avg_Cash = mean(Cash, na.rm = TRUE),
Sum_Cash = sum(Cash, na.rm = TRUE))
print(atm_counts)
## # A tibble: 5 × 4
## ATM Count Avg_Cash Sum_Cash
## <chr> <int> <dbl> <dbl>
## 1 ATM1 365 83.9 30367
## 2 ATM2 365 62.6 22716
## 3 ATM3 365 0.721 263
## 4 ATM4 365 474. 173026.
## 5 <NA> 14 NaN 0
Tidying and preparing the data
#since we do not have values past 04/30/2010 we can exclude these from the time series.
atm_data <- atm_data %>%
mutate(DATE = as.Date(DATE)) %>%
filter(DATE <= as.Date("2010-04-30"))
Convert to a tsibble (time series tibble) with DATE as index and ATM as key
atm_tsibble <- atm_data %>%
as_tsibble(index = DATE, key = ATM)
Check if the data has any gaps with the dates.
atm_tsibble %>% has_gaps()
## # A tibble: 4 × 2
## ATM .gaps
## <chr> <lgl>
## 1 ATM1 FALSE
## 2 ATM2 FALSE
## 3 ATM3 FALSE
## 4 ATM4 FALSE
Fill in the NA values and prepare them with cash is 0 for missing cash values, assuming there was no transaction balance taken out.
atm_data <- atm_data %>%
filter(!is.na(Cash))
atm_data <- atm_data %>%
mutate(Cash = as.numeric(Cash))
atm_tsibble <- atm_data %>%
as_tsibble(index = DATE, key = ATM) %>%
fill_gaps(Cash = 0)
atm_tsibble <- atm_data %>%
as_tsibble(index = DATE, key = ATM)
atm_tsibble <- atm_tsibble %>%
fill_gaps()
atm_forecasts <- atm_tsibble %>%
model(ARIMA(Cash)) %>%
forecast(h = "1 month")
print(atm_forecasts)
## # A fable: 120 x 5 [1D]
## # Key: ATM, .model [4]
## ATM .model DATE Cash .mean
## <chr> <chr> <date> <dist> <dbl>
## 1 ATM1 ARIMA(Cash) 2010-05-01 N(85, 602) 85.2
## 2 ATM1 ARIMA(Cash) 2010-05-02 N(98, 615) 98.1
## 3 ATM1 ARIMA(Cash) 2010-05-03 N(75, 616) 75.5
## 4 ATM1 ARIMA(Cash) 2010-05-04 N(3.5, 616) 3.53
## 5 ATM1 ARIMA(Cash) 2010-05-05 N(98, 616) 98.5
## 6 ATM1 ARIMA(Cash) 2010-05-06 N(78, 616) 78.1
## 7 ATM1 ARIMA(Cash) 2010-05-07 N(84, 616) 84.0
## 8 ATM1 ARIMA(Cash) 2010-05-08 N(85, 785) 84.9
## 9 ATM1 ARIMA(Cash) 2010-05-09 N(97, 789) 97.5
## 10 ATM1 ARIMA(Cash) 2010-05-10 N(78, 789) 77.6
## # ℹ 110 more rows
atm_forecasts %>%
filter_index("2010-05") %>%
autoplot(atm_tsibble) +
labs(title = "Forecasted Cash Withdrawals for May 2010 by ATM",
x = "Date",
y = "Cash Withdrawals (in hundreds of dollars)")
The forecast plot for May 2010 shows cash withdrawals for each ATM machine with an ARIMA model. Both ATM1 and ATM2 display consistent, high-frequency cash withdrawal patterns with regular peaks and troughs.ATM3 has a low withdrawal rate until a sudden spike near the end of the time series.
Additional Diagnosis of the Data
atm_diagnostics <- atm_tsibble %>%
model(ARIMA(Cash))
atm_diagnostics %>%
augment() %>%
ggplot(aes(x = DATE, y = .resid, color = ATM)) +
geom_line() +
facet_wrap(~ATM, scales = "free_y") +
labs(title = "Residuals of ARIMA Model for Each ATM",
y = "Residuals")
For ATM 1 and ATM2 both have relatively centered residuals around zero, which is a good sign, indicating that the ARIMA model captures the trend and seasonality reasonably well for these ATMs.The residuals appear to have random scatter, which indicates that the model may have captured most of the structure in the data, although there is some variance that the model didn’t account for. The residuals for ATM3 are mostly close to zero until a sharp increase around April 2010.This could indicate an anomaly or a sudden change in cash withdrawal patterns for ATM3 that the ARIMA model was not able to capture, possibly due to an event or error in the data.
The residuals for ATM4 also mostly center around zero but show one large spike, which could indicate an outlier or a sudden change in cash demand.The variance in residuals for ATM4 is generally lower, though, suggesting the model fits reasonably well except for that one anomaly.
Additional Models for ATM3 and ATM4
atm_data_3_4 <- atm_data %>% filter(ATM %in% c("ATM3", "ATM4"))
atm_data_3_4_tsibble <- atm_data_3_4 %>% as_tsibble(index = DATE, key = ATM)
ets_model <- atm_data_3_4_tsibble %>%
model(
ETS_model = ETS(Cash ~ error("A") + trend("A") + season("A"))
)
sarima_model <- atm_data_3_4_tsibble %>%
model(
SARIMA_model = ARIMA(Cash ~ pdq(1, 1, 1) + PDQ(1, 1, 1))
)
## Warning in sqrt(diag(best$var.coef)): NaNs produced
report(ets_model)
## Warning in report.mdl_df(ets_model): Model reporting is only supported for
## individual models, so a glance will be shown. To see the report for a specific
## model, use `select()` and `filter()` to identify a single model.
## # A tibble: 2 × 10
## ATM .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM3 ETS_model 26.0 -1666. 3356. 3357. 3403. 25.2 44.3 0.324
## 2 ATM4 ETS_model 423177. -3436. 6895. 6896. 6942. 410424. 411573. 302.
report(sarima_model)
## Warning in report.mdl_df(sarima_model): Model reporting is only supported for
## individual models, so a glance will be shown. To see the report for a specific
## model, use `select()` and `filter()` to identify a single model.
## # A tibble: 2 × 9
## ATM .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 ATM3 SARIMA_model 26.1 -1087. 2184. 2184. 2203. <cpl [8]> <cpl [7]>
## 2 ATM4 SARIMA_model 416051. -2833. 5677. 5677. 5696. <cpl [8]> <cpl [8]>
Here’s code to attempt a SARIMA model for ATM4:
# Fit a SARIMA model specifically for ATM4
sarima_model_atm4 <- atm_data %>%
filter(ATM == "ATM4") %>%
as_tsibble(index = DATE) %>%
model(SARIMA_model = ARIMA(Cash ~ pdq(1,1,1) + PDQ(1,1,1)))
# Report the results for ATM4 SARIMA model
report(sarima_model_atm4)
## Series: Cash
## Model: ARIMA(1,1,1)(1,1,1)[7]
##
## Coefficients:
## ar1 ma1 sar1 sma1
## -0.0025 -1.000 0.0384 -1.0000
## s.e. 0.0532 0.022 0.0533 0.0346
##
## sigma^2 estimated as 416051: log likelihood=-2833.28
## AIC=5676.57 AICc=5676.74 BIC=5695.96
For ATM3, sigma² (residual variance) is very low for both the ETS and SARIMA models (26.0 and 26.1, respectively), suggesting that both models fit well in terms of residual stability. The SARIMA model is likely a better choice for ATM3 based on the lower AIC/BIC and higher log-likelihood, indicating a slightly better fit compared to the ETS model.
SARIMA is the better-performing model based on lower AIC/BIC and higher log-likelihood values. This is particularly true for ATM3, where SARIMA appears well-suited to capture patterns in cash withdrawals. For ATM4, although SARIMA outperforms ETS, high variance and residuals suggest that further adjustments or alternative models may be necessary to achieve accurate forecasts.
##Part B
First we load the data from our folder using read_excel, also fixing the data format to work with Time series analysis.
res_cus_data <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
res_cus_data <- res_cus_data %>% rename(date = 'YYYY-MMM')
res_cus_data <- res_cus_data %>% mutate(date = as.Date(paste0('01-', date), '%d-%Y-%b'))
res_cus_data
## # A tibble: 192 × 3
## CaseSequence date KWH
## <dbl> <date> <dbl>
## 1 733 1998-01-01 6862583
## 2 734 1998-02-01 5838198
## 3 735 1998-03-01 5420658
## 4 736 1998-04-01 5010364
## 5 737 1998-05-01 4665377
## 6 738 1998-06-01 6467147
## 7 739 1998-07-01 8914755
## 8 740 1998-08-01 8607428
## 9 741 1998-09-01 6989888
## 10 742 1998-10-01 6345620
## # ℹ 182 more rows
head(res_cus_data)
## # A tibble: 6 × 3
## CaseSequence date KWH
## <dbl> <date> <dbl>
## 1 733 1998-01-01 6862583
## 2 734 1998-02-01 5838198
## 3 735 1998-03-01 5420658
## 4 736 1998-04-01 5010364
## 5 737 1998-05-01 4665377
## 6 738 1998-06-01 6467147
str(res_cus_data)
## tibble [192 × 3] (S3: tbl_df/tbl/data.frame)
## $ CaseSequence: num [1:192] 733 734 735 736 737 738 739 740 741 742 ...
## $ date : Date[1:192], format: "1998-01-01" "1998-02-01" ...
## $ KWH : num [1:192] 6862583 5838198 5420658 5010364 4665377 ...
we can find which data point is missing using the following code. we can see that value for Sep 2008 is missing.
which(is.na(res_cus_data), arr.ind=TRUE)
## row col
## [1,] 129 3
We can mpdel-based imputation with ARIMA using the following code to impute a value for the missing KWH value for Sep 2008.
res_cus_data$KWH <- na_interpolation(res_cus_data$KWH, option = "linear")
After we fit the missing value we are ready to input the data into a time series model but first lets take a look at the consumption through out the past years in the following plot.
Upon further investigation we can see that the reading for July 2010 was very low which might indicate an error, possibility a missing digit. Looking at the trends from previous year, it seems to be sufficient to correct the value for the sake of simplicity since its clearly a typo error.
print(res_cus_data[res_cus_data$date == "2010-07-01", ])
## # A tibble: 1 × 3
## CaseSequence date KWH
## <dbl> <date> <dbl>
## 1 883 2010-07-01 770523
res_cus_data$KWH[res_cus_data$date == "2010-07-01"] <- 7705230
Time series analysis
res_cus_ts <- ts(res_cus_data$KWH, start=c(1998, 1), frequency=12)
plot(res_cus_ts, main = "Monthly Household Power Consumption", ylab = "KWH", xlab = "Year")
ggtsdisplay(res_cus_ts, points = FALSE, plot.type = "histogram")
We can see that there is a strong seasonality observed in both the time
series and ACF plots indicates that a seasonal model, such as SARIMA or
ETS, could be appropriate.The histogram indicates slight skewness, which
may affect certain model assumptions. If it’s significant, consider a
transformation (e.g., log transformation) to make the data more normally
distributed for certain model types.
power_ts <- ts(res_cus_data$KWH, start = c(1998, 1), frequency = 12)
# Fit the SARIMA model to the time series data
# Use auto.arima with seasonal=TRUE to let it choose the best SARIMA parameters
fit_sarima <- auto.arima(power_ts, seasonal = TRUE)
# Forecast for the next 12 months (2014)
forecast_2014 <- forecast(fit_sarima, h = 12)
# Plot the forecast
plot(forecast_2014, main = "SARIMA Forecast for Household Power Consumption (2014)", xlab = "Year", ylab = "KWH")
# Display forecasted values
print(forecast_2014)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2014 10209236 9411475 11006997 8989166 11429306
## Feb 2014 8723941 7883731 9564150 7438951 10008930
## Mar 2014 7135163 6292723 7977604 5846762 8423565
## Apr 2014 5917544 5058401 6776687 4603598 7231490
## May 2014 5945947 5086503 6805390 4631541 7260352
## Jun 2014 8385300 7525841 9244760 7070870 9699730
## Jul 2014 9361568 8499742 10223394 8043519 10679617
## Aug 2014 10021740 9158861 10884618 8702081 11341399
## Sep 2014 8549487 7686572 9412403 7229772 9869202
## Oct 2014 5779008 4915693 6642322 4458682 7099333
## Nov 2014 6196189 5332325 7060053 4875023 7517355
## Dec 2014 8370178 7506198 9234158 7048835 9691522
he SARIMA model has captured the seasonal patterns well, with the forecasted values reflecting similar periodic fluctuations in power consumption observed historically. This is indicative of the SARIMA model’s ability to account for monthly seasonality effectively. The forecast reflects expected peaks in the winter and summer months, following the historical pattern where energy consumption is generally higher during these seasons due to heating and cooling demands. In summary, the SARIMA model provides a reasonable forecast of household power consumption for 2014, reflecting seasonal trends and a modest upward trend in energy demand