Part A – ATM Forecast

Load libraries and data

library(readxl)
library(scales)
library(gridExtra)
library(dplyr)
library(tidyr)
library(tsoutliers)
library(tseries)
library(tsibble)
library(fable)
library(fabletools)
library(latex2exp)
library(forecast)
library(fpp3)
library(ggplot2)
atm <- read_excel("ATM624DATA.xlsx", col_types = c('date', 'text', 'numeric')) %>%
  mutate(DATE = as_date(DATE)) %>%
  as_tsibble(index = DATE, key = ATM) %>%
  filter(DATE < "2010-05-01")

head(atm)
## # A tsibble: 6 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-05-01 ATM1     96
## 2 2009-05-02 ATM1     82
## 3 2009-05-03 ATM1     85
## 4 2009-05-04 ATM1     90
## 5 2009-05-05 ATM1     99
## 6 2009-05-06 ATM1     88

A little explanation about the code above. First, I loaded the data from an Excel file, specifying the data types for the date, ATM identifier, and cash amount. Then, I transformed the dataset into a tsibble where Date is the index, and ATM identifies the different ATM machines. Finally, I filtered the data to include only transactions before May 2010, which allows me to use this historical data to forecast cash withdrawals for May 2010.This process definitely helps structure the data for easy time series analysis and forecasting.

summary(atm)
##       DATE                ATM                 Cash        
##  Min.   :2009-05-01   Length:1460        Min.   :    0.0  
##  1st Qu.:2009-07-31   Class :character   1st Qu.:    0.5  
##  Median :2009-10-30   Mode  :character   Median :   73.0  
##  Mean   :2009-10-30                      Mean   :  155.6  
##  3rd Qu.:2010-01-29                      3rd Qu.:  114.0  
##  Max.   :2010-04-30                      Max.   :10919.8  
##                                          NA's   :5

I need to deal with the missing values before proceeding to forecasting.

Handling missing values

missing_ATMs <- atm %>% filter(is.na(Cash)) %>% select(DATE, ATM)
missing_ATMs
## # A tsibble: 5 x 2 [1D]
## # Key:       ATM [2]
##   DATE       ATM  
##   <date>     <chr>
## 1 2009-06-13 ATM1 
## 2 2009-06-16 ATM1 
## 3 2009-06-22 ATM1 
## 4 2009-06-18 ATM2 
## 5 2009-06-24 ATM2

We have 3 missing values in ATM1 and 2 in ATM2.

# Filter data for ATM1 and ATM2
ATM1_data <- atm %>% filter(ATM == "ATM1")
ATM2_data <- atm %>% filter(ATM == "ATM2")

# Fit ARIMA model to ATM1
ATM1_arima <- auto.arima(ATM1_data$Cash, seasonal = TRUE)

# Forecast missing values for ATM1 using the fitted ARIMA model
ATM1_data$Cash_filled <- ifelse(is.na(ATM1_data$Cash), 
                                 forecast(ATM1_arima, h = sum(is.na(ATM1_data$Cash)))$mean, 
                                 ATM1_data$Cash)

# Fit ARIMA model to ATM2
ATM2_arima <- auto.arima(ATM2_data$Cash, seasonal = TRUE)

# Forecast missing values for ATM2 using the fitted ARIMA model
ATM2_data$Cash_filled <- ifelse(is.na(ATM2_data$Cash), 
                                 forecast(ATM2_arima, h = sum(is.na(ATM2_data$Cash)))$mean, 
                                 ATM2_data$Cash)


ATM_filled <- atm %>%
  filter(!(ATM %in% c("ATM1", "ATM2"))) %>%
  bind_rows(ATM1_data %>% select(ATM, Cash = Cash_filled), 
            ATM2_data %>% select(ATM, Cash = Cash_filled))

# Checking if it worked
final_missing <- sum(is.na(ATM_filled$Cash))
cat("Final combined data - Missing values:", final_missing, "\n")
## Final combined data - Missing values: 0

In my analysis, I used ARIMA models to handle missing values in the cash flow data for the ATMs. According to Section 13.9 of Hyndman and Athanasopoulos’s “Forecasting: Principles and Practice,” ARIMA models are effective for filling in gaps in time series data. By fitting the ARIMA model to the cash flow data from both ATM1 and ATM2, I was able to predict the missing values. This approach helped me create complete datasets for both ATMs, improving the overall accuracy of my analysis.

Data Exploration

ATM_filled %>%
  autoplot(Cash) +
  facet_wrap(~ATM, scales = "free", nrow = 4) +
  labs(title = " ATM Withdrawals")

It is very apparent that there is a big outlier in ATM4, which will be better if we remove it.

To identify outliers in the Cash column of the atm4 data frame, we can use several methods. One common approach is to use the Interquartile Range (IQR) method, which defines outliers as points that fall below Q1-1.5IQR or above Q3+1.5IQR

ATM_4 <- ATM_filled %>%
  filter(ATM == "ATM4")

Q1 <- quantile(ATM_4$Cash, 0.25, na.rm = TRUE)
Q3 <- quantile(ATM_4$Cash, 0.75, na.rm = TRUE)
IQR_value <- IQR(ATM_4$Cash, na.rm = TRUE)

# Determine the outlier thresholds
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Identify outliers
outliers <- ATM_4 %>% 
  filter(Cash < lower_bound | Cash > upper_bound)

# Display outliers
print(outliers)
## # A tsibble: 3 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM     Cash
##   <date>     <chr>  <dbl>
## 1 2009-09-22 ATM4   1712.
## 2 2010-01-29 ATM4   1575.
## 3 2010-02-09 ATM4  10920.

Given that 10919.762 is much greater than the others, it would be identified as an outlier based on visual inspection and the IQR method. I’ll use tsoutlier to replace the outlier in ATM4

ATM4_ts <- ts(ATM_4$Cash, frequency = 365, start = c(2009, 1))  # Adjust start as necessary

# Identify outliers using tsoutliers
outlier_results <- tsoutliers(ATM4_ts)

# Print outlier information
print(outlier_results)
## $index
## [1] 285
## 
## $replacements
## [1] 230.1752
ATM_4$Cash[285] <- 230.1752
print(ATM_4$Cash[285])
## [1] 230.1752
autoplot(ATM_4) +
  labs(title="ATM4", subtitle="Cash withdrawals daily", y="Hundreds USD")
## Plot variable not specified, automatically selected `.vars = Cash`

ATM_filled <- ATM_filled %>% 
  rename(date=DATE, atm=ATM, cash=Cash) %>%
  mutate(date = as.Date(date)) %>% 
  pivot_wider(names_from=atm, values_from = cash)

Individual ATM Exploring and Modeling

ATM1

ATM_1 <- ATM_filled %>% 
  select(date, ATM1)

 ATM_1%>%
  autoplot(ATM1) +
  ggtitle("Original ATM1")

# Create the second plot (STL Decomposition)
 ATM_1 %>%
  model(STL(ATM1 ~ season(window = "periodic"), robust = TRUE)) %>%
  components() %>%
  autoplot() +
  labs(title = "STL Decomposition for ATM1")

The first plot (Original non transformed plot) displays raw data over time, showing high variability and strong weekly seasonality. Peaks and troughs are apparent, indicating periodic changes in cash withdrawals. The STL decomposition of the ATM cash withdrawal data reveals three key insights. The trend component shows a general increase in cash withdrawals from mid-2009, peaking and then slightly declining in early 2010. The seasonal component captures a consistent weekly pattern, with regular fluctuations that repeat every week, reflecting predictable withdrawal behavior tied to weekly cycles. Lastly, the remainder component highlights short-term irregularities or noise, with noticeable spikes around early 2010, suggesting unexpected events influencing withdrawals during that period.

ATM_1 %>% 
  ACF(ATM1, lag_max = 28) %>% 
  autoplot()

#### The ACF plot shows significant lags at 2, 5, and 7, with lag 7 being the strongest. The slow drop in the ACF could mean the data is non-stationary and may need differencing.

ndiffs(ATM_1$ATM1)
## [1] 0

Turns out it doesn’t need differencing.

Explanation for the selection of the following models -> The additive ETS model is suitable because the seasonality is additive, as shown in the STL seasonal component, and there’s no clear trend.The SNAIVE model is logical to check because of the strong, consistent weekly seasonality identified in the data. Finally, the ARIMA model is considered because, while the focus is on seasonality, ARIMA could capture underlying autocorrelations in the remainder component.

lambda <- ATM_1 %>%
  features(ATM1, features = guerrero) %>%
  pull(lambda_guerrero)


atm1_fit <- ATM_1 %>% 
  model(
    additive = ETS(ATM1 ~ error("A") + trend("N") + season("A")),
    snaive = SNAIVE(ATM1),
    ARIMA = ARIMA(box_cox(ATM1, lambda))
  )


forecast_ATM1 <- atm1_fit %>%
  forecast(h = 30)

# Plot the forecasts
forecast_ATM1 %>%
  autoplot(ATM_1, level = NULL) +
  facet_wrap(~ .model, scales = "free", ncol = 2) +  # Adjust ncol to control number of columns
  guides(colour = guide_legend(title = "Forecast")) +
  labs(title = "ATM1 Forecasts") +
  xlab("Date") +
  ylab("Cash values in Hundreds")

accuracy(atm1_fit) %>%
  select(.model, RMSE:MAPE)
## # A tibble: 3 × 5
##   .model    RMSE   MAE    MPE  MAPE
##   <chr>    <dbl> <dbl>  <dbl> <dbl>
## 1 additive  23.8  15.1 -106.   121.
## 2 snaive    27.9  17.8  -96.6  116.
## 3 ARIMA     25.1  16.2  -87.8  107.

Based on the RMSE (23.80689) and MAE (15.10543) metrics, the additive ETS model appears to be the best overall fit for our data, as it has the lowest values for both metrics.

Forecasting

ATM1_bestfit <- ATM_1 %>% 
  model(
    ETS = ETS(ATM1))

 forecast_ATM1 <- ATM1_bestfit %>%
  forecast(h = 31) %>%
  filter(.model=='ETS')

forecast_ATM1 %>% 
  autoplot(ATM_1) +
  labs(title = "Forecast for ATM1 using ETS",
       y = "Cash Values in Hundreds")

ATM1_bestfit%>%
  gg_tsresiduals() +
  labs(title="Residuals for ETS")

ATM2

ATM_2 <- ATM_filled %>% 
  select(date, ATM2)


ATM_2 %>%
  autoplot(ATM2) +
  ggtitle("Original ATM2")

# STL decomposition
ATM_2 %>%
  model(STL(ATM2 ~ season(window = "periodic"), robust = TRUE)) %>%
  components() %>%
  autoplot() +
  labs(title = "STL Decomposition for ATM2")

ATM_2 %>% 
  ACF(ATM2, lag_max = 28) %>% 
  autoplot()

Same as before it is possible that the data is non-stationary and could potentially benefit from differencing. We should check and see if that’s the case

ndiffs(ATM_2$ATM2)
## [1] 1
ATM_2_diff <- ATM_2 %>%
  mutate(ATM2_diff = ATM2 - lag(ATM2, 1)) %>%  
  filter(!is.na(ATM2_diff))  
# Perform KPSS test
kpss_test <- kpss.test(ATM_2_diff$ATM2_diff)
print(kpss_test)
## 
##  KPSS Test for Level Stationarity
## 
## data:  ATM_2_diff$ATM2_diff
## KPSS Level = 0.014714, Truncation lag parameter = 5, p-value = 0.1
mean_value <- mean(ATM_2_diff$ATM2_diff[ATM_2_diff$ATM2_diff >= 0], na.rm = TRUE)
ATM_2_diff$ATM2_diff[ATM_2_diff$ATM2_diff < 0] <- mean_value

Since our p-value (0.1) is greater than the common significance level (0.05), we fail to reject the null hypothesis. This suggests that the series is stationary.

Also, while I was reviewing ATM_2_diff I realised that there was an important number of negative values. That happened because the differencing operation I performed calculates the change in withdrawals between consecutive time periods. If ATM2 decreases from one period to the next, ATM2_diff will be negative. For example, if withdrawals were $100 one day and $80 the next day, the difference would be -$20. To fix that problem I decided to impute negative values by replacing negative values with the mean of the non-negative differences

# Calculate lambda for Box-Cox transformation on the differenced data
lambda <- ATM_2_diff %>%
  features(ATM2_diff, features = guerrero) %>%
  pull(lambda_guerrero)

# Fit the models using the differenced data
atm2_fit <- ATM_2_diff %>%
  model(
    additive = ETS(ATM2_diff ~ error("A") + trend("N") + season("A")),
    snaive = SNAIVE(ATM2_diff),
    ARIMA = ARIMA(box_cox(ATM2_diff, lambda))
  )

# Forecast for the next 30 periods
forecast_ATM2 <- atm2_fit %>%
  forecast(h = 30)

# Plot the forecasts
forecast_ATM2 %>%
  autoplot(ATM_2_diff, level = NULL) +
  facet_wrap(~ .model, scales = "free", ncol = 2) +  # Adjust ncol to control number of columns
  guides(colour = guide_legend(title = "Forecast")) +
  labs(title = "ATM2 Forecasts") +
  xlab("Date") +
  ylab("Cash values in Hundreds")

# Get stats for the models
model_stats <- left_join(
  glance(atm2_fit) %>% select(.model:BIC), 
  accuracy(atm2_fit) %>% select(.model, RMSE)
) %>%
  arrange(RMSE)

# Display accuracy metrics
accuracy(atm2_fit) %>%
  select(.model, RMSE:MAPE)
## # A tibble: 3 × 5
##   .model    RMSE   MAE   MPE  MAPE
##   <chr>    <dbl> <dbl> <dbl> <dbl>
## 1 additive  20.8  13.9  -Inf   Inf
## 2 snaive    27.6  15.9  -Inf   Inf
## 3 ARIMA     22.1  13.6  -Inf   Inf
# Display the model statistics
model_stats
## # A tibble: 3 × 7
##   .model   sigma2 log_lik   AIC  AICc   BIC  RMSE
##   <chr>     <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 additive   444.  -2178. 4376. 4377. 4415.  20.8
## 2 ARIMA     1622.  -1862. 3732. 3733. 3748.  22.1
## 3 snaive     765.     NA    NA    NA    NA   27.6

Based on the metrics above, the ARIMA model appears to be the best fit for your differenced ATM2 data, as indicated by its lower RMSE, AIC, and BIC values.

Forecast

ATM2_bestfit <- ATM_2_diff %>% 
  model(
    ARIMA = ARIMA(ATM2_diff))

ATM2_bestfit %>% select(.model = "ARIMA") %>% report()
## Series: ATM2_diff 
## Model: ARIMA(0,0,0)(2,0,0)[7] w/ mean 
## 
## Coefficients:
##         sar1    sar2  constant
##       0.3051  0.3525   17.6802
## s.e.  0.0489  0.0502    1.1016
## 
## sigma^2 estimated as 495.1:  log likelihood=-1646.04
## AIC=3300.09   AICc=3300.2   BIC=3315.68
forecast_ATM2 <- ATM2_bestfit %>%
  forecast(h = 31) %>%
  filter(.model=='ARIMA')

# forcasted plot
forecast_ATM2 %>%
  autoplot(ATM_2_diff) +
  ggtitle(TeX(paste0("ATM 2 Forecast with $ARIMA(1,0,0)(0,1,1))_7$ and lambda = ",
         round(lambda,2))))

# residuals
ATM2_bestfit %>%
  select(ARIMA) %>%
  gg_tsresiduals() +
  ggtitle(TeX(paste0("Residuals for $ARIMA(1,0,0)(0,1,1)_7$ with lambda = ",
         round(lambda,2))))

ATM3

ATM_filled  %>%
  autoplot(ATM3) +
  ggtitle("Original ATM3")

Forecast

forecast_atm3 <- ATM3_fit %>%
  forecast(h = 31) 

# forcasted plot
forecast_atm3 %>%
  autoplot(ATM_filled) +
  ggtitle("ATM 3 Forecast with the MEAN() Model")

ATM4

We’re using atm4 which we previously removed the outlier from.

ATM_4%>%  
  autoplot() +
  labs(title = "Original ATM3")

ATM_4%>%
  model(STL(Cash ~ season(window = "periodic"), robust = TRUE)) %>%
  components() %>%
  autoplot() +
  labs(title = "STL Decomposition for ATM4 ")

ATM_4 %>% 
  ACF(Cash, lag_max = 28) %>% 
  autoplot()

The ACF plot shows a notable spike at lag 7, and a smaller spike at lag 28. After lag 7, the ACF appears to gradually decline and then picks up again around lag 28. This slow decline, along with the rebound at lag 28, suggests that the series might show some autocorrelation over time, which could be a sign of non-stationarity. Again,it would be reasonable to perform a test for differencing.

ndiffs(ATM_4$Cash)
## [1] 0

Turns out it doesn’t need differencing.

I’ll chech the followin models : ARIMA, SNAIVE, Additive ETS and Multiplicative ETS (transformed). ARIMA beacuse it can handle both trend and seasonal patterns, the Box-Cox transformation stabilizes variance and is able to capture autocorrelation. Additive ETS or Multiplicative ETS (transformed) because the transformation helps with variance, and ETS is great for capturing seasonality.

lambda <- ATM_4 %>%
  features(Cash, features = guerrero) %>%
  pull(lambda_guerrero)


atm4_fit <- ATM_4 %>%
  model(
    additive = ETS(Cash ~ error("A") + trend("N") + season("A")),
    snaive = SNAIVE(Cash),
    multiplicative_ts = ETS(box_cox(Cash,lambda) ~ error("M") + trend("N") + season("M")),
    ARIMA = ARIMA(box_cox(Cash,lambda)),
  )
forecast_ATM4 <- atm4_fit %>%
  forecast(h = 30)

forecast_ATM4 %>%
  autoplot(ATM_4, level = NULL) +
  facet_wrap(~ .model, scales = "free", ncol = 2) +  # Adjust ncol to control number of columns
  guides(colour = guide_legend(title = "Forecast")) +
  labs(title = "ATM4 Forecasts") +
  xlab("Date") +
  ylab("Cash values in Hundreds")

# stats for the models
left_join(glance(atm4_fit) %>% select(.model:BIC), 
          accuracy(atm4_fit) %>% select(.model, RMSE)) %>%
  arrange(RMSE)
## # A tibble: 4 × 7
##   .model                sigma2 log_lik   AIC  AICc   BIC  RMSE
##   <chr>                  <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 additive          110850.     -3192. 6404. 6405. 6443.  329.
## 2 multiplicative_ts      0.219  -2003. 4026. 4027. 4065.  343.
## 3 ARIMA                176.     -1461. 2931. 2931. 2951.  352.
## 4 snaive            205805.        NA    NA    NA    NA   453.
accuracy(atm4_fit) %>%
  select(.model, RMSE:MAPE)
## # A tibble: 4 × 5
##   .model             RMSE   MAE   MPE  MAPE
##   <chr>             <dbl> <dbl> <dbl> <dbl>
## 1 additive           329.  265. -515.  545.
## 2 snaive             453.  346. -392.  447.
## 3 multiplicative_ts  343.  262. -374.  417.
## 4 ARIMA              352.  274. -368.  413.

ARIMA appears to be the best overall model when considering both model fit (AIC, BIC) and error metrics like RMSE and MAPE. It has the lowest AIC/BIC, indicating it captures the underlying structure with minimal complexity.

ATM4_bestfit <- ATM_4 %>% 
  model(
    ARIMA = ARIMA(Cash))

ATM4_bestfit %>% select(.model = "ARIMA") %>% report()
## Series: Cash 
## Model: ARIMA(3,0,2)(1,0,0)[7] w/ mean 
## 
## Coefficients:
##           ar1      ar2     ar3     ma1     ma2    sar1  constant
##       -0.8778  -0.6769  0.1700  0.9634  0.8065  0.2484  795.0206
## s.e.   0.1004   0.1186  0.0564  0.0906  0.1079  0.0556   48.8937
## 
## sigma^2 estimated as 117528:  log likelihood=-2645.21
## AIC=5306.42   AICc=5306.82   BIC=5337.62
forecast_ATM4 <- ATM4_bestfit %>%
  forecast(h = 31) %>%
  filter(.model=='ARIMA')

# forcasted plot
forecast_ATM4 %>%
  autoplot(ATM_4) +
  ggtitle(TeX(paste0("ATM 4 Forecast with $ARIMA(3,0,2)(1,0,0)_7$ and lambda = ",
         round(lambda,2))))

# residuals
ATM4_bestfit %>%
  select(ARIMA) %>%
  gg_tsresiduals() +
  ggtitle(TeX(paste0("Residuals for $ARIMA(3,0,2)(1,0,0)_7$ with lambda = ",
         round(lambda,2))))

Results

forecast_ATM1 <- forecast_ATM1 %>%
  rename(atm1 = .mean)

forecast_ATM2 <- forecast_ATM2 %>%
  rename(atm2 = .mean)

forecast_atm3 <- forecast_atm3 %>%
  rename(atm3 = .mean)

forecast_ATM4 <- forecast_ATM4 %>%
  rename(atm4 = .mean)
combined_forecasts <- bind_cols(forecast_ATM1, forecast_ATM2, forecast_atm3, forecast_ATM4)
combined_forecasts <- combined_forecasts %>%
    rename(date = date...2)
final_forecast <- combined_forecasts %>%
  select(date, atm1, atm2, atm3, atm4)

final_forecast %>% write.csv("final_forecasts.csv")

………………………………………………………………………….

Part B – Forecasting Power

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

Loading the data

res <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
res_ts <- res %>% 
  rename(Date = 'YYYY-MMM') %>% 
  mutate(Date = yearmonth(Date)) %>% 
  as_tsibble(index = Date)

head(res_ts)
## # A tsibble: 6 x 3 [1M]
##   CaseSequence     Date     KWH
##          <dbl>    <mth>   <dbl>
## 1          733 1998 Jan 6862583
## 2          734 1998 Feb 5838198
## 3          735 1998 Mar 5420658
## 4          736 1998 Apr 5010364
## 5          737 1998 May 4665377
## 6          738 1998 Jun 6467147

We used read_excel to load the data into R. We renamed the ‘YYYY-MMM’ column to ‘Month’ for clarity. We transformed the ‘Month’ column into a proper year-month format, so R understands it as dates. Lastly, we converted the data into a tsibble.

res_ts %>%
  autoplot(KWH)

By observing the visualization above we can clearly notice a possible outlier. We’ll move on with investigating the data for missing values and outliers, and we’ll come up with reasonable ideas to handle them both.

Data Exploration

summary(res_ts$KWH)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   770523  5429912  6283324  6502475  7620524 10655730        1

There’s one missing value in our data.

Same as before we’ll fit an ARIMA model to handle the missing data.

missing_KWH<- res_ts %>% filter(is.na(KWH)) %>% select(Date, KWH)
missing_KWH
## # A tsibble: 1 x 2 [1M]
##       Date   KWH
##      <mth> <dbl>
## 1 2008 Sep    NA
res_arima <- auto.arima(res_ts$KWH, seasonal = TRUE)

# Count the number of missing values in 'KWH'
num_missing <- sum(is.na(res_ts$KWH))

# If there are missing values, forecast them using the fitted ARIMA model
if (num_missing > 0) {
  forecast_values <- forecast(res_arima, h = num_missing)$mean  
  
  # Create a new column 'KWH_filled' by filling NA with forecast values
  res_filled <- res_ts %>% 
    mutate(KWH_filled = ifelse(is.na(KWH), forecast_values, KWH))
} else {
  res_filled <- res_ts %>% 
    mutate(KWH_filled = KWH)
}

res_filled <- res_filled %>% 
  select(CaseSequence,Date, KWH_filled)

summary(res_filled$KWH_filled) ## Check if the missing value is handled
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   770523  5434539  6314472  6516367  7649733 10655730

Outlier

We can manually identify values that significantly deviate from the trend. We’ll consider values that are more than 1.5 times the interquartile range (IQR).

Q1 <- quantile(res_filled$KWH_filled, 0.25)
Q3 <- quantile(res_filled$KWH_filled, 0.75)
IQR <- Q3 - Q1

# Define upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Identify outliers
potential_outliers <- res_filled$KWH_filled[res_filled$KWH_filled < lower_bound | res_filled$KWH_filled > upper_bound]

print(potential_outliers)
## [1] 770523
KWH_filled_with_na <- res_filled$KWH_filled
KWH_filled_with_na[KWH_filled_with_na < lower_bound | KWH_filled_with_na > upper_bound] <- NA

# Step 2: Fit an ARIMA model (including NA values)
arima_model <- auto.arima(KWH_filled_with_na)

# Step 3: Forecast missing values
forecasts <- forecast(arima_model, h = sum(is.na(KWH_filled_with_na)))

# Impute the NA values with the forecasted values
KWH_imputed <- KWH_filled_with_na
KWH_imputed[is.na(KWH_imputed)] <- forecasts$mean

res_filled$KWH_filled <- KWH_imputed

head(res_filled)
## # A tsibble: 6 x 3 [1M]
##   CaseSequence     Date KWH_filled
##          <dbl>    <mth>      <dbl>
## 1          733 1998 Jan    6862583
## 2          734 1998 Feb    5838198
## 3          735 1998 Mar    5420658
## 4          736 1998 Apr    5010364
## 5          737 1998 May    4665377
## 6          738 1998 Jun    6467147

In the process above, we first identified outliers in the KWH_filled values by checking them against calculated lower and upper bounds, setting any outlier values to NA. Next, we fitted an ARIMA model to the KWH_filled data, including the NA values, to capture the underlying patterns in the time series. We then used this model to forecast the missing values, replacing the NA entries with the predicted values from the ARIMA forecasts. Finally, we updated the original res_filled DataFrame with the imputed KWH_filled values and printed the updated DataFrame to confirm the changes. Next, we’ll review the plot.

res_filled <- res_filled %>%
  mutate(KWH_filled = KWH_filled / 1000)
head(res_filled)
## # A tsibble: 6 x 3 [1M]
##   CaseSequence     Date KWH_filled
##          <dbl>    <mth>      <dbl>
## 1          733 1998 Jan      6863.
## 2          734 1998 Feb      5838.
## 3          735 1998 Mar      5421.
## 4          736 1998 Apr      5010.
## 5          737 1998 May      4665.
## 6          738 1998 Jun      6467.

This conversion makes the data easier to interpret, especially when dealing with larger quantities of energy and also simplifies calculations and visualizations, enhancing the overall clarity of the data in your analysis.

res_filled%>%
  autoplot(KWH_filled) +
  ggtitle("Residential Power Usage cleaned plot")

res_filled %>%
  model(STL(KWH_filled ~ season(window = "periodic"), robust = TRUE)) %>%
  components() %>%
  autoplot() +
  labs(title = "STL Decomposition cleaned plot")

The data shows a clear increasing trend, but it accounts for less variation compared to other factors. There is also an annual pattern, with power usage peaking in the summer and winter months. The random fluctuations include a significant spike in December 2013. A seasonal plot confirms the increase in power consumption during peak months, while the off-peak months likely change each year due to weather differences.

res_filled$KWH_filled <- as.numeric(res_filled$KWH_filled)

# Create the time series object
plot_season <- ts(res_filled$KWH_filled, start = c(1998, 1), frequency = 12)


ggseasonplot(plot_season)+
  ggtitle("Residential Power Usage Deasonally Each Year")

The chart above shows the seasonal trend in residential power usage from 1998 to 2013, with usage peaking during the summer months (June to August) and dipping significantly during spring (March to May) and fall (September to November). This suggests higher power consumption during warmer months, likely due to increased air conditioning use.

power_lambda <- res_filled %>%
  features(KWH_filled, features = guerrero) %>%
  pull(lambda_guerrero)

res_filled <- res_filled %>% 
    mutate(KWH_bc = box_cox(KWH_filled, power_lambda))


summary(res_filled$KWH_bc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.040   3.058   3.069   3.069   3.082   3.103
res_filled %>% 
  autoplot(KWH_bc) +
  labs(y = "KWH in Thousands")

In the code above, we first used the Guerrero method to calculate the optimal Box-Cox transformation parameter (lambda) for the KWH_filled data to stabilize its variance. Then, we applied the Box-Cox transformation to the KWH_filled values, creating a new variable (KWH_bc). Finally, we summarized the transformed data and created a plot to visualize the transformed values (KWH_bc). This process helps improve the normality and homoscedasticity of the data.

res_filled %>% 
  ACF(KWH_bc, lag_max = 36) %>% 
  autoplot()

The ACF plot shows significant autocorrelations at lags that are multiples of 12, indicating strong seasonality in the data. The gradual decline in the ACF suggests a non-stationary series, while the PACF has a sharp drop after lag 1, further pointing to potential non-stationarity. Based on this, differencing, particularly seasonal differencing, might be needed to achieve stationarity before modeling.

ndiffs(res_filled$KWH_bc)
## [1] 1

The output of 1 indicates that we need to difference our KWH_bc data once to achieve stationarity.

diff_res <- res_filled %>% 
  mutate(diff_KWH= difference(KWH_filled), diff_KWH_bc = difference(KWH_bc)) %>% 
  select(-KWH_filled, -KWH_bc) %>% 
  slice(-1)


print(kpss.test(diff_res$diff_KWH_bc))
## Warning in kpss.test(diff_res$diff_KWH_bc): p-value greater than printed
## p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff_res$diff_KWH_bc
## KPSS Level = 0.039109, Truncation lag parameter = 4, p-value = 0.1
ndiffs(diff_res$diff_KWH_bc)
## [1] 0

We can conclude that the differenced Box-Cox transformed data (diff_KWH_bc) appears to be stationary.

Modeling

diff_train <- diff_res %>% 
  filter(year(Date) < 2013)

diff_fit <- diff_train %>% 
    model(
    ARIMA = ARIMA(diff_KWH),
    `Auto ARIMA` = ARIMA(diff_KWH, stepwise = FALSE, approx = FALSE)
  )


diff_fc <- diff_fit %>% 
  forecast(h = "1 year")

#plot
diff_fc %>%
  autoplot(diff_res, level = NULL)+
  facet_wrap( ~ .model, scales = "free_y") +
  guides(colour = guide_legend(title = "Forecast"))+
  labs(title= "KWH Forecasts")+
  xlab("Date") +
  ylab("KWH in Thousands") 

In the code above, we filtered the differenced data (diff_res) to include only observations before 2013 and then fitted ARIMA and Auto ARIMA models to this data. After that, we generated forecasts for the year 2013 based on these models. Finally, we created a plot to visualize the forecasts from each model, showing how they predict KWH usage over time.

train <- res_filled %>% 
  filter(year(Date) < 2013)

#models
fit <- train %>% 
    model(
    ETS = ETS(KWH_filled),
    `Additive ETS` = ETS(KWH_filled ~ error("A") + trend("A") + season("A")),
    SNAIVE = SNAIVE(KWH_filled)
  )

#forecast of 2013
fc <- fit %>% 
  forecast(h = "1 year")

#plot
fc %>%
  autoplot(res_filled, level = NULL)+
  facet_wrap( ~ .model, scales = "free_y",ncol=2) +
  guides(colour = guide_legend(title = "Forecast"))+
  labs(title= "KWH Forecasts",
       subtitle = "January 2013 - December 2013")+
  xlab("Month") +
  ylab("KWH in Thousands") 

ETS shows the lowest RMSE (1011.443) and a competitive MAE (680.7843), indicating it’s the best-performing model among those listed.

Results

ETS_fit <- res_filled %>% 
  model(`Additive ETS` = ETS(KWH_filled ~ error("A") + trend("A") + season("A")))


ETS_forecast <- ETS_fit %>% 
  forecast(h=12)

ETS_forecast %>%
  autoplot(res_filled) +
  labs(title = " Residential Power Usage using Additive ETS ",
       y = "KWH in Thousands")

The generated forecast seems reliable, as it effectively reflects the seasonal patterns observed, indicating higher demand during the summer and winter months. It also accurately falls between the peaks and valleys recorded in the more recent years.

final_forec <- ETS_forecast %>%
  rename(KWH_2014 = .mean) %>%
  select(Date, KWH_2014)
final_forec %>% write.csv("KWH_forecasts.csv")

………………………………………………………………………….

Part C – BONUS, optional (part or all).*

Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.

Loading the data

Our data needs some serious improvement.

p1 <- read_excel("Waterflow_Pipe1.xlsx",col_types = c("date", "numeric"))
p2 <- read_excel("Waterflow_Pipe2.xlsx",col_types = c("date", "numeric"))
colnames(p1) <- c("Datetime", "Waterflow") 
colnames(p2) <- c("Datetime", "Waterflow")

# Process data
p1 <- p1 %>% 
  mutate(
    Date = as.Date(Datetime),
    Time = hour(Datetime)
  ) %>% 
  group_by(Date, Time) %>% 
  summarise(Water = mean(Waterflow)) %>%  #`na.rm = TRUE` to handle any missing values
  ungroup() %>% 
  mutate(Datetime = ymd_h(paste(Date, Time))) %>% 
  select(Datetime, Water)
## `summarise()` has grouped output by 'Date'. You can override using the
## `.groups` argument.
p2 <- p2 %>% 
  mutate(
    Date = as.Date(Datetime),
    Time = hour(Datetime)
  ) %>% 
  group_by(Date, Time) %>% 
  summarise(Water = mean(Waterflow)) %>%  
  ungroup() %>% 
  mutate(Datetime = ymd_h(paste(Date, Time))) %>% 
  select(Datetime, Water)
## `summarise()` has grouped output by 'Date'. You can override using the
## `.groups` argument.
head(p1)
## # A tibble: 6 × 2
##   Datetime            Water
##   <dttm>              <dbl>
## 1 2015-10-23 00:00:00  26.1
## 2 2015-10-23 01:00:00  18.9
## 3 2015-10-23 02:00:00  15.2
## 4 2015-10-23 03:00:00  23.1
## 5 2015-10-23 04:00:00  15.5
## 6 2015-10-23 05:00:00  22.7
head(p2)
## # A tibble: 6 × 2
##   Datetime            Water
##   <dttm>              <dbl>
## 1 2015-10-23 01:00:00  18.8
## 2 2015-10-23 02:00:00  43.1
## 3 2015-10-23 03:00:00  38.0
## 4 2015-10-23 04:00:00  36.1
## 5 2015-10-23 05:00:00  31.9
## 6 2015-10-23 06:00:00  28.2

Data Exploration

No missing values in our data

colSums(is.na(p1))
## Datetime    Water 
##        0        0
colSums(is.na(p2))
## Datetime    Water 
##        0        0
library(gridExtra)
x_limits <- range(c(p1$Datetime, p2$Datetime))
y_limits <- range(c(p1$Water, p2$Water))
p1_plot <- ggplot(p1, aes(x = Datetime, y = Water)) +
  geom_line(color = "blue") +
  labs(title = "Water Flow - Pipe 1", x = "Date Time", y = "Water Flow") +
  scale_x_datetime(limits = x_limits) +  # Set x limits
  scale_y_continuous(limits = y_limits) +  # Set y limits
  theme_minimal()

# Plot for Pipe 2 with same axis limits
p2_plot <- ggplot(p2, aes(x = Datetime, y = Water)) +
  geom_line(color = "red") +
  labs(title = "Water Flow - Pipe 2", x = "Date Time", y = "Water Flow") +
  scale_x_datetime(limits = x_limits) +  # Set x limits
  scale_y_continuous(limits = y_limits) +  # Set y limits
  theme_minimal()

# Display both plots side by side
grid.arrange(p1_plot, p2_plot, ncol = 2)

In pipe 1 water flow is generally stable, with values around 20, but data is limited to a short period at the beginning of November. In pipe 2 water flow fluctuates widely, with values varying between 0 and 80, spanning a longer period from early November to early December.

p1_ts <- ts(p1$Water, start = c(2015, 10, 23), frequency = 24)


p2_ts <- ts(p2$Water, start = c(2015, 10, 23), frequency = 24)



ggtsplot_combined <- function(ts1, ts2, title1, title2) {
  # Time series plots
  p1 <- autoplot(ts1) +
    scale_y_continuous(labels = comma) +
    ggtitle(title1) +
    theme(axis.title = element_blank(), 
          axis.text.x = element_blank(), 
          axis.ticks.x = element_blank())
  
  p2 <- autoplot(ts2) +
    scale_y_continuous(labels = comma) +
    ggtitle(title2) +
    theme(axis.title = element_blank(), 
          axis.text.x = element_blank(), 
          axis.ticks.x = element_blank())

  # ACF and PACF plots
  acf1 <- ggAcf(ts1) + ggtitle("ACF - Pipe 1") + theme_minimal()
  pacf1 <- ggPacf(ts1) + ggtitle("PACF - Pipe 1") + theme_minimal()
  acf2 <- ggAcf(ts2) + ggtitle("ACF - Pipe 2") + theme_minimal()
  pacf2 <- ggPacf(ts2) + ggtitle("PACF - Pipe 2") + theme_minimal()

  # Combine all plots into a single grid
  grid.arrange(
    p1, p2, 
    acf1, pacf1, 
    acf2, pacf2, 
    ncol = 2
  )
}

ggtsplot_combined(p1_ts, p2_ts, "Water Flow - Pipe 1", "Water Flow - Pipe 2")

For pipe 1 Water flow fluctuates within a smaller range approximately 15 to 30. The ACF and PACF plots for Pipe 1 show low, mostly insignificant correlations, indicating weak dependencies over time. For pipe 2 water flow varies more widely approximately 0 to 80. The ACF and PACF plots for Pipe 2 also have low correlations, though there is slightly more structure, suggesting some periodic dependencies but overall weak correlation as well.

# Function to check stationarity
check_stationarity <- function(ts) {
  results <- kpss.test(ts)
  return(results$p.value > 0.05)  # TRUE if stationary, FALSE otherwise
}

# Check stationarity for both pipes
p1_stationary <- check_stationarity(p1_ts)
p2_stationary <- check_stationarity(p2_ts)

# Return message based on the results
if (p1_stationary && p2_stationary) {
  message <- "Both pipes data is stationary."
} else {
  message <- "At least one of the pipes data is not stationary."
}

print(message)
## [1] "Both pipes data is stationary."
p1_subseries_plot <- ggsubseriesplot(p1_ts) +
  labs(title = "Subseries Plot for Water Flow - Pipe 1") +
  theme(axis.text.x = element_text(size = 6.5))  

p2_subseries_plot <- ggsubseriesplot(p2_ts) +
  labs(title = "Subseries Plot for Water Flow - Pipe 2") +
  theme(axis.text.x = element_text(size = 6.5)) 

# Arrange the plots side by side
grid.arrange(p1_subseries_plot, p2_subseries_plot, ncol = 2)

The water flow in pipe 1 varies slightly across seasons, fluctuating between 15 and 30, but no clear seasonal pattern is apparent. The flow generally centers around 20, with no consistent peaks across seasons. The water flow in pipe 2 shows greater variability within each season, ranging widely from 0 to 80. However, similar to Pipe 1, there is no strong seasonal trend, as the values do not follow a repetitive pattern across seasons.

Pipe 1 Forecasts

p1_tsibble <- p1 %>% as_tsibble(index = Datetime)
p1_tsibble <- p1_tsibble %>% 
  fill_gaps() 
lambda <- p1_tsibble %>%
  features(Water, features = guerrero) %>%
  pull(lambda_guerrero)

# Step 2: Fit ARIMA and Box-Cox ARIMA models
p1_fit <- p1_tsibble %>%
  model(
    ARIMA_bc = ARIMA(box_cox(Water, lambda)),
    ARIMA = ARIMA(Water)
  )

left_join(glance(p1_fit) %>% select(.model:BIC), 
          accuracy(p1_fit) %>% select(.model, RMSE)) %>%
  arrange(RMSE)
## # A tibble: 2 × 7
##   .model   sigma2 log_lik   AIC  AICc   BIC  RMSE
##   <chr>     <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA      17.9   -675. 1362. 1362. 1383.  4.22
## 2 ARIMA_bc   22.4   -702. 1412. 1412. 1426.  4.23
accuracy(p1_fit) %>%
  select(.model, RMSE:MAPE)
## # A tibble: 2 × 5
##   .model    RMSE   MAE   MPE  MAPE
##   <chr>    <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA_bc  4.23  3.33 -5.15  18.3
## 2 ARIMA     4.22  3.31 -5.01  18.2

For pipe 1 the untransformed ARIMA model outperforms the ARIMA_bc model across almost all metrics, particularly in terms of AIC, AICc, BIC, and RMSE. Based on these results, the ARIMA model is preferable for forecasting, as it shows better fit and slightly lower error overall. The Box-Cox transformation does not appear to improve model performance here.

p1_final_fit <- p1_tsibble %>%
  model(
    ARIMA = ARIMA(Water)
  )

p1_final_fit %>% 
    select(.model = "ARIMA") %>% report()
## Series: Water 
## Model: ARIMA(2,0,2) w/ mean 
## 
## Coefficients:
##          ar1     ar2      ma1      ma2  constant
##       0.1693  0.5913  -0.1540  -0.6364    4.7616
## s.e.  0.4699  0.4205   0.4477   0.4012    0.0581
## 
## sigma^2 estimated as 17.92:  log likelihood=-674.93
## AIC=1361.86   AICc=1362.22   BIC=1382.74
p1_forecast <- p1_final_fit %>%
  forecast(h = 24*7) ## setting h to 168(24*7) hours allows us to forecast the next week , 7days


# Plot the forecast
autoplot(p1_forecast, p1_tsibble) +
  labs(title = "ARIMA(2,0,2) w/ mean  Model Forecast for  Pipe 1 Water Flow", y = "Water Flow", x = "Datetime")

# Check residuals to ensure they resemble white noise
p1_final_fit %>%
  gg_tsresiduals() 

The residuals seem to be like white noise, indicating that the model I have chosen (ARIMA(2,0,2)) fits the data well in terms of capturing the underlying patterns.

Pipe 2 Forecasts

p2_tsibble <- p2 %>% as_tsibble(index = Datetime)
p2_tsibble <- p2_tsibble %>% 
  fill_gaps() 

lambda <- p2_tsibble %>%
  features(Water, features = guerrero) %>%
  pull(lambda_guerrero)

# Step 2: Fit ARIMA and Box-Cox ARIMA models
p2_fit <- p2_tsibble %>%
  model(
    ARIMA_bc = ARIMA(box_cox(Water, lambda)),
    ARIMA = ARIMA(Water)
  )

left_join(glance(p2_fit) %>% select(.model:BIC), 
          accuracy(p2_fit) %>% select(.model, RMSE)) %>%
  arrange(RMSE)
## # A tibble: 2 × 7
##   .model   sigma2 log_lik   AIC  AICc   BIC  RMSE
##   <chr>     <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA      256.  -4191. 8387. 8387. 8402.  16.0
## 2 ARIMA_bc   172.  -3992. 7990. 7990. 8005.  16.0
accuracy(p2_fit) %>%
  select(.model, RMSE:MAPE)
## # A tibble: 2 × 5
##   .model    RMSE   MAE   MPE  MAPE
##   <chr>    <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA_bc  16.0  13.1 -33.9  56.9
## 2 ARIMA     16.0  13.1 -34.6  57.2

Based on the performance metrics and the slightly better predictive accuracy, ARIMA would be the safer choice for making prediction.

p2_final_fit <- p2_tsibble %>%
  model(
    ARIMA = ARIMA(Water)
  )

p2_final_fit %>% 
    select(.model = "ARIMA") %>% report()
## Series: Water 
## Model: ARIMA(0,0,0)(0,0,1)[24] w/ mean 
## 
## Coefficients:
##         sma1  constant
##       0.0846   39.5732
## s.e.  0.0314    0.5472
## 
## sigma^2 estimated as 256:  log likelihood=-4190.54
## AIC=8387.08   AICc=8387.1   BIC=8401.8
p2_forecast <- p2_final_fit %>%
  forecast(h = 24*7) 

# Plot the forecast
autoplot(p2_forecast, p2_tsibble) +
  labs(title = "ARIMA(0,0,0)(0,0,1)[24] Model Forecast for Pipe 2 Water Flow", y = "Water Flow", x = "Datetime")

# Check residuals to ensure they resemble white noise
p2_final_fit %>%
  gg_tsresiduals() 

With a large forecast horizon h=168, the predictions tend to settle around the mean value of the data. The residuals also look like white noise.

Final step

I believe the generated forecast for water flow is unreliable because it fails to reflect the actual data patterns. However ARIMA was a good fit for the data. This may be due to the large forecast horizon, causing the predictions to gravitate towards the mean value.

p1_forecast <- p1_forecast %>%
  as.data.frame() %>%
  select(Datetime, .mean) %>%
  rename(Water1 = .mean)


p2_forecast <- p2_forecast %>%
  as.data.frame() %>%
  select(Datetime, .mean) %>%
  rename(Water2 = .mean)
write.csv(list('1-Pipe' = p1_forecast, '2-Pipe' = p2_forecast), file = 'WaterPipesForecasts.csv')  # I tried combining both pipes into 1 dataset and export that but because they have very different time stamps it was not visually appealing