library(readxl)
library(scales)
library(gridExtra)
library(dplyr)
library(tidyr)
library(tsoutliers)
library(tseries)
library(tsibble)
library(fable)
library(fabletools)
library(latex2exp)
library(forecast)
library(fpp3)
library(ggplot2)
atm <- read_excel("ATM624DATA.xlsx", col_types = c('date', 'text', 'numeric')) %>%
mutate(DATE = as_date(DATE)) %>%
as_tsibble(index = DATE, key = ATM) %>%
filter(DATE < "2010-05-01")
head(atm)
## # A tsibble: 6 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-02 ATM1 82
## 3 2009-05-03 ATM1 85
## 4 2009-05-04 ATM1 90
## 5 2009-05-05 ATM1 99
## 6 2009-05-06 ATM1 88
summary(atm)
## DATE ATM Cash
## Min. :2009-05-01 Length:1460 Min. : 0.0
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 0.5
## Median :2009-10-30 Mode :character Median : 73.0
## Mean :2009-10-30 Mean : 155.6
## 3rd Qu.:2010-01-29 3rd Qu.: 114.0
## Max. :2010-04-30 Max. :10919.8
## NA's :5
missing_ATMs <- atm %>% filter(is.na(Cash)) %>% select(DATE, ATM)
missing_ATMs
## # A tsibble: 5 x 2 [1D]
## # Key: ATM [2]
## DATE ATM
## <date> <chr>
## 1 2009-06-13 ATM1
## 2 2009-06-16 ATM1
## 3 2009-06-22 ATM1
## 4 2009-06-18 ATM2
## 5 2009-06-24 ATM2
# Filter data for ATM1 and ATM2
ATM1_data <- atm %>% filter(ATM == "ATM1")
ATM2_data <- atm %>% filter(ATM == "ATM2")
# Fit ARIMA model to ATM1
ATM1_arima <- auto.arima(ATM1_data$Cash, seasonal = TRUE)
# Forecast missing values for ATM1 using the fitted ARIMA model
ATM1_data$Cash_filled <- ifelse(is.na(ATM1_data$Cash),
forecast(ATM1_arima, h = sum(is.na(ATM1_data$Cash)))$mean,
ATM1_data$Cash)
# Fit ARIMA model to ATM2
ATM2_arima <- auto.arima(ATM2_data$Cash, seasonal = TRUE)
# Forecast missing values for ATM2 using the fitted ARIMA model
ATM2_data$Cash_filled <- ifelse(is.na(ATM2_data$Cash),
forecast(ATM2_arima, h = sum(is.na(ATM2_data$Cash)))$mean,
ATM2_data$Cash)
ATM_filled <- atm %>%
filter(!(ATM %in% c("ATM1", "ATM2"))) %>%
bind_rows(ATM1_data %>% select(ATM, Cash = Cash_filled),
ATM2_data %>% select(ATM, Cash = Cash_filled))
# Checking if it worked
final_missing <- sum(is.na(ATM_filled$Cash))
cat("Final combined data - Missing values:", final_missing, "\n")
## Final combined data - Missing values: 0
ATM_filled %>%
autoplot(Cash) +
facet_wrap(~ATM, scales = "free", nrow = 4) +
labs(title = " ATM Withdrawals")
ATM_4 <- ATM_filled %>%
filter(ATM == "ATM4")
Q1 <- quantile(ATM_4$Cash, 0.25, na.rm = TRUE)
Q3 <- quantile(ATM_4$Cash, 0.75, na.rm = TRUE)
IQR_value <- IQR(ATM_4$Cash, na.rm = TRUE)
# Determine the outlier thresholds
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Identify outliers
outliers <- ATM_4 %>%
filter(Cash < lower_bound | Cash > upper_bound)
# Display outliers
print(outliers)
## # A tsibble: 3 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-09-22 ATM4 1712.
## 2 2010-01-29 ATM4 1575.
## 3 2010-02-09 ATM4 10920.
ATM4_ts <- ts(ATM_4$Cash, frequency = 365, start = c(2009, 1)) # Adjust start as necessary
# Identify outliers using tsoutliers
outlier_results <- tsoutliers(ATM4_ts)
# Print outlier information
print(outlier_results)
## $index
## [1] 285
##
## $replacements
## [1] 230.1752
ATM_4$Cash[285] <- 230.1752
print(ATM_4$Cash[285])
## [1] 230.1752
autoplot(ATM_4) +
labs(title="ATM4", subtitle="Cash withdrawals daily", y="Hundreds USD")
## Plot variable not specified, automatically selected `.vars = Cash`
ATM_filled <- ATM_filled %>%
rename(date=DATE, atm=ATM, cash=Cash) %>%
mutate(date = as.Date(date)) %>%
pivot_wider(names_from=atm, values_from = cash)
ATM_1 <- ATM_filled %>%
select(date, ATM1)
ATM_1%>%
autoplot(ATM1) +
ggtitle("Original ATM1")
# Create the second plot (STL Decomposition)
ATM_1 %>%
model(STL(ATM1 ~ season(window = "periodic"), robust = TRUE)) %>%
components() %>%
autoplot() +
labs(title = "STL Decomposition for ATM1")
ATM_1 %>%
ACF(ATM1, lag_max = 28) %>%
autoplot()
#### The ACF plot shows significant lags at 2, 5, and 7, with lag 7
being the strongest. The slow drop in the ACF could mean the data is
non-stationary and may need differencing.
ndiffs(ATM_1$ATM1)
## [1] 0
lambda <- ATM_1 %>%
features(ATM1, features = guerrero) %>%
pull(lambda_guerrero)
atm1_fit <- ATM_1 %>%
model(
additive = ETS(ATM1 ~ error("A") + trend("N") + season("A")),
snaive = SNAIVE(ATM1),
ARIMA = ARIMA(box_cox(ATM1, lambda))
)
forecast_ATM1 <- atm1_fit %>%
forecast(h = 30)
# Plot the forecasts
forecast_ATM1 %>%
autoplot(ATM_1, level = NULL) +
facet_wrap(~ .model, scales = "free", ncol = 2) + # Adjust ncol to control number of columns
guides(colour = guide_legend(title = "Forecast")) +
labs(title = "ATM1 Forecasts") +
xlab("Date") +
ylab("Cash values in Hundreds")
accuracy(atm1_fit) %>%
select(.model, RMSE:MAPE)
## # A tibble: 3 × 5
## .model RMSE MAE MPE MAPE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 additive 23.8 15.1 -106. 121.
## 2 snaive 27.9 17.8 -96.6 116.
## 3 ARIMA 25.1 16.2 -87.8 107.
ATM1_bestfit <- ATM_1 %>%
model(
ETS = ETS(ATM1))
forecast_ATM1 <- ATM1_bestfit %>%
forecast(h = 31) %>%
filter(.model=='ETS')
forecast_ATM1 %>%
autoplot(ATM_1) +
labs(title = "Forecast for ATM1 using ETS",
y = "Cash Values in Hundreds")
ATM1_bestfit%>%
gg_tsresiduals() +
labs(title="Residuals for ETS")
ATM_2 <- ATM_filled %>%
select(date, ATM2)
ATM_2 %>%
autoplot(ATM2) +
ggtitle("Original ATM2")
# STL decomposition
ATM_2 %>%
model(STL(ATM2 ~ season(window = "periodic"), robust = TRUE)) %>%
components() %>%
autoplot() +
labs(title = "STL Decomposition for ATM2")
ATM_2 %>%
ACF(ATM2, lag_max = 28) %>%
autoplot()
ndiffs(ATM_2$ATM2)
## [1] 1
ATM_2_diff <- ATM_2 %>%
mutate(ATM2_diff = ATM2 - lag(ATM2, 1)) %>%
filter(!is.na(ATM2_diff))
# Perform KPSS test
kpss_test <- kpss.test(ATM_2_diff$ATM2_diff)
print(kpss_test)
##
## KPSS Test for Level Stationarity
##
## data: ATM_2_diff$ATM2_diff
## KPSS Level = 0.014714, Truncation lag parameter = 5, p-value = 0.1
mean_value <- mean(ATM_2_diff$ATM2_diff[ATM_2_diff$ATM2_diff >= 0], na.rm = TRUE)
ATM_2_diff$ATM2_diff[ATM_2_diff$ATM2_diff < 0] <- mean_value
# Calculate lambda for Box-Cox transformation on the differenced data
lambda <- ATM_2_diff %>%
features(ATM2_diff, features = guerrero) %>%
pull(lambda_guerrero)
# Fit the models using the differenced data
atm2_fit <- ATM_2_diff %>%
model(
additive = ETS(ATM2_diff ~ error("A") + trend("N") + season("A")),
snaive = SNAIVE(ATM2_diff),
ARIMA = ARIMA(box_cox(ATM2_diff, lambda))
)
# Forecast for the next 30 periods
forecast_ATM2 <- atm2_fit %>%
forecast(h = 30)
# Plot the forecasts
forecast_ATM2 %>%
autoplot(ATM_2_diff, level = NULL) +
facet_wrap(~ .model, scales = "free", ncol = 2) + # Adjust ncol to control number of columns
guides(colour = guide_legend(title = "Forecast")) +
labs(title = "ATM2 Forecasts") +
xlab("Date") +
ylab("Cash values in Hundreds")
# Get stats for the models
model_stats <- left_join(
glance(atm2_fit) %>% select(.model:BIC),
accuracy(atm2_fit) %>% select(.model, RMSE)
) %>%
arrange(RMSE)
# Display accuracy metrics
accuracy(atm2_fit) %>%
select(.model, RMSE:MAPE)
## # A tibble: 3 × 5
## .model RMSE MAE MPE MAPE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 additive 20.8 13.9 -Inf Inf
## 2 snaive 27.6 15.9 -Inf Inf
## 3 ARIMA 22.1 13.6 -Inf Inf
# Display the model statistics
model_stats
## # A tibble: 3 × 7
## .model sigma2 log_lik AIC AICc BIC RMSE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 additive 444. -2178. 4376. 4377. 4415. 20.8
## 2 ARIMA 1622. -1862. 3732. 3733. 3748. 22.1
## 3 snaive 765. NA NA NA NA 27.6
ATM2_bestfit <- ATM_2_diff %>%
model(
ARIMA = ARIMA(ATM2_diff))
ATM2_bestfit %>% select(.model = "ARIMA") %>% report()
## Series: ATM2_diff
## Model: ARIMA(0,0,0)(2,0,0)[7] w/ mean
##
## Coefficients:
## sar1 sar2 constant
## 0.3051 0.3525 17.6802
## s.e. 0.0489 0.0502 1.1016
##
## sigma^2 estimated as 495.1: log likelihood=-1646.04
## AIC=3300.09 AICc=3300.2 BIC=3315.68
forecast_ATM2 <- ATM2_bestfit %>%
forecast(h = 31) %>%
filter(.model=='ARIMA')
# forcasted plot
forecast_ATM2 %>%
autoplot(ATM_2_diff) +
ggtitle(TeX(paste0("ATM 2 Forecast with $ARIMA(1,0,0)(0,1,1))_7$ and lambda = ",
round(lambda,2))))
# residuals
ATM2_bestfit %>%
select(ARIMA) %>%
gg_tsresiduals() +
ggtitle(TeX(paste0("Residuals for $ARIMA(1,0,0)(0,1,1)_7$ with lambda = ",
round(lambda,2))))
ATM_filled %>%
autoplot(ATM3) +
ggtitle("Original ATM3")
ATM3_fit <- ATM_filled %>%
filter(ATM3 != 0) %>%
model(MEAN(ATM3))
forecast_atm3 <- ATM3_fit %>%
forecast(h = 31)
# forcasted plot
forecast_atm3 %>%
autoplot(ATM_filled) +
ggtitle("ATM 3 Forecast with the MEAN() Model")
ATM_4%>%
autoplot() +
labs(title = "Original ATM3")
ATM_4%>%
model(STL(Cash ~ season(window = "periodic"), robust = TRUE)) %>%
components() %>%
autoplot() +
labs(title = "STL Decomposition for ATM4 ")
ATM_4 %>%
ACF(Cash, lag_max = 28) %>%
autoplot()
ndiffs(ATM_4$Cash)
## [1] 0
lambda <- ATM_4 %>%
features(Cash, features = guerrero) %>%
pull(lambda_guerrero)
atm4_fit <- ATM_4 %>%
model(
additive = ETS(Cash ~ error("A") + trend("N") + season("A")),
snaive = SNAIVE(Cash),
multiplicative_ts = ETS(box_cox(Cash,lambda) ~ error("M") + trend("N") + season("M")),
ARIMA = ARIMA(box_cox(Cash,lambda)),
)
forecast_ATM4 <- atm4_fit %>%
forecast(h = 30)
forecast_ATM4 %>%
autoplot(ATM_4, level = NULL) +
facet_wrap(~ .model, scales = "free", ncol = 2) + # Adjust ncol to control number of columns
guides(colour = guide_legend(title = "Forecast")) +
labs(title = "ATM4 Forecasts") +
xlab("Date") +
ylab("Cash values in Hundreds")
# stats for the models
left_join(glance(atm4_fit) %>% select(.model:BIC),
accuracy(atm4_fit) %>% select(.model, RMSE)) %>%
arrange(RMSE)
## # A tibble: 4 × 7
## .model sigma2 log_lik AIC AICc BIC RMSE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 additive 110850. -3192. 6404. 6405. 6443. 329.
## 2 multiplicative_ts 0.219 -2003. 4026. 4027. 4065. 343.
## 3 ARIMA 176. -1461. 2931. 2931. 2951. 352.
## 4 snaive 205805. NA NA NA NA 453.
accuracy(atm4_fit) %>%
select(.model, RMSE:MAPE)
## # A tibble: 4 × 5
## .model RMSE MAE MPE MAPE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 additive 329. 265. -515. 545.
## 2 snaive 453. 346. -392. 447.
## 3 multiplicative_ts 343. 262. -374. 417.
## 4 ARIMA 352. 274. -368. 413.
ATM4_bestfit <- ATM_4 %>%
model(
ARIMA = ARIMA(Cash))
ATM4_bestfit %>% select(.model = "ARIMA") %>% report()
## Series: Cash
## Model: ARIMA(3,0,2)(1,0,0)[7] w/ mean
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 sar1 constant
## -0.8778 -0.6769 0.1700 0.9634 0.8065 0.2484 795.0206
## s.e. 0.1004 0.1186 0.0564 0.0906 0.1079 0.0556 48.8937
##
## sigma^2 estimated as 117528: log likelihood=-2645.21
## AIC=5306.42 AICc=5306.82 BIC=5337.62
forecast_ATM4 <- ATM4_bestfit %>%
forecast(h = 31) %>%
filter(.model=='ARIMA')
# forcasted plot
forecast_ATM4 %>%
autoplot(ATM_4) +
ggtitle(TeX(paste0("ATM 4 Forecast with $ARIMA(3,0,2)(1,0,0)_7$ and lambda = ",
round(lambda,2))))
# residuals
ATM4_bestfit %>%
select(ARIMA) %>%
gg_tsresiduals() +
ggtitle(TeX(paste0("Residuals for $ARIMA(3,0,2)(1,0,0)_7$ with lambda = ",
round(lambda,2))))
forecast_ATM1 <- forecast_ATM1 %>%
rename(atm1 = .mean)
forecast_ATM2 <- forecast_ATM2 %>%
rename(atm2 = .mean)
forecast_atm3 <- forecast_atm3 %>%
rename(atm3 = .mean)
forecast_ATM4 <- forecast_ATM4 %>%
rename(atm4 = .mean)
combined_forecasts <- bind_cols(forecast_ATM1, forecast_ATM2, forecast_atm3, forecast_ATM4)
combined_forecasts <- combined_forecasts %>%
rename(date = date...2)
final_forecast <- combined_forecasts %>%
select(date, atm1, atm2, atm3, atm4)
final_forecast %>% write.csv("final_forecasts.csv")
res <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
res_ts <- res %>%
rename(Date = 'YYYY-MMM') %>%
mutate(Date = yearmonth(Date)) %>%
as_tsibble(index = Date)
head(res_ts)
## # A tsibble: 6 x 3 [1M]
## CaseSequence Date KWH
## <dbl> <mth> <dbl>
## 1 733 1998 Jan 6862583
## 2 734 1998 Feb 5838198
## 3 735 1998 Mar 5420658
## 4 736 1998 Apr 5010364
## 5 737 1998 May 4665377
## 6 738 1998 Jun 6467147
res_ts %>%
autoplot(KWH)
summary(res_ts$KWH)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 770523 5429912 6283324 6502475 7620524 10655730 1
missing_KWH<- res_ts %>% filter(is.na(KWH)) %>% select(Date, KWH)
missing_KWH
## # A tsibble: 1 x 2 [1M]
## Date KWH
## <mth> <dbl>
## 1 2008 Sep NA
res_arima <- auto.arima(res_ts$KWH, seasonal = TRUE)
# Count the number of missing values in 'KWH'
num_missing <- sum(is.na(res_ts$KWH))
# If there are missing values, forecast them using the fitted ARIMA model
if (num_missing > 0) {
forecast_values <- forecast(res_arima, h = num_missing)$mean
# Create a new column 'KWH_filled' by filling NA with forecast values
res_filled <- res_ts %>%
mutate(KWH_filled = ifelse(is.na(KWH), forecast_values, KWH))
} else {
res_filled <- res_ts %>%
mutate(KWH_filled = KWH)
}
res_filled <- res_filled %>%
select(CaseSequence,Date, KWH_filled)
summary(res_filled$KWH_filled) ## Check if the missing value is handled
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 770523 5434539 6314472 6516367 7649733 10655730
Q1 <- quantile(res_filled$KWH_filled, 0.25)
Q3 <- quantile(res_filled$KWH_filled, 0.75)
IQR <- Q3 - Q1
# Define upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Identify outliers
potential_outliers <- res_filled$KWH_filled[res_filled$KWH_filled < lower_bound | res_filled$KWH_filled > upper_bound]
print(potential_outliers)
## [1] 770523
KWH_filled_with_na <- res_filled$KWH_filled
KWH_filled_with_na[KWH_filled_with_na < lower_bound | KWH_filled_with_na > upper_bound] <- NA
# Step 2: Fit an ARIMA model (including NA values)
arima_model <- auto.arima(KWH_filled_with_na)
# Step 3: Forecast missing values
forecasts <- forecast(arima_model, h = sum(is.na(KWH_filled_with_na)))
# Impute the NA values with the forecasted values
KWH_imputed <- KWH_filled_with_na
KWH_imputed[is.na(KWH_imputed)] <- forecasts$mean
res_filled$KWH_filled <- KWH_imputed
head(res_filled)
## # A tsibble: 6 x 3 [1M]
## CaseSequence Date KWH_filled
## <dbl> <mth> <dbl>
## 1 733 1998 Jan 6862583
## 2 734 1998 Feb 5838198
## 3 735 1998 Mar 5420658
## 4 736 1998 Apr 5010364
## 5 737 1998 May 4665377
## 6 738 1998 Jun 6467147
res_filled <- res_filled %>%
mutate(KWH_filled = KWH_filled / 1000)
head(res_filled)
## # A tsibble: 6 x 3 [1M]
## CaseSequence Date KWH_filled
## <dbl> <mth> <dbl>
## 1 733 1998 Jan 6863.
## 2 734 1998 Feb 5838.
## 3 735 1998 Mar 5421.
## 4 736 1998 Apr 5010.
## 5 737 1998 May 4665.
## 6 738 1998 Jun 6467.
res_filled%>%
autoplot(KWH_filled) +
ggtitle("Residential Power Usage cleaned plot")
res_filled %>%
model(STL(KWH_filled ~ season(window = "periodic"), robust = TRUE)) %>%
components() %>%
autoplot() +
labs(title = "STL Decomposition cleaned plot")
res_filled$KWH_filled <- as.numeric(res_filled$KWH_filled)
# Create the time series object
plot_season <- ts(res_filled$KWH_filled, start = c(1998, 1), frequency = 12)
ggseasonplot(plot_season)+
ggtitle("Residential Power Usage Deasonally Each Year")
power_lambda <- res_filled %>%
features(KWH_filled, features = guerrero) %>%
pull(lambda_guerrero)
res_filled <- res_filled %>%
mutate(KWH_bc = box_cox(KWH_filled, power_lambda))
summary(res_filled$KWH_bc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.040 3.058 3.069 3.069 3.082 3.103
res_filled %>%
autoplot(KWH_bc) +
labs(y = "KWH in Thousands")
res_filled %>%
ACF(KWH_bc, lag_max = 36) %>%
autoplot()
ndiffs(res_filled$KWH_bc)
## [1] 1
diff_res <- res_filled %>%
mutate(diff_KWH= difference(KWH_filled), diff_KWH_bc = difference(KWH_bc)) %>%
select(-KWH_filled, -KWH_bc) %>%
slice(-1)
print(kpss.test(diff_res$diff_KWH_bc))
## Warning in kpss.test(diff_res$diff_KWH_bc): p-value greater than printed
## p-value
##
## KPSS Test for Level Stationarity
##
## data: diff_res$diff_KWH_bc
## KPSS Level = 0.039109, Truncation lag parameter = 4, p-value = 0.1
ndiffs(diff_res$diff_KWH_bc)
## [1] 0
diff_train <- diff_res %>%
filter(year(Date) < 2013)
diff_fit <- diff_train %>%
model(
ARIMA = ARIMA(diff_KWH),
`Auto ARIMA` = ARIMA(diff_KWH, stepwise = FALSE, approx = FALSE)
)
diff_fc <- diff_fit %>%
forecast(h = "1 year")
#plot
diff_fc %>%
autoplot(diff_res, level = NULL)+
facet_wrap( ~ .model, scales = "free_y") +
guides(colour = guide_legend(title = "Forecast"))+
labs(title= "KWH Forecasts")+
xlab("Date") +
ylab("KWH in Thousands")
train <- res_filled %>%
filter(year(Date) < 2013)
#models
fit <- train %>%
model(
ETS = ETS(KWH_filled),
`Additive ETS` = ETS(KWH_filled ~ error("A") + trend("A") + season("A")),
SNAIVE = SNAIVE(KWH_filled)
)
#forecast of 2013
fc <- fit %>%
forecast(h = "1 year")
#plot
fc %>%
autoplot(res_filled, level = NULL)+
facet_wrap( ~ .model, scales = "free_y",ncol=2) +
guides(colour = guide_legend(title = "Forecast"))+
labs(title= "KWH Forecasts",
subtitle = "January 2013 - December 2013")+
xlab("Month") +
ylab("KWH in Thousands")
accuracy(diff_fc, diff_res) %>%
select(.model, RMSE:MAE)
## # A tibble: 2 × 3
## .model RMSE MAE
## <chr> <dbl> <dbl>
## 1 ARIMA 1252. 904.
## 2 Auto ARIMA 1187. 823.
accuracy(fc, res_filled) %>%
select(.model, RMSE:MAE)
## # A tibble: 3 × 3
## .model RMSE MAE
## <chr> <dbl> <dbl>
## 1 Additive ETS 1047. 647.
## 2 ETS 1011. 681.
## 3 SNAIVE 1036. 619.
ETS_fit <- res_filled %>%
model(`Additive ETS` = ETS(KWH_filled ~ error("A") + trend("A") + season("A")))
ETS_forecast <- ETS_fit %>%
forecast(h=12)
ETS_forecast %>%
autoplot(res_filled) +
labs(title = " Residential Power Usage using Additive ETS ",
y = "KWH in Thousands")
final_forec <- ETS_forecast %>%
rename(KWH_2014 = .mean) %>%
select(Date, KWH_2014)
final_forec %>% write.csv("KWH_forecasts.csv")
p1 <- read_excel("Waterflow_Pipe1.xlsx",col_types = c("date", "numeric"))
p2 <- read_excel("Waterflow_Pipe2.xlsx",col_types = c("date", "numeric"))
colnames(p1) <- c("Datetime", "Waterflow")
colnames(p2) <- c("Datetime", "Waterflow")
# Process data
p1 <- p1 %>%
mutate(
Date = as.Date(Datetime),
Time = hour(Datetime)
) %>%
group_by(Date, Time) %>%
summarise(Water = mean(Waterflow)) %>% #`na.rm = TRUE` to handle any missing values
ungroup() %>%
mutate(Datetime = ymd_h(paste(Date, Time))) %>%
select(Datetime, Water)
## `summarise()` has grouped output by 'Date'. You can override using the
## `.groups` argument.
p2 <- p2 %>%
mutate(
Date = as.Date(Datetime),
Time = hour(Datetime)
) %>%
group_by(Date, Time) %>%
summarise(Water = mean(Waterflow)) %>%
ungroup() %>%
mutate(Datetime = ymd_h(paste(Date, Time))) %>%
select(Datetime, Water)
## `summarise()` has grouped output by 'Date'. You can override using the
## `.groups` argument.
head(p1)
## # A tibble: 6 × 2
## Datetime Water
## <dttm> <dbl>
## 1 2015-10-23 00:00:00 26.1
## 2 2015-10-23 01:00:00 18.9
## 3 2015-10-23 02:00:00 15.2
## 4 2015-10-23 03:00:00 23.1
## 5 2015-10-23 04:00:00 15.5
## 6 2015-10-23 05:00:00 22.7
head(p2)
## # A tibble: 6 × 2
## Datetime Water
## <dttm> <dbl>
## 1 2015-10-23 01:00:00 18.8
## 2 2015-10-23 02:00:00 43.1
## 3 2015-10-23 03:00:00 38.0
## 4 2015-10-23 04:00:00 36.1
## 5 2015-10-23 05:00:00 31.9
## 6 2015-10-23 06:00:00 28.2
colSums(is.na(p1))
## Datetime Water
## 0 0
colSums(is.na(p2))
## Datetime Water
## 0 0
library(gridExtra)
x_limits <- range(c(p1$Datetime, p2$Datetime))
y_limits <- range(c(p1$Water, p2$Water))
p1_plot <- ggplot(p1, aes(x = Datetime, y = Water)) +
geom_line(color = "blue") +
labs(title = "Water Flow - Pipe 1", x = "Date Time", y = "Water Flow") +
scale_x_datetime(limits = x_limits) + # Set x limits
scale_y_continuous(limits = y_limits) + # Set y limits
theme_minimal()
# Plot for Pipe 2 with same axis limits
p2_plot <- ggplot(p2, aes(x = Datetime, y = Water)) +
geom_line(color = "red") +
labs(title = "Water Flow - Pipe 2", x = "Date Time", y = "Water Flow") +
scale_x_datetime(limits = x_limits) + # Set x limits
scale_y_continuous(limits = y_limits) + # Set y limits
theme_minimal()
# Display both plots side by side
grid.arrange(p1_plot, p2_plot, ncol = 2)
p1_ts <- ts(p1$Water, start = c(2015, 10, 23), frequency = 24)
p2_ts <- ts(p2$Water, start = c(2015, 10, 23), frequency = 24)
ggtsplot_combined <- function(ts1, ts2, title1, title2) {
# Time series plots
p1 <- autoplot(ts1) +
scale_y_continuous(labels = comma) +
ggtitle(title1) +
theme(axis.title = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
p2 <- autoplot(ts2) +
scale_y_continuous(labels = comma) +
ggtitle(title2) +
theme(axis.title = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
# ACF and PACF plots
acf1 <- ggAcf(ts1) + ggtitle("ACF - Pipe 1") + theme_minimal()
pacf1 <- ggPacf(ts1) + ggtitle("PACF - Pipe 1") + theme_minimal()
acf2 <- ggAcf(ts2) + ggtitle("ACF - Pipe 2") + theme_minimal()
pacf2 <- ggPacf(ts2) + ggtitle("PACF - Pipe 2") + theme_minimal()
# Combine all plots into a single grid
grid.arrange(
p1, p2,
acf1, pacf1,
acf2, pacf2,
ncol = 2
)
}
ggtsplot_combined(p1_ts, p2_ts, "Water Flow - Pipe 1", "Water Flow - Pipe 2")
# Function to check stationarity
check_stationarity <- function(ts) {
results <- kpss.test(ts)
return(results$p.value > 0.05) # TRUE if stationary, FALSE otherwise
}
# Check stationarity for both pipes
p1_stationary <- check_stationarity(p1_ts)
p2_stationary <- check_stationarity(p2_ts)
# Return message based on the results
if (p1_stationary && p2_stationary) {
message <- "Both pipes data is stationary."
} else {
message <- "At least one of the pipes data is not stationary."
}
print(message)
## [1] "Both pipes data is stationary."
p1_subseries_plot <- ggsubseriesplot(p1_ts) +
labs(title = "Subseries Plot for Water Flow - Pipe 1") +
theme(axis.text.x = element_text(size = 6.5))
p2_subseries_plot <- ggsubseriesplot(p2_ts) +
labs(title = "Subseries Plot for Water Flow - Pipe 2") +
theme(axis.text.x = element_text(size = 6.5))
# Arrange the plots side by side
grid.arrange(p1_subseries_plot, p2_subseries_plot, ncol = 2)
p1_tsibble <- p1 %>% as_tsibble(index = Datetime)
p1_tsibble <- p1_tsibble %>%
fill_gaps()
lambda <- p1_tsibble %>%
features(Water, features = guerrero) %>%
pull(lambda_guerrero)
# Step 2: Fit ARIMA and Box-Cox ARIMA models
p1_fit <- p1_tsibble %>%
model(
ARIMA_bc = ARIMA(box_cox(Water, lambda)),
ARIMA = ARIMA(Water)
)
left_join(glance(p1_fit) %>% select(.model:BIC),
accuracy(p1_fit) %>% select(.model, RMSE)) %>%
arrange(RMSE)
## # A tibble: 2 × 7
## .model sigma2 log_lik AIC AICc BIC RMSE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA 17.9 -675. 1362. 1362. 1383. 4.22
## 2 ARIMA_bc 22.4 -702. 1412. 1412. 1426. 4.23
accuracy(p1_fit) %>%
select(.model, RMSE:MAPE)
## # A tibble: 2 × 5
## .model RMSE MAE MPE MAPE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA_bc 4.23 3.33 -5.15 18.3
## 2 ARIMA 4.22 3.31 -5.01 18.2
p1_final_fit <- p1_tsibble %>%
model(
ARIMA = ARIMA(Water)
)
p1_final_fit %>%
select(.model = "ARIMA") %>% report()
## Series: Water
## Model: ARIMA(2,0,2) w/ mean
##
## Coefficients:
## ar1 ar2 ma1 ma2 constant
## 0.1693 0.5913 -0.1540 -0.6364 4.7616
## s.e. 0.4699 0.4205 0.4477 0.4012 0.0581
##
## sigma^2 estimated as 17.92: log likelihood=-674.93
## AIC=1361.86 AICc=1362.22 BIC=1382.74
p1_forecast <- p1_final_fit %>%
forecast(h = 24*7) ## setting h to 168(24*7) hours allows us to forecast the next week , 7days
# Plot the forecast
autoplot(p1_forecast, p1_tsibble) +
labs(title = "ARIMA(2,0,2) w/ mean Model Forecast for Pipe 1 Water Flow", y = "Water Flow", x = "Datetime")
# Check residuals to ensure they resemble white noise
p1_final_fit %>%
gg_tsresiduals()
p2_tsibble <- p2 %>% as_tsibble(index = Datetime)
p2_tsibble <- p2_tsibble %>%
fill_gaps()
lambda <- p2_tsibble %>%
features(Water, features = guerrero) %>%
pull(lambda_guerrero)
# Step 2: Fit ARIMA and Box-Cox ARIMA models
p2_fit <- p2_tsibble %>%
model(
ARIMA_bc = ARIMA(box_cox(Water, lambda)),
ARIMA = ARIMA(Water)
)
left_join(glance(p2_fit) %>% select(.model:BIC),
accuracy(p2_fit) %>% select(.model, RMSE)) %>%
arrange(RMSE)
## # A tibble: 2 × 7
## .model sigma2 log_lik AIC AICc BIC RMSE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA 256. -4191. 8387. 8387. 8402. 16.0
## 2 ARIMA_bc 172. -3992. 7990. 7990. 8005. 16.0
accuracy(p2_fit) %>%
select(.model, RMSE:MAPE)
## # A tibble: 2 × 5
## .model RMSE MAE MPE MAPE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA_bc 16.0 13.1 -33.9 56.9
## 2 ARIMA 16.0 13.1 -34.6 57.2
p2_final_fit <- p2_tsibble %>%
model(
ARIMA = ARIMA(Water)
)
p2_final_fit %>%
select(.model = "ARIMA") %>% report()
## Series: Water
## Model: ARIMA(0,0,0)(0,0,1)[24] w/ mean
##
## Coefficients:
## sma1 constant
## 0.0846 39.5732
## s.e. 0.0314 0.5472
##
## sigma^2 estimated as 256: log likelihood=-4190.54
## AIC=8387.08 AICc=8387.1 BIC=8401.8
p2_forecast <- p2_final_fit %>%
forecast(h = 24*7)
# Plot the forecast
autoplot(p2_forecast, p2_tsibble) +
labs(title = "ARIMA(0,0,0)(0,0,1)[24] Model Forecast for Pipe 2 Water Flow", y = "Water Flow", x = "Datetime")
# Check residuals to ensure they resemble white noise
p2_final_fit %>%
gg_tsresiduals()
p1_forecast <- p1_forecast %>%
as.data.frame() %>%
select(Datetime, .mean) %>%
rename(Water1 = .mean)
p2_forecast <- p2_forecast %>%
as.data.frame() %>%
select(Datetime, .mean) %>%
rename(Water2 = .mean)
write.csv(list('1-Pipe' = p1_forecast, '2-Pipe' = p2_forecast), file = 'WaterPipesForecasts.csv') # I tried combining both pipes into 1 dataset and export that but because they have very different time stamps it was not visually appealing