suppressMessages(suppressWarnings(library(fpp2)))
suppressMessages(suppressWarnings(library(readxl)))
suppressMessages(suppressWarnings(library(seasonal)))
suppressMessages(suppressWarnings(library(rdatamarket)))
suppressMessages(suppressWarnings(library(tseries)))
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(scales)))
suppressMessages(suppressWarnings(library(forecast)))
suppressMessages(suppressWarnings(library(lubridate)))
# read the data
df_atm <- readxl::read_excel("C:/Users/rites/Documents/GitHub/Data624-Project1/ATM624Data.xlsx")
## readxl works best with a newer version of the tibble package.
## You currently have tibble v1.4.2.
## Falling back to column name repair from tibble <= v1.4.2.
## Message displays once per session.
# data preprocessing:
df_atm <- df_atm %>%
drop_na() %>%
spread(ATM, Cash) %>%
mutate(DATE = as.Date(DATE, origin = "1899-12-30")) # in Excel, 1 == 1/1/1900
# convert to timeseries
atm_ts <- ts(df_atm %>% select(-DATE))
df_atm %>% gather(ATM, Cash, -DATE) %>%
ggplot(aes(x = DATE, y = Cash, col = ATM)) +
geom_line(show.legend = FALSE) +
facet_wrap(~ ATM, ncol = 1, scales = "free_y") +
labs(title = "Cash withdrawn from ATMs", x = "Date") +
scale_y_continuous("Cash withdrawn (hundreds)", labels = dollar)
# ATM1 & ATM2
atm1 <- atm_ts[, "ATM1"]
atm2 <- atm_ts[, "ATM2"]
#last 3 observations of ATM3 & convert to ts
atm3 <- atm_ts[(nrow(atm_ts) - 2):nrow(atm_ts), "ATM3"]
atm3 <- ts(atm3, start = 363)
atm3 <- atm_ts[, "ATM3"]
atm3[which(atm3 == 0)] <- NA
# ATM4 & impute spike with median
atm4 <- atm_ts[, "ATM4"]
atm4[which.max(atm4)] <- median(atm4, na.rm = TRUE)
autoplot(ts(atm_ts[1:61, ])) +
labs(title = "Cash withdrawn from ATMs", x = "Day") +
scale_y_continuous("Cash withdrawn (hundreds)", labels = dollar) +
scale_color_discrete(NULL)
atm1 <- ts(atm1, frequency = 7)
atm2 <- ts(atm2, frequency = 7)
# impute NA value
atm1[which(is.na(atm1))] <- median(atm1, na.rm = TRUE)
atm2[which(is.na(atm2))] <- median(atm2, na.rm = TRUE)
atm4 <- ts(atm4, frequency = 7)
ggtsdisplay(atm1, points = FALSE, plot.type = "spectrum",
main = "Withdrawals from atm1", xlab = "Week", ylab = "Cash (hundreds)")
ggtsdisplay(diff(atm1, 7), points = FALSE,
main = "Dif lag7 withdrawals from atm1",
xlab = "Week", ylab = "Cash (hundreds)")
# lambda for Box-cox transformation
atm1_lambda <- BoxCox.lambda(atm1)
# define function to create models & return AIC values for timeseries
atm_aic <- function(p, d, q, P, D, Q) {
# create model with Box-Cox and specified ARIMA parameters; extract AIC
AIC(Arima(atm1, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm1_lambda))
}
# create possible combinations of p, q, P, Q except all zero
expand.grid(p = 0:1, q = 0:1, P = 0:1, Q = 0:1) %>%
filter(p > 0 | q > 0 | P > 0 | Q > 0) %>%
# calc AIC for models
mutate(aic = pmap_dbl(list(p, 0, q, P, 1, Q), atm_aic)) %>%
# return best AIC
slice(which.min(aic))
## p q P Q aic
## 1 1 1 0 1 1188.862
atm1_fit <- Arima(atm1, order = c(1, 0, 1), seasonal = c(0, 1, 1), lambda = atm1_lambda)
Box.test(resid(atm1_fit), type = "L", fitdf = 3, lag = 7)
##
## Box-Ljung test
##
## data: resid(atm1_fit)
## X-squared = 8.2634, df = 4, p-value = 0.08239
ggtsdisplay(resid(atm1_fit), points = FALSE, plot.type = "histogram",
main = "Residuals for ARIMA(1,0,1)(0,1,1) fit of atm1 withdrawals",
xlab = "Week", ylab = "Residual")
ggtsdisplay(atm2, points = FALSE,
main = "Withdrawals from atm2", xlab = "Week", ylab = "Cash (hundreds)")
ggtsdisplay(diff(atm2, 7), points = FALSE,
main = "Dif lag7 withdrawals from atm2",
xlab = "Week", ylab = "Cash (hundreds)")
#lambda for Box-cox transformation
atm2_lambda <- BoxCox.lambda(atm2)
# repurpose above function for atm2
atm_aic <- function(p, d, q, P, D, Q) {
# create model with Box-Cox and specified ARIMA parameters; extract AIC
AIC(Arima(atm2, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm2_lambda))
}
# create possible combinations of p, q, P, Q except all zero
expand.grid(p = c(0, 2, 5), q = c(0, 2, 5), P = 0:1, Q = 0:1) %>%
filter(p > 0 | q > 0 | P > 0 | Q > 0) %>%
# calc AIC for models
mutate(aic = pmap_dbl(list(p, 0, q, P, 1, Q), atm_aic)) %>%
# return best AIC
slice(which.min(aic))
## p q P Q aic
## 1 5 5 0 1 2544.791
atm2_fit <- Arima(atm2, order = c(5, 0, 5), seasonal = c(0, 1, 1), lambda = atm2_lambda)
Box.test(resid(atm2_fit), type = "L", fitdf = 11, lag = 14)
##
## Box-Ljung test
##
## data: resid(atm2_fit)
## X-squared = 4.696, df = 3, p-value = 0.1955
ggtsdisplay(resid(atm2_fit), points = FALSE, plot.type = "histogram",
main = "Residuals for ARIMA(5,0,5)(0,1,1) of atm2 withdrawals",
xlab = "Week", ylab = "Residual")
ggtsdisplay(atm4, points = FALSE,
main = "Withdrawals from atm4", xlab = "Week", ylab = "Cash (hundreds)")
ggtsdisplay(diff(atm4, 7), points = FALSE,
main = "Dif lag7 withdrawals from atm4",
xlab = "Week", ylab = "Cash (hundreds)")
# lambda for Box-cox transformation
atm4_lambda <- BoxCox.lambda(atm4)
# repurpose above function for atm4
atm_aic <- function(p, d, q, P, D, Q) {
# create model with Box-Cox and specified ARIMA parameters; extract AIC
AIC(Arima(atm4, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm4_lambda))
}
# create possible combinations of p, q, P, Q except all zero
expand.grid(p = c(0, 2, 5), q = c(0, 2, 5), P = 0:1, Q = 0:1) %>%
filter(p > 0 | q > 0 | P > 0 | Q > 0) %>%
# calc AIC for models
mutate(aic = pmap_dbl(list(p, 0, q, P, 1, Q), atm_aic)) %>%
# return best AIC
slice(which.min(aic))
## p q P Q aic
## 1 5 2 0 1 2863.061
atm4_fit <- Arima(atm4, order = c(0, 0, 2), seasonal = c(0, 1, 1), lambda = atm4_lambda)
Box.test(resid(atm4_fit), type = "L", fitdf = 3, lag = 7)
##
## Box-Ljung test
##
## data: resid(atm4_fit)
## X-squared = 8.6819, df = 4, p-value = 0.06956
ggtsdisplay(resid(atm4_fit), points = FALSE, plot.type = "histogram",
main = "Residuals for ARIMA(0,0,2)(0,1,1) of atm4 withdrawals",
xlab = "Week", ylab = "Residual")
atm1_forecast <- forecast(atm1_fit, 31, level = 95)
atm2_forecast <- forecast(atm2_fit, 31, level = 95)
atm3_forecast <- meanf(atm3, 31, level = 95)
atm4_forecast <- forecast(atm4_fit, 31, level = 95)
gridExtra::grid.arrange(
autoplot(atm1_forecast) +
labs(title = "atm1: ARIMA(1,0,1)(0,1,1)", x = "Week", y = NULL) +
theme(legend.position = "none"),
autoplot(atm2_forecast) +
labs(title = "atm2: ARIMA(5,0,5)(0,1,1)", x = "Week", y = NULL) +
theme(legend.position = "none"),
autoplot(atm3_forecast) +
labs(title = "atm3: mean", x = "Day", y = NULL) +
theme(legend.position = "none"),
autoplot(atm4_forecast) +
labs(title = "atm4: ARIMA(0,0,2)(0,1,1)", x = "Week", y = NULL) +
theme(legend.position = "none"),
a = grid::textGrob("Forecasted atm withdrawals")
)
## Warning: Removed 362 rows containing missing values (geom_path).
data_frame(DATE = rep(max(df_atm$DATE) + 1:31, 4),
atm = rep(names(df_atm)[-1], each = 31),
Cash = c(atm1_forecast$mean, atm2_forecast$mean,
atm3_forecast$mean, atm4_forecast$mean)) %>%
write_csv("C:/Users/rites/Documents/GitHub/Data624-Project1/project1_atm.csv")
# read the data
pu_data <- readxl::read_excel("C:/Users/rites/Documents/GitHub/Data624-Project1/ResidentialCustomerForecastLoad-624.xlsx")
#timeseries
pu_data =ts(pu_data[,"KWH"],start = c(1998,1),frequency = 12)
autoplot(pu_data) +
labs(title = "Monthly residential power usage")
# Box-cox paramter
pu_lambda <- BoxCox.lambda(pu_data)
pu_trans <- BoxCox(pu_data, pu_lambda)
ggtsdisplay(diff(pu_trans, 12), points = FALSE,
main = "Dif lag12 Box-Cox transformed residential power usage")
kWh timeseries with \(D = 1\) and \(d = 0\):# redefine function
pu_aic <- function(p, q, P, Q) {
# create model with Box-Cox and specified ARIMA parameters; extract AIC
AIC(Arima(pu_data, order = c(p, 0, q), seasonal = c(P, 1, Q), lambda = pu_lambda))
}
# create possible combinations except all zero & p = q = 4; P = Q = 1 (returns error)
expand.grid(p = c(0, 1, 4), q = c(0, 1, 4), P = 0:1, Q = 0:1) %>%
filter(p > 0 | q > 0 | P > 0 | Q > 0, p < 4 | q < 4 | P < 1 | Q < 1) %>%
# calc AIC for models
mutate(aic = pmap_dbl(list(p, q, P, Q), pu_aic)) %>%
# return best AIC
slice(which.min(aic))
## p q P Q aic
## 1 1 0 0 1 575.2659
pu_fit <- Arima(pu_data, order = c(1, 0, 0), seasonal = c(0, 1, 1), lambda = pu_lambda)
Box.test(resid(pu_fit), type = "L", fitdf = 3, lag = 12)
##
## Box-Ljung test
##
## data: resid(pu_fit)
## X-squared = 6.4543, df = 9, p-value = 0.6937
ggtsdisplay(resid(pu_fit), points = FALSE,
main = "Residuals for ARIMA(1,0,0)(0,1,1) of residential power usage")
expand.grid(p = c(1, 3), q = c(1, 3)) %>%
mutate(aic = pmap_dbl(list(p, q, 0, 1), pu_aic))
## p q aic
## 1 1 1 575.7090
## 2 3 1 579.7385
## 3 1 3 579.9391
## 4 3 3 581.4511
ggtsdisplay(resid(pu_fit), points = FALSE, plot.type = "histogram",
main = "Residuals for ARIMA(1,0,0)(0,1,1) of residential power usage")
## Warning: Removed 1 rows containing non-finite values (stat_bin).
Using the ARIMA(1,0,0)(0,1,1) model, the next year (12 months) is forecast, and this forecast is plotted:
pu_forecast <- forecast(pu_fit, 12, level = 95)
autoplot(pu_forecast) +
labs(title = "Forecasted residential enery use",
subtitle = "Using ARIMA(1,0,0)(0,1,1) model", x = "Month", y = "kWh") +
theme(legend.position = "none")
data_frame(`YYYY-MMM` = paste0(2014, "-", month.abb),
PU = pu_forecast$mean) %>%
write_csv("C:/Users/rites/Documents/GitHub/Data624-Project1/project1_kWh.csv")
# read the data
water1 <- readxl::read_excel("C:/Users/rites/Documents/GitHub/Data624-Project1/Waterflow_Pipe1.xlsx")
water2 <- readxl::read_excel("C:/Users/rites/Documents/GitHub/Data624-Project1/Waterflow_Pipe2.xlsx")
water1 <- water1 %>%
# separate date & hour components of readings
mutate(Date = date(DateTime),
# convert hour to hour-ending to match pipeline 2
Hour = hour(DateTime) + 1) %>%
# get average reading for each date & hour
group_by(Date, Hour) %>%
summarize(WaterFlow = mean(WaterFlow)) %>%
# convert back to DateTime and drop separate date/hour columns
ungroup() %>%
mutate(DateTime = ymd_h(paste(Date, Hour))) %>%
select(DateTime, WaterFlow)
# create df with both observations for each hour
water_df <- full_join(water1, water2, by = "DateTime", suffix = c("_1", "_2")) %>%
# convert missing pipeline 1 readings to zero
mutate(WaterFlow_1 = ifelse(is.na(WaterFlow_1), 0, WaterFlow_1)) %>%
# get total waterflow by hour
mutate(WaterFlow = WaterFlow_1 + WaterFlow_2) %>%
# drop individual numbers
select(DateTime, WaterFlow)
# create hourly timeseries object
water_ts <- ts(water_df$WaterFlow, frequency = 24)
autoplot(water_ts) +
labs(title = "Hourly water flow through two pipelines", x = "Day", y = "Total waterflow")
# Box-cox paramter & transform
water_lambda <- BoxCox.lambda(water_ts)
water_trans <- BoxCox(water_ts, water_lambda)
# plot differenced transformed series
ggtsdisplay(diff(water_trans), points = FALSE,
main = "Differenced Box-Cox transformed water flow")
water_fit <- Arima(water_ts, order = c(1, 1, 1), lambda = water_lambda)
Box.test(resid(water_fit), type = "L")
##
## Box-Ljung test
##
## data: resid(water_fit)
## X-squared = 2.0096e-05, df = 1, p-value = 0.9964
ggtsdisplay(resid(water_fit), points = FALSE, plot.type = "histogram",
main = "Residuals for ARIMA(1,1,1) of water flow")
water_forecast <- forecast(water_fit, 168, level = 95)
autoplot(water_forecast) +
labs(title = "Forecasted water flow",
x = "Day", y = "Total flow") +
theme(legend.position = "none")
data_frame(DateTime = max(water_df$DateTime) + hours(1:168),
WaterFlow = water_forecast$mean) %>%
write_csv("C:/Users/rites/Documents/GitHub/Data624-Project1/project1_water.csv")