In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
First, let’s take a look at the data.
# Load ATM data and convert to proper format
atm<- read_excel("/Users/ulianaplotnikova/Desktop/Data624/ATM624Data.xlsx",col_types = c('date', 'text', 'numeric'))
# make all column names lowercase
atm <- atm |>
rename_with(tolower)
glimpse(atm)
## Rows: 1,474
## Columns: 3
## $ date <dttm> 2009-05-01, 2009-05-01, 2009-05-02, 2009-05-02, 2009-05-03, 2009…
## $ atm <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …
The existing date format requires conversion to proper datetime format. The ATM variable will become a factor and then we will change the dataframe into a tsibble format.
# Converting the date column to a date datatype
atm <- atm |>
mutate(
date = as.Date(date, origin = "1900-01-01")
)
# Converting the dataset into a tsibble
atm_ts <- atm |>
as_tsibble(
index = date,
key = atm
)
head(atm_ts)
## # A tsibble: 6 x 3 [1D]
## # Key: atm [1]
## date atm cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-02 ATM1 82
## 3 2009-05-03 ATM1 85
## 4 2009-05-04 ATM1 90
## 5 2009-05-05 ATM1 99
## 6 2009-05-06 ATM1 88
Let’s take a look at daily cash withdrawal amounts
atm_ts |>
autoplot(cash) +
facet_wrap(~atm, scales = "free", nrow = 4) +
labs(title = "ATM Withdrawals: May 2009 - April 2010",
subtitle = "Daily cash withdrawal amounts",
x = "Date",
y = "Cash withdrawals (hundreds)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
plot.subtitle = element_text(hjust = 0.5, size = 11),
strip.text = element_text(face = "bold"),
legend.position = "none")
ATM1 and ATM2 show relatively consistent withdrawal patterns
throughout the year, with fluctuations around a mean of approximately
50-75 hundreds.
ATM3 doesn’t show any withdrawal until April 2010. ATM4 displays a
significantly different pattern, with very low withdrawals for most of
the period, followed by a sharp peak in early 2010 and then a return to
low levels. This suggests a possible anomaly or unusual event affecting
ATM4 during that time. Overall, the data suggests seasonal variations in
ATM usage, with higher withdrawals during certain months.
atm_ts |>
ggplot(aes(x = cash, fill = atm)) +
geom_histogram(bins = 30, alpha = 0.8, color = "white") +
facet_wrap(~atm, ncol = 2, scales = "free") +
labs(title = "Distribution of ATM Withdrawals: May 2009 - April 2010",
subtitle = "Histograms showing withdrawal frequency distribution",
x = "Withdrawal amount (hundreds)",
y = "Frequency") +
scale_y_continuous("Withdrawals (hundreds)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
plot.subtitle = element_text(hjust = 0.5, size = 11),
strip.text = element_text(face = "bold"),
legend.position = "none") +
scale_fill_brewer(palette = "Set1")
Check for null entries
# Check for null entries
atm |>
group_by(atm) |>
summarise(
null_count = sum(is.na(cash))
)
## # A tibble: 5 × 2
## atm null_count
## <chr> <int>
## 1 ATM1 3
## 2 ATM2 2
## 3 ATM3 0
## 4 ATM4 0
## 5 <NA> 14
atm_ts |>
filter(
is.na(cash),
!is.na(atm)
)
## # A tsibble: 5 x 3 [1D]
## # Key: atm [2]
## date atm cash
## <date> <chr> <dbl>
## 1 2009-06-13 ATM1 NA
## 2 2009-06-16 ATM1 NA
## 3 2009-06-22 ATM1 NA
## 4 2009-06-18 ATM2 NA
## 5 2009-06-24 ATM2 NA
In this section we will process and clean dataset, by handling missing values and outliers as well as creating separate time series for each ATM machine.
# Filter out the NA values
atm_ts <- atm_ts |>
filter(
!is.na(atm)
)
# Create timeseries
atm1 <- atm_ts |>
filter(
atm == "ATM1"
) |>
mutate(
cash = na.approx(cash, na.rm = FALSE)
)
atm2 <- atm_ts |>
filter(
atm == "ATM2"
) |>
mutate(
cash = na.approx(cash, na.rm = FALSE)
)
atm3 <- atm_ts |>
filter(
atm == "ATM3"
)
atm4 <- atm_ts |>
filter(
atm == "ATM4"
) |>
mutate(
cash = if_else(cash > 9000, NA_real_, cash)
) |>
mutate(
cash = na.approx(cash, na.rm = FALSE)
)
# Combining all the ATM datasets after preprocessing
atm_clean <- bind_rows(atm1, atm2, atm3, atm4)
# Check the cleaned dataset
head(atm_clean)
## # A tsibble: 6 x 3 [1D]
## # Key: atm [1]
## date atm cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-02 ATM1 82
## 3 2009-05-03 ATM1 85
## 4 2009-05-04 ATM1 90
## 5 2009-05-05 ATM1 99
## 6 2009-05-06 ATM1 88
summary(atm_clean)
## date atm cash
## Min. :2009-05-01 Length:1460 Min. : 0.0
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 1.0
## Median :2009-10-30 Mode :character Median : 73.5
## Mean :2009-10-30 Mean : 148.1
## 3rd Qu.:2010-01-29 3rd Qu.: 114.0
## Max. :2010-04-30 Max. :1712.1
To improve forecasting, we will create additional time-based features that could help identify patterns.
# Add temporal features
atm_features <- atm_clean %>%
mutate(
day_of_week = wday(date, label = TRUE),
month = month(date, label = TRUE),
day_of_month = day(date),
is_weekend = if_else(wday(date) %in% c(1, 7), TRUE, FALSE)
)
# View the enhanced dataset
head(atm_features)
## # A tsibble: 6 x 7 [1D]
## # Key: atm [1]
## date atm cash day_of_week month day_of_month is_weekend
## <date> <chr> <dbl> <ord> <ord> <int> <lgl>
## 1 2009-05-01 ATM1 96 Fri May 1 FALSE
## 2 2009-05-02 ATM1 82 Sat May 2 TRUE
## 3 2009-05-03 ATM1 85 Sun May 3 TRUE
## 4 2009-05-04 ATM1 90 Mon May 4 FALSE
## 5 2009-05-05 ATM1 99 Tue May 5 FALSE
## 6 2009-05-06 ATM1 88 Wed May 6 FALSE
Let’s conduct some further analysis to find out if there are any weekly, monthly, or other time-based trends in the data for each ATM?
atm_features %>%
ggplot(aes(x = day_of_week, y = cash, fill = day_of_week)) +
geom_boxplot() +
facet_wrap(~atm, scales = "free_y") +
labs(title = "Weekly Distribution of ATM Cash Withdrawals",
x = "Day of Week",
y = "Cash (hundreds)") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
# Monthly patterns
atm_features %>%
ggplot(aes(x = month, y = cash, fill = month)) +
geom_boxplot() +
facet_wrap(~atm, scales = "free_y") +
labs(title = "Monthly Patterns in ATM Cash Withdrawals",
x = "Month",
y = "Cash (hundreds)") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
# Monthly trend analysis
atm_features %>%
group_by(atm, month) %>%
summarise(
total_cash = sum(cash, na.rm = TRUE),
avg_daily_cash = mean(cash, na.rm = TRUE),
.groups = "drop"
) %>%
ggplot(aes(x = month, y = total_cash, group = atm, color = atm)) +
geom_line(linewidth = 1) +
geom_point(size = 3, shape = 21, fill = "white") +
labs(
title = "Monthly Total Cash Withdrawals by ATM",
subtitle = "Showing seasonal patterns across locations",
x = "Month",
y = "Total cash withdrawals (hundreds)",
color = "ATM machine"
) +
scale_color_brewer(palette = "Dark2") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom"
)
The graph shows that withdrawals are generally higher during certain months and lower during others (seasonal pattern). For example, there’s a peak in withdrawals around the middle of the year (September) and a decrease towards the end of the year.
atm_features %>%
group_by(atm, is_weekend) %>%
summarise(
avg_withdrawal = mean(cash, na.rm = TRUE),
med_withdrawal = median(cash, na.rm = TRUE),
sd_withdrawal = sd(cash, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(day_type = if_else(is_weekend, "Weekend", "Weekday")) %>%
ggplot(aes(x = atm, y = avg_withdrawal, fill = day_type)) +
geom_col(position = "dodge", color = "white") +
geom_errorbar(
aes(ymin = avg_withdrawal - sd_withdrawal,
ymax = avg_withdrawal + sd_withdrawal),
position = position_dodge(0.9),
width = 0.2
) +
labs(
title = "Weekend vs Weekday Cash Withdrawal Patterns",
subtitle = "Average withdrawals with standard deviation error bars",
x = "ATM Location",
y = "Average cash withdrawal (hundreds)",
fill = "Day Type"
) +
scale_fill_brewer(palette = "Set1") +
theme_light() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "bottom"
)
Let’s decompose each ATM’s time series to better understand trend, seasonality, and residuals.
# decompose and plot time series
decompose_and_plot_enhanced <- function(atm_features, atm_name, color_scheme) {
# Convert to ts object
ts_data <- ts(atm_features$cash, frequency = 7) # Weekly seasonality
# Decompose
decomp <- decompose(ts_data, type = "additive")
# Create custom plot with enhanced appearance
component_names <- c("Observed", "Trend", "Seasonal", "Random")
component_data <- data.frame(
Date = rep(atm_features$date, 4),
Value = c(decomp$x, decomp$trend, decomp$seasonal, decomp$random),
Component = rep(component_names, each = length(decomp$x))
)
# Remove NA values
component_data <- component_data[!is.na(component_data$Value), ]
# Create the plot
ggplot(component_data, aes(x = Date, y = Value)) +
geom_line(color = color_scheme, linewidth = 0.8) +
facet_wrap(~Component, scales = "free_y", ncol = 1) +
labs(
title = paste("Time Series Decomposition:", atm_name),
subtitle = "Additive decomposition showing trend, seasonal, and random components",
y = "Cash withdrawals (hundreds)",
caption = paste("Data period:", min(atm_features$date), "to", max(atm_features$date))
) +
theme_minimal() +
theme(
strip.text = element_text(face = "bold", size = 12),
strip.background = element_rect(fill = "lightgrey", color = NA),
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
plot.subtitle = element_text(hjust = 0.5, color = "darkgrey", size = 11),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 1, face = "italic", size = 9)
)
}
atm1_decomp<- decompose_and_plot_enhanced(atm1, "ATM1", "steelblue")
atm2_decomp <- decompose_and_plot_enhanced(atm2, "ATM2", "darkred")
atm3_decomp <- decompose_and_plot_enhanced(atm3, "ATM3", "forestgreen")
atm4_decomp <- decompose_and_plot_enhanced(atm4, "ATM4", "darkorange")
# Display the plots
atm1_decomp
atm2_decomp
atm3_decomp
atm4_decomp
To make our predictions, we’ll employ a variety of models, evaluate
their outcomes, and select the top performer for forecasting. The models
in our toolkit include: - ARIMA - Additive ETS
- Multiplicative ETS
- SNAIVE
- Transformed Additive
- Transformed Multiplicative
- Transformed SNAIVE
atm1 |>
gg_tsdisplay(cash, plot_type = 'partial') +
labs(title = 'ATM1 Cash Withdrawals')
lambda <- atm1 |>
features(cash, features = guerrero) |>
pull(lambda_guerrero)
atm1_fit <- atm1 |>
model(
"ARIMA" = ARIMA(box_cox(cash,lambda)),
"Additive ETS" = ETS(cash ~ error("A") + trend("N") + season("A")),
"Multiplicative ETS" = ETS(cash ~ error("M") + trend("N") + season("M")),
"SNAIVE" = SNAIVE(cash),
"Transformed Additive" = ETS(box_cox(cash,lambda) ~ error("A") + trend("N") + season("A")),
"Transformed Multiplicative" = ETS(box_cox(cash,lambda) ~ error("M") + trend("N") + season("M")),
"Transformed SNAIVE" = SNAIVE(box_cox(cash,lambda))
)
left_join(glance(atm1_fit) |>
select(.model:BIC),
accuracy(atm1_fit) |>
select(.model, RMSE, MAE)) |>
arrange(RMSE)
## Joining with `by = join_by(.model)`
## # A tibble: 7 × 8
## .model sigma2 log_lik AIC AICc BIC RMSE MAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Transformed Multiplicative 0.0383 -1219. 2458. 2458. 2497. 23.3 15.3
## 2 Additive ETS 580. -2234. 4487. 4488. 4526. 23.8 15.1
## 3 Transformed Additive 1.80 -1180. 2380. 2380. 2419. 24.9 15.9
## 4 ARIMA 1.76 -610. 1228. 1228. 1244. 24.9 16.0
## 5 Multiplicative ETS 0.134 -2273. 4566. 4567. 4605. 26.3 16.2
## 6 Transformed SNAIVE 2.48 NA NA NA NA 27.7 17.7
## 7 SNAIVE 772. NA NA NA NA 27.7 17.7
By comparing AIC, BIC, and AICc of all the models, the ARIMA model came out on top with the lowest values. Consequently, we will go with the ARIMA model as the best fit.
atm1_fit |>
select(.model = "ARIMA") |>
report()
## Series: cash
## Model: ARIMA(0,0,2)(0,1,1)[7]
## Transformation: box_cox(cash, lambda)
##
## Coefficients:
## ma1 ma2 sma1
## 0.1126 -0.1094 -0.6418
## s.e. 0.0524 0.0520 0.0432
##
## sigma^2 estimated as 1.764: log likelihood=-609.99
## AIC=1227.98 AICc=1228.09 BIC=1243.5
forecast_atm1 <- atm1_fit |>
forecast(h = 31) |>
filter(.model=='ARIMA')
atm1_fit |>
select(ARIMA) |>
gg_tsresiduals()
The ACF plot suggests a lack of significant autocorrelation in the data after accounting for the immediate previous value. The histogram of residuals shows an approximately normal distribution, and the time series plot of innovation residuals indicates that the ARIMA model fits the data reasonably well, with only minor deviations from the assumptions of constant variance and normality.
forecast_atm1 |>
autoplot(atm1)
Perform diagnostic checks using the Ljung-Box and Box-Pierce tests.
atm1_fit |>
select(.model = "ARIMA") |>
augment() |>
features(.innov, box_pierce, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 .model 6.37 0.784
atm1_fit |>
select(.model = "ARIMA") |>
augment() |>
features(.innov, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 .model 6.50 0.772
The ARIMA model fits the data well, as indicated by the high p-values from the Ljung-Box and Box-Pierce tests, suggesting the residuals are random.
atm2 |>
gg_tsdisplay(cash, plot_type = 'partial') +
labs(title = 'ATM2 CASH Withdrawals')
lambda2 <- atm2 |>
features(cash, features = guerrero) |>
pull(lambda_guerrero)
atm2_fit <- atm2 |>
model(
"ARIMA" = ARIMA(box_cox(cash,lambda2)),
"Additive ETS" = ETS(cash ~ error("A") + trend("N") + season("A")),
"Multiplicative ETS" = ETS(cash ~ error("M") + trend("N") + season("M")),
"SNAIVE" = SNAIVE(cash),
"Transformed Additive" = ETS(box_cox(cash,lambda2) ~ error("A") + trend("N") + season("A")),
"Transformed Multiplicative" = ETS(box_cox(cash,lambda2) ~ error("M") + trend("N") + season("M")),
"Transformed SNAIVE" = SNAIVE(box_cox(cash,lambda2))
)
left_join(glance(atm2_fit) |>
select(.model:BIC),
accuracy(atm2_fit) |>
select(.model, RMSE, MAE)) |>
arrange(RMSE)
## Joining with `by = join_by(.model)`
## # A tibble: 7 × 8
## .model sigma2 log_lik AIC AICc BIC RMSE MAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA 68.2 -1263. 2539. 2539. 2562. 24.5 17.1
## 2 Additive ETS 648. -2254. 4527. 4528. 4566. 25.1 17.9
## 3 Transformed Additive 72.4 -1854. 3728. 3728. 3767. 25.4 17.8
## 4 SNAIVE 914. NA NA NA NA 30.2 20.8
## 5 Transformed SNAIVE 101. NA NA NA NA 30.2 20.8
## 6 Transformed Multiplicative 0.308 -2011. 4043. 4043. 4082. 34.1 27.2
## 7 Multiplicative ETS 0.424 -2399. 4818. 4819. 4857. 34.4 27.0
Lower values for AIC, AICC, BIC,RMSE and MAE indicate that ARIMA model fits the best in comparison to other models.So, again we will go with ARIMA
atm2_fit |>
select(.model = "ARIMA") |>
report()
## Series: cash
## Model: ARIMA(2,0,2)(0,1,1)[7]
## Transformation: box_cox(cash, lambda2)
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4256 -0.9051 0.4719 0.7967 -0.7271
## s.e. 0.0588 0.0444 0.0888 0.0590 0.0406
##
## sigma^2 estimated as 68.16: log likelihood=-1263.33
## AIC=2538.66 AICc=2538.9 BIC=2561.94
forecast_atm2 <- atm2_fit |>
forecast(h = 31) |>
filter(.model=='ARIMA')
atm2_fit |>
select(ARIMA) |>
gg_tsresiduals()
The ACF plot shows some autocorrelation, but the residuals plot and histogram indicate that the model is a reasonable fit, with residuals that are approximately normally distributed and show no obvious patterns. The model’s performance seems satisfactory.
forecast_atm2 |>
autoplot(atm2)
Perform diagnostic checks using the Ljung-Box and Box-Pierce tests.
atm2_fit |>
select(.model = "ARIMA") |>
augment() |>
features(.innov, box_pierce, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 .model 8.88 0.544
atm2_fit |>
select(.model = "ARIMA") |>
augment() |>
features(.innov, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 .model 9.06 0.527
Both p-values (bp_pvalue and lb_pvalue) are greater than the typical significance level of 0.05. This indicates that there is no significant autocorrelation in the residuals at the specified lag (10). In other words, the model adequately captures the autocorrelation in the data. The ARIMA model appears to be a good fit for the data.
ATM3 doesn’t have much information until April 2010 , so it might be tough to create anything more complex than a random walk model.
atm3 |>
gg_tsdisplay(cash, plot_type = 'partial') +
labs(title = 'ATM3 Cash Withdrawals')
atm3_fit <- atm3 |>
model(NAIVE(cash))
forecast_atm3 <- atm3_fit |>
forecast(h = 31)
forecast_atm3 |>
autoplot(atm3)
atm4 |>
gg_tsdisplay(cash, plot_type = 'partial') +
labs(title = 'ATM4 Cash Withdrawals')
lambda4 <- atm4 |>
features(cash, features = guerrero) |>
pull(lambda_guerrero)
atm4_fit <- atm4 |>
model(
"ARIMA" = ARIMA(box_cox(cash,lambda4)),
"Additive ETS" = ETS(cash ~ error("A") + trend("N") + season("A")),
"Multiplicative ETS" = ETS(cash ~ error("M") + trend("N") + season("M")),
"SNAIVE" = SNAIVE(cash),
"Transformed_Additive" = ETS(box_cox(cash,lambda4) ~ error("A") + trend("N") + season("A")),
"Transformed Multiplicative" = ETS(box_cox(cash,lambda4) ~ error("M") + trend("N") + season("M")),
"Transformed SNAIVE" = SNAIVE(box_cox(cash,lambda4)),
"Additive_Transformed_Non_Seasonal" = ETS(box_cox(cash,lambda4) ~ error("A") + trend("N") + season("N")),
"Multiplicative_Transformed_Non_Seasonal" = ETS(box_cox(cash,lambda4) ~ error("M") + trend("N") + season("N"))
)
left_join(glance(atm4_fit) |>
select(.model:BIC),
accuracy(atm4_fit) |>
select(.model, RMSE, MAE)) |>
arrange(RMSE)
## Joining with `by = join_by(.model)`
## # A tibble: 9 × 8
## .model sigma2 log_lik AIC AICc BIC RMSE MAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Additive ETS 1.11e+5 -3192. 6404. 6405. 6443. 329. 265.
## 2 Transformed_Additive 1.67e+2 -2006. 4032. 4033. 4071. 338. 260.
## 3 Transformed Multiplicative 2.19e-1 -2003. 4026. 4027. 4065. 343. 262.
## 4 ARIMA 1.76e+2 -1461. 2931. 2931. 2951. 352. 274.
## 5 Multiplicative ETS 7.28e-1 -3192. 6404. 6405. 6443. 354. 277.
## 6 Additive_Transformed_Non_Season… 1.97e+2 -2040. 4085. 4085. 4097. 363. 292.
## 7 Multiplicative_Transformed_Non_… 2.38e-1 -2040. 4085. 4085. 4097. 363. 292.
## 8 SNAIVE 2.06e+5 NA NA NA NA 453. 346.
## 9 Transformed SNAIVE 2.89e+2 NA NA NA NA 453. 346.
ARIMA shows the lower values for AIC, AICC, BIC among all the models.So, again we will go with ARIMA for forecast
atm4_fit |>
select(.model = "ARIMA") |>
report()
## Series: cash
## Model: ARIMA(0,0,1)(2,0,0)[7] w/ mean
## Transformation: box_cox(cash, lambda4)
##
## Coefficients:
## ma1 sar1 sar2 constant
## 0.0790 0.2078 0.2023 16.8928
## s.e. 0.0527 0.0516 0.0525 0.7318
##
## sigma^2 estimated as 176.5: log likelihood=-1460.57
## AIC=2931.13 AICc=2931.3 BIC=2950.63
forecast_atm4 <- atm4_fit |>
forecast(h = 31) |>
filter(.model=='ARIMA')
atm4_fit |>
select(ARIMA) |>
gg_tsresiduals()
The model’s autoplot indicates that it remains stable around the zero
mark, lacking any clear patterns, though there are a few noticeable
outliers. In the ACF plot, the lines generally stay within the
confidence intervals, again showing no clear patterns. The histogram
suggests a nearly normal distribution, even though the peak of the
residuals doesn’t seem to align with zero
forecast_atm4 |>
autoplot(atm4)
Perform diagnostic checks using the Ljung-Box and Box-Pierce tests.
atm4_fit |>
select(.model = "ARIMA") |>
augment() |>
features(.innov, box_pierce, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 .model 14.2 0.163
atm4_fit |>
select(.model = "ARIMA") |>
augment() |>
features(.innov, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 .model 14.6 0.147
forecast <- bind_rows(forecast_atm1, forecast_atm2, forecast_atm3, forecast_atm4)
Adding to Excel
write.xlsx(forecast, file = "/Users/ulianaplotnikova/Downloads/ATM624Data.xlsx",
sheetName = "ATM May Forecast", append = FALSE)
ATM_FORECAST <- read_excel("/Users/ulianaplotnikova/Downloads/ATM624Data.xlsx", sheet = "ATM May Forecast")
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
power_data <- read.csv("https://raw.githubusercontent.com/uplotnik/DATA-624/refs/heads/main/ResidentialCustomerForecastLoad-624.csv")
# Initial data exploration
print(head(power_data))
## CaseSequence YYYY.MMM KWH
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
print(str(power_data))
## 'data.frame': 192 obs. of 3 variables:
## $ CaseSequence: int 733 734 735 736 737 738 739 740 741 742 ...
## $ YYYY.MMM : chr "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
## $ KWH : int 6862583 5838198 5420658 5010364 4665377 6467147 8914755 8607428 6989888 6345620 ...
## NULL
summary(power_data)
## CaseSequence YYYY.MMM KWH
## Min. :733.0 Length:192 Min. : 770523
## 1st Qu.:780.8 Class :character 1st Qu.: 5429912
## Median :828.5 Mode :character Median : 6283324
## Mean :828.5 Mean : 6502475
## 3rd Qu.:876.2 3rd Qu.: 7620524
## Max. :924.0 Max. :10655730
## NA's :1
Convert the data into a time series object and visualize
# Convert the data into a time series object
ts_data <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)
autoplot(ts_data, main = "Residential Power Usage (1998-2013)", xlab = "Time", ylab = "KWH") +
geom_line() +
geom_smooth(method = "loess", se = FALSE) +
scale_x_continuous(breaks = seq(1998, 2013, by = 1), labels = function(x) floor(x)) +
theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
Handle missing values:
-Check for missing values -Impute missing values using linear interpolation -Visualize the imputed time series
# Check for missing values
sum(is.na(ts_data))
## [1] 1
# Impute missing values using linear interpolation
ts_data_imputed <- na_interpolation(ts_data, option = "linear")
# Verify that missing values are handled
sum(is.na(ts_data_imputed))
## [1] 0
# Visualize the imputed time series
autoplot(ts_data_imputed, main = "Residential Power Usage (1998-2013)", xlab = "Time", ylab = "KWH") +
geom_line() + # add this line
geom_smooth(method = "loess", se = FALSE) + # add this line
scale_x_continuous(breaks = seq(1998, 2013, by = 1), labels = function(x) floor(x)) + # add this line
theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
After handling missing values the graph still shows an outliers. In the next step we will identify and replace the outliers with median.
Handle Outliers
Detect outliers using boxplot method
boxplot_stats <- boxplot(ts_data_imputed, plot = FALSE)
outliers <- ts_data_imputed %in% boxplot_stats$out
outlier_indices <- which(outliers)
print(outlier_indices)
## [1] 151
print(ts_data_imputed[outlier_indices])
## [1] 770523
Handle outliers by replacement with the median and visualize
# Handle outliers by replacement with the median
median_value <- median(ts_data_imputed, na.rm = TRUE)
ts_data_imputed[outlier_indices] <- median_value
# Verify that outliers are handled
boxplot_stats_after <- boxplot(ts_data_imputed, plot = FALSE)
outliers_after <- ts_data_imputed %in% boxplot_stats_after$out
sum(outliers_after) #checking if any outliers are remaining
## [1] 0
autoplot(ts_data_imputed, main = "Residential Power Usage (1998-2013) - Outliers Handled", xlab = "Time", ylab = "KWH") +
geom_line() +
geom_smooth(method = "loess", se = FALSE) +
scale_x_continuous(breaks = seq(1998, 2013, by = 1), labels = function(x) floor(x)) +
theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
Decompose the time series
# Decompose the time series
decomposed_ts <- decompose(ts_data_imputed)
# Plot the decomposed time series
autoplot(decomposed_ts) +
theme_minimal() +
labs(title = "Decomposition of Residential Power Usage",
subtitle = "Classical Decomposition",
caption = "Source: Your Data Source")
Visualize the seasonality
ggseasonplot(ts_data_imputed,
year.labels=TRUE, year.labels.left=TRUE) +
ylab("KWH") +
ggtitle("Seasonal plot: Residential Power Usage") +
theme_minimal()
Split Data into Training and Testing Sets
train_data <- window(ts_data_imputed, start = c(1998, 1), end = c(2012, 12))
test_data <- window(ts_data_imputed, start = c(2013, 1), end = c(2013, 12))
head(train_data)
## Jan Feb Mar Apr May Jun
## 1998 6862583 5838198 5420658 5010364 4665377 6467147
head(test_data)
## Jan Feb Mar Apr May Jun
## 2013 10655730 7681798 6517514 6105359 5940475 7920627
# Define evaluation metrics
calculate_rmse <- function(forecast_values, actual_values) {
sqrt(mean((forecast_values - actual_values)^2))
}
calculate_mae <- function(forecast_values, actual_values) {
mean(abs(forecast_values - actual_values))
}
# Define the horizon
h <- 12
# Estimate lambda for Box-Cox transformation
lambda <- BoxCox.lambda(train_data)
# Define models
models <- list(
Additive_ETS = list(fit = function(x) ets(x, model="AAN", damped = FALSE), forecast = function(fit, h) forecast(fit, h=h)),
Multiplicative_ETS = list(fit = function(x) ets(x, model="MAM", damped = FALSE), forecast = function(fit, h) forecast(fit, h=h)),
Additive_Damped = list(fit = function(x) ets(x, model="AAN", damped = TRUE), forecast = function(fit, h) forecast(fit, h=h)),
SNAIVE = list(fit = function(x) snaive(x, h=h), forecast = function(fit, h) forecast(fit, h=h)),
Additive_BC_ETS = list(fit = function(x) ets(x, model="AAN", damped = FALSE, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h)),
BC_Additive_Damped = list(fit = function(x) ets(x, model="AAN", damped = TRUE, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h)),
BC_SNAIVE = list(fit = function(x) snaive(x, h=h, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h)),
ARIMA = list(fit = function(x) auto.arima(x), forecast = function(fit, h) forecast(fit, h=h)),
ARIMA_BC = list(fit = function(x) auto.arima(x, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h))
)
Evaluate models
# Evaluate models
results <- lapply(names(models), function(model_name) {
model <- models[[model_name]]
fit <- model$fit(train_data)
forecast_values <- model$forecast(fit, h)$mean
rmse <- calculate_rmse(forecast_values, test_data)
mae <- calculate_mae(forecast_values, test_data)
if(model_name %in% c("SNAIVE", "BC_SNAIVE")) {
aic <- NA
bic <- NA
} else {
aic <- AIC(fit)
bic <- BIC(fit)
}
data.frame(
Model = model_name,
AIC = aic,
BIC = bic,
RMSE = rmse,
MAE = mae
)
})
results <- do.call(rbind, results)
# Print the results
print(results)
## Model AIC BIC RMSE MAE
## 1 Additive_ETS 6039.5454 6055.5102 1600434.6 1392817.4
## 2 Multiplicative_ETS 5739.3988 5793.6790 1021694.1 697734.0
## 3 Additive_Damped 6020.7978 6039.9555 1617817.4 1402941.9
## 4 SNAIVE NA NA 1035537.6 618605.6
## 5 Additive_BC_ETS -637.9292 -621.9644 1594553.9 1310730.0
## 6 BC_Additive_Damped -635.1095 -615.9518 1578975.3 1354114.5
## 7 BC_SNAIVE NA NA 1035537.6 618605.6
## 8 ARIMA 4963.0198 4991.1355 944443.3 616922.5
## 9 ARIMA_BC -1245.9730 -1230.3532 942928.2 631961.6
From the comparison, it’s clear that ARIMA_BC stands out as the top performer, with the lowest AIC, BIC, and RMSE values. Therefore, we’ll proceed with ARIMA_BC for our next analysis.
Fit the Best Model and Generate Forecasts
# Fit the best model on the training data
arima_bc_fit <- auto.arima(train_data, lambda = lambda, biasadj = TRUE)
# Report the model details
arima_bc_fit
## Series: train_data
## ARIMA(1,0,0)(2,1,0)[12] with drift
## Box Cox transformation: lambda= -0.1777466
##
## Coefficients:
## ar1 sar1 sar2 drift
## 0.2759 -0.7395 -0.4097 1e-04
## s.e. 0.0752 0.0746 0.0799 1e-04
##
## sigma^2 = 3.437e-05: log likelihood = 627.99
## AIC=-1245.97 AICc=-1245.6 BIC=-1230.35
# Generate forecasts
arima_bc_forecast <- forecast(arima_bc_fit, h = 12)
arima_bc_forecast
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2013 9052914 7918355 10257382 7408489 11007118
## Feb 2013 8703168 7580813 9896838 7078922 10643458
## Mar 2013 6983552 6114030 7906279 5722837 8480019
## Apr 2013 5896085 5182828 6651658 4860393 7119260
## May 2013 5513817 4854424 6211852 4555780 6643054
## Jun 2013 7712422 6735469 8750269 6297234 9397449
## Jul 2013 8340141 7269425 9478562 6790247 10190076
## Aug 2013 9441293 8203222 10759460 7651225 11586344
## Sep 2013 8416225 7334061 9566934 6849892 10286318
## Oct 2013 5914450 5198578 6672818 4874990 7142191
## Nov 2013 5647758 4969557 6365878 4662601 6809778
## Dec 2013 7163335 6267344 8114420 5864556 8706242
# Plot the residuals
checkresiduals(arima_bc_fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(2,1,0)[12] with drift
## Q* = 29.789, df = 21, p-value = 0.09631
##
## Model df: 3. Total lags used: 24
# Autoplot of the forecast
autoplot(arima_bc_forecast) +
autolayer(fitted(arima_bc_forecast), series="Fitted") +
ylab("KWH") +
xlab("Time") +
ggtitle("Forecasts from ARIMA_BC Model") +
theme_minimal()
Model Accuracy
accuracy(arima_bc_forecast)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 6095.62 602379.8 471265.9 -0.4836496 7.302183 0.7406487 0.1032934
Forecast Values
# Print the forecast values
print(arima_bc_forecast$mean)
## Jan Feb Mar Apr May Jun Jul Aug Sep
## 2013 9052914 8703168 6983552 5896085 5513817 7712422 8340141 9441293 8416225
## Oct Nov Dec
## 2013 5914450 5647758 7163335
write.xlsx(arima_bc_forecast, file = "/Users/ulianaplotnikova/Downloads/ATM624Data1.xlsx",
sheetName = "Forecasting Power", append= FALSE)
library(tidyverse)
library(fpp3)
library(tseries)
library(forecast)
library(kableExtra)
library(reactable)
library(seasonal)
library(tsibble)
library(openxlsx)
library(readxl)
library(mice)
library(caret)
library(zoo)
library(lubridate)
library(imputeTS)
library(naniar)
library(timeplyr)
library(rstatix)
library(timetk)
# Load ATM data and convert to proper format
atm<- read_excel("/Users/ulianaplotnikova/Desktop/Data624/ATM624Data.xlsx",col_types = c('date', 'text', 'numeric'))
# make all column names lowercase
atm <- atm |>
rename_with(tolower)
glimpse(atm)
# Converting the date column to a date datatype
atm <- atm |>
mutate(
date = as.Date(date, origin = "1900-01-01")
)
# Converting the dataset into a tsibble
atm_ts <- atm |>
as_tsibble(
index = date,
key = atm
)
head(atm_ts)
atm_ts |>
autoplot(cash) +
facet_wrap(~atm, scales = "free", nrow = 4) +
labs(title = "ATM Withdrawals: May 2009 - April 2010",
subtitle = "Daily cash withdrawal amounts",
x = "Date",
y = "Cash withdrawals (hundreds)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
plot.subtitle = element_text(hjust = 0.5, size = 11),
strip.text = element_text(face = "bold"),
legend.position = "none")
atm_ts |>
ggplot(aes(x = cash, fill = atm)) +
geom_histogram(bins = 30, alpha = 0.8, color = "white") +
facet_wrap(~atm, ncol = 2, scales = "free") +
labs(title = "Distribution of ATM Withdrawals: May 2009 - April 2010",
subtitle = "Histograms showing withdrawal frequency distribution",
x = "Withdrawal amount (hundreds)",
y = "Frequency") +
scale_y_continuous("Withdrawals (hundreds)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
plot.subtitle = element_text(hjust = 0.5, size = 11),
strip.text = element_text(face = "bold"),
legend.position = "none") +
scale_fill_brewer(palette = "Set1")
# Check for null entries
atm |>
group_by(atm) |>
summarise(
null_count = sum(is.na(cash))
)
atm_ts |>
filter(
is.na(cash),
!is.na(atm)
)
# Filter out the NA values
atm_ts <- atm_ts |>
filter(
!is.na(atm)
)
# Create timeseries
atm1 <- atm_ts |>
filter(
atm == "ATM1"
) |>
mutate(
cash = na.approx(cash, na.rm = FALSE)
)
atm2 <- atm_ts |>
filter(
atm == "ATM2"
) |>
mutate(
cash = na.approx(cash, na.rm = FALSE)
)
atm3 <- atm_ts |>
filter(
atm == "ATM3"
)
atm4 <- atm_ts |>
filter(
atm == "ATM4"
) |>
mutate(
cash = if_else(cash > 9000, NA_real_, cash)
) |>
mutate(
cash = na.approx(cash, na.rm = FALSE)
)
# Combining all the ATM datasets after preprocessing
atm_clean <- bind_rows(atm1, atm2, atm3, atm4)
# Check the cleaned dataset
head(atm_clean)
summary(atm_clean)
# Add temporal features
atm_features <- atm_clean %>%
mutate(
day_of_week = wday(date, label = TRUE),
month = month(date, label = TRUE),
day_of_month = day(date),
is_weekend = if_else(wday(date) %in% c(1, 7), TRUE, FALSE)
)
# View the enhanced dataset
head(atm_features)
atm_features %>%
ggplot(aes(x = day_of_week, y = cash, fill = day_of_week)) +
geom_boxplot() +
facet_wrap(~atm, scales = "free_y") +
labs(title = "Weekly Distribution of ATM Cash Withdrawals",
x = "Day of Week",
y = "Cash (hundreds)") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
# Monthly patterns
atm_features %>%
ggplot(aes(x = month, y = cash, fill = month)) +
geom_boxplot() +
facet_wrap(~atm, scales = "free_y") +
labs(title = "Monthly Patterns in ATM Cash Withdrawals",
x = "Month",
y = "Cash (hundreds)") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
# Monthly trend analysis
atm_features %>%
group_by(atm, month) %>%
summarise(
total_cash = sum(cash, na.rm = TRUE),
avg_daily_cash = mean(cash, na.rm = TRUE),
.groups = "drop"
) %>%
ggplot(aes(x = month, y = total_cash, group = atm, color = atm)) +
geom_line(linewidth = 1) +
geom_point(size = 3, shape = 21, fill = "white") +
labs(
title = "Monthly Total Cash Withdrawals by ATM",
subtitle = "Showing seasonal patterns across locations",
x = "Month",
y = "Total cash withdrawals (hundreds)",
color = "ATM machine"
) +
scale_color_brewer(palette = "Dark2") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom"
)
atm_features %>%
group_by(atm, is_weekend) %>%
summarise(
avg_withdrawal = mean(cash, na.rm = TRUE),
med_withdrawal = median(cash, na.rm = TRUE),
sd_withdrawal = sd(cash, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(day_type = if_else(is_weekend, "Weekend", "Weekday")) %>%
ggplot(aes(x = atm, y = avg_withdrawal, fill = day_type)) +
geom_col(position = "dodge", color = "white") +
geom_errorbar(
aes(ymin = avg_withdrawal - sd_withdrawal,
ymax = avg_withdrawal + sd_withdrawal),
position = position_dodge(0.9),
width = 0.2
) +
labs(
title = "Weekend vs Weekday Cash Withdrawal Patterns",
subtitle = "Average withdrawals with standard deviation error bars",
x = "ATM Location",
y = "Average cash withdrawal (hundreds)",
fill = "Day Type"
) +
scale_fill_brewer(palette = "Set1") +
theme_light() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "bottom"
)
# decompose and plot time series
decompose_and_plot_enhanced <- function(atm_features, atm_name, color_scheme) {
# Convert to ts object
ts_data <- ts(atm_features$cash, frequency = 7) # Weekly seasonality
# Decompose
decomp <- decompose(ts_data, type = "additive")
# Create custom plot with enhanced appearance
component_names <- c("Observed", "Trend", "Seasonal", "Random")
component_data <- data.frame(
Date = rep(atm_features$date, 4),
Value = c(decomp$x, decomp$trend, decomp$seasonal, decomp$random),
Component = rep(component_names, each = length(decomp$x))
)
# Remove NA values
component_data <- component_data[!is.na(component_data$Value), ]
# Create the plot
ggplot(component_data, aes(x = Date, y = Value)) +
geom_line(color = color_scheme, linewidth = 0.8) +
facet_wrap(~Component, scales = "free_y", ncol = 1) +
labs(
title = paste("Time Series Decomposition:", atm_name),
subtitle = "Additive decomposition showing trend, seasonal, and random components",
y = "Cash withdrawals (hundreds)",
caption = paste("Data period:", min(atm_features$date), "to", max(atm_features$date))
) +
theme_minimal() +
theme(
strip.text = element_text(face = "bold", size = 12),
strip.background = element_rect(fill = "lightgrey", color = NA),
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
plot.subtitle = element_text(hjust = 0.5, color = "darkgrey", size = 11),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 1, face = "italic", size = 9)
)
}
atm1_decomp<- decompose_and_plot_enhanced(atm1, "ATM1", "steelblue")
atm2_decomp <- decompose_and_plot_enhanced(atm2, "ATM2", "darkred")
atm3_decomp <- decompose_and_plot_enhanced(atm3, "ATM3", "forestgreen")
atm4_decomp <- decompose_and_plot_enhanced(atm4, "ATM4", "darkorange")
# Display the plots
atm1_decomp
atm2_decomp
atm3_decomp
atm4_decomp
atm1 |>
gg_tsdisplay(cash, plot_type = 'partial') +
labs(title = 'ATM1 Cash Withdrawals')
lambda <- atm1 |>
features(cash, features = guerrero) |>
pull(lambda_guerrero)
atm1_fit <- atm1 |>
model(
"ARIMA" = ARIMA(box_cox(cash,lambda)),
"Additive ETS" = ETS(cash ~ error("A") + trend("N") + season("A")),
"Multiplicative ETS" = ETS(cash ~ error("M") + trend("N") + season("M")),
"SNAIVE" = SNAIVE(cash),
"Transformed Additive" = ETS(box_cox(cash,lambda) ~ error("A") + trend("N") + season("A")),
"Transformed Multiplicative" = ETS(box_cox(cash,lambda) ~ error("M") + trend("N") + season("M")),
"Transformed SNAIVE" = SNAIVE(box_cox(cash,lambda))
)
left_join(glance(atm1_fit) |>
select(.model:BIC),
accuracy(atm1_fit) |>
select(.model, RMSE, MAE)) |>
arrange(RMSE)
atm1_fit |>
select(.model = "ARIMA") |>
report()
forecast_atm1 <- atm1_fit |>
forecast(h = 31) |>
filter(.model=='ARIMA')
atm1_fit |>
select(ARIMA) |>
gg_tsresiduals()
forecast_atm1 |>
autoplot(atm1)
atm1_fit |>
select(.model = "ARIMA") |>
augment() |>
features(.innov, box_pierce, lag = 10, dof = 0)
atm1_fit |>
select(.model = "ARIMA") |>
augment() |>
features(.innov, ljung_box, lag = 10, dof = 0)
atm2 |>
gg_tsdisplay(cash, plot_type = 'partial') +
labs(title = 'ATM2 CASH Withdrawals')
lambda2 <- atm2 |>
features(cash, features = guerrero) |>
pull(lambda_guerrero)
atm2_fit <- atm2 |>
model(
"ARIMA" = ARIMA(box_cox(cash,lambda2)),
"Additive ETS" = ETS(cash ~ error("A") + trend("N") + season("A")),
"Multiplicative ETS" = ETS(cash ~ error("M") + trend("N") + season("M")),
"SNAIVE" = SNAIVE(cash),
"Transformed Additive" = ETS(box_cox(cash,lambda2) ~ error("A") + trend("N") + season("A")),
"Transformed Multiplicative" = ETS(box_cox(cash,lambda2) ~ error("M") + trend("N") + season("M")),
"Transformed SNAIVE" = SNAIVE(box_cox(cash,lambda2))
)
left_join(glance(atm2_fit) |>
select(.model:BIC),
accuracy(atm2_fit) |>
select(.model, RMSE, MAE)) |>
arrange(RMSE)
atm2_fit |>
select(.model = "ARIMA") |>
report()
forecast_atm2 <- atm2_fit |>
forecast(h = 31) |>
filter(.model=='ARIMA')
atm2_fit |>
select(ARIMA) |>
gg_tsresiduals()
forecast_atm2 |>
autoplot(atm2)
atm2_fit |>
select(.model = "ARIMA") |>
augment() |>
features(.innov, box_pierce, lag = 10, dof = 0)
atm2_fit |>
select(.model = "ARIMA") |>
augment() |>
features(.innov, ljung_box, lag = 10, dof = 0)
atm3 |>
gg_tsdisplay(cash, plot_type = 'partial') +
labs(title = 'ATM3 Cash Withdrawals')
atm3_fit <- atm3 |>
model(NAIVE(cash))
forecast_atm3 <- atm3_fit |>
forecast(h = 31)
forecast_atm3 |>
autoplot(atm3)
atm4 |>
gg_tsdisplay(cash, plot_type = 'partial') +
labs(title = 'ATM4 Cash Withdrawals')
lambda4 <- atm4 |>
features(cash, features = guerrero) |>
pull(lambda_guerrero)
atm4_fit <- atm4 |>
model(
"ARIMA" = ARIMA(box_cox(cash,lambda4)),
"Additive ETS" = ETS(cash ~ error("A") + trend("N") + season("A")),
"Multiplicative ETS" = ETS(cash ~ error("M") + trend("N") + season("M")),
"SNAIVE" = SNAIVE(cash),
"Transformed_Additive" = ETS(box_cox(cash,lambda4) ~ error("A") + trend("N") + season("A")),
"Transformed Multiplicative" = ETS(box_cox(cash,lambda4) ~ error("M") + trend("N") + season("M")),
"Transformed SNAIVE" = SNAIVE(box_cox(cash,lambda4)),
"Additive_Transformed_Non_Seasonal" = ETS(box_cox(cash,lambda4) ~ error("A") + trend("N") + season("N")),
"Multiplicative_Transformed_Non_Seasonal" = ETS(box_cox(cash,lambda4) ~ error("M") + trend("N") + season("N"))
)
left_join(glance(atm4_fit) |>
select(.model:BIC),
accuracy(atm4_fit) |>
select(.model, RMSE, MAE)) |>
arrange(RMSE)
atm4_fit |>
select(.model = "ARIMA") |>
report()
forecast_atm4 <- atm4_fit |>
forecast(h = 31) |>
filter(.model=='ARIMA')
atm4_fit |>
select(ARIMA) |>
gg_tsresiduals()
forecast_atm4 |>
autoplot(atm4)
atm4_fit |>
select(.model = "ARIMA") |>
augment() |>
features(.innov, box_pierce, lag = 10, dof = 0)
atm4_fit |>
select(.model = "ARIMA") |>
augment() |>
features(.innov, ljung_box, lag = 10, dof = 0)
forecast <- bind_rows(forecast_atm1, forecast_atm2, forecast_atm3, forecast_atm4)
write.xlsx(forecast, file = "/Users/ulianaplotnikova/Downloads/ATM624Data.xlsx",
sheetName = "ATM May Forecast", append = FALSE)
ATM_FORECAST <- read_excel("/Users/ulianaplotnikova/Downloads/ATM624Data.xlsx", sheet = "ATM May Forecast")
power_data <- read.csv("https://raw.githubusercontent.com/uplotnik/DATA-624/refs/heads/main/ResidentialCustomerForecastLoad-624.csv")
# Initial data exploration
print(head(power_data))
print(str(power_data))
summary(power_data)
# Convert the data into a time series object
ts_data <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)
autoplot(ts_data, main = "Residential Power Usage (1998-2013)", xlab = "Time", ylab = "KWH") +
geom_line() +
geom_smooth(method = "loess", se = FALSE) +
scale_x_continuous(breaks = seq(1998, 2013, by = 1), labels = function(x) floor(x)) +
theme_minimal()
# Check for missing values
sum(is.na(ts_data))
# Impute missing values using linear interpolation
ts_data_imputed <- na_interpolation(ts_data, option = "linear")
# Verify that missing values are handled
sum(is.na(ts_data_imputed))
# Visualize the imputed time series
autoplot(ts_data_imputed, main = "Residential Power Usage (1998-2013)", xlab = "Time", ylab = "KWH") +
geom_line() + # add this line
geom_smooth(method = "loess", se = FALSE) + # add this line
scale_x_continuous(breaks = seq(1998, 2013, by = 1), labels = function(x) floor(x)) + # add this line
theme_minimal()
boxplot_stats <- boxplot(ts_data_imputed, plot = FALSE)
outliers <- ts_data_imputed %in% boxplot_stats$out
outlier_indices <- which(outliers)
print(outlier_indices)
print(ts_data_imputed[outlier_indices])
# Handle outliers by replacement with the median
median_value <- median(ts_data_imputed, na.rm = TRUE)
ts_data_imputed[outlier_indices] <- median_value
# Verify that outliers are handled
boxplot_stats_after <- boxplot(ts_data_imputed, plot = FALSE)
outliers_after <- ts_data_imputed %in% boxplot_stats_after$out
sum(outliers_after) #checking if any outliers are remaining
autoplot(ts_data_imputed, main = "Residential Power Usage (1998-2013) - Outliers Handled", xlab = "Time", ylab = "KWH") +
geom_line() +
geom_smooth(method = "loess", se = FALSE) +
scale_x_continuous(breaks = seq(1998, 2013, by = 1), labels = function(x) floor(x)) +
theme_minimal()
# Decompose the time series
decomposed_ts <- decompose(ts_data_imputed)
# Plot the decomposed time series
autoplot(decomposed_ts) +
theme_minimal() +
labs(title = "Decomposition of Residential Power Usage",
subtitle = "Classical Decomposition",
caption = "Source: Your Data Source")
ggseasonplot(ts_data_imputed,
year.labels=TRUE, year.labels.left=TRUE) +
ylab("KWH") +
ggtitle("Seasonal plot: Residential Power Usage") +
theme_minimal()
train_data <- window(ts_data_imputed, start = c(1998, 1), end = c(2012, 12))
test_data <- window(ts_data_imputed, start = c(2013, 1), end = c(2013, 12))
head(train_data)
head(test_data)
# Define evaluation metrics
calculate_rmse <- function(forecast_values, actual_values) {
sqrt(mean((forecast_values - actual_values)^2))
}
calculate_mae <- function(forecast_values, actual_values) {
mean(abs(forecast_values - actual_values))
}
# Define the horizon
h <- 12
# Estimate lambda for Box-Cox transformation
lambda <- BoxCox.lambda(train_data)
# Define models
models <- list(
Additive_ETS = list(fit = function(x) ets(x, model="AAN", damped = FALSE), forecast = function(fit, h) forecast(fit, h=h)),
Multiplicative_ETS = list(fit = function(x) ets(x, model="MAM", damped = FALSE), forecast = function(fit, h) forecast(fit, h=h)),
Additive_Damped = list(fit = function(x) ets(x, model="AAN", damped = TRUE), forecast = function(fit, h) forecast(fit, h=h)),
SNAIVE = list(fit = function(x) snaive(x, h=h), forecast = function(fit, h) forecast(fit, h=h)),
Additive_BC_ETS = list(fit = function(x) ets(x, model="AAN", damped = FALSE, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h)),
BC_Additive_Damped = list(fit = function(x) ets(x, model="AAN", damped = TRUE, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h)),
BC_SNAIVE = list(fit = function(x) snaive(x, h=h, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h)),
ARIMA = list(fit = function(x) auto.arima(x), forecast = function(fit, h) forecast(fit, h=h)),
ARIMA_BC = list(fit = function(x) auto.arima(x, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h))
)
# Evaluate models
results <- lapply(names(models), function(model_name) {
model <- models[[model_name]]
fit <- model$fit(train_data)
forecast_values <- model$forecast(fit, h)$mean
rmse <- calculate_rmse(forecast_values, test_data)
mae <- calculate_mae(forecast_values, test_data)
if(model_name %in% c("SNAIVE", "BC_SNAIVE")) {
aic <- NA
bic <- NA
} else {
aic <- AIC(fit)
bic <- BIC(fit)
}
data.frame(
Model = model_name,
AIC = aic,
BIC = bic,
RMSE = rmse,
MAE = mae
)
})
results <- do.call(rbind, results)
# Print the results
print(results)
# Fit the best model on the training data
arima_bc_fit <- auto.arima(train_data, lambda = lambda, biasadj = TRUE)
# Report the model details
arima_bc_fit
# Generate forecasts
arima_bc_forecast <- forecast(arima_bc_fit, h = 12)
arima_bc_forecast
# Plot the residuals
checkresiduals(arima_bc_fit)
# Autoplot of the forecast
autoplot(arima_bc_forecast) +
autolayer(fitted(arima_bc_forecast), series="Fitted") +
ylab("KWH") +
xlab("Time") +
ggtitle("Forecasts from ARIMA_BC Model") +
theme_minimal()
accuracy(arima_bc_forecast)
# Print the forecast values
print(arima_bc_forecast$mean)
write.xlsx(arima_bc_forecast, file = "/Users/ulianaplotnikova/Downloads/ATM624Data1.xlsx",
sheetName = "Forecasting Power", append= FALSE)