Part A – ATM Forecast

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.

Explore data

First, let’s take a look at the data.

# Load ATM data and convert to proper format
atm<- read_excel("/Users/ulianaplotnikova/Desktop/Data624/ATM624Data.xlsx",col_types = c('date', 'text', 'numeric'))
# make all column names lowercase
atm <- atm |>
  rename_with(tolower)

glimpse(atm)
## Rows: 1,474
## Columns: 3
## $ date <dttm> 2009-05-01, 2009-05-01, 2009-05-02, 2009-05-02, 2009-05-03, 2009…
## $ atm  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …

The existing date format requires conversion to proper datetime format. The ATM variable will become a factor and then we will change the dataframe into a tsibble format.

# Converting the date column to a date datatype
atm <- atm |>
  mutate(
    date = as.Date(date, origin = "1900-01-01")
  )

# Converting the dataset into a tsibble
atm_ts <- atm |>
  as_tsibble(
    index = date,
    key = atm
  )

head(atm_ts)
## # A tsibble: 6 x 3 [1D]
## # Key:       atm [1]
##   date       atm    cash
##   <date>     <chr> <dbl>
## 1 2009-05-01 ATM1     96
## 2 2009-05-02 ATM1     82
## 3 2009-05-03 ATM1     85
## 4 2009-05-04 ATM1     90
## 5 2009-05-05 ATM1     99
## 6 2009-05-06 ATM1     88

Let’s take a look at daily cash withdrawal amounts

atm_ts |>

  autoplot(cash) +

  facet_wrap(~atm, scales = "free", nrow = 4) +

  labs(title = "ATM Withdrawals: May 2009 - April 2010",

       subtitle = "Daily cash withdrawal amounts",

       x = "Date",

       y = "Cash withdrawals (hundreds)") +

  theme_minimal() +

  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),

        plot.subtitle = element_text(hjust = 0.5, size = 11),

        strip.text = element_text(face = "bold"),

        legend.position = "none")

ATM1 and ATM2 show relatively consistent withdrawal patterns throughout the year, with fluctuations around a mean of approximately 50-75 hundreds.
ATM3 doesn’t show any withdrawal until April 2010. ATM4 displays a significantly different pattern, with very low withdrawals for most of the period, followed by a sharp peak in early 2010 and then a return to low levels. This suggests a possible anomaly or unusual event affecting ATM4 during that time. Overall, the data suggests seasonal variations in ATM usage, with higher withdrawals during certain months.

atm_ts |> 

  ggplot(aes(x = cash, fill = atm)) +

  geom_histogram(bins = 30, alpha = 0.8, color = "white") +

  facet_wrap(~atm, ncol = 2, scales = "free") +

  labs(title = "Distribution of ATM Withdrawals: May 2009 - April 2010",

       subtitle = "Histograms showing withdrawal frequency distribution",

       x = "Withdrawal amount (hundreds)",

       y = "Frequency") +

  scale_y_continuous("Withdrawals (hundreds)") +

  theme_minimal() +

  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),

        plot.subtitle = element_text(hjust = 0.5, size = 11),

        strip.text = element_text(face = "bold"),

        legend.position = "none") +

  scale_fill_brewer(palette = "Set1")

Check for null entries

# Check for null entries
atm |>
  group_by(atm) |>
  summarise(
    null_count = sum(is.na(cash))
  )
## # A tibble: 5 × 2
##   atm   null_count
##   <chr>      <int>
## 1 ATM1           3
## 2 ATM2           2
## 3 ATM3           0
## 4 ATM4           0
## 5 <NA>          14
atm_ts |>
  filter(
    is.na(cash),
    !is.na(atm)
  )
## # A tsibble: 5 x 3 [1D]
## # Key:       atm [2]
##   date       atm    cash
##   <date>     <chr> <dbl>
## 1 2009-06-13 ATM1     NA
## 2 2009-06-16 ATM1     NA
## 3 2009-06-22 ATM1     NA
## 4 2009-06-18 ATM2     NA
## 5 2009-06-24 ATM2     NA

Data Pre-Processing

In this section we will process and clean dataset, by handling missing values and outliers as well as creating separate time series for each ATM machine.

# Filter out the NA values
atm_ts <- atm_ts |>
  filter(
    !is.na(atm)
  )

# Create timeseries 
atm1 <- atm_ts |>
  filter(
    atm == "ATM1"
  ) |>
  mutate(
    cash = na.approx(cash, na.rm = FALSE)
  )

atm2 <- atm_ts |>
  filter(
    atm == "ATM2"
  ) |>
  mutate(
    cash = na.approx(cash, na.rm = FALSE)
  )


atm3 <- atm_ts |>
  filter(
    atm == "ATM3"
  )


atm4 <- atm_ts |>
  filter(
    atm == "ATM4"
  ) |>
  mutate(
    cash = if_else(cash > 9000, NA_real_, cash)
  ) |>
  mutate(
    cash = na.approx(cash, na.rm = FALSE)
  )
# Combining all the ATM datasets after preprocessing

atm_clean <- bind_rows(atm1, atm2, atm3, atm4)

# Check the cleaned dataset

head(atm_clean)
## # A tsibble: 6 x 3 [1D]
## # Key:       atm [1]
##   date       atm    cash
##   <date>     <chr> <dbl>
## 1 2009-05-01 ATM1     96
## 2 2009-05-02 ATM1     82
## 3 2009-05-03 ATM1     85
## 4 2009-05-04 ATM1     90
## 5 2009-05-05 ATM1     99
## 6 2009-05-06 ATM1     88
summary(atm_clean)
##       date                atm                 cash       
##  Min.   :2009-05-01   Length:1460        Min.   :   0.0  
##  1st Qu.:2009-07-31   Class :character   1st Qu.:   1.0  
##  Median :2009-10-30   Mode  :character   Median :  73.5  
##  Mean   :2009-10-30                      Mean   : 148.1  
##  3rd Qu.:2010-01-29                      3rd Qu.: 114.0  
##  Max.   :2010-04-30                      Max.   :1712.1

To improve forecasting, we will create additional time-based features that could help identify patterns.

# Add temporal features

atm_features <- atm_clean %>%

  mutate(

    day_of_week = wday(date, label = TRUE),

    month = month(date, label = TRUE),

    day_of_month = day(date),

    is_weekend = if_else(wday(date) %in% c(1, 7), TRUE, FALSE)

  )

# View the enhanced dataset

head(atm_features)
## # A tsibble: 6 x 7 [1D]
## # Key:       atm [1]
##   date       atm    cash day_of_week month day_of_month is_weekend
##   <date>     <chr> <dbl> <ord>       <ord>        <int> <lgl>     
## 1 2009-05-01 ATM1     96 Fri         May              1 FALSE     
## 2 2009-05-02 ATM1     82 Sat         May              2 TRUE      
## 3 2009-05-03 ATM1     85 Sun         May              3 TRUE      
## 4 2009-05-04 ATM1     90 Mon         May              4 FALSE     
## 5 2009-05-05 ATM1     99 Tue         May              5 FALSE     
## 6 2009-05-06 ATM1     88 Wed         May              6 FALSE

Let’s conduct some further analysis to find out if there are any weekly, monthly, or other time-based trends in the data for each ATM?

atm_features %>%

  ggplot(aes(x = day_of_week, y = cash, fill = day_of_week)) +

  geom_boxplot() +

  facet_wrap(~atm, scales = "free_y") +

  labs(title = "Weekly Distribution of ATM Cash Withdrawals",

       x = "Day of Week", 

       y = "Cash (hundreds)") +

  theme_minimal() +

  theme(legend.position = "none", 

        axis.text.x = element_text(angle = 45, hjust = 1))

# Monthly patterns

atm_features %>%

  ggplot(aes(x = month, y = cash, fill = month)) +

  geom_boxplot() +

  facet_wrap(~atm, scales = "free_y") +

  labs(title = "Monthly Patterns in ATM Cash Withdrawals",

       x = "Month", 

       y = "Cash (hundreds)") +

  theme_minimal() +

  theme(legend.position = "none", 

        axis.text.x = element_text(angle = 45, hjust = 1))

# Monthly trend analysis

atm_features %>%

  group_by(atm, month) %>%

  summarise(

    total_cash = sum(cash, na.rm = TRUE),

    avg_daily_cash = mean(cash, na.rm = TRUE),

    .groups = "drop"

  ) %>%

  ggplot(aes(x = month, y = total_cash, group = atm, color = atm)) +

  geom_line(linewidth = 1) +

  geom_point(size = 3, shape = 21, fill = "white") +

  labs(

    title = "Monthly Total Cash Withdrawals by ATM",

    subtitle = "Showing seasonal patterns across locations",

    x = "Month",

    y = "Total cash withdrawals (hundreds)",

    color = "ATM machine"

  ) +

  scale_color_brewer(palette = "Dark2") +

  theme_minimal() +

  theme(

    plot.title = element_text(hjust = 0.5, face = "bold"),

    plot.subtitle = element_text(hjust = 0.5),

    axis.text.x = element_text(angle = 45, hjust = 1),

    legend.position = "bottom"

  )

The graph shows that withdrawals are generally higher during certain months and lower during others (seasonal pattern). For example, there’s a peak in withdrawals around the middle of the year (September) and a decrease towards the end of the year.

atm_features %>%

  group_by(atm, is_weekend) %>%

  summarise(

    avg_withdrawal = mean(cash, na.rm = TRUE),

    med_withdrawal = median(cash, na.rm = TRUE),

    sd_withdrawal = sd(cash, na.rm = TRUE),

    .groups = "drop"

  ) %>%

  mutate(day_type = if_else(is_weekend, "Weekend", "Weekday")) %>%

  ggplot(aes(x = atm, y = avg_withdrawal, fill = day_type)) +

  geom_col(position = "dodge", color = "white") +

  geom_errorbar(

    aes(ymin = avg_withdrawal - sd_withdrawal, 

        ymax = avg_withdrawal + sd_withdrawal),

    position = position_dodge(0.9),

    width = 0.2

  ) +

  labs(

    title = "Weekend vs Weekday Cash Withdrawal Patterns",

    subtitle = "Average withdrawals with standard deviation error bars",

    x = "ATM Location",

    y = "Average cash withdrawal (hundreds)",

    fill = "Day Type"

  ) +

  scale_fill_brewer(palette = "Set1") +

  theme_light() +

  theme(

    plot.title = element_text(hjust = 0.5, face = "bold"),

    plot.subtitle = element_text(hjust = 0.5),

    legend.position = "bottom"

  )

Let’s decompose each ATM’s time series to better understand trend, seasonality, and residuals.

# decompose and plot time series

decompose_and_plot_enhanced <- function(atm_features, atm_name, color_scheme) {

  # Convert to ts object

  ts_data <- ts(atm_features$cash, frequency = 7)  # Weekly seasonality

  # Decompose

  decomp <- decompose(ts_data, type = "additive")

  # Create custom plot with enhanced appearance

  component_names <- c("Observed", "Trend", "Seasonal", "Random")

  component_data <- data.frame(

    Date = rep(atm_features$date, 4),

    Value = c(decomp$x, decomp$trend, decomp$seasonal, decomp$random),

    Component = rep(component_names, each = length(decomp$x))

  )


  # Remove NA values

  component_data <- component_data[!is.na(component_data$Value), ]

  # Create the plot

  ggplot(component_data, aes(x = Date, y = Value)) +

    geom_line(color = color_scheme, linewidth = 0.8) +

    facet_wrap(~Component, scales = "free_y", ncol = 1) +

    labs(

      title = paste("Time Series Decomposition:", atm_name),

      subtitle = "Additive decomposition showing trend, seasonal, and random components",

      y = "Cash withdrawals (hundreds)",

      caption = paste("Data period:", min(atm_features$date), "to", max(atm_features$date))

    ) +

    theme_minimal() +

    theme(

      strip.text = element_text(face = "bold", size = 12),

      strip.background = element_rect(fill = "lightgrey", color = NA),

      plot.title = element_text(hjust = 0.5, face = "bold", size = 14),

      plot.subtitle = element_text(hjust = 0.5, color = "darkgrey", size = 11),

      panel.grid.minor = element_blank(),

      plot.caption = element_text(hjust = 1, face = "italic", size = 9)

    )

}
atm1_decomp<- decompose_and_plot_enhanced(atm1, "ATM1", "steelblue")

atm2_decomp <- decompose_and_plot_enhanced(atm2, "ATM2", "darkred")

atm3_decomp <- decompose_and_plot_enhanced(atm3, "ATM3", "forestgreen")

atm4_decomp <- decompose_and_plot_enhanced(atm4, "ATM4", "darkorange")

# Display the plots

atm1_decomp

atm2_decomp

atm3_decomp

atm4_decomp

Forecasting

To make our predictions, we’ll employ a variety of models, evaluate their outcomes, and select the top performer for forecasting. The models in our toolkit include: - ARIMA - Additive ETS
- Multiplicative ETS
- SNAIVE
- Transformed Additive
- Transformed Multiplicative
- Transformed SNAIVE

ATM 1

atm1 |>
  gg_tsdisplay(cash, plot_type = 'partial') + 
  labs(title = 'ATM1 Cash Withdrawals')

lambda <- atm1 |>
  features(cash, features = guerrero) |>
  pull(lambda_guerrero)

atm1_fit <- atm1 |>
  model(
    "ARIMA" = ARIMA(box_cox(cash,lambda)),
    "Additive ETS" = ETS(cash ~ error("A") + trend("N") + season("A")),
    "Multiplicative ETS" = ETS(cash ~ error("M") + trend("N") + season("M")),
    "SNAIVE" = SNAIVE(cash),
    "Transformed Additive" = ETS(box_cox(cash,lambda) ~ error("A") + trend("N") + season("A")),
    "Transformed Multiplicative" = ETS(box_cox(cash,lambda) ~ error("M") + trend("N") + season("M")),
    "Transformed SNAIVE" = SNAIVE(box_cox(cash,lambda))
    
  )


left_join(glance(atm1_fit) |>
            select(.model:BIC), 
          accuracy(atm1_fit) |> 
            select(.model, RMSE, MAE)) |>
  arrange(RMSE)
## Joining with `by = join_by(.model)`
## # A tibble: 7 × 8
##   .model                       sigma2 log_lik   AIC  AICc   BIC  RMSE   MAE
##   <chr>                         <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Transformed Multiplicative   0.0383  -1219. 2458. 2458. 2497.  23.3  15.3
## 2 Additive ETS               580.      -2234. 4487. 4488. 4526.  23.8  15.1
## 3 Transformed Additive         1.80    -1180. 2380. 2380. 2419.  24.9  15.9
## 4 ARIMA                        1.76     -610. 1228. 1228. 1244.  24.9  16.0
## 5 Multiplicative ETS           0.134   -2273. 4566. 4567. 4605.  26.3  16.2
## 6 Transformed SNAIVE           2.48       NA    NA    NA    NA   27.7  17.7
## 7 SNAIVE                     772.         NA    NA    NA    NA   27.7  17.7

By comparing AIC, BIC, and AICc of all the models, the ARIMA model came out on top with the lowest values. Consequently, we will go with the ARIMA model as the best fit.

atm1_fit |>
  select(.model = "ARIMA") |>
  report()
## Series: cash 
## Model: ARIMA(0,0,2)(0,1,1)[7] 
## Transformation: box_cox(cash, lambda) 
## 
## Coefficients:
##          ma1      ma2     sma1
##       0.1126  -0.1094  -0.6418
## s.e.  0.0524   0.0520   0.0432
## 
## sigma^2 estimated as 1.764:  log likelihood=-609.99
## AIC=1227.98   AICc=1228.09   BIC=1243.5
forecast_atm1 <- atm1_fit |>
  forecast(h = 31) |>
  filter(.model=='ARIMA')
atm1_fit |>
  select(ARIMA) |>
  gg_tsresiduals()

The ACF plot suggests a lack of significant autocorrelation in the data after accounting for the immediate previous value. The histogram of residuals shows an approximately normal distribution, and the time series plot of innovation residuals indicates that the ARIMA model fits the data reasonably well, with only minor deviations from the assumptions of constant variance and normality.

forecast_atm1 |>
  autoplot(atm1) 

Perform diagnostic checks using the Ljung-Box and Box-Pierce tests.

atm1_fit |> 
  select(.model = "ARIMA") |>
  augment() |>
  features(.innov, box_pierce, lag = 10, dof = 0)
## # A tibble: 1 × 3
##   .model bp_stat bp_pvalue
##   <chr>    <dbl>     <dbl>
## 1 .model    6.37     0.784
atm1_fit |> 
  select(.model = "ARIMA") |>
  augment() |>
  features(.innov, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 .model    6.50     0.772

The ARIMA model fits the data well, as indicated by the high p-values from the Ljung-Box and Box-Pierce tests, suggesting the residuals are random.

ATM 2

atm2 |>
  gg_tsdisplay(cash, plot_type = 'partial') + 
  labs(title = 'ATM2 CASH Withdrawals')

lambda2 <- atm2 |>
  features(cash, features = guerrero) |>
  pull(lambda_guerrero)

atm2_fit <- atm2 |>
  model(
    "ARIMA" = ARIMA(box_cox(cash,lambda2)),
    "Additive ETS" = ETS(cash ~ error("A") + trend("N") + season("A")),
    "Multiplicative ETS" = ETS(cash ~ error("M") + trend("N") + season("M")),
    "SNAIVE" = SNAIVE(cash),
    "Transformed Additive" = ETS(box_cox(cash,lambda2) ~ error("A") + trend("N") + season("A")),
    "Transformed Multiplicative" = ETS(box_cox(cash,lambda2) ~ error("M") + trend("N") + season("M")),
    "Transformed SNAIVE" = SNAIVE(box_cox(cash,lambda2))
 
  )


left_join(glance(atm2_fit) |>
            select(.model:BIC), 
          accuracy(atm2_fit) |> 
            select(.model, RMSE, MAE)) |>
  arrange(RMSE)
## Joining with `by = join_by(.model)`
## # A tibble: 7 × 8
##   .model                      sigma2 log_lik   AIC  AICc   BIC  RMSE   MAE
##   <chr>                        <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA                       68.2    -1263. 2539. 2539. 2562.  24.5  17.1
## 2 Additive ETS               648.     -2254. 4527. 4528. 4566.  25.1  17.9
## 3 Transformed Additive        72.4    -1854. 3728. 3728. 3767.  25.4  17.8
## 4 SNAIVE                     914.        NA    NA    NA    NA   30.2  20.8
## 5 Transformed SNAIVE         101.        NA    NA    NA    NA   30.2  20.8
## 6 Transformed Multiplicative   0.308  -2011. 4043. 4043. 4082.  34.1  27.2
## 7 Multiplicative ETS           0.424  -2399. 4818. 4819. 4857.  34.4  27.0

Lower values for AIC, AICC, BIC,RMSE and MAE indicate that ARIMA model fits the best in comparison to other models.So, again we will go with ARIMA

atm2_fit |>
  select(.model = "ARIMA") |> 
  report()
## Series: cash 
## Model: ARIMA(2,0,2)(0,1,1)[7] 
## Transformation: box_cox(cash, lambda2) 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2     sma1
##       -0.4256  -0.9051  0.4719  0.7967  -0.7271
## s.e.   0.0588   0.0444  0.0888  0.0590   0.0406
## 
## sigma^2 estimated as 68.16:  log likelihood=-1263.33
## AIC=2538.66   AICc=2538.9   BIC=2561.94
forecast_atm2 <- atm2_fit |>
  forecast(h = 31) |>
  filter(.model=='ARIMA')

atm2_fit |>
  select(ARIMA) |>
  gg_tsresiduals()

The ACF plot shows some autocorrelation, but the residuals plot and histogram indicate that the model is a reasonable fit, with residuals that are approximately normally distributed and show no obvious patterns. The model’s performance seems satisfactory.

forecast_atm2 |>
  autoplot(atm2) 

Perform diagnostic checks using the Ljung-Box and Box-Pierce tests.

atm2_fit |> 
  select(.model = "ARIMA") |>
  augment() |>
  features(.innov, box_pierce, lag = 10, dof = 0)
## # A tibble: 1 × 3
##   .model bp_stat bp_pvalue
##   <chr>    <dbl>     <dbl>
## 1 .model    8.88     0.544
atm2_fit |> 
  select(.model = "ARIMA") |>
  augment() |>
  features(.innov, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 .model    9.06     0.527

Both p-values (bp_pvalue and lb_pvalue) are greater than the typical significance level of 0.05. This indicates that there is no significant autocorrelation in the residuals at the specified lag (10). In other words, the model adequately captures the autocorrelation in the data. The ARIMA model appears to be a good fit for the data.

ATM 3

ATM3 doesn’t have much information until April 2010 , so it might be tough to create anything more complex than a random walk model.

atm3 |>
  gg_tsdisplay(cash, plot_type = 'partial') + 
  labs(title = 'ATM3 Cash Withdrawals')

atm3_fit <- atm3 |>
  model(NAIVE(cash))

forecast_atm3  <- atm3_fit |>
  forecast(h = 31)

forecast_atm3  |>
  autoplot(atm3)

ATM 4

atm4 |>
  gg_tsdisplay(cash, plot_type = 'partial') + 
  labs(title = 'ATM4 Cash Withdrawals')

lambda4 <- atm4 |>
  features(cash, features = guerrero) |>
  pull(lambda_guerrero)

atm4_fit <- atm4 |>
  model(
    "ARIMA" = ARIMA(box_cox(cash,lambda4)),
    "Additive ETS" = ETS(cash ~ error("A") + trend("N") + season("A")),
    "Multiplicative ETS" = ETS(cash ~ error("M") + trend("N") + season("M")),
    "SNAIVE" = SNAIVE(cash),
  "Transformed_Additive" = ETS(box_cox(cash,lambda4) ~ error("A") + trend("N") + season("A")),
    "Transformed Multiplicative" = ETS(box_cox(cash,lambda4) ~ error("M") + trend("N") + season("M")),
    "Transformed SNAIVE" = SNAIVE(box_cox(cash,lambda4)),
    "Additive_Transformed_Non_Seasonal" = ETS(box_cox(cash,lambda4) ~ error("A") + trend("N") + season("N")),
    "Multiplicative_Transformed_Non_Seasonal" = ETS(box_cox(cash,lambda4) ~ error("M") + trend("N") + season("N"))
  )


left_join(glance(atm4_fit) |> 
            select(.model:BIC), 
          accuracy(atm4_fit) |> 
            select(.model, RMSE, MAE)) |>
  arrange(RMSE)
## Joining with `by = join_by(.model)`
## # A tibble: 9 × 8
##   .model                            sigma2 log_lik   AIC  AICc   BIC  RMSE   MAE
##   <chr>                              <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Additive ETS                     1.11e+5  -3192. 6404. 6405. 6443.  329.  265.
## 2 Transformed_Additive             1.67e+2  -2006. 4032. 4033. 4071.  338.  260.
## 3 Transformed Multiplicative       2.19e-1  -2003. 4026. 4027. 4065.  343.  262.
## 4 ARIMA                            1.76e+2  -1461. 2931. 2931. 2951.  352.  274.
## 5 Multiplicative ETS               7.28e-1  -3192. 6404. 6405. 6443.  354.  277.
## 6 Additive_Transformed_Non_Season… 1.97e+2  -2040. 4085. 4085. 4097.  363.  292.
## 7 Multiplicative_Transformed_Non_… 2.38e-1  -2040. 4085. 4085. 4097.  363.  292.
## 8 SNAIVE                           2.06e+5     NA    NA    NA    NA   453.  346.
## 9 Transformed SNAIVE               2.89e+2     NA    NA    NA    NA   453.  346.

ARIMA shows the lower values for AIC, AICC, BIC among all the models.So, again we will go with ARIMA for forecast

atm4_fit |>
  select(.model = "ARIMA") |> 
  report()
## Series: cash 
## Model: ARIMA(0,0,1)(2,0,0)[7] w/ mean 
## Transformation: box_cox(cash, lambda4) 
## 
## Coefficients:
##          ma1    sar1    sar2  constant
##       0.0790  0.2078  0.2023   16.8928
## s.e.  0.0527  0.0516  0.0525    0.7318
## 
## sigma^2 estimated as 176.5:  log likelihood=-1460.57
## AIC=2931.13   AICc=2931.3   BIC=2950.63
forecast_atm4 <- atm4_fit |>
  forecast(h = 31) |>
  filter(.model=='ARIMA')
atm4_fit |>
  select(ARIMA) |>
  gg_tsresiduals()

The model’s autoplot indicates that it remains stable around the zero mark, lacking any clear patterns, though there are a few noticeable outliers. In the ACF plot, the lines generally stay within the confidence intervals, again showing no clear patterns. The histogram suggests a nearly normal distribution, even though the peak of the residuals doesn’t seem to align with zero

forecast_atm4 |>
  autoplot(atm4) 

Perform diagnostic checks using the Ljung-Box and Box-Pierce tests.

atm4_fit |> 
  select(.model = "ARIMA") |>
  augment() |>
  features(.innov, box_pierce, lag = 10, dof = 0)
## # A tibble: 1 × 3
##   .model bp_stat bp_pvalue
##   <chr>    <dbl>     <dbl>
## 1 .model    14.2     0.163
atm4_fit |> 
  select(.model = "ARIMA") |>
  augment() |>
  features(.innov, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 .model    14.6     0.147
forecast <- bind_rows(forecast_atm1, forecast_atm2, forecast_atm3, forecast_atm4)

Adding to Excel

write.xlsx(forecast, file = "/Users/ulianaplotnikova/Downloads/ATM624Data.xlsx",
      sheetName = "ATM May Forecast", append = FALSE)
ATM_FORECAST <- read_excel("/Users/ulianaplotnikova/Downloads/ATM624Data.xlsx", sheet = "ATM May Forecast")

Part B – Forecasting Power

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

Explore Data

power_data <- read.csv("https://raw.githubusercontent.com/uplotnik/DATA-624/refs/heads/main/ResidentialCustomerForecastLoad-624.csv")
# Initial data exploration

print(head(power_data))
##   CaseSequence YYYY.MMM     KWH
## 1          733 1998-Jan 6862583
## 2          734 1998-Feb 5838198
## 3          735 1998-Mar 5420658
## 4          736 1998-Apr 5010364
## 5          737 1998-May 4665377
## 6          738 1998-Jun 6467147
print(str(power_data))
## 'data.frame':    192 obs. of  3 variables:
##  $ CaseSequence: int  733 734 735 736 737 738 739 740 741 742 ...
##  $ YYYY.MMM    : chr  "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
##  $ KWH         : int  6862583 5838198 5420658 5010364 4665377 6467147 8914755 8607428 6989888 6345620 ...
## NULL
summary(power_data)
##   CaseSequence     YYYY.MMM              KWH          
##  Min.   :733.0   Length:192         Min.   :  770523  
##  1st Qu.:780.8   Class :character   1st Qu.: 5429912  
##  Median :828.5   Mode  :character   Median : 6283324  
##  Mean   :828.5                      Mean   : 6502475  
##  3rd Qu.:876.2                      3rd Qu.: 7620524  
##  Max.   :924.0                      Max.   :10655730  
##                                     NA's   :1

Convert the data into a time series object and visualize

# Convert the data into a time series object

ts_data <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)
autoplot(ts_data, main = "Residential Power Usage (1998-2013)", xlab = "Time", ylab = "KWH") +
geom_line() + 
geom_smooth(method = "loess", se = FALSE) + 
scale_x_continuous(breaks = seq(1998, 2013, by = 1), labels = function(x) floor(x)) + 
theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'

Data Pre-processing

Handle missing values:

-Check for missing values -Impute missing values using linear interpolation -Visualize the imputed time series

# Check for missing values

sum(is.na(ts_data))
## [1] 1
# Impute missing values using linear interpolation

ts_data_imputed <- na_interpolation(ts_data, option = "linear")

# Verify that missing values are handled

sum(is.na(ts_data_imputed))
## [1] 0
# Visualize the imputed time series

autoplot(ts_data_imputed, main = "Residential Power Usage (1998-2013)", xlab = "Time", ylab = "KWH") +

  geom_line() + # add this line

  geom_smooth(method = "loess", se = FALSE) + # add this line

  scale_x_continuous(breaks = seq(1998, 2013, by = 1), labels = function(x) floor(x)) + # add this line

  theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'

After handling missing values the graph still shows an outliers. In the next step we will identify and replace the outliers with median.

Handle Outliers

Detect outliers using boxplot method

boxplot_stats <- boxplot(ts_data_imputed, plot = FALSE)
outliers <- ts_data_imputed %in% boxplot_stats$out
outlier_indices <- which(outliers)
print(outlier_indices)
## [1] 151
print(ts_data_imputed[outlier_indices])
## [1] 770523

Handle outliers by replacement with the median and visualize

# Handle outliers by replacement with the median

median_value <- median(ts_data_imputed, na.rm = TRUE)
ts_data_imputed[outlier_indices] <- median_value
  • Verify that outliers are handled
# Verify that outliers are handled

boxplot_stats_after <- boxplot(ts_data_imputed, plot = FALSE)
outliers_after <- ts_data_imputed %in% boxplot_stats_after$out
sum(outliers_after) #checking if any outliers are remaining
## [1] 0
  • Visualize the time series after outlier handling
autoplot(ts_data_imputed, main = "Residential Power Usage (1998-2013) - Outliers Handled", xlab = "Time", ylab = "KWH") +
 geom_line() +
geom_smooth(method = "loess", se = FALSE) +
 scale_x_continuous(breaks = seq(1998, 2013, by = 1), labels = function(x) floor(x)) +
theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'

Decompose the time series

# Decompose the time series

decomposed_ts <- decompose(ts_data_imputed)

# Plot the decomposed time series
autoplot(decomposed_ts) +

  theme_minimal() +

  labs(title = "Decomposition of Residential Power Usage",

       subtitle = "Classical Decomposition",

       caption = "Source: Your Data Source")

Visualize the seasonality

ggseasonplot(ts_data_imputed,

              year.labels=TRUE, year.labels.left=TRUE) +

  ylab("KWH") +

  ggtitle("Seasonal plot: Residential Power Usage") +

  theme_minimal()

Split Data into Training and Testing Sets

train_data <- window(ts_data_imputed, start = c(1998, 1), end = c(2012, 12))

test_data <- window(ts_data_imputed, start = c(2013, 1), end = c(2013, 12))

head(train_data)
##          Jan     Feb     Mar     Apr     May     Jun
## 1998 6862583 5838198 5420658 5010364 4665377 6467147
head(test_data)
##           Jan      Feb      Mar      Apr      May      Jun
## 2013 10655730  7681798  6517514  6105359  5940475  7920627

Modeling and Forecasting

# Define evaluation metrics

calculate_rmse <- function(forecast_values, actual_values) {

  sqrt(mean((forecast_values - actual_values)^2))

}

calculate_mae <- function(forecast_values, actual_values) {

  mean(abs(forecast_values - actual_values))

}
# Define the horizon

h <- 12

# Estimate lambda for Box-Cox transformation

lambda <- BoxCox.lambda(train_data)

# Define models

models <- list(

  Additive_ETS = list(fit = function(x) ets(x, model="AAN", damped = FALSE), forecast = function(fit, h) forecast(fit, h=h)),

  Multiplicative_ETS = list(fit = function(x) ets(x, model="MAM", damped = FALSE), forecast = function(fit, h) forecast(fit, h=h)),

  Additive_Damped = list(fit = function(x) ets(x, model="AAN", damped = TRUE), forecast = function(fit, h) forecast(fit, h=h)),

  SNAIVE = list(fit = function(x) snaive(x, h=h), forecast = function(fit, h) forecast(fit, h=h)),

  Additive_BC_ETS = list(fit = function(x) ets(x, model="AAN", damped = FALSE, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h)),

  BC_Additive_Damped = list(fit = function(x) ets(x, model="AAN", damped = TRUE, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h)),

  BC_SNAIVE = list(fit = function(x) snaive(x, h=h, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h)),

  ARIMA = list(fit = function(x) auto.arima(x), forecast = function(fit, h) forecast(fit, h=h)),

  ARIMA_BC = list(fit = function(x) auto.arima(x, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h))

)

Evaluate models

# Evaluate models

results <- lapply(names(models), function(model_name) {

  model <- models[[model_name]]

  fit <- model$fit(train_data)

  forecast_values <- model$forecast(fit, h)$mean

  rmse <- calculate_rmse(forecast_values, test_data)

  mae <- calculate_mae(forecast_values, test_data)

  if(model_name %in% c("SNAIVE", "BC_SNAIVE")) {

    aic <- NA

    bic <- NA

  } else {

    aic <- AIC(fit)

    bic <- BIC(fit)

  }

  data.frame(

    Model = model_name,

    AIC = aic,

    BIC = bic,

    RMSE = rmse,

    MAE = mae

  )

})

results <- do.call(rbind, results)

# Print the results

print(results)
##                Model        AIC        BIC      RMSE       MAE
## 1       Additive_ETS  6039.5454  6055.5102 1600434.6 1392817.4
## 2 Multiplicative_ETS  5739.3988  5793.6790 1021694.1  697734.0
## 3    Additive_Damped  6020.7978  6039.9555 1617817.4 1402941.9
## 4             SNAIVE         NA         NA 1035537.6  618605.6
## 5    Additive_BC_ETS  -637.9292  -621.9644 1594553.9 1310730.0
## 6 BC_Additive_Damped  -635.1095  -615.9518 1578975.3 1354114.5
## 7          BC_SNAIVE         NA         NA 1035537.6  618605.6
## 8              ARIMA  4963.0198  4991.1355  944443.3  616922.5
## 9           ARIMA_BC -1245.9730 -1230.3532  942928.2  631961.6

From the comparison, it’s clear that ARIMA_BC stands out as the top performer, with the lowest AIC, BIC, and RMSE values. Therefore, we’ll proceed with ARIMA_BC for our next analysis.

Fit the Best Model and Generate Forecasts

# Fit the best model on the training data

arima_bc_fit <- auto.arima(train_data, lambda = lambda, biasadj = TRUE)

# Report the model details

arima_bc_fit
## Series: train_data 
## ARIMA(1,0,0)(2,1,0)[12] with drift 
## Box Cox transformation: lambda= -0.1777466 
## 
## Coefficients:
##          ar1     sar1     sar2  drift
##       0.2759  -0.7395  -0.4097  1e-04
## s.e.  0.0752   0.0746   0.0799  1e-04
## 
## sigma^2 = 3.437e-05:  log likelihood = 627.99
## AIC=-1245.97   AICc=-1245.6   BIC=-1230.35
# Generate forecasts

arima_bc_forecast <- forecast(arima_bc_fit, h = 12)
arima_bc_forecast
##          Point Forecast   Lo 80    Hi 80   Lo 95    Hi 95
## Jan 2013        9052914 7918355 10257382 7408489 11007118
## Feb 2013        8703168 7580813  9896838 7078922 10643458
## Mar 2013        6983552 6114030  7906279 5722837  8480019
## Apr 2013        5896085 5182828  6651658 4860393  7119260
## May 2013        5513817 4854424  6211852 4555780  6643054
## Jun 2013        7712422 6735469  8750269 6297234  9397449
## Jul 2013        8340141 7269425  9478562 6790247 10190076
## Aug 2013        9441293 8203222 10759460 7651225 11586344
## Sep 2013        8416225 7334061  9566934 6849892 10286318
## Oct 2013        5914450 5198578  6672818 4874990  7142191
## Nov 2013        5647758 4969557  6365878 4662601  6809778
## Dec 2013        7163335 6267344  8114420 5864556  8706242
# Plot the residuals

checkresiduals(arima_bc_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(2,1,0)[12] with drift
## Q* = 29.789, df = 21, p-value = 0.09631
## 
## Model df: 3.   Total lags used: 24
# Autoplot of the forecast

autoplot(arima_bc_forecast) +

  autolayer(fitted(arima_bc_forecast), series="Fitted") +

  ylab("KWH") +

  xlab("Time") +

  ggtitle("Forecasts from ARIMA_BC Model") +

  theme_minimal()

Model Accuracy

accuracy(arima_bc_forecast)
##                   ME     RMSE      MAE        MPE     MAPE      MASE      ACF1
## Training set 6095.62 602379.8 471265.9 -0.4836496 7.302183 0.7406487 0.1032934

Forecast Values

# Print the forecast values

print(arima_bc_forecast$mean)
##          Jan     Feb     Mar     Apr     May     Jun     Jul     Aug     Sep
## 2013 9052914 8703168 6983552 5896085 5513817 7712422 8340141 9441293 8416225
##          Oct     Nov     Dec
## 2013 5914450 5647758 7163335
write.xlsx(arima_bc_forecast, file = "/Users/ulianaplotnikova/Downloads/ATM624Data1.xlsx",
      sheetName = "Forecasting Power", append= FALSE)

Appendix: All code for this report

library(tidyverse)
library(fpp3)
library(tseries)
library(forecast)
library(kableExtra)
library(reactable)
library(seasonal)
library(tsibble)
library(openxlsx)
library(readxl)
library(mice)
library(caret)
library(zoo)
library(lubridate)
library(imputeTS)
library(naniar)
library(timeplyr)
library(rstatix)
library(timetk)
# Load ATM data and convert to proper format
atm<- read_excel("/Users/ulianaplotnikova/Desktop/Data624/ATM624Data.xlsx",col_types = c('date', 'text', 'numeric'))

# make all column names lowercase
atm <- atm |>
  rename_with(tolower)

glimpse(atm)
# Converting the date column to a date datatype
atm <- atm |>
  mutate(
    date = as.Date(date, origin = "1900-01-01")
  )

# Converting the dataset into a tsibble
atm_ts <- atm |>
  as_tsibble(
    index = date,
    key = atm
  )

head(atm_ts)
atm_ts |>

  autoplot(cash) +

  facet_wrap(~atm, scales = "free", nrow = 4) +

  labs(title = "ATM Withdrawals: May 2009 - April 2010",

       subtitle = "Daily cash withdrawal amounts",

       x = "Date",

       y = "Cash withdrawals (hundreds)") +

  theme_minimal() +

  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),

        plot.subtitle = element_text(hjust = 0.5, size = 11),

        strip.text = element_text(face = "bold"),

        legend.position = "none")


atm_ts |> 

  ggplot(aes(x = cash, fill = atm)) +

  geom_histogram(bins = 30, alpha = 0.8, color = "white") +

  facet_wrap(~atm, ncol = 2, scales = "free") +

  labs(title = "Distribution of ATM Withdrawals: May 2009 - April 2010",

       subtitle = "Histograms showing withdrawal frequency distribution",

       x = "Withdrawal amount (hundreds)",

       y = "Frequency") +

  scale_y_continuous("Withdrawals (hundreds)") +

  theme_minimal() +

  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),

        plot.subtitle = element_text(hjust = 0.5, size = 11),

        strip.text = element_text(face = "bold"),

        legend.position = "none") +

  scale_fill_brewer(palette = "Set1")
# Check for null entries
atm |>
  group_by(atm) |>
  summarise(
    null_count = sum(is.na(cash))
  )
atm_ts |>
  filter(
    is.na(cash),
    !is.na(atm)
  )
# Filter out the NA values
atm_ts <- atm_ts |>
  filter(
    !is.na(atm)
  )

# Create timeseries 
atm1 <- atm_ts |>
  filter(
    atm == "ATM1"
  ) |>
  mutate(
    cash = na.approx(cash, na.rm = FALSE)
  )

atm2 <- atm_ts |>
  filter(
    atm == "ATM2"
  ) |>
  mutate(
    cash = na.approx(cash, na.rm = FALSE)
  )


atm3 <- atm_ts |>
  filter(
    atm == "ATM3"
  )


atm4 <- atm_ts |>
  filter(
    atm == "ATM4"
  ) |>
  mutate(
    cash = if_else(cash > 9000, NA_real_, cash)
  ) |>
  mutate(
    cash = na.approx(cash, na.rm = FALSE)
  )
# Combining all the ATM datasets after preprocessing

atm_clean <- bind_rows(atm1, atm2, atm3, atm4)

# Check the cleaned dataset

head(atm_clean)

summary(atm_clean)
# Add temporal features

atm_features <- atm_clean %>%

  mutate(

    day_of_week = wday(date, label = TRUE),

    month = month(date, label = TRUE),

    day_of_month = day(date),

    is_weekend = if_else(wday(date) %in% c(1, 7), TRUE, FALSE)

  )

# View the enhanced dataset

head(atm_features)
atm_features %>%

  ggplot(aes(x = day_of_week, y = cash, fill = day_of_week)) +

  geom_boxplot() +

  facet_wrap(~atm, scales = "free_y") +

  labs(title = "Weekly Distribution of ATM Cash Withdrawals",

       x = "Day of Week", 

       y = "Cash (hundreds)") +

  theme_minimal() +

  theme(legend.position = "none", 

        axis.text.x = element_text(angle = 45, hjust = 1))
# Monthly patterns

atm_features %>%

  ggplot(aes(x = month, y = cash, fill = month)) +

  geom_boxplot() +

  facet_wrap(~atm, scales = "free_y") +

  labs(title = "Monthly Patterns in ATM Cash Withdrawals",

       x = "Month", 

       y = "Cash (hundreds)") +

  theme_minimal() +

  theme(legend.position = "none", 

        axis.text.x = element_text(angle = 45, hjust = 1))

# Monthly trend analysis

atm_features %>%

  group_by(atm, month) %>%

  summarise(

    total_cash = sum(cash, na.rm = TRUE),

    avg_daily_cash = mean(cash, na.rm = TRUE),

    .groups = "drop"

  ) %>%

  ggplot(aes(x = month, y = total_cash, group = atm, color = atm)) +

  geom_line(linewidth = 1) +

  geom_point(size = 3, shape = 21, fill = "white") +

  labs(

    title = "Monthly Total Cash Withdrawals by ATM",

    subtitle = "Showing seasonal patterns across locations",

    x = "Month",

    y = "Total cash withdrawals (hundreds)",

    color = "ATM machine"

  ) +

  scale_color_brewer(palette = "Dark2") +

  theme_minimal() +

  theme(

    plot.title = element_text(hjust = 0.5, face = "bold"),

    plot.subtitle = element_text(hjust = 0.5),

    axis.text.x = element_text(angle = 45, hjust = 1),

    legend.position = "bottom"

  )

atm_features %>%

  group_by(atm, is_weekend) %>%

  summarise(

    avg_withdrawal = mean(cash, na.rm = TRUE),

    med_withdrawal = median(cash, na.rm = TRUE),

    sd_withdrawal = sd(cash, na.rm = TRUE),

    .groups = "drop"

  ) %>%

  mutate(day_type = if_else(is_weekend, "Weekend", "Weekday")) %>%

  ggplot(aes(x = atm, y = avg_withdrawal, fill = day_type)) +

  geom_col(position = "dodge", color = "white") +

  geom_errorbar(

    aes(ymin = avg_withdrawal - sd_withdrawal, 

        ymax = avg_withdrawal + sd_withdrawal),

    position = position_dodge(0.9),

    width = 0.2

  ) +

  labs(

    title = "Weekend vs Weekday Cash Withdrawal Patterns",

    subtitle = "Average withdrawals with standard deviation error bars",

    x = "ATM Location",

    y = "Average cash withdrawal (hundreds)",

    fill = "Day Type"

  ) +

  scale_fill_brewer(palette = "Set1") +

  theme_light() +

  theme(

    plot.title = element_text(hjust = 0.5, face = "bold"),

    plot.subtitle = element_text(hjust = 0.5),

    legend.position = "bottom"

  )
# decompose and plot time series

decompose_and_plot_enhanced <- function(atm_features, atm_name, color_scheme) {

  # Convert to ts object

  ts_data <- ts(atm_features$cash, frequency = 7)  # Weekly seasonality

  # Decompose

  decomp <- decompose(ts_data, type = "additive")

  # Create custom plot with enhanced appearance

  component_names <- c("Observed", "Trend", "Seasonal", "Random")

  component_data <- data.frame(

    Date = rep(atm_features$date, 4),

    Value = c(decomp$x, decomp$trend, decomp$seasonal, decomp$random),

    Component = rep(component_names, each = length(decomp$x))

  )


  # Remove NA values

  component_data <- component_data[!is.na(component_data$Value), ]

  # Create the plot

  ggplot(component_data, aes(x = Date, y = Value)) +

    geom_line(color = color_scheme, linewidth = 0.8) +

    facet_wrap(~Component, scales = "free_y", ncol = 1) +

    labs(

      title = paste("Time Series Decomposition:", atm_name),

      subtitle = "Additive decomposition showing trend, seasonal, and random components",

      y = "Cash withdrawals (hundreds)",

      caption = paste("Data period:", min(atm_features$date), "to", max(atm_features$date))

    ) +

    theme_minimal() +

    theme(

      strip.text = element_text(face = "bold", size = 12),

      strip.background = element_rect(fill = "lightgrey", color = NA),

      plot.title = element_text(hjust = 0.5, face = "bold", size = 14),

      plot.subtitle = element_text(hjust = 0.5, color = "darkgrey", size = 11),

      panel.grid.minor = element_blank(),

      plot.caption = element_text(hjust = 1, face = "italic", size = 9)

    )

}
atm1_decomp<- decompose_and_plot_enhanced(atm1, "ATM1", "steelblue")

atm2_decomp <- decompose_and_plot_enhanced(atm2, "ATM2", "darkred")

atm3_decomp <- decompose_and_plot_enhanced(atm3, "ATM3", "forestgreen")

atm4_decomp <- decompose_and_plot_enhanced(atm4, "ATM4", "darkorange")

# Display the plots

atm1_decomp

atm2_decomp

atm3_decomp

atm4_decomp
atm1 |>
  gg_tsdisplay(cash, plot_type = 'partial') + 
  labs(title = 'ATM1 Cash Withdrawals')
lambda <- atm1 |>
  features(cash, features = guerrero) |>
  pull(lambda_guerrero)

atm1_fit <- atm1 |>
  model(
    "ARIMA" = ARIMA(box_cox(cash,lambda)),
    "Additive ETS" = ETS(cash ~ error("A") + trend("N") + season("A")),
    "Multiplicative ETS" = ETS(cash ~ error("M") + trend("N") + season("M")),
    "SNAIVE" = SNAIVE(cash),
    "Transformed Additive" = ETS(box_cox(cash,lambda) ~ error("A") + trend("N") + season("A")),
    "Transformed Multiplicative" = ETS(box_cox(cash,lambda) ~ error("M") + trend("N") + season("M")),
    "Transformed SNAIVE" = SNAIVE(box_cox(cash,lambda))
    
  )


left_join(glance(atm1_fit) |>
            select(.model:BIC), 
          accuracy(atm1_fit) |> 
            select(.model, RMSE, MAE)) |>
  arrange(RMSE)
atm1_fit |>
  select(.model = "ARIMA") |>
  report()
forecast_atm1 <- atm1_fit |>
  forecast(h = 31) |>
  filter(.model=='ARIMA')
atm1_fit |>
  select(ARIMA) |>
  gg_tsresiduals()
forecast_atm1 |>
  autoplot(atm1) 
atm1_fit |> 
  select(.model = "ARIMA") |>
  augment() |>
  features(.innov, box_pierce, lag = 10, dof = 0)
atm1_fit |> 
  select(.model = "ARIMA") |>
  augment() |>
  features(.innov, ljung_box, lag = 10, dof = 0)
atm2 |>
  gg_tsdisplay(cash, plot_type = 'partial') + 
  labs(title = 'ATM2 CASH Withdrawals')
lambda2 <- atm2 |>
  features(cash, features = guerrero) |>
  pull(lambda_guerrero)

atm2_fit <- atm2 |>
  model(
    "ARIMA" = ARIMA(box_cox(cash,lambda2)),
    "Additive ETS" = ETS(cash ~ error("A") + trend("N") + season("A")),
    "Multiplicative ETS" = ETS(cash ~ error("M") + trend("N") + season("M")),
    "SNAIVE" = SNAIVE(cash),
    "Transformed Additive" = ETS(box_cox(cash,lambda2) ~ error("A") + trend("N") + season("A")),
    "Transformed Multiplicative" = ETS(box_cox(cash,lambda2) ~ error("M") + trend("N") + season("M")),
    "Transformed SNAIVE" = SNAIVE(box_cox(cash,lambda2))
 
  )


left_join(glance(atm2_fit) |>
            select(.model:BIC), 
          accuracy(atm2_fit) |> 
            select(.model, RMSE, MAE)) |>
  arrange(RMSE)
atm2_fit |>
  select(.model = "ARIMA") |> 
  report()
forecast_atm2 <- atm2_fit |>
  forecast(h = 31) |>
  filter(.model=='ARIMA')

atm2_fit |>
  select(ARIMA) |>
  gg_tsresiduals()
forecast_atm2 |>
  autoplot(atm2) 
atm2_fit |> 
  select(.model = "ARIMA") |>
  augment() |>
  features(.innov, box_pierce, lag = 10, dof = 0)
atm2_fit |> 
  select(.model = "ARIMA") |>
  augment() |>
  features(.innov, ljung_box, lag = 10, dof = 0)
atm3 |>
  gg_tsdisplay(cash, plot_type = 'partial') + 
  labs(title = 'ATM3 Cash Withdrawals')
atm3_fit <- atm3 |>
  model(NAIVE(cash))

forecast_atm3  <- atm3_fit |>
  forecast(h = 31)

forecast_atm3  |>
  autoplot(atm3)
atm4 |>
  gg_tsdisplay(cash, plot_type = 'partial') + 
  labs(title = 'ATM4 Cash Withdrawals')
lambda4 <- atm4 |>
  features(cash, features = guerrero) |>
  pull(lambda_guerrero)

atm4_fit <- atm4 |>
  model(
    "ARIMA" = ARIMA(box_cox(cash,lambda4)),
    "Additive ETS" = ETS(cash ~ error("A") + trend("N") + season("A")),
    "Multiplicative ETS" = ETS(cash ~ error("M") + trend("N") + season("M")),
    "SNAIVE" = SNAIVE(cash),
  "Transformed_Additive" = ETS(box_cox(cash,lambda4) ~ error("A") + trend("N") + season("A")),
    "Transformed Multiplicative" = ETS(box_cox(cash,lambda4) ~ error("M") + trend("N") + season("M")),
    "Transformed SNAIVE" = SNAIVE(box_cox(cash,lambda4)),
    "Additive_Transformed_Non_Seasonal" = ETS(box_cox(cash,lambda4) ~ error("A") + trend("N") + season("N")),
    "Multiplicative_Transformed_Non_Seasonal" = ETS(box_cox(cash,lambda4) ~ error("M") + trend("N") + season("N"))
  )


left_join(glance(atm4_fit) |> 
            select(.model:BIC), 
          accuracy(atm4_fit) |> 
            select(.model, RMSE, MAE)) |>
  arrange(RMSE)
  
 
atm4_fit |>
  select(.model = "ARIMA") |> 
  report()
forecast_atm4 <- atm4_fit |>
  forecast(h = 31) |>
  filter(.model=='ARIMA')
atm4_fit |>
  select(ARIMA) |>
  gg_tsresiduals()
forecast_atm4 |>
  autoplot(atm4) 
atm4_fit |> 
  select(.model = "ARIMA") |>
  augment() |>
  features(.innov, box_pierce, lag = 10, dof = 0)
atm4_fit |> 
  select(.model = "ARIMA") |>
  augment() |>
  features(.innov, ljung_box, lag = 10, dof = 0)
forecast <- bind_rows(forecast_atm1, forecast_atm2, forecast_atm3, forecast_atm4)
write.xlsx(forecast, file = "/Users/ulianaplotnikova/Downloads/ATM624Data.xlsx",
      sheetName = "ATM May Forecast", append = FALSE)
ATM_FORECAST <- read_excel("/Users/ulianaplotnikova/Downloads/ATM624Data.xlsx", sheet = "ATM May Forecast")

power_data <- read.csv("https://raw.githubusercontent.com/uplotnik/DATA-624/refs/heads/main/ResidentialCustomerForecastLoad-624.csv")
# Initial data exploration

print(head(power_data))
print(str(power_data))
summary(power_data)
# Convert the data into a time series object

ts_data <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)
autoplot(ts_data, main = "Residential Power Usage (1998-2013)", xlab = "Time", ylab = "KWH") +
geom_line() + 
geom_smooth(method = "loess", se = FALSE) + 
scale_x_continuous(breaks = seq(1998, 2013, by = 1), labels = function(x) floor(x)) + 
theme_minimal()
# Check for missing values

sum(is.na(ts_data))
# Impute missing values using linear interpolation

ts_data_imputed <- na_interpolation(ts_data, option = "linear")

# Verify that missing values are handled

sum(is.na(ts_data_imputed))

# Visualize the imputed time series

autoplot(ts_data_imputed, main = "Residential Power Usage (1998-2013)", xlab = "Time", ylab = "KWH") +

  geom_line() + # add this line

  geom_smooth(method = "loess", se = FALSE) + # add this line

  scale_x_continuous(breaks = seq(1998, 2013, by = 1), labels = function(x) floor(x)) + # add this line

  theme_minimal()

boxplot_stats <- boxplot(ts_data_imputed, plot = FALSE)
outliers <- ts_data_imputed %in% boxplot_stats$out
outlier_indices <- which(outliers)
print(outlier_indices)
print(ts_data_imputed[outlier_indices])
# Handle outliers by replacement with the median

median_value <- median(ts_data_imputed, na.rm = TRUE)
ts_data_imputed[outlier_indices] <- median_value

# Verify that outliers are handled

boxplot_stats_after <- boxplot(ts_data_imputed, plot = FALSE)
outliers_after <- ts_data_imputed %in% boxplot_stats_after$out
sum(outliers_after) #checking if any outliers are remaining
autoplot(ts_data_imputed, main = "Residential Power Usage (1998-2013) - Outliers Handled", xlab = "Time", ylab = "KWH") +
 geom_line() +
geom_smooth(method = "loess", se = FALSE) +
 scale_x_continuous(breaks = seq(1998, 2013, by = 1), labels = function(x) floor(x)) +
theme_minimal()
# Decompose the time series

decomposed_ts <- decompose(ts_data_imputed)

# Plot the decomposed time series
autoplot(decomposed_ts) +

  theme_minimal() +

  labs(title = "Decomposition of Residential Power Usage",

       subtitle = "Classical Decomposition",

       caption = "Source: Your Data Source")
ggseasonplot(ts_data_imputed,

              year.labels=TRUE, year.labels.left=TRUE) +

  ylab("KWH") +

  ggtitle("Seasonal plot: Residential Power Usage") +

  theme_minimal()


train_data <- window(ts_data_imputed, start = c(1998, 1), end = c(2012, 12))

test_data <- window(ts_data_imputed, start = c(2013, 1), end = c(2013, 12))

head(train_data)
head(test_data)
# Define evaluation metrics

calculate_rmse <- function(forecast_values, actual_values) {

  sqrt(mean((forecast_values - actual_values)^2))

}

calculate_mae <- function(forecast_values, actual_values) {

  mean(abs(forecast_values - actual_values))

}
# Define the horizon

h <- 12

# Estimate lambda for Box-Cox transformation

lambda <- BoxCox.lambda(train_data)

# Define models

models <- list(

  Additive_ETS = list(fit = function(x) ets(x, model="AAN", damped = FALSE), forecast = function(fit, h) forecast(fit, h=h)),

  Multiplicative_ETS = list(fit = function(x) ets(x, model="MAM", damped = FALSE), forecast = function(fit, h) forecast(fit, h=h)),

  Additive_Damped = list(fit = function(x) ets(x, model="AAN", damped = TRUE), forecast = function(fit, h) forecast(fit, h=h)),

  SNAIVE = list(fit = function(x) snaive(x, h=h), forecast = function(fit, h) forecast(fit, h=h)),

  Additive_BC_ETS = list(fit = function(x) ets(x, model="AAN", damped = FALSE, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h)),

  BC_Additive_Damped = list(fit = function(x) ets(x, model="AAN", damped = TRUE, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h)),

  BC_SNAIVE = list(fit = function(x) snaive(x, h=h, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h)),

  ARIMA = list(fit = function(x) auto.arima(x), forecast = function(fit, h) forecast(fit, h=h)),

  ARIMA_BC = list(fit = function(x) auto.arima(x, lambda = lambda), forecast = function(fit, h) forecast(fit, h=h))

)


# Evaluate models

results <- lapply(names(models), function(model_name) {

  model <- models[[model_name]]

  fit <- model$fit(train_data)

  forecast_values <- model$forecast(fit, h)$mean

  rmse <- calculate_rmse(forecast_values, test_data)

  mae <- calculate_mae(forecast_values, test_data)

  if(model_name %in% c("SNAIVE", "BC_SNAIVE")) {

    aic <- NA

    bic <- NA

  } else {

    aic <- AIC(fit)

    bic <- BIC(fit)

  }

  data.frame(

    Model = model_name,

    AIC = aic,

    BIC = bic,

    RMSE = rmse,

    MAE = mae

  )

})

results <- do.call(rbind, results)

# Print the results

print(results)
# Fit the best model on the training data

arima_bc_fit <- auto.arima(train_data, lambda = lambda, biasadj = TRUE)

# Report the model details

arima_bc_fit

# Generate forecasts

arima_bc_forecast <- forecast(arima_bc_fit, h = 12)
arima_bc_forecast
# Plot the residuals

checkresiduals(arima_bc_fit)
# Autoplot of the forecast

autoplot(arima_bc_forecast) +

  autolayer(fitted(arima_bc_forecast), series="Fitted") +

  ylab("KWH") +

  xlab("Time") +

  ggtitle("Forecasts from ARIMA_BC Model") +

  theme_minimal()
accuracy(arima_bc_forecast)

# Print the forecast values

print(arima_bc_forecast$mean)
write.xlsx(arima_bc_forecast, file = "/Users/ulianaplotnikova/Downloads/ATM624Data1.xlsx",
      sheetName = "Forecasting Power", append= FALSE)