Project 1

Author

Naomi Buell

Part A – ATM Forecast, ATM624Data.xlsx

In part A, I forecast how much cash is taken out of 4 different ATM machines for May 2010.

Data Exploration

First, I load and prep the time series data. I convert the date column to a date format and the cash column to thousands of dollars. I then convert the data to a tsibble.

# Read in CSV
atm <- read_csv(
    "https://github.com/naomibuell/DATA624/raw/refs/heads/main/ATM624Data.csv"
) |>
    clean_names() |>
    mutate(
        date = as.Date(date, format = "%m/%d/%Y %I:%M:%S %p"), # Convert to date format
        cash = cash / 10, # Convert dollars from hundreds to thousands
        atm = as.factor(atm) # Convert ATM to a factor for better handling
    ) |>
    as_tsibble(index = date, key = atm) |>
    fill_gaps(.full = TRUE) # Fill gaps with NA values for daily data

I then plot the data to get a sense of the data.

atm |>
    autoplot() +
    theme_classic() +
    labs(
        title = "Cash withdrawals from 4 different ATMs",
        subtitle = "Raw data shows implausible outlier and NA ATM keys.",
        x = "Date",
        y = "Cash Withdrawals (in thousands)"
    ) +
    scale_y_continuous(labels = scales::dollar_format())

# Drop NA
atm_drop <- atm |>
    drop_na()

# Remove outlier
format_thousands <- function(x) {
    dollar_format()(x * 1000)
}
outlier <- max(atm_drop$cash) |> format_thousands()
atm_drop <- atm_drop |> filter(cash < max(cash))

# Plot
atm_drop |>
    autoplot() +
    theme_classic() +
    labs(
        title = "Cash withdrawals from 4 different ATMs",
        subtitle = "Removed implausible outlier and NAs.",
        x = "Date",
        y = "Cash Withdrawals (in thousands)"
    ) +
    scale_y_continuous(labels = scales::dollar_format()) +
    facet_wrap(~atm, scales = "free_y", ncol = 2)

At first glance, we see a high outlier in the spring of 2010, and transactions with missing ATM key and withdrawal amount. We will remove the NA observations and the high outlier since it is unlikely that $1,092,000 were withdrawn from a single atm in one day.

Next, we will investigate the distribution of cash withdrawals by ATM.

atm_drop |>
    ggplot(aes(x = cash)) +
    geom_histogram() +
    facet_wrap(~atm, scales = "free", ncol = 2) +
    theme_classic() +
    labs(
        title = "Distribution of cash withdrawals by ATM",
        subtitle = "ATM3 has a lot of 0 withdrawal days.",
        x = "Cash Withdrawals (in thousands)",
        y = "Count"
    ) +
    scale_x_continuous(labels = scales::dollar_format())

# Function to calculate mode
get_mode <- function(x) {
    ux <- unique(x)
    ux[which.max(tabulate(match(x, ux)))]
}

# get modes
atm_modes <- atm_drop |>
    as_tibble() |>
    group_by(atm) |>
    summarize(mode = get_mode(cash)) |>
    pivot_wider(names_from = atm, values_from = mode) |>
    mutate(across(everything(), ~ format_thousands(.)))

# Pivot data wide
atm_wide <- atm_drop |>
    as_tibble() |>
    pivot_wider(names_from = atm, values_from = cash)

# Get max values
max_2 <- max(atm_wide$ATM2, na.rm = TRUE) |> format_thousands()
max_4 <- max(atm_wide$ATM4, na.rm = TRUE) |> format_thousands()
  1. ATM1 looks bimodal, with many small transactions skewing the data that is otherwise centered around $9,000.

  2. ATM2 has a more uniform distribution, with a variety of small and large transactions between $0 and $14,700.

  3. We see that ATM3 has a lot of 0 withdrawal days–ATM3 only has 3 days of nonzero data. These zeros are likely in error.

  4. ATM4 is heavily skewed right, with a much larger max: $171,200. The mode is $1,600.

atm_wide |>
    arrange(desc(date)) |>
    head()
# A tibble: 6 × 5
  date        ATM1  ATM2  ATM3  ATM4
  <date>     <dbl> <dbl> <dbl> <dbl>
1 2010-04-30   8.5   9     8.5  48.2
2 2010-04-29   8.2   8.9   8.2   4.4
3 2010-04-28   9.6  10.7   9.6  34.8
4 2010-04-27   0.4   0.2   0     1.4
5 2010-04-26   7.4   1.1   0    40.4
6 2010-04-25  10.9   8.9   0    54.2

We observe that ATM1 and ATM3 show identical values during their overlapping active periods, specifically in the last 3 days of the time series. Based on this pattern, ATM3 appears to be a duplicate of ATM1, the only difference being that ATM3 contains zeros before 2010-04-28. I will consolidate these two ATMs into one. This consolidation assumes that both ATMs will follow the same forecasting patterns.

atm_combined <- atm_wide |>
    select(-ATM3) |>
    rename(ATM1_3 = ATM1)

Data Imputation

atm_combined |>
    as_tibble() |>
    skim()
Data summary
Name as_tibble(atm_combined)
Number of rows 365
Number of columns 4
_______________________
Column type frequency:
Date 1
numeric 3
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 2009-05-01 2010-04-30 2009-10-30 365

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ATM1_3 3 0.99 8.39 3.67 0.1 7.30 9.10 10.80 18.0 ▂▁▇▃▁
ATM2 2 0.99 6.26 3.89 0.0 2.55 6.70 9.30 14.7 ▇▅▇▇▂
ATM4 1 1.00 44.53 35.14 0.2 12.38 40.35 70.43 171.2 ▇▅▃▁▁

Some ATMs have more daily data than others. ATM1_3 has 3 missings, ATM2 has 2 missings, and ATM4 has 1. I will impute data for missing dates prior to forecasting with the caret::preProcess() function to impute missing values via medians. Median imputation is appropriate here because it is robust to outliers or skewed data, as was evidenced in the histograms above.

atm_wide_ts <- atm_combined |>
    mutate(date = as.Date(date)) |> # Convert to Date type
    as_tsibble(index = date)

# Impute missing values
impute <- preProcess(
    atm_wide_ts,
    method = c("medianImpute")
)

# display imputation summary
impute
Created from 359 samples and 4 variables

Pre-processing:
  - ignored (1)
  - median imputation (3)
# Replace missings with imputed values
atm_impute <- predict(impute, atm_wide_ts)

# Get counts of each ATM key
atm_impute |>
    as_tibble() |>
    skim()
Data summary
Name as_tibble(atm_impute)
Number of rows 365
Number of columns 4
_______________________
Column type frequency:
Date 1
numeric 3
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 2009-05-01 2010-04-30 2009-10-30 365

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ATM1_3 0 1 8.39 3.65 0.1 7.3 9.10 10.8 18.0 ▂▁▇▃▁
ATM2 0 1 6.26 3.88 0.0 2.6 6.70 9.3 14.7 ▇▅▇▇▂
ATM4 0 1 44.52 35.09 0.2 12.4 40.35 70.4 171.2 ▇▅▃▁▁

Now, there are no missings and we can proceed to forecasting.

Forecast

We will start by examining a decomposition of the time series to determine trends and seasonality, using the STL method for this daily data (Unlike SEATS and X-11, STL will handle any type of seasonality, not only monthly and quarterly data).

# Create empty list to store plots
plots <- list()

# ATM names
atm_names <- c("ATM1_3", "ATM2", "ATM4")

# Create plots in a loop
for (atm_name in atm_names) {
    plots[[atm_name]] <- atm_impute |>
        model(STL(!!sym(atm_name))) |>
        components() |>
        autoplot() +
        theme_classic() +
        scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
        labs(title = "", x = "")
}

# Combine plots using patchwork
(plots[["ATM1_3"]] / plots[["ATM2"]] / plots[["ATM4"]]) +
    plot_annotation(
        title = "Decomposition of withdrawals trends shows weekly seasonality, no clear trend.",
        subtitle = "Shock to magnitude of seasonality in February 2010."
    )

There appears to be a shock that happened in February 2010, where the seasonal (weekly) pattern’s magnitude decreased. We will use the guerrero feature to choose a good value of \(\\lambda\) to make the size of the seasonal variation about the same across the whole series, as that makes the forecasting model simpler.

# Create empty lists to store lambdas and plots
lambdas <- list()
atm_transformed <- atm_impute
# Calculate lambdas and create plots in a loop
for (atm_name in atm_names) {
    # Calculate lambda
    lambdas[[atm_name]] <- atm_impute |>
        features(!!sym(atm_name), features = guerrero) |>
        pull(lambda_guerrero)

    atm_transformed <- atm_transformed |>
        mutate(!!sym(atm_name) := box_cox(!!sym(atm_name), lambdas[[atm_name]]))
}

I’ll try the following models:

  1. ARIMA: For ARIMA to be appropriate, the data needs to be stationary. We will use the built-in differencing of the ARIMA() function.

  2. ETS: We let the ETS() function select the model by minimising the AICc.

Because the series is relatively long, we can afford to use a training and a test set. We create a training set from 2009-05-01 to the end of March 2010 and select an ARIMA and an ETS model using the ARIMA() and ETS() functions. The test set will include data from the month of April 2010.

ATM1_3

The output below shows the model selected and estimated by ARIMA() and ETS().

# Split data into training and test sets
train <- atm_transformed |> filter_index(. ~ "2010-03")
test <- atm_transformed |> filter_index("2010-04" ~ .)

# Fit models for ATM1_3
atm1_3_fit <- train |>
    select(ATM1_3) |>
    model(
        arima = ARIMA(),
        ets = ETS()
    )

# Report on models selected
atm1_3_fit
# A mable: 1 x 2
                     arima          ets
                   <model>      <model>
1 <ARIMA(0,0,2)(0,1,1)[7]> <ETS(A,N,A)>
# Save model names
arima_model_1_3 <- atm1_3_fit$arima |> format()
ets_model <- atm1_3_fit$ets |> format()

The ARIMA model selected for ATM1_3 is <ARIMA(0,0,2)(0,1,1)[7]>. This model has no autoregressive (AR) terms, 2 moving average (MA) terms, and seasonal differencing with period 7 (weekly). This suggests the series is already stationary but has weekly seasonality.

The ETS model automatically selected for all ATMs, as we will see, are <ETS(A,N,A)> models. The ETS(A,N,A) model has additive errors, no trend, and additive seasonality, indicating regular seasonal patterns without overall trend across all machines.

The output below shows the residuals of the models selected by ARIMA() and ETS().

atm1_3_fit |>
    select(arima) |>
    gg_tsresiduals() +
    labs(
        title = paste(arima_model_1_3, "residuals for ATM1_3"),
        subtitle = "The model does well in capturing all the dynamics in the data as the residuals seem to be white noise."
    )

atm1_3_fit |>
    select(ets) |>
    gg_tsresiduals() +
    labs(
        title = paste(ets_model, "residuals for ATM1_3"),
        subtitle = "The model does well as the residuals seem to be white noise, despite a couple autocorrelations above the critical point."
    )

The residuals of both models appear to be white noise, with no significant autocorrelations. The ARIMA model has a few more significant autocorrelations than the ETS model, but both models seem to fit the data well.

The output below evaluates the forecasting performance of the two competing models over the test set.

bind_rows(
    atm1_3_fit |> accuracy(),
    atm1_3_fit |>
        forecast(h = 30) |>
        accuracy(test |> select(ATM1_3))
) |>
    select(-ME, -MPE, -ACF1)
# A tibble: 4 × 7
  .model .type     RMSE   MAE  MAPE    MASE   RMSSE
  <chr>  <chr>    <dbl> <dbl> <dbl>   <dbl>   <dbl>
1 arima  Training 0.730 0.401 Inf     0.874   0.831
2 ets    Training 0.740 0.403 Inf     0.877   0.842
3 arima  Test     0.369 0.242  16.9 NaN     NaN    
4 ets    Test     0.371 0.242  17.0 NaN     NaN    

In this case, the ARIMA model seems to be the slightly more accurate model based on the test set RMSE and MAPE.

Below we generate and plot forecasts from the ARIMA model for May 2010.

forecast_1_3 <- atm_impute |>
    model(ARIMA(box_cox(ATM1_3, lambdas$ATM1_3))) |>
    forecast(h = "1 month")

forecast_1_3 |>
    autoplot(atm_impute) +
    labs(
        title = paste("ATM1_3 forecast through May 2010 with", arima_model_1_3),
        y = "Withdrawals (in thousands)",
        x = "Date"
    ) +
    scale_y_continuous(labels = scales::dollar_format()) +
    scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    theme_classic()

ATM2

The output below shows the model selected and estimated by ARIMA() and ETS() for ATM2.

# Fit models for ATM2
atm2_fit <- train |>
    select(ATM2) |>
    model(
        arima = ARIMA(),
        ets = ETS()
    )

# Report on models selected
atm2_fit
# A mable: 1 x 2
                     arima          ets
                   <model>      <model>
1 <ARIMA(2,0,2)(0,1,1)[7]> <ETS(A,N,A)>
# Save model name
arima_model_2 <- atm2_fit$arima |> format()

# Plot residuals for ARIMA
atm2_fit |>
    select(arima) |>
    gg_tsresiduals() +
    labs(
        title = paste(arima_model_2, "residuals for ATM2"),
        subtitle = "The model does well in capturing all the dynamics in the data as the residuals seem to be white noise, despite an autocorrelation above the critical point."
    )

# Plot residuals for ETS
atm2_fit |>
    select(ets) |>
    gg_tsresiduals() +
    labs(
        title = paste(ets_model, "residuals for ATM2"),
        subtitle = "The model does well as the residuals seem to be white noise, despite a several autocorrelations near or above the critical point."
    )

<ARIMA(2,0,2)(0,1,1)[7]> was the ARIMA model selected for ATM2. This model uses 2 AR terms and 2 MA terms, with weekly seasonal differencing, suggesting both short-term dependencies and weekly patterns. As with the previous ATM, the ETS model automatically selected is <ETS(A,N,A)>. The residuals of both models appear to be white noise, with few significant autocorrelations.

The output below evaluates the forecasting performance of the two competing models over the test set.

bind_rows(
    atm2_fit |> accuracy(),
    atm2_fit |>
        forecast(h = 30) |>
        accuracy(test |> select(ATM2))
) |>
    select(-ME, -MPE, -ACF1)
# A tibble: 4 × 7
  .model .type     RMSE   MAE  MAPE    MASE   RMSSE
  <chr>  <chr>    <dbl> <dbl> <dbl>   <dbl>   <dbl>
1 arima  Training  1.57 1.10   92.1   0.823   0.811
2 ets    Training  1.63 1.14   91.3   0.856   0.842
3 arima  Test      1.33 0.977  45.5 NaN     NaN    
4 ets    Test      1.24 0.913  56.5 NaN     NaN    

In this case, the ETS model seems to be the slightly more accurate model based on the test set RMSE and MAE (although the MAPE is more favorable to the ARIMA model).

Below we generate and plot forecasts from the ETS model for May 2010.

# Generate and plot forecast
forecast_2 <- atm_impute |>
    model(ETS(box_cox(ATM2, lambdas$ATM2))) |>
    forecast(h = "1 month")

forecast_2 |>
    autoplot(atm_impute) +
    labs(
        title = paste("ATM2 forecast through May 2010 with", ets_model),
        y = "Withdrawals (in thousands)",
        x = "Date"
    ) +
    scale_y_continuous(labels = scales::dollar_format()) +
    scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    theme_classic()

ATM4

For our last ATM, we will once again fit the ARIMA and ETS models to ATM4. The output below shows the model selected and estimated by ARIMA() and ETS().

# Fit models for ATM4
atm4_fit <- train |>
    select(ATM4) |>
    model(
        arima = ARIMA(),
        ets = ETS()
    )

# Report on models selected
atm4_fit
# A mable: 1 x 2
                             arima          ets
                           <model>      <model>
1 <ARIMA(0,0,0)(2,0,0)[7] w/ mean> <ETS(A,N,A)>
# Save model name
arima_model_4 <- atm4_fit$arima |> format()

# Plot residuals for ARIMA
atm4_fit |>
    select(arima) |>
    gg_tsresiduals() +
    labs(
        title = paste(arima_model_4, "residuals for ATM4"),
        subtitle = "The model does well in capturing all the dynamics in the data as the residuals seem to be white noise, despite an autocorrelation above the critical point."
    )

# Plot residuals for ETS
atm4_fit |>
    select(ets) |>
    gg_tsresiduals() +
    labs(
        title = paste(ets_model, "residuals for ATM4"),
        subtitle = "The model does well as the residuals seem to be white noise."
    )

<ARIMA(0,0,0)(2,0,0)[7] w/ mean> is the ARIMA model selected for ATM4. This model has no non-seasonal AR, differencing, or MA components, but it includes two seasonal autoregressive (SAR) terms with a period of 7. The inclusion of two SAR terms implies that past weeks’ withdrawal amounts influence future values. The presence of a mean term indicates the series fluctuates around a stable average over time, rather than drifting. As with the previous ATMs, the ETS model automatically selected is <ETS(A,N,A)>. The residuals of both models appear to be white noise.

The output below evaluates the forecasting performance of the two competing models over the test set.

# Compare accuracy
bind_rows(
    atm4_fit |> accuracy(),
    atm4_fit |>
        forecast(h = 30) |>
        accuracy(test |> select(ATM4))
) |>
    select(-ME, -MPE, -ACF1)
# A tibble: 4 × 7
  .model .type     RMSE   MAE  MAPE    MASE   RMSSE
  <chr>  <chr>    <dbl> <dbl> <dbl>   <dbl>   <dbl>
1 arima  Training  4.80  4.01  Inf    0.825   0.777
2 ets    Training  4.56  3.66  Inf    0.753   0.738
3 arima  Test      4.28  3.45  216. NaN     NaN    
4 ets    Test      5.10  4.14  245. NaN     NaN    

In this case, the ARIMA model seems to be the slightly more accurate model based on the test set RMSE, MAE, and MAPE.

Below we generate and plot forecasts from the ARIMA model for May 2010.

# Generate and plot forecast
forecast_4 <- atm_impute |>
    model(ARIMA(box_cox(ATM4, lambdas$ATM4))) |>
    forecast(h = "1 month")

forecast_4 |>
    autoplot(atm_impute) +
    labs(
        title = paste("ATM4 forecast through May 2010 with", arima_model_4),
        y = "Withdrawals (in thousands)",
        x = "Date"
    ) +
    scale_y_continuous(labels = scales::dollar_format()) +
    scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    theme_classic()

Conclusion

Based on the analysis, here are the key findings for each ATM:

  1. ATM1_3 (consolidated ATMs): The ARIMA(0,0,2)(0,1,0)[7] model performed best, capturing strong weekly seasonality. The forecast suggests withdrawal patterns will continue with clear weekly cycles, showing a gradual upward trend.

  2. ATM2: The ETS(A,N,A) model was optimal, indicating additive errors and seasonality without trend. The forecast suggests withdrawal amounts will remain stable, with regular weekly fluctuations but no strong upward or downward movement.

  3. ATM4: The ARIMA(0,0,0)(2,0,0)[7] with mean model provided the best fit, suggesting withdrawals fluctuate around a stable average, with patterns primarily driven by weekly cycles.

I prepare my forecasts for submission in an Excel file.

excel_forecasts <- list()
excel_forecasts[["ATM1_3"]] <- forecast_1_3 |>
    as_tsibble() |>
    select(.model, date, .mean) |>
    pivot_wider(names_from = .model, values_from = .mean)

excel_forecasts[["ATM2"]] <- forecast_2 |>
    as_tsibble() |>
    select(.model, date, .mean) |>
    pivot_wider(names_from = .model, values_from = .mean)

excel_forecasts[["ATM4"]] <- forecast_4 |>
    as_tsibble() |>
    select(.model, date, .mean) |>
    pivot_wider(names_from = .model, values_from = .mean)

Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx

In part B, I forecast residential power usage in kilowatt hours (kwh) for 2014 based on data from January 1998 until December 2013.

Data Exploration

First, I load and prep the time series data. I convert the YYYY-MMM to a date column. I then convert the data to a tsibble, filling any gaps in the monthly data. I also convert the power usage to millions of kwh.

# Read in CSV
power <- read_csv(
    "https://github.com/naomibuell/DATA624/raw/refs/heads/main/ResidentialCustomerForecastLoad-624.csv"
) |>
    clean_names() |>
    mutate(
        date = yearmonth(yyyy_mmm), # Convert to yearmonth type
        kwh = kwh / 1e6 # Convert kwh to millions
    ) |> # Ensure date is in year-month format
    as_tsibble(index = date) |>
    select(kwh) |>
    fill_gaps(.full = TRUE) # Fill gaps in the monthly data

# Browse data
power |>
    as_tibble() |>
    skim()
Data summary
Name as_tibble(power)
Number of rows 192
Number of columns 2
_______________________
Column type frequency:
character 1
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
date 0 1 5 5 0 192 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
kwh 1 0.99 6.5 1.45 0.77 5.43 6.28 7.62 10.66 ▁▁▇▅▁
# Get missingness info
missing_date <- power |>
    filter(is.na(kwh)) |>
    pull(date)

I note that we have missing power data for 2008 Sep, which we will need to impute before forecasting.

I plot the data.

# Plot the power data
power |>
    autoplot() +
    labs(
        title = "Monthly residential power usage shows upward trend and seasonality.",
        subtitle = "There is a low outlier in 2010.",
        x = "",
        y = "Power Usage (Millions of kWh)"
    ) +
    theme_classic() +
    scale_x_yearmonth(date_breaks = "1 year", date_labels = "%Y")

# Histogram of power usage
power |>
    ggplot(aes(x = kwh)) +
    geom_histogram() +
    theme_classic() +
    labs(
        title = "Power usage is skewed right with low outlier.",
        x = "Power Usage (Millions of kWh)",
        y = "Count of months, January 1998 - December 2013"
    )

# Store original outlier value
outlier <- power$kwh |> min(na.rm = TRUE)

# Replace outlier with 10x value
power_outlier <- power |>
    mutate(
        kwh = case_when(
            kwh == power$kwh |> min(na.rm = TRUE) ~ kwh * 10,
            TRUE ~ kwh
        )
    )

The time plot shows an upward trend and seasonality. The distribution appears skewed right. There is a low outlier in 2010, at 0.770523 kwh. It appears that this value is missing a digit, and we will assume this should be 7.70523.

power_outlier |>
    autoplot() +
    labs(
        title = "After fixing outlier, power usage values look more reasonable.",
        x = "",
        y = "Power Usage (Millions of kWh)"
    ) +
    theme_classic() +
    scale_x_yearmonth(date_breaks = "1 year", date_labels = "%Y")

Data Imputation

Next we impute the missing value we caught earlier with the caret::preProcess() class to estimate missing values based on bagged trees. Imputation via bagging fits a bagged tree model for each predictor (as a function of all the others). This method is simple and accurate for handling non-linear patterns and outliers. The main trade-off is computational cost, but for a moderate-sized power usage dataset, this isn’t a significant concern.

# Impute missing values using KNN imputation
impute <- preProcess(
    power_outlier,
    method = c("bagImpute")
)
impute
Created from 191 samples and 2 variables

Pre-processing:
  - bagged tree imputation (1)
  - ignored (1)
# Replace missings with imputed values
power_impute <- predict(impute, power_outlier)

# Get counts of missings after imputation
power_impute |>
    as_tibble() |>
    skim()
Data summary
Name as_tibble(power_impute)
Number of rows 192
Number of columns 2
_______________________
Column type frequency:
character 1
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
date 0 1 5 5 0 192 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
kwh 0 1 6.54 1.39 4.31 5.44 6.31 7.65 10.66 ▇▇▇▂▁
# Get imputed observation at missing date
imputed_obs <- power_impute |>
    filter(date == missing_date) |>
    pull(kwh) *
    1e6

The missing value on 2008 Sep was imputed to 6.2615685^{6} kwh. Now our data is complete (no missings) and we can begin forecasting.

Forecast

We will start by examining a decomposition of the time series to determine trends and seasonality, using the STL method, as we did with the ATM data. STL has several advantages over the classical, SEATS and X11 decomposition methods:

  • The seasonal component is allowed to change over time, and the rate of change can be controlled by the user.

  • The smoothness of the trend-cycle can also be controlled by the user.

  • It can be robust to outliers (i.e., the user can specify a robust decomposition), so that occasional unusual observations will not affect the estimates of the trend-cycle and seasonal components. They will, however, affect the remainder component.

power_impute |>
    model(STL(kwh)) |>
    components() |>
    autoplot() +
    theme_classic() +
    scale_x_yearmonth(date_breaks = "1 year", date_labels = "%Y") +
    labs(
        title = "Yearly seasonality and upward trend, esp. from 2009.",
        subtitle = "Subtle increase in magnitude of seasonality over time.",
        x = "",
        y = "Power Usage (Millions of kWh)"
    )

The decomposition shows a clear upward trend in power usage, especially from 2009 onwards, with annual seasons. The seasonal component appears to increase in magnitude slightly over time.

We will use the guerrero feature to choose a good value of \(\\lambda\) to make the size of the seasonal variation about the same across the whole series, as that makes the forecasting model simpler.

lambda <- power_impute |>
    features(kwh, features = guerrero) |>
    pull(lambda_guerrero)

num_years <- max(year(power_impute$date)) - min(year(power_impute$date))

I’ll try ARIMA and ETS models, and compare results for accuracy.

Because the series spans 15 years of monthly data, we can afford to split it into training and test sets for model validation.

The training set covers from the start of our time series up to the end of 2012 and the test set will go from beginning of 2013 onward. This will be used to fit ARIMA and ETS models that will ultimately forecast this data through 2014.

# Get training and test data
train <- power_impute |> filter_index(. ~ "2012-12-01")
test <- power_impute |> filter_index("2013-01-01" ~ .)

# Fit models for power usage
power_fit <- train |>
    model(
        arima = ARIMA(box_cox(kwh, lambda)),
        ets = ETS(box_cox(kwh, lambda))
    )

# Report on models selected
power_fit
# A mable: 1 x 2
                               arima          ets
                             <model>      <model>
1 <ARIMA(1,0,0)(2,1,0)[12] w/ drift> <ETS(A,N,A)>
# Save model names
arima_model <- power_fit$arima |> format()
ets_model <- power_fit$ets |> format()

# Plot residuals for ARIMA
power_fit |>
    select(arima) |>
    gg_tsresiduals() +
    labs(
        title = paste(arima_model, "residuals indicate the model does well."),
        subtitle = "The residuals look like white noise and normally distributed, couple autocorrelations above the critical point."
    )

# Plot residuals for ETS
power_fit |>
    select(ets) |>
    gg_tsresiduals() +
    labs(
        title = paste(ets_model, "residuals indicate the model does well."),
        subtitle = "Residuals look like white noise centered around 0 and normally distributed, despite a few autocorrelations above the critical point."
    )

The ARIMA model selected for power usage is <ARIMA(1,0,0)(2,1,0)[12] w/ drift>. This model has 1 non-seasonal AR term, 2 seasonal AR terms, and seasonal differencing with period 12 (monthly), plus drift. This suggests both short-term dependencies and yearly seasonality with an underlying trend. The ETS model automatically selected is <ETS(A,N,A)>. The ETS(A,N,A) model indicates additive errors and seasonality with no trend component, suggesting regular seasonal patterns but no consistent trend direction.

The residuals of both models suggest they fit the data well, with residuals appearing random, normally distributed, and centered around 0.

The output below evaluates the forecasting performance of the two competing models over the test set.

bind_rows(
    power_fit |> accuracy(),
    power_fit |>
        forecast(h = 12) |>
        accuracy(test |> mutate(kwh = box_cox(kwh, lambda)))
) |>
    select(-ME, -MPE, -ACF1)
# A tibble: 4 × 7
  .model .type     RMSE   MAE   MAPE    MASE   RMSSE
  <chr>  <chr>    <dbl> <dbl>  <dbl>   <dbl>   <dbl>
1 arima  Training 0.578 0.431   6.64   0.691   0.718
2 ets    Training 0.583 0.441   6.79   0.708   0.724
3 arima  Test     5.97  5.83  353.   NaN     NaN    
4 ets    Test     5.73  5.60  339.   NaN     NaN    

In this case, the ETS model seems to be the slightly more accurate model based on the test set RMSE, MAE, and MAPE.

Below we generate and plot forecasts from the ETS model for 2014.

# Generate and plot forecast
forecast <- power_impute |>
    model(ETS(box_cox(kwh, lambda))) |>
    forecast(h = "12 months")

forecast |>
    autoplot(power_impute) +
    labs(
        title = paste("Power usage forecasted for 2014 with", ets_model),
        y = "Power Usage (Millions of kWh)",
        x = ""
    ) +
    scale_x_yearmonth(date_breaks = "1 year", date_labels = "%Y") +
    theme_classic()

Conclusion

Based on the analysis, the residential power usage forecast for 2014 follows strong seasonal patterns, with peaks in winter and summer and lower usage in spring and fall. The ETS(A,N,A) model, which captures additive seasonality without trend, provided the best fit, accurately reflecting yearly cycles. While test metrics (RMSE, MAE, MAPE) indicate strong performance, the forecast assumes historical patterns will persist and does not account for potential external changes like extreme weather.

I prepare my forecasts for submission in an Excel file.

excel_forecasts[["power"]] <- forecast |>
    as_tsibble() |>
    select(.model, date, .mean) |>
    pivot_wider(names_from = .model, values_from = .mean) |>
    mutate(date = as_date(date)) # Convert for Excel compatibility

Part C – BONUS (3% extra credit to overall grade), optional (part or all), Waterflow_Pipe1.xlsx and Waterflow_Pipe2.xlsx

In Part C, I work with two data sets with different time stamps.

Data Exploration

First, I load and prep the time series data. I then convert the data to a tsibble.

water1 <- read_csv(
    "https://github.com/naomibuell/DATA624/raw/refs/heads/main/Waterflow_Pipe1.csv"
) |>
    rename(water_flow1 = WaterFlow)
water2 <- read_csv(
    "https://github.com/naomibuell/DATA624/raw/refs/heads/main/Waterflow_Pipe2.csv"
) |>
    rename(water_flow2 = WaterFlow)

I merge the two files and tidy the data. I aggregate based on hour (for multiple recordings within an hour, I take the mean). I also plot the data below.

water <- full_join(water1, water2) |>
    clean_names() |>
    mutate(date_time = mdy_hm(date_time)) |>
    group_by(date_time = floor_date(date_time, "hour")) |>
    summarize(
        water_flow1 = mean(water_flow1, na.rm = TRUE),
        water_flow2 = mean(water_flow2, na.rm = TRUE)
    ) |>
    as_tsibble() |>
    fill_gaps()

# Browse data
water |>
    as_tibble() |>
    skim()
Data summary
Name as_tibble(water)
Number of rows 1001
Number of columns 3
_______________________
Column type frequency:
numeric 2
POSIXct 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
water_flow1 765 0.24 19.89 4.24 8.92 17.03 19.78 22.79 31.73 ▁▅▇▅▁
water_flow2 1 1.00 39.56 16.05 1.88 28.14 39.68 50.78 78.30 ▂▆▇▅▂

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
date_time 0 1 2015-10-23 2015-12-03 16:00:00 2015-11-12 20:00:00 1001
# Plot
water |>
    pivot_longer(
        cols = starts_with("water_flow"),
        names_to = "pipe",
        values_to = "flow"
    ) |>
    ggplot(aes(x = date_time, y = flow, color = pipe)) +
    geom_line() +
    theme_classic() +
    labs(
        title = "Hourly water flow over time for two pipes in 2015",
        subtitle = "Pipe 1 has earlier data and many gaps. Pipe 2 has later, complete data.",
        x = "",
        y = "Water Flow",
        color = "Pipe"
    ) +
    scale_x_datetime(date_breaks = "1 week", date_labels = "%b %d") +
    facet_wrap(~pipe, scales = "free_y", ncol = 1)

I note that water_flow1 ends much earlier than water_flow2, and starts an hour earlier. water_flow1 also has many missing estimates over time that we will need to impute. water_flow2 is complete, within its time range.

Data imputation

I used the bagged method again for imputation, as it is robust to outliers and can handle non-linear patterns. This method will help us fill in the missing values for water_flow1 based on the available data from both pipes. Note that I only impute data between the start and end dates of water_flow1 and water_flow2 because we don’t want to impute values outside the range of available data.

# Impute missing values using KNN imputation for water_flow1
impute <- preProcess(
    water,
    method = c("bagImpute")
)
impute
Created from 235 samples and 3 variables

Pre-processing:
  - bagged tree imputation (2)
  - ignored (1)
water_impute <- predict(impute, water)

# get start and end dates
range1 <- water |>
    select(water_flow1) |>
    filter(!is.na(water_flow1)) |>
    pull(date_time) |>
    range()
range2 <- water |>
    select(water_flow2) |>
    filter(!is.na(water_flow2)) |>
    pull(date_time) |>
    range()

# Remove imputed data outside of these ranges
water_ranged <- water_impute |>
    mutate(
        water_flow1 = case_when(
            date_time >= range1[1] & date_time <= range1[2] ~ water_flow1,
            TRUE ~ NA # Set to NA for any date outside the range of available data for water_flow1
        ),
        water_flow2 = case_when(
            date_time >= range2[1] & date_time <= range2[2] ~ water_flow2,
            TRUE ~ NA # Set to NA for any date outside the range of available data for water_flow2
        )
    )

Now that waterflow1 data has been imputed, we can decompose to investigate trend and seasonality components.

# Create empty list to store plots
plots <- list()

# Create decomposition plot for water_flow1
plots[["pipe1"]] <- water_ranged |>
    select(water_flow1) |>
    drop_na() |>
    model(STL()) |>
    components() |>
    autoplot() +
    labs(
        subtitle = "Not enough data for weekly seasonality.",
        x = "",
        y = "Water Flow"
    ) +
    theme_classic()

# Create decomposition plot for water_flow2
plots[["pipe2"]] <- water_ranged |>
    select(water_flow2) |>
    drop_na() |>
    model(STL()) |>
    components() |>
    autoplot() +
    labs(
        subtitle = "Daily and weekly seasonality.",
        x = "",
        y = ""
    ) +
    theme_classic() +
    scale_x_datetime(date_breaks = "1 week", date_labels = "%b %d")

# Combine plots using patchwork
plots[["pipe1"]] +
    plots[["pipe2"]] +
    plot_annotation(
        title = "Comparison of trend and seasonality components of both pipes.",
        subtitle = "Daily seasonality for both pipes have variability. Trends do not appear to match.",
        caption = "Note: Timeline on x axis differs between pipes."
    )

It is likely that water flow from both pipes have both daily seasonality (e.g., higher flow during the day and lower at night) and weekly seasonality (e.g., higher flow on weekdays than weekends). However, for water_flow1, there is not enough data to measure weekly seasonality. Daily seasonality for both pipes has variability, so I will transform the data using the box_cox() transformation to stabilize variance. This will help improve the performance of our forecasting models.

water_ranged |>
    select(water_flow1) |>
    drop_na() |>
    features(water_flow1, features = guerrero) |>
    pull(lambda_guerrero)
[1] 0.994421
water_ranged |>
    select(water_flow2) |>
    drop_na() |>
    features(water_flow2, features = guerrero) |>
    pull(lambda_guerrero)
[1] 0.9447908

The lambdas generated above are very close to 1, indicating the data is already in a good form for modelling, so I opt not to transform the data for water_flow1 and water_flow2 using the Box-Cox transformation.

Forecast

I determine if the data is stationary with a unit root test before forecasting. The KPSS test p-value is reported below.

  1. H0: The data is stationary.

  2. HA: The data is not stationary.

water_ranged |>
    features(water_flow1, unitroot_kpss)
# A tibble: 1 × 2
  kpss_stat kpss_pvalue
      <dbl>       <dbl>
1     0.233         0.1
water_ranged |>
    features(water_flow2, unitroot_kpss)
# A tibble: 1 × 2
  kpss_stat kpss_pvalue
      <dbl>       <dbl>
1     0.105         0.1

The p-value is 0.1 in test for both pipes. We fail to reject the null hypothesis and can conclude that the differenced data is stationary.

After verifying, I try both ARIMA and ETS models to predict a week forward forecast. I split data into training and test sets for validation, with test sets spanning one week prior to the end of the data.

# Get training and test data for water_flow1
train1 <- water_ranged |>
    select(water_flow1) |>
    filter(!is.na(water_flow1)) |>
    filter_index(. ~ "2015-10-25 23:00")
test1 <- water_ranged |>
    select(water_flow1) |>
    filter(!is.na(water_flow1)) |>
    filter_index("2015-10-26 00:00" ~ .)
# Get training and test data for water_flow2
train2 <- water_ranged |>
    select(water_flow2) |>
    filter(!is.na(water_flow2)) |>
    filter_index(. ~ "2015-11-26 16:00")
test2 <- water_ranged |>
    select(water_flow2) |>
    filter(!is.na(water_flow2)) |>
    filter_index("2015-11-26 17:00:00" ~ .)

# Fit models for water usage
fit1 <- train1 |>
    model(
        arima = ARIMA(water_flow1),
        ets = ETS(water_flow1)
    )

fit2 <- train2 |>
    model(
        arima = ARIMA(water_flow2),
        ets = ETS(water_flow2)
    )

# Report on models selected
fit1
# A mable: 1 x 2
                   arima          ets
                 <model>      <model>
1 <ARIMA(0,0,0) w/ mean> <ETS(M,N,N)>
fit2
# A mable: 1 x 2
                              arima          ets
                            <model>      <model>
1 <ARIMA(0,0,0)(0,0,1)[24] w/ mean> <ETS(M,N,N)>
# Save model names
arima_model1 <- fit1$arima |> format()
ets_model1 <- fit1$ets |> format()
arima_model2 <- fit2$arima |> format()
ets_model2 <- fit2$ets |> format()

# Plot residuals for water_flow1
fit1 |>
    select(arima) |>
    gg_tsresiduals() +
    labs(
        title = paste(
            arima_model1,
            "residuals indicate water_flow1 model does well."
        ),
        subtitle = "The residuals look like white noise and normally distributed, one autocorrelation above the critical point."
    )

fit1 |>
    select(ets) |>
    gg_tsresiduals() +
    labs(
        title = paste(
            ets_model1,
            "residuals indicate water_flow1 model does well."
        ),
        subtitle = "Like ARIMA plot above, the residuals look like white noise and normally distributed, one autocorrelation above the critical point."
    )

# Plot residuals for water_flow2
fit2 |>
    select(arima) |>
    gg_tsresiduals() +
    labs(
        title = paste(
            arima_model2,
            "residuals indicate water_flow2 model does well."
        ),
        subtitle = "Residuals look like white noise and normally distributed, couple autocorrelations above the critical point."
    )

fit2 |>
    select(ets) |>
    gg_tsresiduals() +
    labs(
        title = paste(
            ets_model2,
            "residuals indicate water_flow2 model does well."
        ),
        subtitle = "Residuals look like white noise and normally distributed, several autocorrelations above the critical point."
    )

  • For water_flow1, both models essentially forecast the mean. ARIMA(0,0,0) with mean has no autoregressive component (p=0), no differencing (d=0), no moving average component (q=0), and is with mean, meaning this just predicts the historical mean of the series. ETS(M,N,N): Without trend or seasonality components, this also defaults to predicting the mean.

  • water_flow2 was also fit with an ETS(M,N,N) models, indicating that the errors are multiplicative, there is no trend component, and no seasonal component. For water_flow2, ARIMA(0,0,0)(0,0,1)[24] with mean indicates the series fluctuates around a constant level with one seasonal moving average term at lag 24 (daily seasonality), suggesting that errors from previous days influence current values.

Next, I test the accuracy of each of the forecasts.

# Compare accuracy for water_flow1
bind_rows(
    fit1 |> accuracy(),
    fit1 |>
        forecast(h = "1 week") |>
        accuracy(test1)
) |>
    select(-ME, -MPE, -ACF1)
# A tibble: 4 × 7
  .model .type     RMSE   MAE  MAPE    MASE   RMSSE
  <chr>  <chr>    <dbl> <dbl> <dbl>   <dbl>   <dbl>
1 arima  Training  4.33  3.49  19.4   0.612   0.637
2 ets    Training  4.33  3.49  19.4   0.612   0.637
3 arima  Test      4.17  3.21  17.0 NaN     NaN    
4 ets    Test      4.17  3.21  17.0 NaN     NaN    
bind_rows(
    fit2 |> accuracy(),
    fit2 |>
        forecast(h = "1 week") |>
        accuracy(test2)
) |>
    select(-ME, -MPE, -ACF1)
# A tibble: 4 × 7
  .model .type     RMSE   MAE  MAPE    MASE   RMSSE
  <chr>  <chr>    <dbl> <dbl> <dbl>   <dbl>   <dbl>
1 arima  Training  15.8  12.9  56.4   0.748   0.738
2 ets    Training  15.9  12.9  56.6   0.749   0.741
3 arima  Test      16.6  13.7  61.2 NaN     NaN    
4 ets    Test      16.7  13.7  61.9 NaN     NaN    

For water_flow1, the accuracies are exactly the same. We are indifferent between either model, since they are both essentially forecasting the mean of the data. For water_flow2, the ARIMA model works best as it minimizes the test RMSE and MAPE.

# Generate and plot forecast for water_flow1
forecast1 <- water_ranged |>
    select(water_flow1) |>
    filter(!is.na(water_flow1)) |>
    model(ARIMA(water_flow1)) |>
    forecast(h = "1 week")

forecast1 |>
    autoplot(water_ranged |> select(water_flow1)) +
    labs(
        subtitle = "Forecast a week foreward for water_flow1.",
        x = "",
        y = "Water Flow"
    ) +
    theme_classic()

# Generate and plot forecast for water_flow2
forecast2 <- water_ranged |>
    select(water_flow2) |>
    filter(!is.na(water_flow2)) |>
    model(ARIMA(water_flow2)) |>
    forecast(h = "1 week")

forecast2 |>
    autoplot(water_ranged |> select(water_flow2)) +
    labs(
        subtitle = "Forecast a week foreward for water_flow2.",
        x = "",
        y = "Water Flow"
    ) +
    theme_classic()

Export all forecasts to Excel

#| label: export-forecast
excel_forecasts[["water_flow1"]] <- forecast1 |>
    as_tsibble() |>
    select(.model, date_time, .mean) |>
    pivot_wider(names_from = .model, values_from = .mean)

excel_forecasts[["water_flow2"]] <- forecast2 |>
    as_tsibble() |>
    select(.model, date_time, .mean) |>
    pivot_wider(names_from = .model, values_from = .mean)

write_xlsx(excel_forecasts, "Project_1_forecasts_NB.xlsx")