library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(fpp3)

## Warning: package 'fpp3' was built under R version 4.3.3

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tsibble     1.1.5     ✔ fable       0.3.4
## ✔ tsibbledata 0.4.1     ✔ fabletools  0.4.2
## ✔ feasts      0.3.2

## Warning: package 'tsibble' was built under R version 4.3.3

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(readxl)
library(openxlsx)

## Warning: package 'openxlsx' was built under R version 4.3.3

Part A

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual .rmd file. Also please submit the forecast which you will put in an Excel readable file.

Data Cleaning

# Import and inspect data
atm_data <- read_xlsx('ATM624Data.xlsx')
print(head(atm_data))

## # A tibble: 6 × 3
##    DATE ATM    Cash
##   <dbl> <chr> <dbl>
## 1 39934 ATM1     96
## 2 39934 ATM2    107
## 3 39935 ATM1     82
## 4 39935 ATM2     89
## 5 39936 ATM1     85
## 6 39936 ATM2     90

print(tail(atm_data))

## # A tibble: 6 × 3
##    DATE ATM    Cash
##   <dbl> <chr> <dbl>
## 1 40293 ATM4  542. 
## 2 40294 ATM4  404. 
## 3 40295 ATM4   13.7
## 4 40296 ATM4  348. 
## 5 40297 ATM4   44.2
## 6 40298 ATM4  482.

# Convert the Date to a usable format, and convert the tibble to a tsibble
atm_data$DATE <- as.Date(atm_data$DATE, origin = '1899-12-30')
atm_data <- atm_data |>
  as_tsibble(key = 'ATM', index = 'DATE') |>
  arrange(DATE)

print(head(atm_data))

## # A tsibble: 6 x 3 [1D]
## # Key:       ATM [4]
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-05-01 ATM1    96 
## 2 2009-05-01 ATM2   107 
## 3 2009-05-01 ATM3     0 
## 4 2009-05-01 ATM4   777.
## 5 2009-05-02 ATM1    82 
## 6 2009-05-02 ATM2    89

ATM 1

Exploratory Data Analysis

# Filter for ATM 1 and plot data
atm_1_ts <- atm_data |> filter(ATM == 'ATM1')
atm_1_ts |> autoplot(Cash)

This data appears to be seasonal, with a significant dropoff occurring on a regular basis. The ACF plot confirmed this finding, showing clear peaks in correlation at 7, 14, and 21 days, confirming the seasonal (weekly) nature of the data.

atm_1_ts |>
  ACF(Cash) |>
  autoplot()

The size of the spikes in the data are inconsistent, so a Box-Cox transformation was applied to try to smooth out the variance. However, the Box-Cox transformed data seemed to show less consistent variation, so it was not utilized.

# Find lambda
lambda_atm1 <- atm_1_ts |>
  features(Cash, features = guerrero) |>
  pull(lambda_guerrero)

# Apply Box-Cox transformation
atm_1_bc <- atm_1_ts |>
  mutate(b_c = box_cox(Cash, lambda_atm1)) |>
  select(DATE, b_c)

atm_1_bc |> autoplot(b_c)

Finally, missing values were replaced with the median of the data-set.

atm_1_ts$Cash[is.na(atm_1_ts$Cash)] <- median(atm_1_ts$Cash, na.rm=TRUE)

Model Selection

A test/train split was created from the data with 80% of the data points being assigned to the training set and the remaining 20% used for the test set. The following models were developed from the training set and used to predict the test set:

A seasonal Naive Bayes model.
An ETS model.
An ARIMA model.

The model with lowest RMSE was the ETS model, so that model was used for the final forecast.

# Create Test/Train Split
atm_1_train <- atm_1_ts[1:round(.8*nrow(atm_1_ts)),]
atm_1_test <- anti_join(atm_1_ts, atm_1_train, by = 'DATE')

# Use model to predict the test data
atm_1_model <- atm_1_train |>
  model('snaive' = SNAIVE(Cash),
        'ets' = ETS(Cash),
        'arima' = ARIMA(Cash)) |>
  forecast(h = nrow(atm_1_test))

# Evaluate model accuracy
accuracy(atm_1_model, atm_1_ts) |> select(.model, RMSE) |> arrange(RMSE)

## # A tibble: 3 × 2
##   .model  RMSE
##   <chr>  <dbl>
## 1 ets     51.8
## 2 arima   54.2
## 3 snaive  62.5

The selected ETS model produces residuals that look like white noise, as they do not display a pattern or significant outliers.

atm_1_ts |>
  model(ETS(Cash)) |>
  gg_tsresiduals()

Forecast

A 31-day forecast was generated (one full month).

# Generate forecast
atm_1_prediction <- atm_1_ts |>
  model(ETS(Cash)) |>
  forecast(h = 31)

atm_1_prediction |> autoplot(atm_1_ts, level = NULL)

# Produce final output
atm_1_final <- atm_1_prediction[,-c(2,4)] |>
  select(2,1,3) |>
  rename(Cash = .mean)

print(head(atm_1_final))

## # A tsibble: 6 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM     Cash
##   <date>     <chr>  <dbl>
## 1 2010-05-01 ATM1   87.1 
## 2 2010-05-02 ATM1  101.  
## 3 2010-05-03 ATM1   73.1 
## 4 2010-05-04 ATM1    5.74
## 5 2010-05-05 ATM1  100.  
## 6 2010-05-06 ATM1   79.4

ATM 2

Exploratory Data Analysis

# Filter for ATM 2 and plot data
atm_2_ts <- atm_data |> filter(ATM == 'ATM2')
atm_2_ts |> autoplot(Cash)

Like with the previous ATM, this data appears to be seasonal. The ACF plot again confirms weekly seasonality, showing spikes at 7, 14, and 21 days.

atm_2_ts |>
  ACF(Cash) |>
  autoplot()

Next, Missing values were replaced with the median of the data set.

atm_2_ts$Cash[is.na(atm_2_ts$Cash)] <- median(atm_2_ts$Cash, na.rm=TRUE)

As with the previous ATM, the Box-Cox transformation does not appear to have made the variance meaningfully more consistent, so it was not utilized.

# Find lambda
lambda_atm2 <- atm_2_ts |>
  features(Cash, features = guerrero) |>
  pull(lambda_guerrero)

# Apply Box-Cox transformation
atm_2_bc <- atm_2_ts |>
  mutate(b_c = box_cox(Cash, lambda_atm2)) |>
  select(DATE, b_c)

atm_2_bc |> autoplot(b_c)

Model Selection

A seasonal Naive Bayes model.
An ETS model.
An ARIMA model.

The model with lowest RMSE was the seasonal Naive Bayes model, so that model was used for the final forecast.

# Create Test/Train Split
atm_2_train <- atm_2_ts[1:round(.8*nrow(atm_2_ts)),]
atm_2_test <- anti_join(atm_2_ts, atm_2_train, by = 'DATE')

# Use model to predict the test data
atm_2_model <- atm_2_train |>
  model('snaive' = SNAIVE(Cash),
        'ets' = ETS(Cash),
        'arima' = ARIMA(Cash)) |>
  forecast(h = nrow(atm_2_test))

# Evaluate model accuracy
accuracy(atm_2_model, atm_2_ts) |> select(.model, RMSE) |> arrange(RMSE)

## # A tibble: 3 × 2
##   .model  RMSE
##   <chr>  <dbl>
## 1 snaive  59.7
## 2 ets     64.3
## 3 arima   64.9

The selected SNAIVE model produces nearly normal residuals with consistent variance, although the acf plot suggests that some seasonality may still not be fully captured.

atm_2_ts |>
  model(SNAIVE(Cash)) |>
  gg_tsresiduals()

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).

Forecast

A 31-day forecast was generated (one full month).

# Generate forecast
atm_2_prediction <- atm_2_ts |>
  model(SNAIVE(Cash)) |>
  forecast(h = 31)

atm_2_prediction |> autoplot(atm_2_ts, level = NULL)

# Produce final output
atm_2_final <- atm_2_prediction[,-c(2,4)] |>
  select(2,1,3) |>
  rename(Cash = .mean)

print(head(atm_2_final))

## # A tsibble: 6 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2010-05-01 ATM2     61
## 2 2010-05-02 ATM2     89
## 3 2010-05-03 ATM2     11
## 4 2010-05-04 ATM2      2
## 5 2010-05-05 ATM2    107
## 6 2010-05-06 ATM2     89

ATM 3

Exploratory Data Analysis

# Filter for ATM 3 and plot data
atm_3_ts <- atm_data |> filter(ATM == 'ATM3')
atm_3_ts |> autoplot(Cash)

Clearly, this ATM has only been activated recently (or the provided data is incomplete). Data with a value of 0 was removed.

atm_3_ts <- atm_3_ts |> filter(Cash != 0)
atm_3_ts |> autoplot(Cash)

There are only 3 data points available for this ATM. Without additional context, the best approximation that can be made is a simple average of the existing data points.

Forecast

# Generate forecast
atm_3_prediction <- atm_3_ts |>
  model(MEAN(Cash)) |>
  forecast(h = 31)

atm_3_prediction |> autoplot(atm_3_ts, level = NULL)

# Produce final output
atm_3_final <- atm_3_prediction[,-c(2,4)] |>
  select(2,1,3) |>
  rename(Cash = .mean)

print(head(atm_3_final))

## # A tsibble: 6 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2010-05-01 ATM3   87.7
## 2 2010-05-02 ATM3   87.7
## 3 2010-05-03 ATM3   87.7
## 4 2010-05-04 ATM3   87.7
## 5 2010-05-05 ATM3   87.7
## 6 2010-05-06 ATM3   87.7

ATM 4

Exploratory Data Analysis

# Filter for ATM 4 and plot data
atm_4_ts <- atm_data |> filter(ATM == 'ATM4')
atm_4_ts |> autoplot(Cash)

This data set clearly shows a single extreme outlier. This outlier, along with missing values, was replaced with the median of the data set.

atm_4_ts$Cash[atm_4_ts$Cash == max(atm_4_ts$Cash, na.rm = TRUE)] <- median(atm_4_ts$Cash, na.rm = TRUE)
atm_4_ts$Cash[is.na(atm_4_ts$Cash)] <- median(atm_4_ts$Cash, na.rm=TRUE)
atm_4_ts |> autoplot(Cash)

The ACF plot for this data set confirmed the same weekly seasonality shown in ATM1 and ATM2.

atm_4_ts |>
  ACF(Cash) |>
  autoplot()

As with ATM1 and ATM2, a Box-Cox transformation did not make the size of the variations more consistent, so it was not included.

# Find lambda
lambda_atm4 <- atm_4_ts |>
  features(Cash, features = guerrero) |>
  pull(lambda_guerrero)

# Apply Box-Cox transformation
atm_4_bc <- atm_4_ts |>
  mutate(b_c = box_cox(Cash, lambda_atm4)) |>
  select(DATE, b_c)

atm_4_bc |> autoplot(b_c)

Model Selection

A seasonal Naive Bayes model.
An ETS model.
An ARIMA model.

The model with lowest RMSE was the ARIMA model, so that model was tried for the final forecast.

# Create Test/Train Split
atm_4_train <- atm_4_ts[1:round(.8*nrow(atm_4_ts)),]
atm_4_test <- anti_join(atm_4_ts, atm_4_train, by = 'DATE')

# Use model to predict the test data
atm_4_model <- atm_4_train |>
  model('snaive' = SNAIVE(Cash),
        'ets' = ETS(Cash),
        'arima' = ARIMA(Cash)) |>
  forecast(h = nrow(atm_4_test))

# Evaluate model accuracy
accuracy(atm_4_model, atm_4_ts) |> select(.model, RMSE) |> arrange(RMSE)

## # A tibble: 3 × 2
##   .model  RMSE
##   <chr>  <dbl>
## 1 arima   321.
## 2 ets     395.
## 3 snaive  551.

The selected ARIMA model does not produce normal residuals, indicating that another model might be a better fit.

atm_4_ts |>
  model(ARIMA(Cash)) |>
  gg_tsresiduals()

Forecast

# Generate forecast
atm_4_prediction <- atm_4_ts |>
  model(ARIMA(Cash)) |>
  forecast(h = 31)

atm_4_prediction |> autoplot(atm_4_ts, level = NULL)

It is clear from the plot that this forecast is not appropriate for this data set. I tested the next-lowest RMSE model (ETS). However, this model also produced an odd-looking forecast.

# Generate forecast
atm_4_prediction <- atm_4_ts |>
  model(ETS(Cash)) |>
  forecast(h = 31)

atm_4_prediction |> autoplot(atm_4_ts, level = NULL)

Finally, I tried using the seasonal Naive Bayes model to produce a forecast. This produced the most appropriate looking forecast of the three models, so it was used for the final forecast.

# Generate forecast
atm_4_prediction <- atm_4_ts |>
  model(SNAIVE(Cash)) |>
  forecast(h = 31)

atm_4_prediction |> autoplot(atm_4_ts, level = NULL)

# Produce final output
atm_4_final <- atm_4_prediction[,-c(2,4)] |>
  select(2,1,3) |>
  rename(Cash = .mean)

print(atm_4_final)

## # A tsibble: 31 x 3 [1D]
## # Key:       ATM [1]
##    DATE       ATM     Cash
##    <date>     <chr>  <dbl>
##  1 2010-05-01 ATM4    5.45
##  2 2010-05-02 ATM4  542.  
##  3 2010-05-03 ATM4  404.  
##  4 2010-05-04 ATM4   13.7 
##  5 2010-05-05 ATM4  348.  
##  6 2010-05-06 ATM4   44.2 
##  7 2010-05-07 ATM4  482.  
##  8 2010-05-08 ATM4    5.45
##  9 2010-05-09 ATM4  542.  
## 10 2010-05-10 ATM4  404.  
## # ℹ 21 more rows

Additionally, this model’s residual diagnostics are the closest to white noise.

atm_4_ts |>
  model(SNAIVE(Cash)) |>
  gg_tsresiduals()

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).

Report

# Combine predictions for all four ATMs into a single Data Frame
full_atm_predictions <- rbind(as_tibble(atm_1_final), as_tibble(atm_2_final), as_tibble(atm_3_final), as_tibble(atm_4_final)) |> arrange(DATE)

# Create wide-format predictions
atm_wide <- full_atm_predictions |> pivot_wider(names_from = ATM, values_from = Cash)

# Write predictions to CSV files
write.csv(full_atm_predictions, 'myrianthopoulos_atm_forecast_long.csv')
write.csv(atm_wide, 'myrianthopoulos_atm_forecast_wide.csv')

# Inspect results
print(head(full_atm_predictions))

## # A tibble: 6 × 3
##   DATE       ATM     Cash
##   <date>     <chr>  <dbl>
## 1 2010-05-01 ATM1   87.1 
## 2 2010-05-01 ATM2   61   
## 3 2010-05-01 ATM3   87.7 
## 4 2010-05-01 ATM4    5.45
## 5 2010-05-02 ATM1  101.  
## 6 2010-05-02 ATM2   89

print(head(atm_wide))

## # A tibble: 6 × 5
##   DATE         ATM1  ATM2  ATM3   ATM4
##   <date>      <dbl> <dbl> <dbl>  <dbl>
## 1 2010-05-01  87.1     61  87.7   5.45
## 2 2010-05-02 101.      89  87.7 542.  
## 3 2010-05-03  73.1     11  87.7 404.  
## 4 2010-05-04   5.74     2  87.7  13.7 
## 5 2010-05-05 100.     107  87.7 348.  
## 6 2010-05-06  79.4     89  87.7  44.2

Part B

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

Data Cleaning

power_data <- read_xlsx('ResidentialCustomerForecastLoad-624.xlsx')
print(head(power_data))

## # A tibble: 6 × 3
##   CaseSequence `YYYY-MMM`     KWH
##          <dbl> <chr>        <dbl>
## 1          733 1998-Jan   6862583
## 2          734 1998-Feb   5838198
## 3          735 1998-Mar   5420658
## 4          736 1998-Apr   5010364
## 5          737 1998-May   4665377
## 6          738 1998-Jun   6467147

print(tail(power_data))

## # A tibble: 6 × 3
##   CaseSequence `YYYY-MMM`     KWH
##          <dbl> <chr>        <dbl>
## 1          919 2013-Jul   8415321
## 2          920 2013-Aug   9080226
## 3          921 2013-Sep   7968220
## 4          922 2013-Oct   5759367
## 5          923 2013-Nov   5769083
## 6          924 2013-Dec   9606304

power_data$`YYYY-MMM` <- yearmonth(power_data$`YYYY-MMM`)
power_data <- power_data |> 
  rename(Month = `YYYY-MMM`) |> 
  select(Month, KWH) |>
  as_tsibble(index = 'Month')

print(head(power_data))

## # A tsibble: 6 x 2 [1M]
##      Month     KWH
##      <mth>   <dbl>
## 1 1998 Jan 6862583
## 2 1998 Feb 5838198
## 3 1998 Mar 5420658
## 4 1998 Apr 5010364
## 5 1998 May 4665377
## 6 1998 Jun 6467147

power_data |> autoplot(KWH)

This data appears to be seasonal, but the ACF plot was investigated to confirm this. The ACF plot showed clear peaks in correlation at 6, 12, and 18 months and clear troughs at 3, 9, 15, and 21 months, confirming the seasonal nature of the data.

power_data |>
  ACF(KWH) |>
  autoplot()

In addition to revealing the seasonal nature of the data, the initial plot shows a single clear outlier. This value was replaced with the median value. Missing values were replaced with the median of the data set as well.

power_data$KWH[power_data$KWH == min(power_data$KWH, na.rm = TRUE)] <- median(power_data$KWH, na.rm = TRUE)
power_data$KWH[is.na(power_data$KWH)] <- median(power_data$KWH, na.rm=TRUE)
power_data |> autoplot(KWH)

The variance here looks pretty consistent, but I checked to see if a Box-Cox transformation would help smooth it out. The plot of the Box-Cox transformed data did not look meaningfully different than the original data plot. Unnecessary data transformations should be avoided, so the Box-Cox transformation was not applied.

# Find lambda
lambda_power <- power_data |>
  features(KWH, features = guerrero) |>
  pull(lambda_guerrero)

# Apply Box-Cox transformation
power_data_bc <- power_data |>
  mutate(b_c = box_cox(KWH, lambda_power)) |>
  select(Month, b_c)

power_data_bc |> autoplot(b_c)

Model Selection

A seasonal Naive Bayes model.
An ETS model.
An ARIMA model.

The model with lowest RMSE was the ARIMA model, so that model was used for the final forecast.

# Create Test/Train Split
power_data_train <- power_data[1:round(.8*nrow(power_data)),]
power_data_test <- anti_join(power_data, power_data_train, by = 'Month')

# Use model to predict the test data
power_data_model <- power_data_train |>
  model('snaive' = SNAIVE(KWH),
        'ets_ann' = ETS(KWH),
        'arima' = ARIMA(KWH)) |>
  forecast(h = nrow(power_data_test))

# Evaluate model accuracy
accuracy(power_data_model, power_data) |> select(.model, RMSE) |> arrange(RMSE)

## # A tibble: 3 × 2
##   .model      RMSE
##   <chr>      <dbl>
## 1 arima   1123229.
## 2 ets_ann 1240425.
## 3 snaive  1265484.

The selected ARIMA model’s residuals appear to be white noise, suggesting that the model is appropriate for the data.

power_data |>
  model(ARIMA(KWH)) |>
  gg_tsresiduals()

Forecast

A 12-month forecast was generated (one full year).

# Generate forecast
kwh_prediction <- power_data |>
  model(ARIMA(KWH)) |>
  forecast(h = 12)

kwh_prediction |> autoplot(power_data, level = NULL)

# Produce final output
kwh_final <- kwh_prediction[,-c(1,3)] |>
  rename(KWH = .mean)

print(kwh_final)

## # A tsibble: 12 x 2 [1M]
##       Month       KWH
##       <mth>     <dbl>
##  1 2014 Jan 10138894.
##  2 2014 Feb  8757344.
##  3 2014 Mar  6702667.
##  4 2014 Apr  6008940.
##  5 2014 May  5943702.
##  6 2014 Jun  8207930.
##  7 2014 Jul  9520071.
##  8 2014 Aug 10010697.
##  9 2014 Sep  8488807.
## 10 2014 Oct  5867038.
## 11 2014 Nov  6155281.
## 12 2014 Dec  8307433.

Report

# Write forecast to a .csv file
write.csv(kwh_final, 'myrianthopoulos_kwh_forecast.csv')

Part C

Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.

Data Cleaning

# Import and inspect the data
pipe1_data <- read_xlsx('Waterflow_Pipe1.xlsx')
pipe2_data <- read_xlsx('Waterflow_Pipe2.xlsx')

print(head(pipe1_data))

## # A tibble: 6 × 2
##   `Date Time` WaterFlow
##         <dbl>     <dbl>
## 1      42300.     23.4 
## 2      42300.     28.0 
## 3      42300.     23.1 
## 4      42300.     30.0 
## 5      42300.      6.00
## 6      42300.     15.9

print(head(pipe2_data))

## # A tibble: 6 × 2
##   `Date Time` WaterFlow
##         <dbl>     <dbl>
## 1      42300.      18.8
## 2      42300.      43.1
## 3      42300.      38.0
## 4      42300.      36.1
## 5      42300.      31.9
## 6      42300.      28.2

# Average the WaterFlow column values for each hour and filter the index
pipe_data <- rbind(pipe1_data, pipe2_data) |> rename(date_time = 'Date Time') |> arrange(date_time)
pipe_data$date_time <- convertToDateTime(pipe_data$date_time)
pipe_data$date <- as.Date(pipe_data$date_time)
pipe_data$hour <- format(pipe_data$date_time, format = "%H")
pipe_data <- pipe_data |>
  group_by(date, hour) |>
  mutate(hour_avg = mean(WaterFlow)) |>
  select(date, hour, hour_avg) |>
  ungroup() |>
  distinct() |>
  mutate(date_time = paste0(date, " ", hour, ':00:00'))

pipe_data$date_time <- as.POSIXct(pipe_data$date_time)
pipe_data <- pipe_data |> select(date_time, hour_avg) |> as_tsibble(index = 'date_time')
print(head(pipe_data))

## # A tsibble: 6 x 2 [1h] <?>
##   date_time           hour_avg
##   <dttm>                 <dbl>
## 1 2015-10-23 00:00:00     26.1
## 2 2015-10-23 01:00:00     18.8
## 3 2015-10-23 02:00:00     24.5
## 4 2015-10-23 03:00:00     25.6
## 5 2015-10-23 04:00:00     22.4
## 6 2015-10-23 05:00:00     23.6

Data Transformation

pipe_data |> autoplot(hour_avg)

The amount of variance in the data is very inconsistent across this data set. A Box-Cox transformation was applied to make it more consistent.

# Find lambda
lambda_pipe <- pipe_data |>
  features(hour_avg, features = guerrero) |>
  pull(lambda_guerrero)

# Apply Box-Cox transformation
pipe_data_bc <- pipe_data |>
  mutate(b_c = box_cox(hour_avg, lambda_pipe)) |>
  select(date_time, b_c)

pipe_data_bc |> autoplot(b_c)

With the data transformed, no seasonal differencing and one order of differencing are recommended to obtain stationary data.

pipe_data_bc |> features(b_c, unitroot_nsdiffs)

## # A tibble: 1 × 1
##   nsdiffs
##     <int>
## 1       0

pipe_data_bc |>
  features(b_c, unitroot_ndiffs)

## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      1

After one order of differencing is applied, the data exhibits no trend or seasonality, and appears stationary (albeit with occasional outliers). It is therefore possible to forecast it using an ARIMA model.

pipe_data_bc |>
  mutate(diff_bc = difference(b_c)) |>
  autoplot(diff_bc)

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

Forecast

Since \(c=0\) and \(d=1\), the forecasts will stabilize at a constant (in this case, the mean of the Box-Cox transformed data) as per section 9.5 in the textbook.

# Fill gaps in data
pipe_data_bc <- fill_gaps(pipe_data_bc)

# Generate forecast
pipe_prediction <- pipe_data_bc |>
  model(ARIMA(b_c)) |>
  forecast(h = 168)

pipe_prediction |> autoplot(pipe_data_bc, level = NULL)

# Produce final output
pipe_final <- pipe_prediction[,-c(1,3)] |>
  rename(b_c = .mean) |>
  mutate(hour_avg = inv_box_cox(b_c, lambda_pipe)) |>
  select(date_time, hour_avg)

print(head(pipe_final))

## # A tsibble: 6 x 2 [1h] <?>
##   date_time           hour_avg
##   <dttm>                 <dbl>
## 1 2015-12-04 00:00:00     29.3
## 2 2015-12-04 01:00:00     29.6
## 3 2015-12-04 02:00:00     29.6
## 4 2015-12-04 03:00:00     29.6
## 5 2015-12-04 04:00:00     29.6
## 6 2015-12-04 05:00:00     29.6

Report

# Write forecast to a .csv file
write.csv(pipe_final, 'myrianthopoulos_pipe_forecast.csv')

DATA 624 Project 1

Marley Myrianthopoulos

2024-10-28

Part A

Data Cleaning

ATM 1

Exploratory Data Analysis

Model Selection

Forecast

ATM 2

Exploratory Data Analysis

Model Selection

Forecast

ATM 3

Exploratory Data Analysis

Forecast

ATM 4

Exploratory Data Analysis

Model Selection

Forecast

Report

Part B

Data Cleaning

Model Selection

Forecast

Report

Part C

Data Cleaning

Data Transformation

Forecast

Report