library(fpp3)
## ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
## ✔ tibble      3.2.1     ✔ tsibble     1.1.4
## ✔ dplyr       1.1.3     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.0     ✔ feasts      0.3.1
## ✔ lubridate   1.9.3     ✔ fable       0.3.3
## ✔ ggplot2     3.4.4     ✔ fabletools  0.3.4
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
library(dplyr)
library(ggplot2)
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ readr   2.1.4
## ✔ purrr   1.0.2     ✔ stringr 1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tsibble)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(fable)
library(tidyr)
library(lubridate)
library(fabletools)
library(distributional)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following object is masked from 'package:tsibble':
## 
##     key
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(writexl)
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following object is masked from 'package:tsibble':
## 
##     index
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(seasonal)
## 
## Attaching package: 'seasonal'
## 
## The following object is masked from 'package:tibble':
## 
##     view

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

kwh_data <- read_excel("/Users/briansingh/Desktop/CUNY/Spring 2024/Data 624/Project1/ResidentialCustomerForecastLoad-624.xlsx",col_names = TRUE)
names(kwh_data)[names(kwh_data) == "YYYY-MMM"] <- "Date"
summary(kwh_data)
##   CaseSequence       Date                KWH          
##  Min.   :733.0   Length:192         Min.   :  770523  
##  1st Qu.:780.8   Class :character   1st Qu.: 5429912  
##  Median :828.5   Mode  :character   Median : 6283324  
##  Mean   :828.5                      Mean   : 6502475  
##  3rd Qu.:876.2                      3rd Qu.: 7620524  
##  Max.   :924.0                      Max.   :10655730  
##                                     NA's   :1
#convert from character to monthly for time series
kwh_data$Date <- yearmonth(as.Date(as.yearmon(kwh_data$Date, "%Y-%b")))

#one missing value in KWH -- replace with median so no gaps in time series
kwh_data$KWH[is.na(kwh_data$KWH)] = median(kwh_data$KWH, na.rm = TRUE)

#case sequence has no significance in this
kwh_timeseries <- kwh_data %>%
    as_tsibble(index=Date) %>%
    select(-CaseSequence)
kwh_timeseries %>%
  pivot_longer(-Date) %>%
  ggplot(aes(x = Date, y = value, colour = name)) +
  geom_line() +
  facet_grid(name ~ ., scales = "free_y")

I can see seasonal data with a slight upwards trend after approximately 2011.

kwh_timeseries %>%
    gg_season(KWH)

July has a large outlier which we can address and normalize for modeling. Lets look at the decomposition to identify trend and seasonality.

kwh_timeseries %>%
  model(
    classical_decomposition(KWH, type = "multiplicative")
  ) %>%
  components() %>%
  autoplot() +
  labs(title = "Classical decomposition with type = multiplicative")
## Warning: Removed 6 rows containing missing values (`geom_line()`).

As seen above, definitely seasonal data with an upward trend beginning after 2010. It is not steadily increasing, so drifting would not be ideal for forecasting. We can use seasonal naive.

fit_kwh_snaive <- kwh_timeseries %>%
    model(SNAIVE(KWH))

fit_kwh_snaive %>% gg_tsresiduals()
## Warning: Removed 12 rows containing missing values (`geom_line()`).
## Warning: Removed 12 rows containing missing values (`geom_point()`).
## Warning: Removed 12 rows containing non-finite values (`stat_bin()`).

The residuals appear to be centered around zero and follow a normal distribution. Also, there appears to be constant variance, with the exception of a couple of outliers, which holds for a good model.

lambda <- kwh_timeseries %>%
    features(KWH,features = guerrero) %>%
    pull(lambda_guerrero)
kwh_timeseries %>%
  autoplot(box_cox(KWH, lambda))

Lambda of 0.1041 can be used to apply box-cox to normalize the data. However, did not appear very helpful.

kwh_snaive_forecast <- fit_kwh_snaive %>%
    forecast(h=12)

kwh_snaive_forecast
## # A fable: 12 x 4 [1M]
## # Key:     .model [1]
##    .model          Date                 KWH    .mean
##    <chr>          <mth>              <dist>    <dbl>
##  1 SNAIVE(KWH) 2014 Jan N(1.1e+07, 1.4e+12) 10655730
##  2 SNAIVE(KWH) 2014 Feb N(7681798, 1.4e+12)  7681798
##  3 SNAIVE(KWH) 2014 Mar N(6517514, 1.4e+12)  6517514
##  4 SNAIVE(KWH) 2014 Apr N(6105359, 1.4e+12)  6105359
##  5 SNAIVE(KWH) 2014 May N(5940475, 1.4e+12)  5940475
##  6 SNAIVE(KWH) 2014 Jun N(7920627, 1.4e+12)  7920627
##  7 SNAIVE(KWH) 2014 Jul N(8415321, 1.4e+12)  8415321
##  8 SNAIVE(KWH) 2014 Aug N(9080226, 1.4e+12)  9080226
##  9 SNAIVE(KWH) 2014 Sep   N(8e+06, 1.4e+12)  7968220
## 10 SNAIVE(KWH) 2014 Oct N(5759367, 1.4e+12)  5759367
## 11 SNAIVE(KWH) 2014 Nov N(5769083, 1.4e+12)  5769083
## 12 SNAIVE(KWH) 2014 Dec N(9606304, 1.4e+12)  9606304
kwh_snaive_forecast %>%
    autoplot(kwh_timeseries)

kwh_snaive_forecast_df <- data.frame(kwh_snaive_forecast)
write_xlsx(kwh_snaive_forecast_df, "forecast_kwh.xlsx")

It does appear to be a good model and follows the overall trend and seasonality of the original data.