#setting up WD/packages and looking at the columns/rows

setwd("C:/Users/Matthew/OneDrive/Desktop/predictive analytics")
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.5.3
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.3 ──
## ✔ tibble      3.3.1     ✔ tsibble     1.2.0
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.2     ✔ ggtime      0.2.0
## ✔ lubridate   1.9.4     ✔ feasts      0.5.0
## ✔ ggplot2     4.0.2     ✔ fable       0.5.0
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tsibble' was built under R version 4.5.3
## Warning: package 'tsibbledata' was built under R version 4.5.3
## Warning: package 'ggtime' was built under R version 4.5.3
## Warning: package 'feasts' was built under R version 4.5.3
## Warning: package 'fabletools' was built under R version 4.5.3
## Warning: package 'fable' was built under R version 4.5.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
glimpse(aus_retail)
## Rows: 64,532
## Columns: 5
## Key: State, Industry [152]
## $ State       <chr> "Australian Capital Territory", "Australian Capital Territ…
## $ Industry    <chr> "Cafes, restaurants and catering services", "Cafes, restau…
## $ `Series ID` <chr> "A3349849A", "A3349849A", "A3349849A", "A3349849A", "A3349…
## $ Month       <mth> 1982 Apr, 1982 May, 1982 Jun, 1982 Jul, 1982 Aug, 1982 Sep…
## $ Turnover    <dbl> 4.4, 3.4, 3.6, 4.0, 3.6, 4.2, 4.8, 5.4, 6.9, 3.8, 4.2, 4.0…
ggplot(aus_retail, aes(x = Month, y = Turnover, color = State)) +
         geom_line()

#overall line plot
#more precise data picks

aus_retail %>%
  filter(Industry == "Cafes, restaurants and catering services") %>%
  ggplot(aes(x = Month, y = Turnover, color = State)) +
  geom_line()

#model setup

retail_one <- aus_retail %>%
  filter(
    State == "New South Wales",
    Industry == "Cafes, restaurants and catering services"
  )
turnover_model <- retail_one %>%
  model(
    classical = classical_decomposition(Turnover, type = "additive")
  )
components(turnover_model)
## # A dable: 441 x 9 [1M]
## # Key:     State, Industry, .model [1]
## # :        Turnover = trend + seasonal + random
##    State   Industry .model    Month Turnover trend seasonal random season_adjust
##    <chr>   <chr>    <chr>     <mth>    <dbl> <dbl>    <dbl>  <dbl>         <dbl>
##  1 New So… Cafes, … class… 1982 Apr     61.8  NA     -7.71   NA             69.5
##  2 New So… Cafes, … class… 1982 May     60.8  NA     -5.64   NA             66.4
##  3 New So… Cafes, … class… 1982 Jun     58.7  NA    -22.7    NA             81.4
##  4 New So… Cafes, … class… 1982 Jul     60.3  NA     -9.23   NA             69.5
##  5 New So… Cafes, … class… 1982 Aug     56.1  NA     -4.07   NA             60.2
##  6 New So… Cafes, … class… 1982 Sep     58.1  NA     -0.896  NA             59.0
##  7 New So… Cafes, … class… 1982 Oct     53.9  59.9    7.91  -13.9           46.0
##  8 New So… Cafes, … class… 1982 Nov     61.2  60.2   11.5   -10.4           49.7
##  9 New So… Cafes, … class… 1982 Dec     75.7  60.5   51.2   -36.0           24.5
## 10 New So… Cafes, … class… 1983 Jan     54.2  60.7    1.43   -7.97          52.8
## # ℹ 431 more rows
components(turnover_model) %>%
  autoplot()
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).

stl_model <- retail_one %>%
  model(
    stl=STL(Turnover ~ trend(window = 7) +
              season(window = "periodic"),
            robust = TRUE))

components(stl_model) %>%
autoplot()

Turnover seems about to be the same, the trend in the decompositon method seems smoother compared to the STL decomposition. The seasonal components are almost the same in both methods, showing a strong and consistent yearly seasonal pattern in retail turnover. The remainder/random ones seem to differ a lot more, where they seem entirely different. I believe this is because STL leaves a different residual pattern of variation with LOESS to estimate seasonality trend, which would allow it to capture more complex changes in the series. Overall it appears that they both show similar trends/seasonal components, but STL is better overall because it is able to provide a better representation of the underlying structure of the data.

round(nrow(retail_one) * .7) #find what row to cut off the training
## [1] 309
#find cutoff date

retail_one %>%
  mutate(row = row_number())
## # A tsibble: 441 x 6 [1M]
## # Key:       State, Industry [1]
##    State           Industry                  `Series ID`    Month Turnover   row
##    <chr>           <chr>                     <chr>          <mth>    <dbl> <int>
##  1 New South Wales Cafes, restaurants and c… A3349709X   1982 Apr     61.8     1
##  2 New South Wales Cafes, restaurants and c… A3349709X   1982 May     60.8     2
##  3 New South Wales Cafes, restaurants and c… A3349709X   1982 Jun     58.7     3
##  4 New South Wales Cafes, restaurants and c… A3349709X   1982 Jul     60.3     4
##  5 New South Wales Cafes, restaurants and c… A3349709X   1982 Aug     56.1     5
##  6 New South Wales Cafes, restaurants and c… A3349709X   1982 Sep     58.1     6
##  7 New South Wales Cafes, restaurants and c… A3349709X   1982 Oct     53.9     7
##  8 New South Wales Cafes, restaurants and c… A3349709X   1982 Nov     61.2     8
##  9 New South Wales Cafes, restaurants and c… A3349709X   1982 Dec     75.7     9
## 10 New South Wales Cafes, restaurants and c… A3349709X   1983 Jan     54.2    10
## # ℹ 431 more rows
#train data setup
train_data <- retail_one %>%
  filter(Month <= yearmonth("2007 Dec"))
#test data setup
test_data <- retail_one %>%
  filter(Month > yearmonth("2007 Dec"))
#Model setup
models <- train_data %>%
  model(
    mean = MEAN(Turnover),
    naive = NAIVE(Turnover),
    snaive = SNAIVE(Turnover),
    drift = RW(Turnover ~ drift())
    )
#forecast for only the last 132 since we trained on the first 309
forecast <- models %>%
  forecast(h = 132)
#Plotting
forecast %>%
  autoplot(retail_one, level = NULL)

accuracy(forecast, test_data) %>% #get the entire tibble + MSE
  mutate(MSE = RMSE^2)
## # A tibble: 4 × 13
##   .model State    Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <chr>    <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 drift  New Sou… Cafes, … Test   43.6  105.  88.2  3.72  15.3   NaN   NaN 0.866
## 2 mean   New Sou… Cafes, … Test  355.   381. 355.  60.2   60.2   NaN   NaN 0.922
## 3 naive  New Sou… Cafes, … Test  126.   187. 152.  17.0   24.3   NaN   NaN 0.922
## 4 snaive New Sou… Cafes, … Test  148.   202. 165.  21.3   26.1   NaN   NaN 0.936
## # ℹ 1 more variable: MSE <dbl>

The drift model seems to be the most accurate of the bunch as it more closely aligns with how reality turned out. it had the lowest RMSE, MAE, MAPE and MSE. The mean model performed the worst out of all the models as from the graph it is nowhere near where the line comes out to be. The Naive and SNaive models were second and third best respectively. These results showcase that accounting for strong upward trend in retail turnover is important for producing results that resemble closely to reality.