#setting up WD/packages and looking at the columns/rows
setwd("C:/Users/Matthew/OneDrive/Desktop/predictive analytics")
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.5.3
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.3 ──
## ✔ tibble 3.3.1 ✔ tsibble 1.2.0
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.2 ✔ ggtime 0.2.0
## ✔ lubridate 1.9.4 ✔ feasts 0.5.0
## ✔ ggplot2 4.0.2 ✔ fable 0.5.0
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tsibble' was built under R version 4.5.3
## Warning: package 'tsibbledata' was built under R version 4.5.3
## Warning: package 'ggtime' was built under R version 4.5.3
## Warning: package 'feasts' was built under R version 4.5.3
## Warning: package 'fabletools' was built under R version 4.5.3
## Warning: package 'fable' was built under R version 4.5.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
glimpse(aus_retail)
## Rows: 64,532
## Columns: 5
## Key: State, Industry [152]
## $ State <chr> "Australian Capital Territory", "Australian Capital Territ…
## $ Industry <chr> "Cafes, restaurants and catering services", "Cafes, restau…
## $ `Series ID` <chr> "A3349849A", "A3349849A", "A3349849A", "A3349849A", "A3349…
## $ Month <mth> 1982 Apr, 1982 May, 1982 Jun, 1982 Jul, 1982 Aug, 1982 Sep…
## $ Turnover <dbl> 4.4, 3.4, 3.6, 4.0, 3.6, 4.2, 4.8, 5.4, 6.9, 3.8, 4.2, 4.0…
ggplot(aus_retail, aes(x = Month, y = Turnover, color = State)) +
geom_line()
#overall line plot
#more precise data picks
aus_retail %>%
filter(Industry == "Cafes, restaurants and catering services") %>%
ggplot(aes(x = Month, y = Turnover, color = State)) +
geom_line()
#model setup
retail_one <- aus_retail %>%
filter(
State == "New South Wales",
Industry == "Cafes, restaurants and catering services"
)
turnover_model <- retail_one %>%
model(
classical = classical_decomposition(Turnover, type = "additive")
)
components(turnover_model)
## # A dable: 441 x 9 [1M]
## # Key: State, Industry, .model [1]
## # : Turnover = trend + seasonal + random
## State Industry .model Month Turnover trend seasonal random season_adjust
## <chr> <chr> <chr> <mth> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 New So… Cafes, … class… 1982 Apr 61.8 NA -7.71 NA 69.5
## 2 New So… Cafes, … class… 1982 May 60.8 NA -5.64 NA 66.4
## 3 New So… Cafes, … class… 1982 Jun 58.7 NA -22.7 NA 81.4
## 4 New So… Cafes, … class… 1982 Jul 60.3 NA -9.23 NA 69.5
## 5 New So… Cafes, … class… 1982 Aug 56.1 NA -4.07 NA 60.2
## 6 New So… Cafes, … class… 1982 Sep 58.1 NA -0.896 NA 59.0
## 7 New So… Cafes, … class… 1982 Oct 53.9 59.9 7.91 -13.9 46.0
## 8 New So… Cafes, … class… 1982 Nov 61.2 60.2 11.5 -10.4 49.7
## 9 New So… Cafes, … class… 1982 Dec 75.7 60.5 51.2 -36.0 24.5
## 10 New So… Cafes, … class… 1983 Jan 54.2 60.7 1.43 -7.97 52.8
## # ℹ 431 more rows
components(turnover_model) %>%
autoplot()
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).
stl_model <- retail_one %>%
model(
stl=STL(Turnover ~ trend(window = 7) +
season(window = "periodic"),
robust = TRUE))
components(stl_model) %>%
autoplot()
Turnover seems about to be the same, the trend in the decompositon
method seems smoother compared to the STL decomposition. The seasonal
components are almost the same in both methods, showing a strong and
consistent yearly seasonal pattern in retail turnover. The
remainder/random ones seem to differ a lot more, where they seem
entirely different. I believe this is because STL leaves a different
residual pattern of variation with LOESS to estimate seasonality trend,
which would allow it to capture more complex changes in the series.
Overall it appears that they both show similar trends/seasonal
components, but STL is better overall because it is able to provide a
better representation of the underlying structure of the data.
round(nrow(retail_one) * .7) #find what row to cut off the training
## [1] 309
#find cutoff date
retail_one %>%
mutate(row = row_number())
## # A tsibble: 441 x 6 [1M]
## # Key: State, Industry [1]
## State Industry `Series ID` Month Turnover row
## <chr> <chr> <chr> <mth> <dbl> <int>
## 1 New South Wales Cafes, restaurants and c… A3349709X 1982 Apr 61.8 1
## 2 New South Wales Cafes, restaurants and c… A3349709X 1982 May 60.8 2
## 3 New South Wales Cafes, restaurants and c… A3349709X 1982 Jun 58.7 3
## 4 New South Wales Cafes, restaurants and c… A3349709X 1982 Jul 60.3 4
## 5 New South Wales Cafes, restaurants and c… A3349709X 1982 Aug 56.1 5
## 6 New South Wales Cafes, restaurants and c… A3349709X 1982 Sep 58.1 6
## 7 New South Wales Cafes, restaurants and c… A3349709X 1982 Oct 53.9 7
## 8 New South Wales Cafes, restaurants and c… A3349709X 1982 Nov 61.2 8
## 9 New South Wales Cafes, restaurants and c… A3349709X 1982 Dec 75.7 9
## 10 New South Wales Cafes, restaurants and c… A3349709X 1983 Jan 54.2 10
## # ℹ 431 more rows
#train data setup
train_data <- retail_one %>%
filter(Month <= yearmonth("2007 Dec"))
#test data setup
test_data <- retail_one %>%
filter(Month > yearmonth("2007 Dec"))
#Model setup
models <- train_data %>%
model(
mean = MEAN(Turnover),
naive = NAIVE(Turnover),
snaive = SNAIVE(Turnover),
drift = RW(Turnover ~ drift())
)
#forecast for only the last 132 since we trained on the first 309
forecast <- models %>%
forecast(h = 132)
#Plotting
forecast %>%
autoplot(retail_one, level = NULL)
accuracy(forecast, test_data) %>% #get the entire tibble + MSE
mutate(MSE = RMSE^2)
## # A tibble: 4 × 13
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 drift New Sou… Cafes, … Test 43.6 105. 88.2 3.72 15.3 NaN NaN 0.866
## 2 mean New Sou… Cafes, … Test 355. 381. 355. 60.2 60.2 NaN NaN 0.922
## 3 naive New Sou… Cafes, … Test 126. 187. 152. 17.0 24.3 NaN NaN 0.922
## 4 snaive New Sou… Cafes, … Test 148. 202. 165. 21.3 26.1 NaN NaN 0.936
## # ℹ 1 more variable: MSE <dbl>
The drift model seems to be the most accurate of the bunch as it more closely aligns with how reality turned out. it had the lowest RMSE, MAE, MAPE and MSE. The mean model performed the worst out of all the models as from the graph it is nowhere near where the line comes out to be. The Naive and SNaive models were second and third best respectively. These results showcase that accounting for strong upward trend in retail turnover is important for producing results that resemble closely to reality.