Food & Baverage: It’s Friday Night

knitr::include_graphics("SB.jpg")

Dibuat oleh Reza Lutfi Ismail

Background

Proyek ini berisikan tentang perkiraan dan analisa untuk pengambilan keputuan bisnis pada outlet dengan menggunakan methode Time Series & Forecasting yang memanfaatkan deret waktu.

Pola kemusiman pada industri makanan dan minuman sangatlah mempengaruhi perilaku pembelian dari pelanggan. Pada proyek kali ini, digunakan dataset untuk menganalisa jumlah pengunjung sehingga dapat embuat penilaian yang lebih baik di tahun 2018.

Data Preprocess

Import Library

# data wrangling 
library(readr)
## Warning: package 'readr' was built under R version 4.0.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(purrr)
## Warning: package 'purrr' was built under R version 4.0.3
library(zoo)
## Warning: package 'zoo' was built under R version 4.0.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(recipes)
## Warning: package 'recipes' was built under R version 4.0.3
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(imputeTS)
## Warning: package 'imputeTS' was built under R version 4.0.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'imputeTS'
## The following object is masked from 'package:zoo':
## 
##     na.locf
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.3
library(padr)

# data Set
library(fpp2)
## Warning: package 'fpp2' was built under R version 4.0.3
## -- Attaching packages ------------------------- fpp2 2.4 --
## v ggplot2   3.3.2     v fma       2.4  
## v forecast  8.13      v expsmooth 2.3
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'forecast' was built under R version 4.0.3
## 
# model and evaluation
library(forecast)
library(TTR)
## Warning: package 'TTR' was built under R version 4.0.3
library(tseries)
## Warning: package 'tseries' was built under R version 4.0.3
## 
## Attaching package: 'tseries'
## The following object is masked from 'package:imputeTS':
## 
##     na.remove
# visualization
library(ggplot2)
library(TSstudio)
## Warning: package 'TSstudio' was built under R version 4.0.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
options(scipen = 999)

Import Data

train <- read.csv("data_input/data-train.csv")
test <- read.csv("data_input/data-test.csv")

EDA

Untuk membuat data set supaya menjadi terurut, digunakan fungsi arrange() pada kolom transaction_date, sehingga waktu menjadi terurut

train <- train %>%
   arrange(transaction_date)

Demonstrated how to properly do data aggregation

untuk mengubah tipe data pada kolom transaction_date digunakan fungsi ymd_hms pada library lubridate dan mengetahui jumlah pengunjung pada masing-masing jam menggunakan summarise pada kolom receipt_number

# aggregation

train <- train %>%
   select(transaction_date, receipt_number) %>%
   mutate(transaction_date = ymd_hms(transaction_date),
          transaction_date = floor_date(transaction_date, unit = "hour")) %>%
   group_by(transaction_date) %>%
   summarise(visitor = n_distinct(receipt_number)) %>%
      ungroup()
## `summarise()` ungrouping output (override with `.groups` argument)

Demonstrated how to properly do time series padding

Untuk mengetahui pada jam keberapa visitor melakukan transaksi, dilakukan padding dengan fungsi pad dari library(padr)

# padding

set.seed(100)
train <- train  %>%
   pad(start_val = ymd_hms("2017-12-01 00:00:00"),
       end_val = ymd_hms("2018-02-18 23:00:00")) %>%
   mutate(visitor = replace_na(visitor, 0)) %>%
   mutate(operational_hour = hour(transaction_date)) %>% 
   filter(operational_hour %in% c(10:22)) 
## pad applied on the interval: hour

Karena pada kasus Time Series, nilai sangatlah penting, untuk mengecek apakah ada NA atau tidak, kita dapat menggunakan fungsi colSums

# check NA 

colSums(is.na(train))
## transaction_date          visitor operational_hour 
##                0                0                0

Seasonality Analysis

Compared multipled time series decomposition approach

Pada step ini, saya akan menganalisa bentuk dari Time Series untuk melihat bagaimana bentuk dari Time Series Untuk menampilkan plot pada data train yang sudah di plot, saya menggunakan ggplot dan ggplotly

# cross validation

val <- tail(train, 13*7) # data validation 
train <- head(train, nrow(train) - 13*7) # data train
# Plot data Train & Validation 

plot_train <- train %>%
   ggplot(aes(x = transaction_date, y = visitor)) +
   geom_line() +
   scale_x_datetime(name = "Transaction Date", 
                    date_breaks = "2 weeks", 
                    expand = expansion(mult = 0.05, add = 0.05)) +
   scale_y_continuous(breaks = seq(0, 400, 50), expand = expansion(mult = 0.05, add = 0.05)) +
   geom_line(data = val, 
             aes(color = "red"), 
             show.legend = F)
  
ggplotly(plot_train)

Reported interpretable hourly and weekly seasonality

berdasarkan plot diatas, terlihat bahwa ada perbedaan peak pada pola seasonal yang mengartikan bahwa terdapat pola seasonal yang belum berhasil ditangkap. untuk mengetahui lebih lanjut, kita dapat membuat objek multi seasonal time series dnegan menggunakan fungsi msts

# buat object multi seasonal time series

multi <- msts(train$visitor, 
              seasonal.periods = c(13, # Jam operasi dalam sehari 
                                   13*7)) # jam oeprasi dalam seminggu 

untuk melihat decompose, menggunakan fungsi decomp untuk interpretasi perjam dan perminggu

# Decompose

multi_decomp <- multi %>%
   mstl()

multi_decomp %>%
   tail(13*7*4) %>%
   autoplot()

Model Fitting and Evaluation

Demnstrated how to prepare cross validation data for this case

Untuk dapat melakukan training model, maka perlu split data pada data train yaitu menjadi data train dan data validation

# cross validation

val <- tail(train, 13*7) # data validation 
train <- head(train, nrow(train) - 13*7) # data train

Demonstrated how to properly do model fitting and evaluation

stlf

Membuat model forecast dengan fungsi stlf()

# buat model menggunakan stlf 
stlf <- stlf(y = multi, h = 13*7)
# forecast 
stlf_forecast <- forecast(stlf, h = 13*7)
# test accuracy terhadap data validation
accuracy(stlf, val$visitor)
##                       ME     RMSE      MAE  MPE MAPE      MASE      ACF1
## Training set -0.01769208 5.331640 4.058445  NaN  Inf 0.4076074 0.1382671
## Test set     -4.40404904 6.815271 5.678190 -Inf  Inf 0.5702854        NA

ARIMA

Membuat model forecast dengan Multi-Seasonal ARIMA

# buat model arima menggunakan arima
arima <- stlm(multi, method = "arima")
# forecast
arima_forecast <- forecast(arima, h = 13*7)
# test accuracy terhadap validation 
accuracy(arima_forecast, val$visitor)
##                      ME     RMSE      MAE MPE MAPE      MASE         ACF1
## Training set -0.0162468 5.186077 3.928382 NaN  Inf 0.3945446 0.0007599934
## Test set     -0.4245888 5.427942 4.232354 NaN  Inf 0.4250738           NA

Holt-Winter

Membuat model dengan Multi-seasonal Holt-Winter

# buat model menggunakan ets
hw <- stlm(multi, method = "ets")
# forecast
hw_forecast <- forecast(hw, h = 13*7)
# test accuracy terhadap validation 
accuracy(hw_forecast, val$visitor)
##                       ME     RMSE      MAE  MPE MAPE      MASE      ACF1
## Training set -0.01769208 5.331640 4.058445  NaN  Inf 0.4076074 0.1382671
## Test set     -4.40404904 6.815271 5.678190 -Inf  Inf 0.5702854        NA

TBATS

Membuat model dengan TBATS model

# buat model menggunakan tbats
 tbats <- multi %>%
   tbats(use.box.cox = F,
         use.trend = T,
         use.damped.trend = T)
# forecast 
 tbats_forecast <- forecast(tbats, h = 13*7)
# test accuracy terhadap validation 
 accuracy(tbats_forecast, val$visitor)
##                      ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set -0.1608187 6.564820 5.037088  NaN  Inf 0.5058967 -0.00079954
## Test set     -4.4869188 8.251937 6.561895 -Inf  Inf 0.6590398          NA

Compared multiple model specifications

berdasarkan dari ke empat mdoel yang sudah dibuat, didapat bahwa model ARIMA menghasilkan nilai MAE yang paling kecil, yaitu pada training sebesar 3.9 dan tes sebesar 4.2

maka, saya akan menggunakan model ARIMA

Evaluate model

Uji Assumption

# residual autocorrelation
Box.test(arima$residuals, type = "Ljung-Box",
         lag = 13*7)
## 
##  Box-Ljung test
## 
## data:  arima$residuals
## X-squared = 183.45, df = 91, p-value = 0.0000000335

Uji Normality shapiro.test

# Normality

shapiro.test(arima$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  arima$residuals
## W = 0.99028, p-value = 0.00000643

dari hasil uji assumption dan shapiro, disimpulkan bahwa model belum bisa memenuhi dari uji tersebut. maka dari itu, untuk tetap dapat menggunakan model tersebut, dilakukan transformasi data pada data train dan data validation

# tranformasi data train dan val
rec <- recipe(visitor ~., data = train) %>%
   step_sqrt(all_outcomes()) %>% 
   step_center(all_outcomes()) %>%
   step_scale(all_outcomes()) %>%
   prep()

# ubah data train
train_rec <- juice(rec)

# ubah data val
val_rec <- bake(rec, val)
# Membuat fungsi revert

rec_rev <- function(x, rec) {
   
   means <- rec$steps[[2]]$means[["visitor"]]
   sds <- rec$steps[[3]]$sds[["visitor"]]
   
   x <- (x * sds + means)^2
   x <- round(x)
   x <- ifelse(x < 0, 0, x)
   x
   
}

tahap selanjutnya yaitu membuat objek multi time series pada data train rec

# Membuat object MTS based on data train and val rec

multi_rec <- msts(train_rec$visitor, 
                  seasonal.periods = c(13, # Jam buka per hari
                                       13*7)) # jam buka per minggu

selanjutnya yaitu melihat decompose pada multi time series rec

# Decompose
multi_decomp_rec <- multi_rec %>%
   mstl()

multi_decomp_rec %>%
   tail(13*7*4) %>%
   autoplot()

dari plot diatas, dapat dilihat bahwa error pada masing-masing plot decompose sangatlah kecil

tahap selanjutnya adalah membuat model multi seasonal pada masing-masing data rec

ARIMA

# buat mdoel arima pada data multi rec
arima_rec <- stlm(multi_rec, method = "arima", )
# forecast 
arima_rec_forecast <- forecast(arima_rec, h = 13*7)
# revert 
rec_rev_arima <- rec_rev(arima_rec_forecast$mean, rec = rec)
# test accuracy
accuracy(rec_rev_arima, val$visitor)
##                  ME     RMSE      MAE       MPE    MAPE
## Test set -0.1428571 7.268024 5.505495 -12.78146 27.6622

Hol-Winter

# buat model holt winter pada data multi rec
hw_rec <- stlm(multi_rec, method = "ets")
# forecast 
hw_rec_forecast <- forecast(hw_rec, h=13*7)
# revert
rec_rev_hw <- rec_rev(hw_rec_forecast$mean, rec = rec)
# tes accuracy 
accuracy(rec_rev_hw, val$visitor)
##                 ME     RMSE      MAE  MPE MAPE
## Test set -6.483516 9.748485 7.736264 -Inf  Inf

TBATS

Membuat model dengan TBATS method based on rec

# buat model tbats pada data rec 
tbats_mod_rec <- multi_rec %>%
   tbats(use.box.cox = F,
         use.trend = T,
         use.damped.trend = T)
# forecast 
tbats_rec_forecast <- forecast(tbats_mod_rec, h = 13*7)
# revert
rec_rev_tbats <- rec_rev(tbats_rec_forecast$mean, rec = rec)
# tes accuracy 
accuracy(rec_rev_tbats, val$visitor)
##                 ME     RMSE     MAE  MPE MAPE
## Test set -7.461538 10.85114 8.78022 -Inf  Inf

Prediction Performance

Validation dataset

Dari model hasil transformasi, didapat nilai MAE terkecil adalah pada model ARIMA

# buat mdoel arima pada data multi rec
arima_rec <- stlm(multi_rec, method = "arima", )
# forecast 
arima_rec_forecast <- forecast(arima_rec, h = 13*7)

rec_rev_arima <- rec_rev(arima_rec_forecast$mean, rec = rec)
# test accuracy
accuracy(rec_rev_arima, val$visitor)
##                  ME     RMSE      MAE       MPE    MAPE
## Test set -0.1428571 7.268024 5.505495 -12.78146 27.6622

Test Dataset

knitr::include_graphics("capstone.JPG")

Conclusion

Assumption

# Residual Autocorrelation

Box.test(arima_rec$residuals, type = "Ljung-Box",
         lag = 13*7)
## 
##  Box-Ljung test
## 
## data:  arima_rec$residuals
## X-squared = 196.72, df = 91, p-value = 0.0000000009301

terlihata bahwa dari uji assumption diatas bahwa model belum memenuhi uji residual autocorrelation yang menandakan bahwa adanya korelasi yang kuat antar prediktor

Normality

# Normality

shapiro.test(arima_rec$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  arima_rec$residuals
## W = 0.99679, p-value = 0.08174

dari uji normality diatas, terlihat bahwa model sudah memenuhi uji normalitas, yaitu p value > 0.05 yang mengartikan bahwa data terdistribusi dengan normal

Highest visitor

create final data frame

# Create submission data
submission <- test %>% 
  mutate(visitor = rec_rev_arima)

# save data
write.csv(submission, "submission-reza.csv", row.names = F)

# check first 3 data
head(submission, 3)
submission_weekday <- submission %>% 
  mutate(Date = date(datetime)) %>% 
  group_by(Date) %>% 
  summarise(Visitor = sum(visitor)) %>% 
  mutate(Days = wday(Date, label = T, abbr = F)) 
## `summarise()` ungrouping output (override with `.groups` argument)
plot_weekday <- submission_weekday %>% 
  ggplot(aes(x = Days,
             y = Visitor)) +
  geom_point()+
  theme_minimal() +
  scale_y_continuous(limits = c(100, 500))

ggplotly(plot_weekday)

dari data submission, terlihat bahwa visitor tertinggi terdapat pada hari sabtu