library(dplyr) # for data wrangling
library(lubridate) # to dea with date
library(padr) # for padding
library(forecast) # time series library
library(tseries) # for adf.test
library(MLmetrics) # calculate error
library(zoo) #imputation missing value
library(tseries) # adf.test
library(ggplot2) # for plotting
library(ggsci)train <- read.csv("data/data-train.csv")
test <- read.csv("data/data-test.csv")train %>% anyNA()## [1] FALSE
test %>% anyNA()## [1] TRUE
Data Process
head(train)## transaction_date receipt_number item_id item_group item_major_group
## 1 2017-12-01T13:32:46Z A0026694 I10100139 noodle_dish food
## 2 2017-12-01T13:32:46Z A0026694 I10500037 drinks beverages
## 3 2017-12-01T13:33:39Z A0026695 I10500044 drinks beverages
## 4 2017-12-01T13:33:39Z A0026695 I10400009 side_dish food
## 5 2017-12-01T13:33:39Z A0026695 I10500046 drinks beverages
## 6 2017-12-01T13:35:59Z A0026696 I10300002 pastry food
## quantity price_usd total_usd payment_type sales_type
## 1 1 7.33 7.33 cash dine_in
## 2 1 4.12 4.12 cash dine_in
## 3 1 2.02 2.02 cash dine_in
## 4 1 5.60 5.60 cash dine_in
## 5 1 3.01 3.01 cash dine_in
## 6 1 4.86 4.86 cash dine_in
The dataset includes information about:
transaction_date: The timestamp of a transaction receipt_number: The ID of a transaction item_id: The ID of an item in a transaction item_group: The group ID of an item in a transaction item_major_group: The major-group ID of an item in a transaction quantity: The quantity of purchased item price_usd: The price of purchased item total_usd: The total price of purchased item payment_type: The payment method sales_type: The sales method
Case
Membuat model forecasting pada data food and beverages untuk mengetahui seasonality dan visitors per tiap jam nya, dengan hasil prediksi yaitu untuk 7 hari ke depan dengan jam operasional 10.00 - 22.00 atau 13 jam satu hari
Ketika membuat time series, kita hanya membutuhkan kolom waktu sebagai target variabel dan kolom value sebagai prediktor, maka kita akan melakukan data wrangling dengan mengambil kolom transaction_date yang dikonversi kedalam bentuk hms_h atau membulatkan menjadi bentuk jam. lalu dimasukan ke kolom baru bernamadatetime dan juga mengambil kolom receipt_number sebagai target value atau bukti transaksi customer.
fnb_train <-train %>%
mutate(transaction_date = ymd_hms(transaction_date),
datetime = floor_date(transaction_date, unit = "hours")) %>%
select(transaction_date, receipt_number, datetime)Selanjutnya kita perlu melakukan data agregation dengan membuat group_by pada kolom datetime dan melakukan summarise pada kolom visitor, lalu menggunakan fungsi n_distinct yang membuat receipt_number menjadi unique value
fnb_train <- fnb_train %>%
group_by(datetime) %>%
summarise(visitor = n_distinct(receipt_number))
head(fnb_train,14)## # A tibble: 14 x 2
## datetime visitor
## <dttm> <int>
## 1 2017-12-01 13:00:00 16
## 2 2017-12-01 14:00:00 38
## 3 2017-12-01 15:00:00 27
## 4 2017-12-01 16:00:00 29
## 5 2017-12-01 17:00:00 44
## 6 2017-12-01 18:00:00 50
## 7 2017-12-01 19:00:00 66
## 8 2017-12-01 20:00:00 70
## 9 2017-12-01 21:00:00 63
## 10 2017-12-01 22:00:00 63
## 11 2017-12-01 23:00:00 51
## 12 2017-12-02 00:00:00 29
## 13 2017-12-02 01:00:00 17
## 14 2017-12-02 10:00:00 10
fnb_train %>%
pad() %>%
anyNA()## [1] TRUE
Saat melakukan padding terdapat NA pada data fnb tersebut, karena terdapat missing data dalam hitungan 24 jam, selanjutnya nilai NA akan di replace dengan nilai akhir dari visitor sebelumnya untuk memastikan data benar - benar terisi penuh sesuai dengan time series.
fnb_clean <- fnb_train %>%
arrange(datetime) %>%
pad() %>%
na.locf()%>%
as.data.frame()## pad applied on the interval: hour
fnb_clean %>% head()## datetime visitor
## 1 2017-12-01 13:00:00 16
## 2 2017-12-01 14:00:00 38
## 3 2017-12-01 15:00:00 27
## 4 2017-12-01 16:00:00 29
## 5 2017-12-01 17:00:00 44
## 6 2017-12-01 18:00:00 50
fnb_clean <- fnb_clean %>%
filter(hour(datetime) >=10 & hour(datetime) <=22)range data
range(fnb_clean$datetime)## [1] "2017-12-01 13:00:00 UTC" "2018-02-18 22:00:00 UTC"
#Seasonality Analysis
fnb_ts <- ts(fnb_clean$visitor,
frequency = 13)fnb_ts %>%
head(13*7*4)%>%
decompose() %>%
autoplot() Masih terihat sesional pada random, dan pola pada trend juga belum memiliki hasil yang smooth, sehingga dapat dikatakan multiseasonal
Multi Seasonality
Decompose time series tidak dapat digunakan bila kita ingin mendapatkan multiseasonal seperti hourly and weekly, oleh karena itu untuk mendapatkan seasonal hourly and weekly kita akan menggunakan fungsi mstl()
#daily and weekly seasonality
fnb_clean$visitor %>%
msts(seasonal.periods = c(13,13*7)) %>%
mstl() %>%
head(13*7*4) %>%
autoplot()#for data cross-validation
fnb_msts <- fnb_clean$visitor %>%
msts(seasonal.periods = c(13,13*7))
# for visualization hourly
fnb_viz_hour <- fnb_ts %>%
decompose()
# for visualization hourly & weekly
fnb_viz <- fnb_msts %>%
mstl()Model Fitting and Evaluation
Pada model ini saya akan menggunakan multiseasonality dikarenakan hasil dari plot memperlihatkan pola seasonal lebih dari satu
Cross-Validation
Pada case ini kita akan melakukan forecasting dengan data test selama 7 hari, sehingga kita akan membuat cross validaiton dengan data test yang diambil dari data train sebanyak 7 hari atau 7*13 data
fnb_train <- function(x){
train <- head(x, length(x) - 13*7)
}
fnb_test <- function(x){
test <- tail(x, 13*7)
}
train_1 <- fnb_train(fnb_msts)
test_1 <- fnb_test(fnb_msts)Modeling
Modeling akan membandingkan antara Arima, Holtwinters dan Tbats
# HoltWinters
fnb_Hw <- stlm(train_1, method = "ets", lambda = 0, s.window = 13)
# Arima
fnb_arima <- stlm(train_1, method = "arima", lambda = 0, s.window = 13)
fnb_tbats <- train_1 %>%
tbats(use.trend = T, use.damped.trend = T,use.box.cox = F)# function to combine the accuracy from forecast
compare_forecast <- function(x, test){
lapply(x, function(x) forecast::accuracy(x, test)["Test set", ]) %>%
lapply(., function(x) x %>% t() %>% as.data.frame) %>%
bind_rows() %>%
mutate(model = names(x)) %>%
select(model, everything())
}Forecasting
forecast_hw <- forecast(object = fnb_Hw, h = length(test_1))
forecast_arima <- forecast(object = fnb_arima, h = length(test_1))
forecast_tbats <- forecast(object = fnb_tbats, h = length(test_1))Evaluation
forecast_list <- list(
"HoltWinters" = forecast_hw,
"Arima" = forecast_arima,
"TBATS" = forecast_tbats
)
compare_forecast(forecast_list,test_1 )## model ME RMSE MAE MPE MAPE MASE
## 1 HoltWinters -4.368073 8.688166 6.782272 -32.60171 41.40435 1.0403377
## 2 Arima -1.289049 7.346271 5.801359 -19.90875 34.71612 0.8898747
## 3 TBATS -3.874787 8.248994 6.827708 -41.53810 49.34678 1.0473071
## ACF1 Theil's U
## 1 0.3152556 0.9127264
## 2 0.3115878 0.7706868
## 3 0.4732729 0.7469707
fnb_msts %>%
autoplot(series = "Actual")+
autolayer(forecast_arima$mean, series = "ARIMA")Prediction Performance
Conclusion
Auto Correlation
Autocorrelation assumption, using Ljung-box function
Box.test(fnb_arima$residuals, type = "Ljung-Box")##
## Box-Ljung test
##
## data: fnb_arima$residuals
## X-squared = 0.0027596, df = 1, p-value = 0.9581
Result : p-value=0.9578 > 0.05 (alpha), which can be concluded that the residuals has no-autocorrelation
shapiro.test(x = forecast_arima$residuals)##
## Shapiro-Wilk normality test
##
## data: forecast_arima$residuals
## W = 0.96767, p-value = 1.2e-13
hist(forecast_arima$residuals, breaks = 30)DATA TEST
fnb.test <- test
head(fnb.test)## datetime visitor
## 1 2018-02-19T10:00:00Z NA
## 2 2018-02-19T11:00:00Z NA
## 3 2018-02-19T12:00:00Z NA
## 4 2018-02-19T13:00:00Z NA
## 5 2018-02-19T14:00:00Z NA
## 6 2018-02-19T15:00:00Z NA
arima_test <- fnb_arimaarima_forecast <- forecast(arima_test, h = 91)arima_forecast$mean %>%
as.data.frame() %>% head(6)## x
## 1 7.187968
## 2 11.863034
## 3 19.273427
## 4 12.931562
## 5 14.777919
## 6 21.251455
fnb.test$visitor <- arima_forecast$mean
fnb.test %>% head()## datetime visitor
## 1 2018-02-19T10:00:00Z 7.187968
## 2 2018-02-19T11:00:00Z 11.863034
## 3 2018-02-19T12:00:00Z 19.273427
## 4 2018-02-19T13:00:00Z 12.931562
## 5 2018-02-19T14:00:00Z 14.777919
## 6 2018-02-19T15:00:00Z 21.251455
write.csv(fnb.test, file = "fnb_arima_tesT.csv")read.csv("fnb_arima_test.csv") %>% head()## X datetime visitor
## 1 1 2018-02-19T10:00:00Z 7.187968
## 2 2 2018-02-19T11:00:00Z 11.863034
## 3 3 2018-02-19T12:00:00Z 19.273427
## 4 4 2018-02-19T13:00:00Z 12.931562
## 5 5 2018-02-19T14:00:00Z 14.777919
## 6 6 2018-02-19T15:00:00Z 21.251455