R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(dplyr) # data wrangling
library(lubridate) # date manipulation
library(padr) # complete data frame
library(zoo) # Missing value imputation
library(forecast) # time series library
library(TTR) # for Simple moving average function
library(MLmetrics) # calculate error
library(tseries) # adf.test
library(fpp) # data for forecasting: principles and practice
library(TSstudio) # mempercantik visualisasi timeseries
library(ggplot2)
library(tidyr)

Data Preprocess

data_test <- read.csv("data/data-test.csv")
data_train <- read.csv("data/data-train.csv")

str(data_test)
## 'data.frame':    91 obs. of  2 variables:
##  $ datetime: chr  "2018-02-19T10:00:00Z" "2018-02-19T11:00:00Z" "2018-02-19T12:00:00Z" "2018-02-19T13:00:00Z" ...
##  $ visitor : logi  NA NA NA NA NA NA ...
#data_test EDA
# Convert datetime to proper datetime format
data_test$datetime <- ymd_hms(data_test$datetime)
# Convert transaction_date to datetime format
data_train$transaction_date <- ymd_hms(data_train$transaction_date)
data_train %>% head()
##      transaction_date receipt_number   item_id  item_group item_major_group
## 1 2017-12-01 13:32:46       A0026694 I10100139 noodle_dish             food
## 2 2017-12-01 13:32:46       A0026694 I10500037      drinks        beverages
## 3 2017-12-01 13:33:39       A0026695 I10500044      drinks        beverages
## 4 2017-12-01 13:33:39       A0026695 I10400009   side_dish             food
## 5 2017-12-01 13:33:39       A0026695 I10500046      drinks        beverages
## 6 2017-12-01 13:35:59       A0026696 I10300002      pastry             food
##   quantity price_usd total_usd payment_type sales_type
## 1        1      7.33      7.33         cash    dine_in
## 2        1      4.12      4.12         cash    dine_in
## 3        1      2.02      2.02         cash    dine_in
## 4        1      5.60      5.60         cash    dine_in
## 5        1      3.01      3.01         cash    dine_in
## 6        1      4.86      4.86         cash    dine_in
data_hourly <- data_train %>%
  mutate(hour = floor_date(transaction_date, unit = "hour")) %>%
  group_by(hour) %>%
  summarise(visitor = n_distinct(receipt_number))
data_hourly %>% head()
## # A tibble: 6 × 2
##   hour                visitor
##   <dttm>                <int>
## 1 2017-12-01 13:00:00      16
## 2 2017-12-01 14:00:00      38
## 3 2017-12-01 15:00:00      27
## 4 2017-12-01 16:00:00      29
## 5 2017-12-01 17:00:00      44
## 6 2017-12-01 18:00:00      50

Aggregated data

data_hourly %>% head()
## # A tibble: 6 × 2
##   hour                visitor
##   <dttm>                <int>
## 1 2017-12-01 13:00:00      16
## 2 2017-12-01 14:00:00      38
## 3 2017-12-01 15:00:00      27
## 4 2017-12-01 16:00:00      29
## 5 2017-12-01 17:00:00      44
## 6 2017-12-01 18:00:00      50
mean_visitor <- median(data_hourly$visitor, na.rm = TRUE)
new_hours <- seq(from = ymd_hms("2017-12-01 00:00:00"), to = ymd_hms("2017-12-01 12:00:00"), by = "1 hour")

new_rows <- tibble(
  hour = new_hours,
  visitor = rep(mean_visitor, length(new_hours))
)

data_hourly <- bind_rows(new_rows, data_hourly) 

data_hourly <- data_hourly %>%
  filter(hour(hour) >= 10 & hour(hour) <= 22)

data_hourly <- data_hourly %>%
  padr::pad() %>%
  fill(visitor, .direction = "downup")
## pad applied on the interval: hour

The data started from 13 hours so had to add the inital 3 hours of opening and had to convert missing values in visitors to mean.

Time series padding ensures continuous data by filling in any missing time intervals. This is crucial for time series analysis, especially when forecasting or visualizing data.

Seasonality Analysis

  1. Classical Decomposition
data_hourly_ts <- ts(data_hourly$visitor, start = c(2017, 12,01),frequency =  24)
decomp_classical <- decompose(data_hourly_ts, type = "additive")
autoplot(decomp_classical) + 
  ggtitle("Classical Decomposition")

Model Fitting and Evaluation

Cross-validation

train <- head(data_hourly_ts, length(data_hourly_ts) - 10)
valid <- tail(data_hourly_ts,10)

Preprocessing steps

adf.test(train)
## Warning in adf.test(train): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train
## Dickey-Fuller = -15.054, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary

Checking the p value we can see that the data is stationary

# ACF plot
acf(train, main = "ACF Plot")

# PACF plot
pacf(train, main = "PACF Plot")

Types of models

arima_model <- auto.arima(train)
arima_forecast <- forecast(arima_model, h = 91)
accuracy_arima <- accuracy(arima_forecast, valid)
print(accuracy_arima)
##                       ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.01419551 7.145289 4.634181 -9.073794 23.79750 0.5564935
## Test set     -1.26529392 6.889579 5.063668 -2.865970 13.19702 0.6080682
##                       ACF1 Theil's U
## Training set -0.0006250333        NA
## Test set      0.2175880048 0.4726814
holt_model <- holt(train,h = 91)
holt_forecast <- forecast(holt_model, h = 91)
accuracy_holt <- accuracy(holt_forecast, valid)
print(accuracy_holt)
##                       ME     RMSE       MAE       MPE     MAPE     MASE
## Training set  0.00176041 10.34769  5.276849 -23.44783 39.38291 0.633668
## Test set     17.60070991 22.49881 17.600710  39.57402 39.57402 2.113573
##                    ACF1 Theil's U
## Training set 0.01073758        NA
## Test set     0.62925087  1.742733
naive_forecast <- naive(train, h = 91)
accuracy_naive <- accuracy(naive_forecast, valid)
print(accuracy_naive)
##                        ME     RMSE       MAE       MPE     MAPE      MASE
## Training set -0.002634352 10.39735  5.313488 -22.48203 39.81005 0.6380677
## Test set     17.100000000 22.10656 17.100000  38.10264 38.10264 2.0534456
##                     ACF1 Theil's U
## Training set -0.08722958        NA
## Test set      0.62911508  1.705841
# Plot ARIMA forecast with training and validation data
autoplot(train, series = "Training Data") +
  autolayer(arima_forecast, series = "ARIMA Forecast", PI = FALSE) +
  autolayer(valid, series = "Validation Data") +
  ggtitle("ARIMA Forecast vs Actual") +
  xlab("Time") +
  ylab("Value") +
  guides(colour = guide_legend(title = "Series"))

autoplot(train, series = "Training Data") +
  autolayer(holt_forecast, series = "Holt's Forecast", PI = FALSE) +
  autolayer(valid, series = "Validation Data") +
  ggtitle("Holt's Forecast vs Actual") +
  xlab("Time") +
  ylab("Value") +
  guides(colour = guide_legend(title = "Series"))

autoplot(train, series = "Training Data") +
  autolayer(naive_forecast, series = "Naive Forecast", PI = FALSE) +
  autolayer(valid, series = "Validation Data") +
  ggtitle("Naive Forecast vs Actual") +
  xlab("Time") +
  ylab("Value") +
  guides(colour = guide_legend(title = "Series"))

Model Assessment and Fit

Evaluating Different Model Specifications Four forecasting models were employed : ARIMA, Naive, and Holt’s Linear Trend.

With the lowest RMSE, MAE, and MAPE in the test set, ARIMA models perform the best overall. While Holt’s and Naive models function similarly, ARIMA are more accurate. When it comes to MAE and MAPE, ARIMA performs the best.

Prediction Performance

forecast_mod = as.data.frame(arima_forecast)
submission <- data_test %>% mutate(visitor = round(forecast_mod$`Point Forecast`))
write.csv(submission, "submission.csv", row.names = FALSE)

Assumption check

Box.test(arima_forecast$residuals)
## 
##  Box-Pierce test
## 
## data:  arima_forecast$residuals
## X-squared = 0.00074188, df = 1, p-value = 0.9783

p-value is > 0.05 it means all our model have No autocorrelation in the forecast errors

Conclusion

Box.Test was done to check No autocorrelation and p-value is > 0.05 it means all our model have No autocorrelation in the forecast errors.

Visitors tend to prefer evening times.