This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
library(dplyr) # data wrangling
library(lubridate) # date manipulation
library(padr) # complete data frame
library(zoo) # Missing value imputation
library(forecast) # time series library
library(TTR) # for Simple moving average function
library(MLmetrics) # calculate error
library(tseries) # adf.test
library(fpp) # data for forecasting: principles and practice
library(TSstudio) # mempercantik visualisasi timeseries
library(ggplot2)
library(tidyr)
Data Preprocess
data_test <- read.csv("data/data-test.csv")
data_train <- read.csv("data/data-train.csv")
str(data_test)
## 'data.frame': 91 obs. of 2 variables:
## $ datetime: chr "2018-02-19T10:00:00Z" "2018-02-19T11:00:00Z" "2018-02-19T12:00:00Z" "2018-02-19T13:00:00Z" ...
## $ visitor : logi NA NA NA NA NA NA ...
#data_test EDA
# Convert datetime to proper datetime format
data_test$datetime <- ymd_hms(data_test$datetime)
# Convert transaction_date to datetime format
data_train$transaction_date <- ymd_hms(data_train$transaction_date)
data_train %>% head()
## transaction_date receipt_number item_id item_group item_major_group
## 1 2017-12-01 13:32:46 A0026694 I10100139 noodle_dish food
## 2 2017-12-01 13:32:46 A0026694 I10500037 drinks beverages
## 3 2017-12-01 13:33:39 A0026695 I10500044 drinks beverages
## 4 2017-12-01 13:33:39 A0026695 I10400009 side_dish food
## 5 2017-12-01 13:33:39 A0026695 I10500046 drinks beverages
## 6 2017-12-01 13:35:59 A0026696 I10300002 pastry food
## quantity price_usd total_usd payment_type sales_type
## 1 1 7.33 7.33 cash dine_in
## 2 1 4.12 4.12 cash dine_in
## 3 1 2.02 2.02 cash dine_in
## 4 1 5.60 5.60 cash dine_in
## 5 1 3.01 3.01 cash dine_in
## 6 1 4.86 4.86 cash dine_in
data_hourly <- data_train %>%
mutate(hour = floor_date(transaction_date, unit = "hour")) %>%
group_by(hour) %>%
summarise(visitor = n_distinct(receipt_number))
data_hourly %>% head()
## # A tibble: 6 × 2
## hour visitor
## <dttm> <int>
## 1 2017-12-01 13:00:00 16
## 2 2017-12-01 14:00:00 38
## 3 2017-12-01 15:00:00 27
## 4 2017-12-01 16:00:00 29
## 5 2017-12-01 17:00:00 44
## 6 2017-12-01 18:00:00 50
Aggregated data
data_hourly %>% head()
## # A tibble: 6 × 2
## hour visitor
## <dttm> <int>
## 1 2017-12-01 13:00:00 16
## 2 2017-12-01 14:00:00 38
## 3 2017-12-01 15:00:00 27
## 4 2017-12-01 16:00:00 29
## 5 2017-12-01 17:00:00 44
## 6 2017-12-01 18:00:00 50
mean_visitor <- median(data_hourly$visitor, na.rm = TRUE)
new_hours <- seq(from = ymd_hms("2017-12-01 00:00:00"), to = ymd_hms("2017-12-01 12:00:00"), by = "1 hour")
new_rows <- tibble(
hour = new_hours,
visitor = rep(mean_visitor, length(new_hours))
)
data_hourly <- bind_rows(new_rows, data_hourly)
data_hourly <- data_hourly %>%
filter(hour(hour) >= 10 & hour(hour) <= 22)
data_hourly <- data_hourly %>%
padr::pad() %>%
fill(visitor, .direction = "downup")
## pad applied on the interval: hour
The data started from 13 hours so had to add the inital 3 hours of opening and had to convert missing values in visitors to mean.
Time series padding ensures continuous data by filling in any missing time intervals. This is crucial for time series analysis, especially when forecasting or visualizing data.
Seasonality Analysis
data_hourly_ts <- ts(data_hourly$visitor, start = c(2017, 12,01),frequency = 24)
decomp_classical <- decompose(data_hourly_ts, type = "additive")
autoplot(decomp_classical) +
ggtitle("Classical Decomposition")
Model Fitting and Evaluation
Cross-validation
train <- head(data_hourly_ts, length(data_hourly_ts) - 10)
valid <- tail(data_hourly_ts,10)
Preprocessing steps
adf.test(train)
## Warning in adf.test(train): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: train
## Dickey-Fuller = -15.054, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
Checking the p value we can see that the data is stationary
# ACF plot
acf(train, main = "ACF Plot")
# PACF plot
pacf(train, main = "PACF Plot")
Types of models
arima_model <- auto.arima(train)
arima_forecast <- forecast(arima_model, h = 91)
accuracy_arima <- accuracy(arima_forecast, valid)
print(accuracy_arima)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.01419551 7.145289 4.634181 -9.073794 23.79750 0.5564935
## Test set -1.26529392 6.889579 5.063668 -2.865970 13.19702 0.6080682
## ACF1 Theil's U
## Training set -0.0006250333 NA
## Test set 0.2175880048 0.4726814
holt_model <- holt(train,h = 91)
holt_forecast <- forecast(holt_model, h = 91)
accuracy_holt <- accuracy(holt_forecast, valid)
print(accuracy_holt)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.00176041 10.34769 5.276849 -23.44783 39.38291 0.633668
## Test set 17.60070991 22.49881 17.600710 39.57402 39.57402 2.113573
## ACF1 Theil's U
## Training set 0.01073758 NA
## Test set 0.62925087 1.742733
naive_forecast <- naive(train, h = 91)
accuracy_naive <- accuracy(naive_forecast, valid)
print(accuracy_naive)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.002634352 10.39735 5.313488 -22.48203 39.81005 0.6380677
## Test set 17.100000000 22.10656 17.100000 38.10264 38.10264 2.0534456
## ACF1 Theil's U
## Training set -0.08722958 NA
## Test set 0.62911508 1.705841
# Plot ARIMA forecast with training and validation data
autoplot(train, series = "Training Data") +
autolayer(arima_forecast, series = "ARIMA Forecast", PI = FALSE) +
autolayer(valid, series = "Validation Data") +
ggtitle("ARIMA Forecast vs Actual") +
xlab("Time") +
ylab("Value") +
guides(colour = guide_legend(title = "Series"))
autoplot(train, series = "Training Data") +
autolayer(holt_forecast, series = "Holt's Forecast", PI = FALSE) +
autolayer(valid, series = "Validation Data") +
ggtitle("Holt's Forecast vs Actual") +
xlab("Time") +
ylab("Value") +
guides(colour = guide_legend(title = "Series"))
autoplot(train, series = "Training Data") +
autolayer(naive_forecast, series = "Naive Forecast", PI = FALSE) +
autolayer(valid, series = "Validation Data") +
ggtitle("Naive Forecast vs Actual") +
xlab("Time") +
ylab("Value") +
guides(colour = guide_legend(title = "Series"))
Model Assessment and Fit
Evaluating Different Model Specifications Four forecasting models were employed : ARIMA, Naive, and Holt’s Linear Trend.
With the lowest RMSE, MAE, and MAPE in the test set, ARIMA models perform the best overall. While Holt’s and Naive models function similarly, ARIMA are more accurate. When it comes to MAE and MAPE, ARIMA performs the best.
Prediction Performance
forecast_mod = as.data.frame(arima_forecast)
submission <- data_test %>% mutate(visitor = round(forecast_mod$`Point Forecast`))
write.csv(submission, "submission.csv", row.names = FALSE)
Assumption check
Box.test(arima_forecast$residuals)
##
## Box-Pierce test
##
## data: arima_forecast$residuals
## X-squared = 0.00074188, df = 1, p-value = 0.9783
p-value is > 0.05 it means all our model have No autocorrelation in the forecast errors
Conclusion
Box.Test was done to check No autocorrelation and p-value is > 0.05 it means all our model have No autocorrelation in the forecast errors.
Visitors tend to prefer evening times.