library(tidyverse)
library(forecast)
library(gridExtra)
library(fastDummies)
library(Metrics)
library(BETS)
library(lmtest)
df_raw <- read_csv("search.csv")
head(df_raw)
# define calendar variation variable
idul_adha_months <- c(
"2004-02","2005-01","2006-01","2007-12","2008-12","2009-11",
"2010-11","2011-11","2012-10","2013-10","2014-10","2015-09",
"2016-09","2017-09","2018-08","2019-08","2020-07","2021-07",
"2022-07","2023-06","2024-06", "2025-06"
)
df_raw$idul_adha <- as.integer(df_raw$Month %in% idul_adha_months)
df_raw
df_raw %>% filter(idul_adha==1)
df <- df_raw %>%
mutate(ts = ym(Month), search = Search,
month = month(ts), year = year(ts),
t = seq_along(ts)) %>%
dummy_cols(select_columns = "month") %>%
select(ts, t, idul_adha, year, month, starts_with("month_"), search)
df
ggplot(data = df) +
geom_line(aes(x = ts, y = search)) +
ggtitle("Search Volume Index for keyword sapi") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "2 year", date_labels = "%Y")
train <- df %>% filter(year != 2025)
test <- df %>% filter(year == 2025)
y_train <- ts(train$search, start = c(2004, 1), frequency = 12)
y_test <- ts(test$search, start = c(2025, 1), frequency = 12)
Y_t = \underbrace{\beta_0 + \beta_1 t}_{\text{deterministic trend}} + \underbrace{\sum_{i=1}^{s-1} \delta_i S_{i,t} + \delta_{s,t}}_{\text{deterministic seasonal with dummy variable}} + \underbrace{\alpha_1 V_t}_{\text{calendar variation effect}} + \underbrace{\frac{\theta_q(B)}{\phi_p(B)} a_t}_{\text{ARIMA error}}
xreg_train <- cbind(
trend = seq_along(y_train),
seasonal = train %>% select(starts_with("month_")) %>% select(-month_12),
idul_adha = train$idul_adha
) %>% as.matrix()
m1 <- auto.arima(y_train, xreg = xreg_train, seasonal = FALSE)
summary(m1)
Series: y_train
Regression with ARIMA(3,0,2) errors
Coefficients:
ar1 ar2 ar3 ma1 ma2 intercept trend seasonal.month_1 seasonal.month_2 seasonal.month_3
0.5669 0.3314 0.0616 -0.5636 -0.2336 8.7419 0.0741 -0.7479 0.6210 3.3676
s.e. 1.8309 1.5796 0.2083 1.8370 1.5587 4.1196 0.0265 2.3072 2.2057 2.1904
seasonal.month_4 seasonal.month_5 seasonal.month_6 seasonal.month_7 seasonal.month_8 seasonal.month_9 seasonal.month_10
3.7441 5.2152 4.1848 6.1902 8.2195 6.7434 3.4633
s.e. 2.2121 2.2085 2.2025 2.2008 2.2021 2.1795 2.2004
seasonal.month_11 idul_adha
0.5648 30.7546
s.e. 2.3110 1.7911
sigma^2 = 60.24: log likelihood = -864.45
AIC=1768.89 AICc=1772.53 BIC=1839.48
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.02499097 7.46309 4.977715 -3.971377 21.10869 1.037024 0.001747125
predict_train <- train %>%
mutate(actual = search,
m1_predict = m1$fitted)
predict_train
Y_t = \underbrace{\beta_0 + \beta_1 t}_{\text{deterministic trend}} + \underbrace{\left[ \sum_{k=1}^{K} \left(a_k \sin\left(\frac{2\pi k t}{s}\right) + b_k \cos\left(\frac{2\pi k t}{s}\right)\right) \right]}_{\text{deterministic seasonal with Fourier terms}} + \underbrace{\alpha_1 V_t }_{\text{calendar variation}} + \underbrace{\frac{\theta_q(B)}{\phi_p(B)} a_t}_{\text{ARIMA error}}
xreg_train2 <- cbind(
trend = seq_along(y_train),
seasonal = fourier(y_train, K = 5),
idul_adha = train$idul_adha
) %>% as.matrix()
m2 <- auto.arima(y_train, xreg = xreg_train2, seasonal = FALSE)
summary(m2)
Series: y_train
Regression with ARIMA(3,0,2) errors
Coefficients:
ar1 ar2 ar3 ma1 ma2 intercept trend S1-12 C1-12 S2-12 C2-12 S3-12 C3-12 S4-12 C4-12
0.5702 0.3283 0.0615 -0.5671 -0.2307 12.2098 0.0741 -1.8092 -3.0268 0.1869 -1.3899 0.1815 0.6159 -0.2845 0.1101
s.e. 1.8185 1.5678 0.2081 1.8245 1.5479 3.8051 0.0265 0.5928 0.5797 0.5901 0.5903 0.6338 0.6343 0.6776 0.6770
S5-12 C5-12 idul_adha
0.3030 0.3186 30.7610
s.e. 0.6891 0.6890 1.7908
sigma^2 = 59.99: log likelihood = -864.46
AIC=1766.93 AICc=1770.2 BIC=1833.99
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.0249246 7.463611 4.970092 -3.973825 21.07118 1.035436 0.001765115
Y_t = \beta_0 + \underbrace{\sum_{i=1}^{s-1} \delta_i S_{i,t} + \delta_{s,t}}_{\text{deterministic seasonal with dummy}} + \underbrace{\alpha_1 V_t}_{\text{calendar variation effect}} + \underbrace{\frac{\theta_q(B)}{\phi_p(B)(1-B)^d} a_t}_{\text{ARIMA error}}
xreg_train3 <- cbind(
seasonal = train %>% select(starts_with("month_")) %>% select(-month_12),
idul_adha = train$idul_adha
) %>% as.matrix()
m3 <- auto.arima(y_train, xreg = xreg_train3, seasonal = FALSE)
summary(m3)
Series: y_train
Regression with ARIMA(2,1,2) errors
Coefficients:
ar1 ar2 ma1 ma2 seasonal.month_1 seasonal.month_2 seasonal.month_3 seasonal.month_4 seasonal.month_5
-0.2974 -0.0536 -0.6848 -0.1423 -0.8380 0.5437 3.2965 3.6771 5.1538
s.e. 1.8398 0.2262 1.8444 1.5830 2.3062 2.2058 2.1877 2.2055 2.2045
seasonal.month_6 seasonal.month_7 seasonal.month_8 seasonal.month_9 seasonal.month_10 seasonal.month_11 idul_adha
4.1285 6.1401 8.1773 6.7114 3.4351 0.5506 30.7718
s.e. 2.1970 2.1969 2.1955 2.1772 2.2008 2.3109 1.7919
sigma^2 = 60.36: log likelihood = -863.25
AIC=1760.51 AICc=1763.14 BIC=1820.44
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.329 7.502455 5.050058 -1.862855 21.31278 1.052095 0.01021194
Y_t = \beta_0 + \underbrace{\left[ \sum_{k=1}^{K} \left(a_k \sin\left(\frac{2\pi k t}{s}\right) + b_k \cos\left(\frac{2\pi k t}{s}\right)\right) \right]}_{\text{deterministic seasonal with Fourier terms}} + \underbrace{\alpha_1 V_t}_{\text{calendar variation}} + \underbrace{\frac{\theta_q(B)}{\phi_p(B)(1-B)^d} a_t}_{\text{ARIMA error}}
xreg_train4 <- cbind(
seasonal = fourier(y_train, K = 5),
idul_adha = train$idul_adha
) %>% as.matrix()
m4 <- auto.arima(y_train, xreg = xreg_train4, seasonal = FALSE)
summary(m4)
Series: y_train
Regression with ARIMA(2,1,2) errors
Coefficients:
ar1 ar2 ma1 ma2 S1-12 C1-12 S2-12 C2-12 S3-12 C3-12 S4-12 C4-12 S5-12 C5-12
-0.4307 -0.0687 -0.5511 -0.2571 -1.8347 -3.0158 0.1746 -1.3822 0.1734 0.6235 -0.2890 0.1189 0.3009 0.3278
s.e. 1.9877 0.2219 1.9945 1.7161 0.5886 0.5755 0.5886 0.5890 0.6339 0.6345 0.6792 0.6787 0.6894 0.6892
idul_adha
30.7572
s.e. 1.7970
sigma^2 = 60.11: log likelihood = -863.27
AIC=1758.55 AICc=1760.87 BIC=1814.95
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.32892 7.503059 5.044598 -1.868757 21.28611 1.050958 0.01014251
Y_t = \underbrace{\beta_0 + \beta_1 t}_{\text{deterministic trend}} + \underbrace{\alpha_1 V_t}_{\text{calendar variation effect}} + \underbrace{\frac{\theta_q(B)\Theta_Q(B^s)}{\phi_p(B)\Phi_P(B^s)(1-B)^d(1-B^s)^D} a_t}_{\text{Seasonal ARIMA}}
xreg_train5 <- cbind(
trend = seq_along(y_train),
idul_adha = train$idul_adha
) %>% as.matrix()
m5 <- auto.arima(y_train, xreg = xreg_train5)
summary(m5)
Series: y_train
Regression with ARIMA(1,0,2)(2,0,0)[12] errors
Coefficients:
ar1 ma1 ma2 sar1 sar2 intercept trend idul_adha
0.8435 -0.8533 0.1479 0.9431 -0.3234 13.5693 0.0743 18.3191
s.e. 0.1268 0.1547 0.0735 0.0708 0.0759 3.3776 0.0231 1.8880
sigma^2 = 36.69: log likelihood = -813.21
AIC=1644.43 AICc=1645.17 BIC=1676.19
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set -0.01452727 5.960219 3.593978 -3.338806 14.54093 0.7487454 -0.006946276
Y_t = \beta_0 + \underbrace{\alpha_1 V_t}_{\text{calendar variation effect}} + \underbrace{\frac{\theta_q(B)\Theta_Q(B^s)}{\phi_p(B)\Phi_P(B^s)(1-B)^d(1-B^s)^D} a_t}_{\text{Seasonal ARIMA}}
xreg_train6 <- cbind(
idul_adha = train$idul_adha
) %>% as.matrix()
m6 <- auto.arima(y_train, xreg = xreg_train6)
summary(m6)
Series: y_train
Regression with ARIMA(1,1,1)(1,0,2)[12] errors
Coefficients:
ar1 ma1 sar1 sma1 sma2 idul_adha
-0.1260 -0.8738 0.3328 0.5422 0.2995 18.9039
s.e. 0.0872 0.0442 0.1322 0.1337 0.1077 1.9471
sigma^2 = 37.12: log likelihood = -812.64
AIC=1639.28 AICc=1639.74 BIC=1663.96
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.08039586 6.007218 3.694197 -2.04907 14.90472 0.7696244 0.0087753
predict_train <- train %>%
mutate(actual = search,
m1_predict = m1$fitted,
m2_predict = m2$fitted,
m3_predict = m3$fitted,
m4_predict = m4$fitted,
m5_predict = m5$fitted,
m6_predict = m6$fitted) %>%
select(ts, actual, m1_predict, m2_predict, m3_predict, m4_predict, m5_predict, m6_predict)
predict_train
my_theme <- theme(
legend.position = c(0.2, 0.75),
legend.text = element_text(size = 5),
legend.title = element_text(size = 10),
legend.key.size = unit(0.3, "lines"),
legend.box.margin = margin(0, 0, 0, 0),
legend.background= element_rect(fill = "white", colour = "grey80"),
plot.title = element_text(size = 8, face = "bold")
)
p1 <- ggplot(data = predict_train) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m1_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "red")
) +
ggtitle("Deterministic Trend & Seasonality (Dummy)") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p2 <- ggplot(data = predict_train) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m2_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "blue")
) +
ggtitle("Deterministic Trend & Seasonality (Fourier)") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p3 <- ggplot(data = predict_train) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m3_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "brown")
) +
ggtitle("Stochastic Trend & Deterministic Seasonality (Dummy)") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p4 <- ggplot(data = predict_train) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m4_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "purple")
) +
ggtitle("Stochastic Trend & Deterministic Seasonality (Fourier)") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p5 <- ggplot(data = predict_train) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m5_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "green4")
) +
ggtitle("Deterministic Trend & Stochastic Seasonality") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p6 <- ggplot(data = predict_train) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m6_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "orange")
) +
ggtitle("Stochastic Trend & Seasonality") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
grid.arrange(p1,p2, p3, p4, p5, p6, ncol=2)
xreg_future <- cbind(
trend = test$t,
seasonal = test %>% select(starts_with("month_")) %>% select(-month_12),
idul_adha = test$idul_adha
) %>% as.matrix()
xreg_future2 <- cbind(
trend = test$t,
seasonal = fourier(y_test, K = 5),
idul_adha = test$idul_adha
) %>% as.matrix()
xreg_future3 <- cbind(
seasonal = test %>% select(starts_with("month_")) %>% select(-month_12),
idul_adha = test$idul_adha
) %>% as.matrix()
xreg_future4 <- cbind(
seasonal = fourier(y_test, K = 5),
idul_adha = test$idul_adha
) %>% as.matrix()
xreg_future5 <- cbind(
trend = test$t,
idul_adha = test$idul_adha
) %>% as.matrix()
xreg_future6 <- cbind(
idul_adha = test$idul_adha
) %>% as.matrix()
predict_test <- test %>%
mutate(actual = search,
m1_predict = forecast(m1, xreg = xreg_future)$mean,
m2_predict = forecast(m2, xreg = xreg_future2)$mean,
m3_predict = forecast(m3, xreg = xreg_future3)$mean,
m4_predict = forecast(m4, xreg = xreg_future4)$mean,
m5_predict = forecast(m5, xreg = xreg_future5)$mean,
m6_predict = forecast(m6, xreg = xreg_future6)$mean) %>%
select(ts, actual, m1_predict, m2_predict, m3_predict, m4_predict, m5_predict, m6_predict)
predict_test
p1 <- ggplot(data = predict_test) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m1_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "red")
) +
ggtitle("Deterministic Trend & Seasonality (Dummy)") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p2 <- ggplot(data = predict_test) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m2_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "blue")
) +
ggtitle("Deterministic Trend & Seasonality (Fourier)") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p3 <- ggplot(data = predict_test) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m3_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "brown")
) +
ggtitle("Stochastic Trend & Deterministic Seasonality (Dummy)") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p4 <- ggplot(data = predict_test) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m4_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "purple")
) +
ggtitle("Stochastic Trend & Deterministic Seasonality (Fourier)") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p5 <- ggplot(data = predict_test) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m5_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "green4")
) +
ggtitle("Deterministic Trend & Stochastic Seasonality") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p6 <- ggplot(data = predict_test) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m6_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "orange")
) +
ggtitle("Stochastic Trend & Seasonality") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
grid.arrange(p1,p2, p3, p4, p5, p6, ncol=2)
predict_train %>%
select(ends_with("_predict")) %>%
imap_dfr(~ tibble(
Model = .y,
RMSE = rmse (predict_train$actual, .x),
MAPE = mape (predict_train$actual, .x),
sMAPE = smape(predict_train$actual, .x)
)) %>%
mutate(Model = recode(Model,
m1_predict = "Deterministic Trend & Seasonality (Dummy)",
m2_predict = "Deterministic Trend & Seasonality (Fourier)",
m3_predict = "Stochastic Trend & Deterministic Seasonality (Dummy)",
m4_predict = "Stochastic Trend & Deterministic Seasonality (Fourier)",
m5_predict = "Deterministic Trend & Stochastic Seasonality",
m6_predict = "Stochastic Trend & Seasonality",
))
predict_test %>%
select(ends_with("_predict")) %>%
imap_dfr(~ tibble(
Model = .y,
RMSE = rmse (predict_test$actual, .x),
MAPE = mape (predict_test$actual, .x),
sMAPE = smape(predict_test$actual, .x)
)) %>%
mutate(Model = recode(Model,
m1_predict = "Deterministic Trend & Seasonality (Dummy)",
m2_predict = "Deterministic Trend & Seasonality (Fourier)",
m3_predict = "Stochastic Trend & Deterministic Seasonality (Dummy)",
m4_predict = "Stochastic Trend & Deterministic Seasonality (Fourier)",
m5_predict = "Deterministic Trend & Stochastic Seasonality",
m6_predict = "Stochastic Trend & Seasonality",
))