library(tidyverse)
library(forecast)
library(gridExtra)
library(fastDummies)
library(Metrics)
df_raw <- read_csv("search.csv")
head(df_raw)
# define calendar variation variable
idul_adha_months <- c(
"2004-02","2005-01","2006-01","2007-12","2008-12","2009-11",
"2010-11","2011-11","2012-10","2013-10","2014-10","2015-09",
"2016-09","2017-09","2018-08","2019-08","2020-07","2021-07",
"2022-07","2023-06","2024-06", "2025-06"
)
df_raw$idul_adha <- as.integer(df_raw$Month %in% idul_adha_months)
df_raw
df_raw %>% filter(idul_adha==1)
df <- df_raw %>%
mutate(ts = ym(Month), search = Search,
month = month(ts), year = year(ts),
t = seq_along(ts)) %>%
dummy_cols(select_columns = "month") %>%
select(ts, t, idul_adha, year, month, starts_with("month_"), search)
df
ggplot(data = df) +
geom_line(aes(x = ts, y = search)) +
ggtitle("Search Volume Index for keyword sapi") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "2 year", date_labels = "%Y")
train <- df %>% filter(year != 2025)
test <- df %>% filter(year == 2025)
Y_t = \underbrace{\beta_0 + \beta_1 t}_{\text{deterministic trend}} + \underbrace{a_t}_{\text{white noise error}}
m1 <- lm(search ~ t, data = train)
summary(m1)
Call:
lm(formula = search ~ t, data = train)
Residuals:
Min 1Q Median 3Q Max
-16.276 -5.751 -2.496 1.284 66.739
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.74167 1.50713 8.454 2.36e-15 ***
t 0.09725 0.01033 9.416 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 11.93 on 250 degrees of freedom
Multiple R-squared: 0.2618, Adjusted R-squared: 0.2588
F-statistic: 88.66 on 1 and 250 DF, p-value: < 2.2e-16
Y_t = \underbrace{\beta t}_{\text{deterministic trend}} + \underbrace{\sum_{i=1}^{12} \delta_i S_{i,t}}_{\text{deterministic seasonal with dummy variables}} + \underbrace{a_t}_{\text{white noise error}}
m2 <- lm(search ~ -1 + t + month_1 + month_2 + month_3 + month_4 +
month_5 + month_6 + month_7 + month_8 +
month_9 + month_10 + month_11 + month_12,
data = train)
summary(m2)
Call:
lm(formula = search ~ -1 + t + month_1 + month_2 + month_3 +
month_4 + month_5 + month_6 + month_7 + month_8 + month_9 +
month_10 + month_11 + month_12, data = train)
Residuals:
Min 1Q Median 3Q Max
-20.850 -5.469 -2.340 2.026 62.286
Coefficients:
Estimate Std. Error t value Pr(>|t|)
t 0.09637 0.01017 9.472 < 2e-16 ***
month_1 9.24454 2.84139 3.254 0.001305 **
month_2 9.10056 2.84582 3.198 0.001572 **
month_3 10.33753 2.85027 3.627 0.000351 ***
month_4 10.66973 2.85475 3.738 0.000233 ***
month_5 12.09718 2.85926 4.231 3.32e-05 ***
month_6 13.95319 2.86380 4.872 2.01e-06 ***
month_7 17.38064 2.86837 6.059 5.28e-09 ***
month_8 17.90332 2.87296 6.232 2.07e-09 ***
month_9 17.85457 2.87759 6.205 2.40e-09 ***
month_10 14.52011 2.88224 5.038 9.29e-07 ***
month_11 11.61422 2.88693 4.023 7.71e-05 ***
month_12 9.56548 2.89164 3.308 0.001085 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 11.74 on 239 degrees of freedom
Multiple R-squared: 0.8404, Adjusted R-squared: 0.8317
F-statistic: 96.8 on 13 and 239 DF, p-value: < 2.2e-16
Y_t = \underbrace{\beta t}_{\text{deterministic trend}} + \underbrace{\sum_{i=1}^{12} \delta_i S_{i,t}}_{\text{deterministic seasonal with dummy variable}} + \underbrace{\alpha V_t}_{\text{calendar variation effect}} + \underbrace{a_t}_{\text{white noise error}}
m3 <- lm(search ~ -1 + t + month_1 + month_2 + month_3 + month_4 +
month_5 + month_6 + month_7 + month_8 +
month_9 + month_10 + month_11 + month_12 + idul_adha,
data = train)
summary(m3)
Call:
lm(formula = search ~ -1 + t + month_1 + month_2 + month_3 +
month_4 + month_5 + month_6 + month_7 + month_8 + month_9 +
month_10 + month_11 + month_12 + idul_adha, data = train)
Residuals:
Min 1Q Median 3Q Max
-23.808 -4.289 -0.115 3.154 44.682
Coefficients:
Estimate Std. Error t value Pr(>|t|)
t 0.096365 0.007168 13.443 < 2e-16 ***
month_1 6.386173 2.010382 3.177 0.001687 **
month_2 7.671374 2.007224 3.822 0.000169 ***
month_3 10.337527 2.008269 5.147 5.53e-07 ***
month_4 10.669733 2.011427 5.305 2.58e-07 ***
month_5 12.097177 2.014605 6.005 7.11e-09 ***
month_6 11.094822 2.026103 5.476 1.10e-07 ***
month_7 13.093081 2.039620 6.419 7.34e-10 ***
month_8 15.044949 2.032535 7.402 2.30e-12 ***
month_9 13.567017 2.046060 6.631 2.22e-10 ***
month_10 10.232557 2.049309 4.993 1.15e-06 ***
month_11 7.326667 2.052579 3.569 0.000433 ***
month_12 6.707106 2.045639 3.279 0.001199 **
idul_adha 30.012887 1.923661 15.602 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 8.269 on 238 degrees of freedom
Multiple R-squared: 0.9211, Adjusted R-squared: 0.9165
F-statistic: 198.4 on 14 and 238 DF, p-value: < 2.2e-16
Y_t = \underbrace{\beta t}_{\text{deterministic trend}} + \underbrace{\sum_{i=1}^{12} \delta_i S_{i,t}}_{\text{deterministic seasonal with dummy variable}} + \underbrace{\alpha V_t}_{\text{calendar variation effect}} + \underbrace{\frac{\theta_q(B)}{\phi_p(B)} a_t}_{\text{ARMA error}}
res <- ts(m3$residuals)
ggAcf(res) + ggtitle("Residual ACF")
res_arima <- auto.arima(res)
summary(res_arima)
Series: res
ARIMA(3,0,2) with zero mean
Coefficients:
ar1 ar2 ar3 ma1 ma2
0.5288 0.3650 0.0591 -0.5126 -0.2701
s.e. 2.1908 1.8904 0.2349 2.1935 1.8450
sigma^2 = 57.13: log likelihood = -865.07
AIC=1742.15 AICc=1742.49 BIC=1763.33
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set -0.2406057 7.482885 5.088429 110.5669 235.9901 0.8492696 -0.0004942394
predict_train <- train %>%
mutate(actual = search,
m1_predict = m1$fitted.values,
m2_predict = m2$fitted.values,
m3_predict = m3$fitted.values,
m4_predict = m3$fitted.values + res_arima$fitted) %>%
select(ts, actual, m1_predict, m2_predict, m3_predict, m4_predict)
predict_train
my_theme <- theme(
legend.position = c(0.2, 0.75),
legend.text = element_text(size = 7),
legend.title = element_text(size = 11),
legend.background= element_rect(fill = "white", colour = "grey80"),
plot.title = element_text(size = 10, face = "bold")
)
p1 <- ggplot(data = predict_train) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m1_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "red")
) +
ggtitle("Trend") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p2 <- ggplot(data = predict_train) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m2_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "blue")
) +
ggtitle("Trend + Seasonal") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p3 <- ggplot(data = predict_train) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m3_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "brown")
) +
ggtitle("Trend + Seasonal + Calendar Variation") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
p4 <- ggplot(data = predict_train) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m4_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "purple")
) +
ggtitle("Trend + Seasonal + Calendar Variation + ARMA") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 year", date_labels = "%Y") +
my_theme
grid.arrange(p1,p2, p3, p4, ncol=2)
test
predict_test <- test %>%
mutate(actual = search,
m1_predict = predict(m1, newdata = test %>% select(t)),
m2_predict = predict(m2, newdata = test %>% select(t, starts_with("month_"))),
m3_predict = predict(m3, newdata = test %>% select(t, starts_with("month_"), idul_adha)),
m4_predict = m3_predict + forecast(res_arima, h=11)$mean) %>%
select(ts, actual, m1_predict, m2_predict, m3_predict, m4_predict)
predict_test
my_theme <- theme(
legend.position = c(0.2, 0.75),
legend.text = element_text(size = 7),
legend.title = element_text(size = 11),
legend.background= element_rect(fill = "white", colour = "grey80"),
plot.title = element_text(size = 10, face = "bold")
)
p1 <- ggplot(data = predict_test) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m1_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "red")
) +
ggtitle("Trend") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 months", date_labels = "%b %Y") +
my_theme
p2 <- ggplot(data = predict_test) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m2_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "blue")
) +
ggtitle("Trend + Seasonal") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 months", date_labels = "%b %Y") +
my_theme
p3 <- ggplot(data = predict_test) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m3_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "brown")
) +
ggtitle("Trend + Seasonal + Calendar Variation") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 months", date_labels = "%b %Y") +
my_theme
p4 <- ggplot(data = predict_test) +
geom_line(aes(x = ts, y = actual, color = "Actual")) +
geom_line(aes(x = ts, y = m4_predict, color = "Predicted")) +
scale_color_manual(
name = NULL,
values = c("Actual" = "grey",
"Predicted" = "purple")
) +
ggtitle("Trend + Seasonal + Calendar Variation + ARMA") +
xlab("") + ylab("") +
scale_x_datetime(date_breaks = "3 months", date_labels = "%b %Y") +
my_theme
grid.arrange(p1,p2, p3, p4, ncol=2)
predict_train %>%
select(ends_with("_predict")) %>%
imap_dfr(~ tibble(
Model = .y,
RMSE = rmse (predict_train$actual, .x),
MAPE = mape (predict_train$actual, .x),
sMAPE = smape(predict_train$actual, .x)
)) %>%
mutate(Model = recode(Model,
m1_predict = "Trend",
m2_predict = "Trend + Seasonal",
m3_predict = "Trend + Seasonal + CV",
m4_predict = "Trend + Seasonal + CV + ARMA"
))
predict_test %>%
select(ends_with("_predict")) %>%
imap_dfr(~ tibble(
Model = .y,
RMSE = rmse (predict_test$actual, .x),
MAPE = mape (predict_test$actual, .x),
sMAPE = smape(predict_test$actual, .x)
)) %>%
mutate(Model = recode(Model,
m1_predict = "Trend",
m2_predict = "Trend + Seasonal",
m3_predict = "Trend + Seasonal + CV",
m4_predict = "Trend + Seasonal + CV + ARMA"
))