Vignesh Venkatesh
2025-11-03
This document provides a step-by-step reproducible analysis of Google search interest for ‘Cricket Australia’ from 2014 to 2024, using time series modeling in R. The analysis includes exploratory visualization, anomaly detection, model fitting, forecasting, and evaluation.
You will learn to:
Ensure your working directory contains the CSV file:
cricket_australia.csv
This CSV must have at least two columns: Month (YYYY-MM)
and Search.Interests (numeric index).
library(tidyverse)
library(lubridate)
library(timetk)
library(fpp3)
library(ggplot2)
cricket_df <- read.csv("cricket_australia.csv") %>%
mutate(
Month = as.Date(paste0(Month, "-01")),
value = Search.Interests
) %>%
select(Month, value)
head(cricket_df)
## Month value
## 1 2015-01-01 46
## 2 2015-02-01 31
## 3 2015-03-01 38
## 4 2015-04-01 4
## 5 2015-05-01 3
## 6 2015-06-01 9
Inference: Data is loaded and structured as a time series with month as the index.
cricket_df %>%
plot_time_series(
.date_var = Month,
.value = value,
.title = "Google Search Interest Over Time: 'Cricket Australia'",
.smooth = TRUE,
.interactive = TRUE
)
cricket_df %>%
plot_seasonal_diagnostics(.date_var = Month, .value = value,
.title = "Seasonality Diagnostics: Monthly Cricket Interest")
cricket_df %>%
plot_anomaly_diagnostics(.date_var = Month, .value = value, .alpha = 0.05,
.title = "Anomaly Detection in Cricket Interest Over Time")
Inference: Peaks correspond to major cricket events, particularly when visiting teams like India or England play in Australia.
cricket_df %>%
plot_acf_diagnostics(.date_var = Month, .value = value,
.title = "ACF & PACF of Cricket Interest")
cricket_df %>%
plot_stl_diagnostics(.date_var = Month, .value = value,
.feature_set = c("observed","trend","season","remainder"),
.title = "STL Decomposition of Cricket Search Interest")
train_data <- cricket_df %>% filter(Month < as.Date("2024-01-01"))
test_data <- cricket_df %>% filter(Month >= as.Date("2024-01-01") & Month <= as.Date("2024-12-01"))
train_ts <- train_data %>% mutate(Month = yearmonth(Month)) %>% as_tsibble(index = Month)
test_ts <- test_data %>% mutate(Month = yearmonth(Month)) %>% as_tsibble(index = Month)
arima_fit <- train_ts %>% model(ARIMA(value))
ets_fit <- train_ts %>% model(ETS(value))
tslm_fit <- train_ts %>% model(TSLM(value ~ trend() + season()))
arima_fc <- forecast(arima_fit, new_data = test_ts)
ets_fc <- forecast(ets_fit, new_data = test_ts)
tslm_fc <- forecast(tslm_fit, new_data = test_ts)
# Accuracy computation
compute_accuracy <- function(df) {
df %>% summarise(
RMSE = sqrt(mean((pred - value)^2, na.rm=TRUE)),
MAE = mean(abs(pred - value), na.rm=TRUE),
MAPE = mean(abs((pred - value)/value)*100, na.rm=TRUE)
)
}
arima_aug <- arima_fc %>% as_tibble() %>% select(Month, .mean) %>% rename(pred = .mean) %>% left_join(test_data %>% select(Month, value), by="Month")
ets_aug <- ets_fc %>% as_tibble() %>% select(Month, .mean) %>% rename(pred = .mean) %>% left_join(test_data %>% select(Month, value), by="Month")
tslm_aug <- tslm_fc %>% as_tibble() %>% select(Month, .mean) %>% rename(pred = .mean) %>% left_join(test_data %>% select(Month, value), by="Month")
arima_test_acc <- compute_accuracy(arima_aug) %>% mutate(Model="ARIMA")
ets_test_acc <- compute_accuracy(ets_aug) %>% mutate(Model="ETS")
tslm_test_acc <- compute_accuracy(tslm_aug) %>% mutate(Model="TSLM")
test_accuracy <- bind_rows(arima_test_acc, ets_test_acc, tslm_test_acc)
test_accuracy
## # A tibble: 3 × 4
## RMSE MAE MAPE Model
## <dbl> <dbl> <dbl> <chr>
## 1 6.53 4.98 96.2 ARIMA
## 2 8.74 7.65 123. ETS
## 3 5.32 4.26 68.0 TSLM
train_plus_test_ts <- bind_rows(train_ts, test_ts)
tslm_fit_2025 <- train_plus_test_ts %>% model(TSLM(value ~ trend() + season()))
future_2025 <- new_data(train_plus_test_ts, n = 12)
tslm_future_2025 <- forecast(tslm_fit_2025, new_data = future_2025) %>%
as_tibble() %>% select(Month, .mean) %>% rename(pred = .mean) %>% mutate(Model = "TSLM") %>% mutate(Month = as.Date(Month))
historical_data <- bind_rows(
train_data %>% mutate(Model="Actual", pred=value) %>% select(Month, pred, Model),
test_data %>% mutate(Model="Actual", pred=value) %>% select(Month, pred, Model)
)
all_viz_data <- bind_rows(historical_data, tslm_future_2025)
ggplot(all_viz_data, aes(x = Month, y = pred, color = Model)) +
geom_line(size = 1.1) +
geom_point(data = tslm_future_2025, aes(x = Month, y = pred, color = Model), size = 1.5) +
labs(title = "Google Search Interest: Actual (2014-2024) + Forecast (2025, TSLM)", x = "Month", y = "Search Interest Index") +
scale_color_manual(values = c("Actual" = "black", "TSLM" = "#009E73")) +
theme_minimal(base_size = 13) +
theme(legend.position="bottom")
This analysis examined Google search interest for ‘Cricket Australia’ from 2014 to 2024 and forecasted trends for 2025 using ARIMA, ETS, and TSLM models.
Key Insights:
Applications:
Limitations:
Future Directions: