Data expectation: a CSV named
tennis_data.csvwith two columns:date(d/m/yyyy) andtennis(0–100 Google Trends index).
Run this block once. If a package shows an error later, re‑run this cell.
pkgs <- c(
"tidyverse", "tidymodels", "timetk", "kableExtra", "modeltime",
"xgboost", "lubridate", "fpp3"
# Optional: Prophet (commented out because it requires extra system deps)
# , "prophet"
)
install.packages(setdiff(pkgs, rownames(installed.packages())))
library(tidyverse)
library(tidymodels)
library(timetk)
library(kableExtra)
library(modeltime)
library(xgboost)
library(lubridate)
library(fpp3) # brings in tsibble, fable, fabletools, feasts
Before running the code below: 1. Download the Tennis Google Trends
CSV 2. Rename the file to tennis_data.csv
3. Save it in the same folder as this R Markdown
file
✅ If your file is not in the same folder, R will not find it and you’ll get a “file not found” error.
# Read the CSV file with proper date parsing for d/m/yyyy format
tennis_tbl <- readr::read_csv(
"tennis_data.csv",
col_types = readr::cols(
date = readr::col_character(), # Read as character first
tennis = readr::col_double()
)
) %>%
dplyr::mutate(
# Parse date in d/m/yyyy format
date = lubridate::dmy(date)
) %>%
dplyr::arrange(date) %>%
tidyr::drop_na()
# Quick sanity checks
stopifnot(all(c("date","tennis") %in% names(tennis_tbl)))
stopifnot(inherits(tennis_tbl$date, "Date"))
# Peek at the data
head(tennis_tbl)
cat("\nDate range:", as.character(min(tennis_tbl$date)), "to", as.character(max(tennis_tbl$date)), "\n")
##
## Date range: 2020-11-01 to 2025-11-02
cat("Number of observations:", nrow(tennis_tbl), "\n")
## Number of observations: 262
# The data is already weekly, so we just need to create proper time series objects
# Create a yearweek column for tsibble
tennis_weekly_tbl <- tennis_tbl %>%
dplyr::mutate(week = tsibble::yearweek(date)) %>%
dplyr::group_by(week) %>%
dplyr::summarise(
date = min(date), # Keep the first date in each week
tennis = mean(tennis, na.rm = TRUE),
.groups = "drop"
) %>%
dplyr::select(date, tennis)
# Create tsibble version for fpp3 functions
tennis_weekly_ts <- tennis_weekly_tbl %>%
dplyr::mutate(week = tsibble::yearweek(date)) %>%
dplyr::select(week, tennis) %>%
tsibble::as_tsibble(index = week)
# Check for gaps
if (any(tsibble::has_gaps(tennis_weekly_ts)$.gaps)) {
message("Filling gaps in time series...")
tennis_weekly_ts <- tennis_weekly_ts %>%
tsibble::fill_gaps(tennis = mean(tennis, na.rm = TRUE))
}
# Verify structure
cat("\nWeekly data structure:\n")
##
## Weekly data structure:
print(head(tennis_weekly_tbl))
## # A tibble: 6 × 2
## date tennis
## <date> <dbl>
## 1 2020-11-01 7
## 2 2020-11-08 7
## 3 2020-11-15 9
## 4 2020-11-22 8
## 5 2020-11-29 7
## 6 2020-12-06 7
cat("\nWeek column class:", class(tennis_weekly_ts$week), "\n")
##
## Week column class: yearweek vctrs_vctr
# Basic time series plot
tennis_weekly_tbl %>%
timetk::plot_time_series(
.date_var = date,
.value = tennis,
.interactive = FALSE,
.title = "Tennis Google Trends: Weekly Interest Over Time"
)
# Seasonal plot using tsibble
tennis_weekly_ts %>%
feasts::gg_season(tennis, period = "year") +
labs(title = "Seasonal Pattern of Tennis Interest",
y = "Tennis Interest (0-100)",
x = "Week of Year")
# Focus window: July 2022 to most recent data
filter_start <- lubridate::ymd("2022-07-01")
filter_end <- max(tennis_weekly_tbl$date)
cat("Filtering data from", as.character(filter_start), "to", as.character(filter_end), "\n")
## Filtering data from 2022-07-01 to 2025-11-02
tennis_weekly_tbl <- tennis_weekly_tbl %>%
dplyr::filter(date >= filter_start & date <= filter_end)
# Rebuild tsibble from filtered data
tennis_weekly_ts <- tennis_weekly_tbl %>%
dplyr::mutate(week = tsibble::yearweek(date)) %>%
dplyr::select(week, tennis) %>%
tsibble::as_tsibble(index = week)
cat("Filtered to", nrow(tennis_weekly_tbl), "observations\n")
## Filtered to 175 observations
# Re-plot in the focused window
p1 <- tennis_weekly_tbl %>%
timetk::plot_time_series(
.date_var = date,
.value = tennis,
.interactive = FALSE,
.title = "Tennis Interest (Filtered Period)"
)
p2 <- tennis_weekly_ts %>%
feasts::gg_season(tennis, period = "year") +
labs(title = "Seasonal Pattern (Filtered)")
print(p1)
print(p2)
# Anomaly diagnostics
tennis_weekly_tbl %>%
timetk::plot_anomaly_diagnostics(
.date_var = date,
.value = tennis,
.alpha = 0.1,
.title = "Anomaly Detection in Tennis Interest"
)
# ACF/PACF diagnostics
tennis_weekly_tbl %>%
timetk::plot_acf_diagnostics(
.date_var = date,
.value = tennis,
.title = "ACF and PACF Diagnostics"
)
# STL decomposition
tennis_weekly_ts %>%
fabletools::model(
STL = feasts::STL(tennis ~ season(window = 13))
) %>%
fabletools::components() %>%
autoplot() +
labs(title = "STL Decomposition of Tennis Interest")
# Create 80/20 train/test split (chronological)
splits <- tennis_weekly_tbl %>%
rsample::initial_time_split(prop = 0.8)
# Visualize the split
splits %>%
timetk::tk_time_series_cv_plan() %>%
timetk::plot_time_series_cv_plan(
.date_var = date,
.value = tennis,
.title = "Train/Test Split Visualization"
)
# Extract train and test sets
train <- rsample::training(splits)
test <- rsample::testing(splits)
# Create tsibble versions for fpp3
train_ts <- train %>%
dplyr::mutate(week = tsibble::yearweek(date)) %>%
dplyr::select(week, tennis) %>%
tsibble::as_tsibble(index = week)
test_ts <- test %>%
dplyr::mutate(week = tsibble::yearweek(date)) %>%
dplyr::select(week, tennis) %>%
tsibble::as_tsibble(index = week)
cat("Training set:", nrow(train), "observations\n")
## Training set: 140 observations
cat("Test set:", nrow(test), "observations\n")
## Test set: 35 observations
# Fit simple benchmark models
fpp3_fit <- train_ts %>%
fabletools::model(
Mean = fable::MEAN(tennis),
Naive = fable::NAIVE(tennis),
SNaive = fable::SNAIVE(tennis ~ lag("year")),
Drift = fable::RW(tennis ~ drift())
)
# Forecast for the length of the test set
h_test <- nrow(test_ts)
fpp3_fc <- fpp3_fit %>% fabletools::forecast(h = h_test)
# Plot forecasts
fpp3_fc %>%
autoplot(train_ts, level = NULL) +
autolayer(test_ts, colour = "blue") +
labs(title = "Benchmark Model Forecasts vs Actual",
y = "Tennis Interest") +
theme_minimal()
# Accuracy on test set
fpp3_acc_tbl <- fabletools::accuracy(fpp3_fc, test_ts) %>%
dplyr::arrange(RMSE) %>%
dplyr::select(.model, RMSE, MAE, MAPE, MASE)
fpp3_acc_tbl %>%
kableExtra::kbl(digits = 2, caption = "Benchmark Model Accuracy (fpp3)") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
kableExtra::row_spec(1, bold = TRUE, color = "white", background = "#4CAF50")
| .model | RMSE | MAE | MAPE | MASE |
|---|---|---|---|---|
| Naive | 6.23 | 2.71 | 14.78 | NaN |
| Mean | 6.53 | 5.43 | 49.68 | NaN |
| SNaive | 7.36 | 2.29 | 13.95 | NaN |
| Drift | 7.63 | 4.73 | 33.24 | NaN |
# ARIMA
arima_fit <-
modeltime::arima_reg() %>%
parsnip::set_engine("auto_arima") %>%
parsnip::fit(tennis ~ date, data = train)
# ARIMA + XGBoost (Boosted ARIMA)
arima_boost_fit <-
modeltime::arima_boost(
min_n = 2,
learn_rate = 0.015
) %>%
parsnip::set_engine("auto_arima_xgboost") %>%
parsnip::fit(tennis ~ date, data = train)
# Exponential Smoothing (ETS)
ets_fit <-
modeltime::exp_smoothing() %>%
parsnip::set_engine("ets") %>%
parsnip::fit(tennis ~ date, data = train)
# Linear Regression with time trend
lm_fit <-
parsnip::linear_reg() %>%
parsnip::set_engine("lm") %>%
parsnip::fit(tennis ~ date, data = train)
# Create model table
models_tbl <- modeltime::modeltime_table(
arima_fit,
arima_boost_fit,
ets_fit,
lm_fit
# , prophet_fit
)
# Calibrate on test set
calibrate_tbl <- models_tbl %>%
modeltime::modeltime_calibrate(new_data = test)
# Forecast visualization
calibrate_tbl %>%
modeltime::modeltime_forecast(
actual_data = tennis_weekly_tbl,
new_data = test
) %>%
modeltime::plot_modeltime_forecast(
.title = "Modeltime Forecasts vs Actual"
)
# Accuracy table
mt_acc_tbl <- calibrate_tbl %>%
modeltime::modeltime_accuracy()
mt_acc_tbl %>%
dplyr::arrange(rmse) %>%
dplyr::select(.model_desc, mae, rmse, mape, mase) %>%
kableExtra::kbl(digits = 2, caption = "Modeltime Accuracy (ML-style)") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
kableExtra::row_spec(1, bold = TRUE, color = "white", background = "#4CAF50")
| .model_desc | mae | rmse | mape | mase |
|---|---|---|---|---|
| ARIMA(1,0,1) WITH NON-ZERO MEAN | 5.31 | 6.45 | 48.37 | 1.88 |
| ARIMA(1,0,1) WITH NON-ZERO MEAN | 5.31 | 6.45 | 48.37 | 1.88 |
| ETS(M,N,M) | 4.18 | 6.81 | 29.84 | 1.48 |
| LM | 8.27 | 8.70 | 79.74 | 2.93 |
# Refit all models on full dataset
refit_tbl <- calibrate_tbl %>%
modeltime::modeltime_refit(data = tennis_weekly_tbl)
# Forecast 156 weeks ahead (≈ 3 years)
future_forecast <- refit_tbl %>%
modeltime::modeltime_forecast(
h = 156,
actual_data = tennis_weekly_tbl
)
# Plot with confidence intervals
future_forecast %>%
modeltime::plot_modeltime_forecast(
.title = "3-Year Forward Forecast (with 95% CI)"
)
# Plot without confidence intervals for clarity
future_forecast %>%
modeltime::plot_modeltime_forecast(
.conf_interval_show = FALSE,
.title = "3-Year Forward Forecast (Point Estimates)"
)
# Prepare fpp3 results
fpp3_accuracy <- fpp3_acc_tbl %>%
dplyr::select(Model = .model, RMSE, MAE, MAPE) %>%
dplyr::mutate(Framework = "fpp3 (Benchmark)")
# Prepare modeltime results
modeltime_accuracy <- calibrate_tbl %>%
modeltime::modeltime_accuracy() %>%
dplyr::select(Model = .model_desc, RMSE = rmse, MAE = mae, MAPE = mape) %>%
dplyr::mutate(Framework = "modeltime (ML)")
# Combine and sort
combined_accuracy <- dplyr::bind_rows(fpp3_accuracy, modeltime_accuracy) %>%
dplyr::arrange(RMSE)
# Display table
combined_accuracy %>%
kableExtra::kbl(digits = 2, caption = "All Models – Accuracy Comparison (Sorted by RMSE)") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
kableExtra::row_spec(1, bold = TRUE, color = "white", background = "#4CAF50")
| Model | RMSE | MAE | MAPE | Framework |
|---|---|---|---|---|
| Naive | 6.23 | 2.71 | 14.78 | fpp3 (Benchmark) |
| ARIMA(1,0,1) WITH NON-ZERO MEAN | 6.45 | 5.31 | 48.37 | modeltime (ML) |
| ARIMA(1,0,1) WITH NON-ZERO MEAN | 6.45 | 5.31 | 48.37 | modeltime (ML) |
| Mean | 6.53 | 5.43 | 49.68 | fpp3 (Benchmark) |
| ETS(M,N,M) | 6.81 | 4.18 | 29.84 | modeltime (ML) |
| SNaive | 7.36 | 2.29 | 13.95 | fpp3 (Benchmark) |
| Drift | 7.63 | 4.73 | 33.24 | fpp3 (Benchmark) |
| LM | 8.70 | 8.27 | 79.74 | modeltime (ML) |
# Visual comparison
combined_accuracy %>%
dplyr::mutate(Model = forcats::fct_reorder(Model, RMSE)) %>%
ggplot2::ggplot(aes(x = Model, y = RMSE, fill = Framework)) +
geom_col() +
coord_flip() +
labs(
title = "Model Performance Comparison",
subtitle = "Lower RMSE is better – Best models at top",
x = "Model",
y = "RMSE"
) +
theme_minimal() +
theme(legend.position = "bottom")
# Overall data summary
summary_stats <- tennis_weekly_tbl %>%
summarise(
Min = min(tennis),
Q1 = quantile(tennis, 0.25),
Median = median(tennis),
Mean = mean(tennis),
Q3 = quantile(tennis, 0.75),
Max = max(tennis),
SD = sd(tennis)
)
summary_stats %>%
kableExtra::kbl(digits = 2, caption = "Summary Statistics: Tennis Interest") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Min | Q1 | Median | Mean | Q3 | Max | SD |
|---|---|---|---|---|---|---|
| 7 | 9 | 10 | 14.3 | 12.5 | 77 | 12.57 |
# Best model summary
best_model <- combined_accuracy %>% dplyr::slice(1)
cat("\n Best Performing Model:", best_model$Model, "\n")
##
## Best Performing Model: Naive
cat(" Framework:", best_model$Framework, "\n")
## Framework: fpp3 (Benchmark)
cat(" RMSE:", round(best_model$RMSE, 2), "\n")
## RMSE: 6.23
cat(" MAE:", round(best_model$MAE, 2), "\n")
## MAE: 2.71
cat(" MAPE:", round(best_model$MAPE, 2), "%\n")
## MAPE: 14.78 %