1 Steps

Get the data (simple step‑by‑step).
Explore the series (plots, seasonality, anomalies, ACF/PACF, STL).
Fit baseline statistical models with fpp3 (MEAN, NAIVE, SNAIVE, DRIFT).
Fit automated ML‑style models with modeltime (ARIMA, ARIMA Boost, ETS, Linear Reg; Prophet optional).
Compare accuracy and generate forward forecasts.

Data expectation: a CSV named tennis_data.csv with two columns: date (d/m/yyyy) and tennis (0–100 Google Trends index).

2 1) Install packages (one‑time)

Run this block once. If a package shows an error later, re‑run this cell.

pkgs <- c(
  "tidyverse", "tidymodels", "timetk", "kableExtra", "modeltime",
  "xgboost", "lubridate", "fpp3"
  # Optional: Prophet (commented out because it requires extra system deps)
  # , "prophet"
)
install.packages(setdiff(pkgs, rownames(installed.packages())))

3 2) Load libraries

library(tidyverse)
library(tidymodels)
library(timetk)
library(kableExtra)
library(modeltime)
library(xgboost)
library(lubridate)
library(fpp3)  # brings in tsibble, fable, fabletools, feasts

4 3) Get the data

4.1 Save your data file first

Before running the code below: 1. Download the Tennis Google Trends CSV 2. Rename the file to tennis_data.csv 3. Save it in the same folder as this R Markdown file

✅ If your file is not in the same folder, R will not find it and you’ll get a “file not found” error.

5 4) Import & prepare the time series

# Read the CSV file with proper date parsing for d/m/yyyy format
tennis_tbl <- readr::read_csv(
  "tennis_data.csv",
  col_types = readr::cols(
    date   = readr::col_character(),  # Read as character first
    tennis = readr::col_double()
  )
) %>%
  dplyr::mutate(
    # Parse date in d/m/yyyy format
    date = lubridate::dmy(date)
  ) %>%
  dplyr::arrange(date) %>%
  tidyr::drop_na()

# Quick sanity checks
stopifnot(all(c("date","tennis") %in% names(tennis_tbl)))
stopifnot(inherits(tennis_tbl$date, "Date"))

# Peek at the data
head(tennis_tbl)

cat("\nDate range:", as.character(min(tennis_tbl$date)), "to", as.character(max(tennis_tbl$date)), "\n")

## 
## Date range: 2020-11-01 to 2025-11-02

cat("Number of observations:", nrow(tennis_tbl), "\n")

## Number of observations: 262

# The data is already weekly, so we just need to create proper time series objects
# Create a yearweek column for tsibble
tennis_weekly_tbl <- tennis_tbl %>%
  dplyr::mutate(week = tsibble::yearweek(date)) %>%
  dplyr::group_by(week) %>%
  dplyr::summarise(
    date = min(date),  # Keep the first date in each week
    tennis = mean(tennis, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  dplyr::select(date, tennis)

# Create tsibble version for fpp3 functions
tennis_weekly_ts <- tennis_weekly_tbl %>%
  dplyr::mutate(week = tsibble::yearweek(date)) %>%
  dplyr::select(week, tennis) %>%
  tsibble::as_tsibble(index = week)

# Check for gaps
if (any(tsibble::has_gaps(tennis_weekly_ts)$.gaps)) {
  message("Filling gaps in time series...")
  tennis_weekly_ts <- tennis_weekly_ts %>% 
    tsibble::fill_gaps(tennis = mean(tennis, na.rm = TRUE))
}

# Verify structure
cat("\nWeekly data structure:\n")

## 
## Weekly data structure:

print(head(tennis_weekly_tbl))

## # A tibble: 6 × 2
##   date       tennis
##   <date>      <dbl>
## 1 2020-11-01      7
## 2 2020-11-08      7
## 3 2020-11-15      9
## 4 2020-11-22      8
## 5 2020-11-29      7
## 6 2020-12-06      7

cat("\nWeek column class:", class(tennis_weekly_ts$week), "\n")

## 
## Week column class: yearweek vctrs_vctr

6 5) Exploratory Data Analysis

# Basic time series plot
tennis_weekly_tbl %>%  
  timetk::plot_time_series(
    .date_var = date,
    .value    = tennis,
    .interactive = FALSE,
    .title = "Tennis Google Trends: Weekly Interest Over Time"
  )

# Seasonal plot using tsibble
tennis_weekly_ts %>% 
  feasts::gg_season(tennis, period = "year") +
  labs(title = "Seasonal Pattern of Tennis Interest",
       y = "Tennis Interest (0-100)",
       x = "Week of Year")

# Focus window: July 2022 to most recent data
filter_start <- lubridate::ymd("2022-07-01")
filter_end <- max(tennis_weekly_tbl$date)

cat("Filtering data from", as.character(filter_start), "to", as.character(filter_end), "\n")

## Filtering data from 2022-07-01 to 2025-11-02

tennis_weekly_tbl <- tennis_weekly_tbl %>%
  dplyr::filter(date >= filter_start & date <= filter_end)

# Rebuild tsibble from filtered data
tennis_weekly_ts <- tennis_weekly_tbl %>%
  dplyr::mutate(week = tsibble::yearweek(date)) %>%
  dplyr::select(week, tennis) %>%
  tsibble::as_tsibble(index = week)

cat("Filtered to", nrow(tennis_weekly_tbl), "observations\n")

## Filtered to 175 observations

# Re-plot in the focused window
p1 <- tennis_weekly_tbl %>%  
  timetk::plot_time_series(
    .date_var = date,
    .value    = tennis,
    .interactive = FALSE,
    .title = "Tennis Interest (Filtered Period)"
  )

p2 <- tennis_weekly_ts %>% 
  feasts::gg_season(tennis, period = "year") +
  labs(title = "Seasonal Pattern (Filtered)")

print(p1)

print(p2)

# Anomaly diagnostics
tennis_weekly_tbl %>% 
  timetk::plot_anomaly_diagnostics(
    .date_var = date,
    .value    = tennis,
    .alpha    = 0.1,
    .title = "Anomaly Detection in Tennis Interest"
  )

# ACF/PACF diagnostics
tennis_weekly_tbl %>% 
  timetk::plot_acf_diagnostics(
    .date_var = date,
    .value    = tennis,
    .title = "ACF and PACF Diagnostics"
  )

# STL decomposition
tennis_weekly_ts %>% 
  fabletools::model(
    STL = feasts::STL(tennis ~ season(window = 13))
  ) %>% 
  fabletools::components() %>% 
  autoplot() +
  labs(title = "STL Decomposition of Tennis Interest")

7 6) Train / Test split

# Create 80/20 train/test split (chronological)
splits <- tennis_weekly_tbl %>% 
  rsample::initial_time_split(prop = 0.8)

# Visualize the split
splits %>% 
  timetk::tk_time_series_cv_plan() %>% 
  timetk::plot_time_series_cv_plan(
    .date_var = date, 
    .value = tennis,
    .title = "Train/Test Split Visualization"
  )

# Extract train and test sets
train <- rsample::training(splits)
test  <- rsample::testing(splits)

# Create tsibble versions for fpp3
train_ts <- train %>% 
  dplyr::mutate(week = tsibble::yearweek(date)) %>%
  dplyr::select(week, tennis) %>%
  tsibble::as_tsibble(index = week)

test_ts <- test %>% 
  dplyr::mutate(week = tsibble::yearweek(date)) %>%
  dplyr::select(week, tennis) %>%
  tsibble::as_tsibble(index = week)

cat("Training set:", nrow(train), "observations\n")

## Training set: 140 observations

cat("Test set:", nrow(test), "observations\n")

## Test set: 35 observations

8 7) Baseline benchmark models (fpp3)

# Fit simple benchmark models
fpp3_fit <- train_ts %>% 
  fabletools::model(
    Mean   = fable::MEAN(tennis),
    Naive  = fable::NAIVE(tennis),
    SNaive = fable::SNAIVE(tennis ~ lag("year")),
    Drift  = fable::RW(tennis ~ drift())
  )

# Forecast for the length of the test set
h_test <- nrow(test_ts)
fpp3_fc <- fpp3_fit %>% fabletools::forecast(h = h_test)

# Plot forecasts
fpp3_fc %>% 
  autoplot(train_ts, level = NULL) +
  autolayer(test_ts, colour = "blue") +
  labs(title = "Benchmark Model Forecasts vs Actual",
       y = "Tennis Interest") +
  theme_minimal()

# Accuracy on test set
fpp3_acc_tbl <- fabletools::accuracy(fpp3_fc, test_ts) %>% 
  dplyr::arrange(RMSE) %>% 
  dplyr::select(.model, RMSE, MAE, MAPE, MASE)

fpp3_acc_tbl %>%
  kableExtra::kbl(digits = 2, caption = "Benchmark Model Accuracy (fpp3)") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>% 
  kableExtra::row_spec(1, bold = TRUE, color = "white", background = "#4CAF50")

Benchmark Model Accuracy (fpp3)
.model	RMSE	MAE	MAPE	MASE
Naive	6.23	2.71	14.78	NaN
Mean	6.53	5.43	49.68	NaN
SNaive	7.36	2.29	13.95	NaN
Drift	7.63	4.73	33.24	NaN

9 8) Automated models (modeltime)

# ARIMA
arima_fit <- 
  modeltime::arima_reg() %>% 
  parsnip::set_engine("auto_arima") %>% 
  parsnip::fit(tennis ~ date, data = train)

# ARIMA + XGBoost (Boosted ARIMA)
arima_boost_fit <- 
  modeltime::arima_boost(
    min_n = 2,
    learn_rate = 0.015
  ) %>% 
  parsnip::set_engine("auto_arima_xgboost") %>% 
  parsnip::fit(tennis ~ date, data = train)

# Exponential Smoothing (ETS)
ets_fit <- 
  modeltime::exp_smoothing() %>% 
  parsnip::set_engine("ets") %>% 
  parsnip::fit(tennis ~ date, data = train)

# Linear Regression with time trend
lm_fit <- 
  parsnip::linear_reg() %>% 
  parsnip::set_engine("lm") %>% 
  parsnip::fit(tennis ~ date, data = train)

# Create model table
models_tbl <- modeltime::modeltime_table(
  arima_fit, 
  arima_boost_fit, 
  ets_fit, 
  lm_fit
  # , prophet_fit
)

# Calibrate on test set
calibrate_tbl <- models_tbl %>% 
  modeltime::modeltime_calibrate(new_data = test)

# Forecast visualization
calibrate_tbl %>%
  modeltime::modeltime_forecast(
    actual_data = tennis_weekly_tbl,
    new_data    = test
  ) %>% 
  modeltime::plot_modeltime_forecast(
    .title = "Modeltime Forecasts vs Actual"
  )

# Accuracy table
mt_acc_tbl <- calibrate_tbl %>% 
  modeltime::modeltime_accuracy()

mt_acc_tbl %>%
  dplyr::arrange(rmse) %>%
  dplyr::select(.model_desc, mae, rmse, mape, mase) %>%
  kableExtra::kbl(digits = 2, caption = "Modeltime Accuracy (ML-style)") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>% 
  kableExtra::row_spec(1, bold = TRUE, color = "white", background = "#4CAF50")

Modeltime Accuracy (ML-style)
.model_desc	mae	rmse	mape	mase
ARIMA(1,0,1) WITH NON-ZERO MEAN	5.31	6.45	48.37	1.88
ARIMA(1,0,1) WITH NON-ZERO MEAN	5.31	6.45	48.37	1.88
ETS(M,N,M)	4.18	6.81	29.84	1.48
LM	8.27	8.70	79.74	2.93

10 9) Refit best models on full data & forecast forward

# Refit all models on full dataset
refit_tbl <- calibrate_tbl %>% 
  modeltime::modeltime_refit(data = tennis_weekly_tbl)

# Forecast 156 weeks ahead (≈ 3 years)
future_forecast <- refit_tbl %>% 
  modeltime::modeltime_forecast(
    h = 156, 
    actual_data = tennis_weekly_tbl
  )

# Plot with confidence intervals
future_forecast %>% 
  modeltime::plot_modeltime_forecast(
    .title = "3-Year Forward Forecast (with 95% CI)"
  )

# Plot without confidence intervals for clarity
future_forecast %>% 
  modeltime::plot_modeltime_forecast(
    .conf_interval_show = FALSE,
    .title = "3-Year Forward Forecast (Point Estimates)"
  )

11 10) Combine accuracies: fpp3 vs modeltime

# Prepare fpp3 results
fpp3_accuracy <- fpp3_acc_tbl %>% 
  dplyr::select(Model = .model, RMSE, MAE, MAPE) %>% 
  dplyr::mutate(Framework = "fpp3 (Benchmark)")

# Prepare modeltime results
modeltime_accuracy <- calibrate_tbl %>% 
  modeltime::modeltime_accuracy() %>% 
  dplyr::select(Model = .model_desc, RMSE = rmse, MAE = mae, MAPE = mape) %>% 
  dplyr::mutate(Framework = "modeltime (ML)")

# Combine and sort
combined_accuracy <- dplyr::bind_rows(fpp3_accuracy, modeltime_accuracy) %>% 
  dplyr::arrange(RMSE)

# Display table
combined_accuracy %>% 
  kableExtra::kbl(digits = 2, caption = "All Models – Accuracy Comparison (Sorted by RMSE)") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>% 
  kableExtra::row_spec(1, bold = TRUE, color = "white", background = "#4CAF50")

All Models – Accuracy Comparison (Sorted by RMSE)
Model	RMSE	MAE	MAPE	Framework
Naive	6.23	2.71	14.78	fpp3 (Benchmark)
ARIMA(1,0,1) WITH NON-ZERO MEAN	6.45	5.31	48.37	modeltime (ML)
ARIMA(1,0,1) WITH NON-ZERO MEAN	6.45	5.31	48.37	modeltime (ML)
Mean	6.53	5.43	49.68	fpp3 (Benchmark)
ETS(M,N,M)	6.81	4.18	29.84	modeltime (ML)
SNaive	7.36	2.29	13.95	fpp3 (Benchmark)
Drift	7.63	4.73	33.24	fpp3 (Benchmark)
LM	8.70	8.27	79.74	modeltime (ML)

# Visual comparison
combined_accuracy %>% 
  dplyr::mutate(Model = forcats::fct_reorder(Model, RMSE)) %>%
  ggplot2::ggplot(aes(x = Model, y = RMSE, fill = Framework)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Model Performance Comparison",
    subtitle = "Lower RMSE is better – Best models at top",
    x = "Model",
    y = "RMSE"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

12 11) Summary Statistics

# Overall data summary
summary_stats <- tennis_weekly_tbl %>%
  summarise(
    Min = min(tennis),
    Q1 = quantile(tennis, 0.25),
    Median = median(tennis),
    Mean = mean(tennis),
    Q3 = quantile(tennis, 0.75),
    Max = max(tennis),
    SD = sd(tennis)
  )

summary_stats %>%
  kableExtra::kbl(digits = 2, caption = "Summary Statistics: Tennis Interest") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Summary Statistics: Tennis Interest
Min	Q1	Median	Mean	Q3	Max	SD
7	9	10	14.3	12.5	77	12.57

# Best model summary
best_model <- combined_accuracy %>% dplyr::slice(1)
cat("\n Best Performing Model:", best_model$Model, "\n")

## 
##  Best Performing Model: Naive

cat("   Framework:", best_model$Framework, "\n")

##    Framework: fpp3 (Benchmark)

cat("   RMSE:", round(best_model$RMSE, 2), "\n")

##    RMSE: 6.23

cat("   MAE:", round(best_model$MAE, 2), "\n")

##    MAE: 2.71

cat("   MAPE:", round(best_model$MAPE, 2), "%\n")

##    MAPE: 14.78 %

Tennis Popularity Timeseries

Eliza Morrison

2025-11-03

1 Steps

2 1) Install packages (one‑time)

3 2) Load libraries

4 3) Get the data

4.1 Save your data file first

5 4) Import & prepare the time series

6 5) Exploratory Data Analysis

7 6) Train / Test split

8 7) Baseline benchmark models (fpp3)

9 8) Automated models (modeltime)

10 9) Refit best models on full data & forecast forward

11 10) Combine accuracies: fpp3 vs modeltime

12 11) Summary Statistics