MCMC for ‘Big Data’ with Stan

Faster sampling with CmdStan using within-chain parallelization

Statistics
ML
Bayesian Modeling
Big Data
Stan
R
Author

Marc-Aurèle Rivière

Published

June 5, 2022

Abstract

This is an extension (and a translation in R) of PyMC-Labs’ benchmarking of MCMC for Big Data. The Stan code was updated to use within-chain parallelization and compiler optimization for faster CPU sampling.

This document is no longer updated

Please visit this page for a more up-to-date version of this post.

Note

You can check the source code by clicking on the </> Code button at the top-right.

1 Setup


library(here)        # File path management
library(pipebind)    # Piping goodies

library(data.table)  # Data wrangling (fast)
library(dplyr)       # Data wrangling
library(tidyr)       # Data wrangling (misc)
library(purrr)       # Manipulating lists
library(stringr)     # Manipulating strings
library(lubridate)   # Manipulating dates

library(cmdstanr)    # R interface with Stan
library(posterior)   # Wrangling Stan model ouputs

library(ggplot2)     # Plots
library(patchwork)   # Combining plots
library(ggridges)    # Ridgeline plots
library(bayesplot)   # Plots for Stan models

options(
  mc.cores = max(1L, parallel::detectCores(logical = TRUE)),
  scipen = 999L, 
  digits = 4L,
  ggplot2.discrete.colour = \() scale_color_viridis_d(),
  ggplot2.discrete.fill = \() scale_fill_viridis_d()
)

nrows_print <- 10

data.table::setDTthreads(getOption("mc.cores"))

1.1 Stan setup

Installing CmdStan
cmdstanr::check_cmdstan_toolchain(fix = TRUE, quiet = TRUE)

cpp_opts <- list(
  stan_threads = TRUE
  , STAN_CPP_OPTIMS = TRUE
  , STAN_NO_RANGE_CHECKS = TRUE # WARN: remove this if you haven't tested the model
  , PRECOMPILED_HEADERS = TRUE
  , CXXFLAGS_OPTIM = "-march=native -mtune=native"
  , CXXFLAGS_OPTIM_TBB = "-mtune=native -march=native"
  , CXXFLAGS_OPTIM_SUNDIALS = "-mtune=native -march=native"
)

cmdstanr::install_cmdstan(cpp_options = cpp_opts, quiet = TRUE)
Loading CmdStan (if already installed)
highest_cmdstan_version <- fs::dir_ls(config$cmdstan_path) |> fs::path_file() |> 
  keep(\(e) str_detect(e, "cmdstan-")) |> 
  bind(x, str_split(x, '-', simplify = TRUE)[,2]) |> 
  reduce(\(x, y) ifelse(utils::compareVersion(x, y) == 1, x, y))

set_cmdstan_path(glue::glue("{config$cmdstan_path}cmdstan-{highest_cmdstan_version}"))
Setting up knitr’s engine for CmdStan
## Inspired by: https://mpopov.com/blog/2020/07/30/replacing-the-knitr-engine-for-stan/

## Note: We could haved use cmdstanr::register_knitr_engine(), 
##       but it wouldn't include compiler optimizations & multi-threading by default

knitr::knit_engines$set(
  cmdstan = function(options) {
    output_var <- options$output.var
    if (!is.character(output_var) || length(output_var) != 1L) {
      stop(
        "The chunk option output.var must be a character string ",
        "providing a name for the returned `CmdStanModel` object."
      )
    }
    if (options$eval) {
      if (options$cache) {
        cache_path <- options$cache.path
        if (length(cache_path) == 0L || is.na(cache_path) || cache_path == "NA") 
          cache_path <- ""
        dir <- paste0(cache_path, options$label)
      } else {
        dir <- tempdir()
      }
      file <- write_stan_file(options$code, dir = dir, force_overwrite = TRUE)
      mod <- cmdstan_model(
        file, 
        cpp_opts <- list(
          stan_threads = TRUE
          , STAN_CPP_OPTIMS = TRUE
          , STAN_NO_RANGE_CHECKS = TRUE # The model was already tested
          , PRECOMPILED_HEADERS = TRUE
          # , CXXFLAGS_OPTIM = "-march=native -mtune=native"
          , CXXFLAGS_OPTIM_TBB = "-mtune=native -march=native"
          , CXXFLAGS_OPTIM_SUNDIALS = "-mtune=native -march=native"
        ),
        stanc_options = list("Oexperimental")
      )
      assign(output_var, mod, envir = knitr::knit_global())
    }
    options$engine <- "stan"
    code <- paste(options$code, collapse = "\n")
    knitr::engine_output(options, code, '')
  }
)

─ Session info ───────────────────────────────────────────────────────────────
 setting        value
 version        R version 4.2.1 (2022-06-23)
 os             Ubuntu 20.04.4 LTS
 system         x86_64, linux-gnu
 ui             X11
 language       (EN)
 collate        C.UTF-8
 ctype          C.UTF-8
 tz             Europe/Paris
 date           2022-09-24
 pandoc         2.19.2 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
 Quarto         1.1.251
 Stan (CmdStan) 2.30.1

─ Packages ───────────────────────────────────────────────────────────────────
 ! package    * version date (UTC) lib source
   bayesplot  * 1.9.0   2022-03-10 [1] CRAN (R 4.2.0)
   cmdstanr   * 0.5.3   2022-08-03 [1] local
   data.table * 1.14.2  2021-09-27 [1] CRAN (R 4.2.0)
   dplyr      * 1.0.10  2022-09-01 [1] CRAN (R 4.2.1)
   ggplot2    * 3.3.6   2022-05-03 [1] CRAN (R 4.2.0)
   ggridges   * 0.5.3   2021-01-08 [1] CRAN (R 4.2.0)
 P here       * 1.0.1   2020-12-13 [?] CRAN (R 4.2.0)
   lubridate  * 1.8.0   2021-10-07 [1] CRAN (R 4.2.0)
   patchwork  * 1.1.2   2022-08-19 [1] CRAN (R 4.2.1)
   pipebind   * 0.1.1   2022-08-10 [1] CRAN (R 4.2.0)
   posterior  * 1.3.1   2022-09-06 [1] CRAN (R 4.2.1)
   purrr      * 0.3.4   2020-04-17 [1] CRAN (R 4.2.0)
 P stringr    * 1.4.1   2022-08-20 [?] CRAN (R 4.2.1)
   tidyr      * 1.2.1   2022-09-08 [1] CRAN (R 4.2.1)

 [1] /home/mar/Dev/Projects/R/Bayes/renv/library/R-4.2/x86_64-pc-linux-gnu
 [2] /home/mar/.cache/R/renv/library/Bayes-9578a481/R-4.2/x86_64-pc-linux-gnu
 [3] /usr/lib/R/library
 [4] /usr/local/lib/R/site-library
 [5] /usr/lib/R/site-library

 P ── Loaded and on-disk path mismatch.

──────────────────────────────────────────────────────────────────────────────

2 Data


2.1 Matches data

Loading the raw matches data:

matches_data_raw <- purrr::map_dfr(
  fs::dir_ls(matches_data_path, regexp = "atp_matches_(.*).csv"),
  \(f) readr::read_csv(f, num_threads = 32, show_col_types = FALSE) |> 
    select(tourney_date, tourney_level, round, winner_id, winner_name, loser_id, loser_name, score)
) |> mutate(tourney_date = lubridate::ymd(tourney_date))

Filtering matches based on the original posts’ data processing:

round_numbers = list(
  "R128" = 1,
  "RR" = 1,
  "R64" = 2,
  "R32" = 3,
  "R16" = 4,
  "QF" = 5,
  "SF" = 6,
  "F" = 7
)

(matches_data_clean <- matches_data_raw 
  |> filter(
    tourney_date %between% c("1968-01-01", "2021-06-20"),
    str_detect(score, "RET|W/O|DEF|nbsp|Def.", negate = TRUE),
    str_length(score) > 4,
    tourney_level != "D",
    round %in% names(round_numbers)
  )
  |> mutate(
    round_number = sapply(round, \(r) round_numbers[[r]]),
    label = 1
  )
  |> arrange(tourney_date, round_number)
  |> select(-round, -tourney_level)
)
data.frame [160,399 x 8]
tourney_date winner_id winner_name loser_id loser_name score round_number label
1968-01-19 110 023 Richard Coulthard 107 760 Max Senior 12-10 7-5 4-6 7-5 2 1
1968-01-19 109 803 John Brown 106 964 Ernie Mccabe 6-3 6-2 6-4 2 1
1968-01-19 100 257 Ross Case 110 024 Gondo Widjojo 6-4 3-6 6-3 7-5 2 1
1968-01-19 100 105 Allan Stone 110 025 Robert Layton 6-4 6-2 6-1 2 1
1968-01-19 109 966 Warren Jacques 110 026 Bert Kearney 6-4 6-1 7-5 2 1
1968-01-19 107 759 Max Pettman 110 027 Takesji Tsujimoto 6-4 6-1 6-2 2 1
1968-01-19 100 101 Mike Belkin 110 028 M Marchment 6-2 3-6 6-4 9-7 2 1
1968-01-19 100 025 Barry Phillips Moore 108 430 Tony Dawson 6-3 6-0 6-3 2 1
1968-01-19 108 519 William Coghlan 110 029 Peter Oatey 6-0 6-2 9-11 6-3 2 1
1968-01-19 109 799 Geoff Pollard 110 030 Christian Janssens 6-4 6-2 6-4 2 1
1968-01-19 100 146 Jun Kamiwazumi 110 031 Brian Connor 8-6 6-4 6-2 2 1
1968-01-19 100 180 Bill Lloyd 110 032 H Nielson 6-2 7-5 6-4 2 1
1968-01-19 100 013 Neale Fraser 110 033 R Harvey 6-1 6-1 6-0 2 1
1968-01-19 110 034 Merv Guse 110 035 J May 6-1 6-2 6-2 2 1
1968-01-19 100 174 Manuel Orantes 110 036 L Weatherhog 6-4 6-0 6-2 2 1
[ omitted 160,384 entries ]

2.2 Player data

Loading the raw player data:

(player_data_raw <- readr::read_csv(player_data_path) 
 |> mutate(player_name = str_c(name_first, name_last, sep = " "))
 |> select(player_id, player_name)
)
data.frame [55,649 x 2]
player_id player_name
100 001 Gardnar Mulloy
100 002 Pancho Segura
100 003 Frank Sedgman
100 004 Giuseppe Merlo
100 005 Richard Gonzalez
100 006 Grant Golden
100 007 Abe Segal
100 008 Kurt Nielsen
100 009 Istvan Gulyas
100 010 Luis Ayala
100 011 Torben Ulrich
100 012 Nicola Pietrangeli
100 013 Neale Fraser
100 014 Trevor Fancutt
100 015 Sammy Giammalva
[ omitted 55,634 entries ]

Filtering player_data to only keep the players actually present in our data, and updating their IDs:

(player_data <- with(matches_data_clean, tibble(player_id = union(winner_id, loser_id)))
 |> group_by(player_id) 
 |> summarize(player_idx = cur_group_id()) 
 |> left_join(player_data_raw)
)
data.frame [4,830 x 3]
player_id player_idx player_name
100 001 1 Gardnar Mulloy
100 002 2 Pancho Segura
100 003 3 Frank Sedgman
100 004 4 Giuseppe Merlo
100 005 5 Richard Gonzalez
100 006 6 Grant Golden
100 007 7 Abe Segal
100 009 8 Istvan Gulyas
100 010 9 Luis Ayala
100 011 10 Torben Ulrich
100 012 11 Nicola Pietrangeli
100 013 12 Neale Fraser
100 014 13 Trevor Fancutt
100 015 14 Sammy Giammalva
100 016 15 Ken Rosewall
[ omitted 4,815 entries ]

2.3 Matches + Player data

Allocating the new player IDs (player_idx) to the winner_id and loser_id from matches_data:

(matches_data <- left_join(matches_data_clean, player_data, by = c("winner_id" = "player_id")) 
 |> rename(winner_idx = player_idx) 
 |> relocate(winner_idx, .after = winner_id) 
 |> left_join(player_data, by = c("loser_id" = "player_id")) 
 |> rename(loser_idx = player_idx) 
 |> relocate(loser_idx, .after = loser_id)
 |> drop_na(winner_idx, loser_idx)
 |> select(-matches("player_name"))
)
data.frame [160,399 x 10]
tourney_date winner_id winner_idx winner_name loser_id loser_idx loser_name score round_number label
1968-01-19 110 023 3 655 Richard Coulthard 107 760 3 129 Max Senior 12-10 7-5 4-6 7-5 2 1
1968-01-19 109 803 3 440 John Brown 106 964 2 909 Ernie Mccabe 6-3 6-2 6-4 2 1
1968-01-19 100 257 253 Ross Case 110 024 3 656 Gondo Widjojo 6-4 3-6 6-3 7-5 2 1
1968-01-19 100 105 103 Allan Stone 110 025 3 657 Robert Layton 6-4 6-2 6-1 2 1
1968-01-19 109 966 3 600 Warren Jacques 110 026 3 658 Bert Kearney 6-4 6-1 7-5 2 1
1968-01-19 107 759 3 128 Max Pettman 110 027 3 659 Takesji Tsujimoto 6-4 6-1 6-2 2 1
1968-01-19 100 101 99 Mike Belkin 110 028 3 660 M Marchment 6-2 3-6 6-4 9-7 2 1
1968-01-19 100 025 24 Barry Phillips Moore 108 430 3 325 Tony Dawson 6-3 6-0 6-3 2 1
1968-01-19 108 519 3 349 William Coghlan 110 029 3 661 Peter Oatey 6-0 6-2 9-11 6-3 2 1
1968-01-19 109 799 3 436 Geoff Pollard 110 030 3 662 Christian Janssens 6-4 6-2 6-4 2 1
1968-01-19 100 146 144 Jun Kamiwazumi 110 031 3 663 Brian Connor 8-6 6-4 6-2 2 1
1968-01-19 100 180 178 Bill Lloyd 110 032 3 664 H Nielson 6-2 7-5 6-4 2 1
1968-01-19 100 013 12 Neale Fraser 110 033 3 665 R Harvey 6-1 6-1 6-0 2 1
1968-01-19 110 034 3 666 Merv Guse 110 035 3 667 J May 6-1 6-2 6-2 2 1
1968-01-19 100 174 172 Manuel Orantes 110 036 3 668 L Weatherhog 6-4 6-0 6-2 2 1
[ omitted 160,384 entries ]

3 Model


3.1 Stan code

Note

Updated Stan code with within-chain parallelization

functions {
  array[] int sequence(int start, int end) {
    array[end - start + 1] int seq;
    for (n in 1 : num_elements(seq)) {
      seq[n] = n + start - 1;
    }
    return seq;
  }

  // Compute partial sums of the log-likelihood
  real partial_log_lik_lpmf(array[] int seq, int start, int end,
                            data array[] int labels, 
                            data array[] int winner_ids, 
                            data array[] int loser_ids, 
                            vector player_skills) {
    real ptarget = 0;
    int N = end - start + 1;

    vector[N] mu = rep_vector(0.0, N);
    for (n in 1 : N) {
      int nn = n + start - 1;
      mu[n] += player_skills[winner_ids[nn]] - player_skills[loser_ids[nn]];
    }
    ptarget += bernoulli_logit_lpmf(labels[start : end] | mu);
    return ptarget;
  }
}
data {
    int n_players;
    int n_matches;
    
    array[n_matches] int<lower=1, upper=n_players> winner_ids; // Winner of game n
    array[n_matches] int<lower=1, upper=n_players> loser_ids;  // Loser of game n
    array[n_matches] int<lower=0, upper=1> labels;             // Always 1 in this model
    
    int grainsize;
}
transformed data {
    array[n_matches] int seq = sequence(1, n_matches);
}
parameters {
    real<lower=0> player_sd;          // Scale of ability variation (hierarchical prior)
    vector[n_players] player_skills;  // Ability of player k
}
model {   
  player_sd ~ std_normal();
  player_skills ~ normal(0, player_sd);
    
  target += reduce_sum(
    partial_log_lik_lpmf, seq, grainsize, 
    labels, winner_ids, loser_ids, player_skills
  );
}

3.2 Stan data

tennis_data_stan <- list(
  n_matches = nrow(matches_data),
  n_players = with(matches_data, length(union(winner_id, loser_id))),
  winner_ids = matches_data$winner_idx,
  loser_ids = matches_data$loser_idx,
  labels = matches_data$label,
  grainsize = max(100, nrow(matches_data) / 60)
)
List of 6
 $ n_matches : int 160399
 $ n_players : int 4830
 $ winner_ids: int [1:160399] 3655 3440 253 103 3600 3128 99 24 3349 3436 ...
 $ loser_ids : int [1:160399] 3129 2909 3656 3657 3658 3659 3660 3325 3661 3662 ...
 $ labels    : num [1:160399] 1 1 1 1 1 1 1 1 1 1 ...
 $ grainsize : num 2673

3.3 Model fit

tennis_mod_fit <- tennis_mod_exe$sample(
  data = tennis_data_stan, seed = 256,
  iter_warmup = 1000, iter_sampling = 1000, refresh = 0,
  chains = 4, parallel_chains = 4, threads_per_chain = 7
)
Note

Sampling takes ~2.67 minutes on my CPU (Ryzen 5950X, 16 Cores/32 Threads), on WSL2 (Ubuntu 20.04)

data.table [4 x 2]
Chain Time
1 161.834s (~2.7 minutes)
2 160.699s (~2.68 minutes)
3 159.521s (~2.66 minutes)
4 159.712s (~2.66 minutes)

4 Model diagnostics


bayesplot::mcmc_neff_hist(bayesplot::neff_ratio(tennis_mod_fit))

bayesplot::mcmc_rhat_hist(bayesplot::rhat(tennis_mod_fit))

Plotting random subsets of the traces:

hist_trace_plot <- function(mod, vars) {
  draws <- mod$draws(variables = vars, format = "draws_list")
  patchwork::wrap_plots(
    bayesplot::mcmc_hist(draws, facet_args = list(nrow = length(vars))),
    bayesplot::mcmc_trace(draws, facet_args = list(nrow = length(vars))),
    widths = c(1, 1.5)
  )
}
hist_trace_plot(
  tennis_mod_fit, 
  paste0("player_skills[", sample(player_data$player_idx, 5), "]")
)

5 Posterior Predictions


5.1 Posterior data

Getting our Posterior Predictions into long format and joining the result with player_data:

(player_skills <- as.data.table(tennis_mod_fit$draws(variables = "player_skills") |> 
    bind(x, subset_draws(x, "player_skills", regex = T, draw = sample.int(ndraws(x), size = 500))))
   [, .(player_skills = list(value)), by = variable
  ][, `:=`(player_idx = as.integer(str_extract(variable, "\\d{1,4}")), variable = NULL)
  ][, `:=`(skill_mean = sapply(player_skills, mean), skill_sd = sapply(player_skills, sd))
  ][as.data.table(player_data), on = "player_idx", nomatch = NULL
  ][order(-skill_mean), .(player_name, player_id, player_idx, skill_mean, skill_sd, player_skills)]
)
data.table [4,830 x 6]
player_name player_id player_idx skill_mean skill_sd player_skills
Novak Djokovic 104 925 2 415 3.533 0.092 <numeric [500]>
Rafael Nadal 104 745 2 367 3.425 0.085 <numeric [500]>
Roger Federer 103 819 2 082 3.31 0.075 <numeric [500]>
Bjorn Borg 100 437 423 3.249 0.108 <numeric [500]>
Ivan Lendl 100 656 613 3.228 0.084 <numeric [500]>
John McEnroe 100 581 553 3.188 0.094 <numeric [500]>
Jimmy Connors 100 284 280 3.169 0.075 <numeric [500]>
Rod Laver 100 029 28 3.014 0.108 <numeric [500]>
Andy Murray 104 918 2 413 2.926 0.094 <numeric [500]>
Pete Sampras 101 948 1 408 2.922 0.087 <numeric [500]>
Boris Becker 101 414 1 127 2.834 0.085 <numeric [500]>
Andre Agassi 101 736 1 310 2.77 0.082 <numeric [500]>
Stefan Edberg 101 222 993 2.74 0.082 <numeric [500]>
Andy Roddick 104 053 2 157 2.703 0.094 <numeric [500]>
Juan Martin del Potro 105 223 2 503 2.68 0.105 <numeric [500]>
[ omitted 4,815 entries ]

5.2 Posterior plots

Plot code
ridgeline_plot <- function(dat, var) {
  dat[, .(player_skills = unlist(player_skills)), by = setdiff(names(dat), 'player_skills')
     ][, player_name := factor(player_name, levels = unique(player_name))] -> dat
  
  ggplot(dat, aes_string(y = var)) +
    ggridges::geom_density_ridges(
      aes_string(x = "player_skills", fill = var), 
      alpha = 0.5, scale = 2, color = "grey30"
    ) +
    labs(x = "Player Skills", y = "") +
    scale_y_discrete(
      position = "right", 
      limits = \(x) rev(x), 
      labels = \(x) str_replace_all(x, "\\s", "\n")
    ) +
    theme(legend.position = "none", axis.line.y = element_blank())
}

Plotting the player_skills posteriors of the top 10 players:

ridgeline_plot(head(player_skills, 10), "player_name")

Plotting the player_skills posteriors of the bottom 10 players:

ridgeline_plot(tail(player_skills, 10), "player_name")