Bayesian Rock Climbing Rankings

With R and Stan

Statistics
ML
Bayesian Modeling
Stan
R
Author

Marc-Aurèle Rivière

Published

April 19, 2022

Abstract

This is simply a transposition of Ethan Rosenthal’s article on Bayesian Rock Climbing to R. The Stan code was updated to use within-chain parallelization and compiler optimization for faster CPU sampling.

This document is no longer updated

Please visit this page for a more up-to-date version of this post.

Note

You can check the source code by clicking on the </> Code button at the top-right.

1 Setup


library(here)        # File path management
library(pipebind)    # Piping goodies

library(data.table)  # Data wrangling (fast)
library(dplyr)       # Data wrangling
library(dtplyr)      # data.table backend for dplyr
library(dbplyr)      # SQL backend for dplyr
library(DBI)         # Database connection
library(RSQLite)     # SQLite interface
library(purrr)       # Manipulating lists
library(stringr)     # Manipulating strings
library(lubridate)   # Manipulating dates

library(cmdstanr)    # R interface with Stan
library(posterior)   # Wrangling Stan model ouputs

library(ggplot2)     # Plots
library(ggridges)    # Ridgeline plots
library(bayesplot)   # Plots for Stan models
library(patchwork)   # Combining plots

options(
  mc.cores = max(1L, parallel::detectCores(logical = TRUE)),
  scipen = 999L, 
  digits = 4L,
  ggplot2.discrete.colour = \() scale_color_viridis_d(),
  ggplot2.discrete.fill = \() scale_fill_viridis_d()
)

nrows_print <- 10

data.table::setDTthreads(getOption("mc.cores"))

1.1 Stan setup

Installing CmdStan
cmdstanr::check_cmdstan_toolchain(fix = TRUE, quiet = TRUE)

cpp_opts <- list(
  stan_threads = TRUE
  , STAN_CPP_OPTIMS = TRUE
  , STAN_NO_RANGE_CHECKS = TRUE # WARN: remove this if you haven't tested the model
  , PRECOMPILED_HEADERS = TRUE
  , CXXFLAGS_OPTIM = "-march=native -mtune=native"
  , CXXFLAGS_OPTIM_TBB = "-mtune=native -march=native"
  , CXXFLAGS_OPTIM_SUNDIALS = "-mtune=native -march=native"
)

cmdstanr::install_cmdstan(cpp_options = cpp_opts, quiet = TRUE)
Loading CmdStan (if already installed)
highest_cmdstan_version <- fs::dir_ls(config$cmdstan_path) |> fs::path_file() |> 
  keep(\(e) str_detect(e, "cmdstan-")) |> 
  bind(x, str_split(x, '-', simplify = TRUE)[,2]) |> 
  reduce(\(x, y) ifelse(utils::compareVersion(x, y) == 1, x, y))

set_cmdstan_path(glue::glue("{config$cmdstan_path}cmdstan-{highest_cmdstan_version}"))
Setting up knitr’s engine for CmdStan
## Inspired by: https://mpopov.com/blog/2020/07/30/replacing-the-knitr-engine-for-stan/

## Note: We could haved use cmdstanr::register_knitr_engine(), 
##       but it wouldn't include compiler optimizations & multi-threading by default

knitr::knit_engines$set(
  cmdstan = function(options) {
    output_var <- options$output.var
    if (!is.character(output_var) || length(output_var) != 1L) {
      stop(
        "The chunk option output.var must be a character string ",
        "providing a name for the returned `CmdStanModel` object."
      )
    }
    if (options$eval) {
      if (options$cache) {
        cache_path <- options$cache.path
        if (length(cache_path) == 0L || is.na(cache_path) || cache_path == "NA") 
          cache_path <- ""
        dir <- paste0(cache_path, options$label)
      } else {
        dir <- tempdir()
      }
      file <- write_stan_file(options$code, dir = dir, force_overwrite = TRUE)
      mod <- cmdstan_model(
        file, 
        cpp_opts <- list(
          stan_threads = TRUE
          , STAN_CPP_OPTIMS = TRUE
          , STAN_NO_RANGE_CHECKS = TRUE # The model was already tested
          , PRECOMPILED_HEADERS = TRUE
          # , CXXFLAGS_OPTIM = "-march=native -mtune=native"
          , CXXFLAGS_OPTIM_TBB = "-mtune=native -march=native"
          , CXXFLAGS_OPTIM_SUNDIALS = "-mtune=native -march=native"
        ),
        stanc_options = list("Oexperimental")
      )
      assign(output_var, mod, envir = knitr::knit_global())
    }
    options$engine <- "stan"
    code <- paste(options$code, collapse = "\n")
    knitr::engine_output(options, code, '')
  }
)

─ Session info ───────────────────────────────────────────────────────────────
 setting        value
 version        R version 4.2.1 (2022-06-23)
 os             Ubuntu 20.04.4 LTS
 system         x86_64, linux-gnu
 ui             X11
 language       (EN)
 collate        C.UTF-8
 ctype          C.UTF-8
 tz             Europe/Paris
 date           2022-09-24
 pandoc         2.19.2 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
 Quarto         1.1.251
 Stan (CmdStan) 2.30.1

─ Packages ───────────────────────────────────────────────────────────────────
 ! package    * version date (UTC) lib source
   bayesplot  * 1.9.0   2022-03-10 [1] CRAN (R 4.2.0)
   cmdstanr   * 0.5.3   2022-08-03 [1] local
   data.table * 1.14.2  2021-09-27 [1] CRAN (R 4.2.0)
   DBI        * 1.1.3   2022-06-18 [1] CRAN (R 4.2.0)
   dbplyr     * 2.2.1   2022-06-27 [1] CRAN (R 4.2.0)
   dplyr      * 1.0.10  2022-09-01 [1] CRAN (R 4.2.1)
   dtplyr     * 1.2.2   2022-08-20 [1] CRAN (R 4.2.1)
   ggplot2    * 3.3.6   2022-05-03 [1] CRAN (R 4.2.0)
   ggridges   * 0.5.3   2021-01-08 [1] CRAN (R 4.2.0)
 P here       * 1.0.1   2020-12-13 [?] CRAN (R 4.2.0)
   lubridate  * 1.8.0   2021-10-07 [1] CRAN (R 4.2.0)
   patchwork  * 1.1.2   2022-08-19 [1] CRAN (R 4.2.1)
   pipebind   * 0.1.1   2022-08-10 [1] CRAN (R 4.2.0)
   posterior  * 1.3.1   2022-09-06 [1] CRAN (R 4.2.1)
   purrr      * 0.3.4   2020-04-17 [1] CRAN (R 4.2.0)
   RSQLite    * 2.2.17  2022-09-10 [1] CRAN (R 4.2.1)
 P stringr    * 1.4.1   2022-08-20 [?] CRAN (R 4.2.1)

 [1] /home/mar/Dev/Projects/R/Bayes/renv/library/R-4.2/x86_64-pc-linux-gnu
 [2] /home/mar/.cache/R/renv/library/Bayes-9578a481/R-4.2/x86_64-pc-linux-gnu
 [3] /usr/lib/R/library
 [4] /usr/local/lib/R/site-library
 [5] /usr/lib/R/site-library

 P ── Loaded and on-disk path mismatch.

──────────────────────────────────────────────────────────────────────────────

2 Data


2.1 Loading from SQL

con <- DBI::dbConnect(RSQLite::SQLite(), dbname = climb_path)
Note

Comparing raw SQL and dbplyr:

Note

dbplyr automatically translates dplyr code into SQL

climb_dbp <- (reduce(
    list(
      tbl(con, "ascent") |> filter(country %like% "USA") |> 
        select(user_id, grade_id, method_id, crag, climb_type, route_name = name, ascent_date = date),
      tbl(con, "grade") |> select(grade_id = id, usa_routes, usa_boulders), 
      tbl(con, "method") |> select(method_id = id, method_name = name)
    ),
    .f = \(acc, i) left_join(acc, i)
  ) 
  |> select(-grade_id, -method_id)
  |> collect()
)
data.frame [658,822 x 8]
user_id crag climb_type route_name ascent_date usa_routes usa_boulders method_name
25 Hueco Tanks 1 Frogger 917 823 600 5.12d V9 Redpoint
25 Hueco Tanks 1 Sex After Death 917 823 600 5.12d V9 Redpoint
25 Hueco Tanks 1 Red 915 145 200 5.12d V9 Redpoint
25 Hueco Tanks 1 Better Eat Your Wheaties 949 359 600 5.12c V8/9 Redpoint
25 Horse Flats 1 Blank Generation 957 132 000 5.12d V9 Redpoint
25 Bishop 1 Shazam 933 458 400 5.12d V9 Redpoint
25 Bishop 1 It’s All About Love 938 728 800 5.12c V8/9 Redpoint
25 Joshua Tree 1 Crypic Tips 957 132 000 5.12d V9 Redpoint
25 Hueco Tanks 1 Mushroom Roof 946 681 200 5.12c V8/9 Redpoint
26 The Pit 0 Castillo 969 660 000 5.13c V12 Redpoint
26 The Pit 0 Sister of Mercy 969 660 000 5.12c V8/9 Flash
26 The Pit 0 Rage 965 858 400 5.13b V11 Redpoint
26 The Pit 0 Fresh Squeezed 949 359 600 5.13a V10 Redpoint
26 The Pit 0 The Specialist 957 132 000 5.13a V10 Redpoint
26 The Pit 0 No Joke 965 080 800 5.13a V10 Redpoint
[ omitted 658,807 entries ]

Time difference of 1.274 secs

SELECT
  ascent.user_id
  , ascent.crag
  , ascent.climb_type
  , ascent.name AS route_name
  , ascent.date AS ascent_date
  , grade.usa_routes
  , grade.usa_boulders
  , method.name AS method_name
FROM ascent
JOIN grade ON grade.id = ascent.grade_id
JOIN method ON method.id = ascent.method_id
WHERE ascent.country = 'USA'
data.frame [658,822 x 8]
user_id crag climb_type route_name ascent_date usa_routes usa_boulders method_name
25 Hueco Tanks 1 Frogger 917 823 600 5.12d V9 Redpoint
25 Hueco Tanks 1 Sex After Death 917 823 600 5.12d V9 Redpoint
25 Hueco Tanks 1 Red 915 145 200 5.12d V9 Redpoint
25 Hueco Tanks 1 Better Eat Your Wheaties 949 359 600 5.12c V8/9 Redpoint
25 Horse Flats 1 Blank Generation 957 132 000 5.12d V9 Redpoint
25 Bishop 1 Shazam 933 458 400 5.12d V9 Redpoint
25 Bishop 1 It’s All About Love 938 728 800 5.12c V8/9 Redpoint
25 Joshua Tree 1 Crypic Tips 957 132 000 5.12d V9 Redpoint
25 Hueco Tanks 1 Mushroom Roof 946 681 200 5.12c V8/9 Redpoint
26 The Pit 0 Castillo 969 660 000 5.13c V12 Redpoint
26 The Pit 0 Sister of Mercy 969 660 000 5.12c V8/9 Flash
26 The Pit 0 Rage 965 858 400 5.13b V11 Redpoint
26 The Pit 0 Fresh Squeezed 949 359 600 5.13a V10 Redpoint
26 The Pit 0 The Specialist 957 132 000 5.13a V10 Redpoint
26 The Pit 0 No Joke 965 080 800 5.13a V10 Redpoint
[ omitted 658,807 entries ]

Time difference of 0.9819 secs

2.2 Processing

route_ratings <- c(
  str_c("5.", 1:9), 
  map(str_c("5.", 10:15), \(x) str_c(x, letters[1:4])) |> unlist()
)

bouldering_grades <- c(paste0("V", 0:20))

## Mode for non-numerical data
mode <- \(x) levels(x)[which.max(tabulate(match(x, levels(x))))]
Note

Comparing data.table, dplyr, and dtplyr:

climb_dt <- as.data.table(climb_dbp)

threshold_ascents_dt <- function(old_dt, limit = 20) {
  new_dt <- old_dt[, if(.N >= limit) .SD, by = user_id
                 ][, if(.N >= limit) .SD, by = route_id]
  
  if (!identical(dim(old_dt), dim(new_dt))) 
    threshold_ascents_dt(new_dt, limit)
  else return(new_dt)
}
climb_dt <- climb_dt[,
    `:=`(
      route_id = str_c(
        str_replace_all(crag, ' ', '_'), "__", 
        str_replace_all(route_name, ' ', '_'), "__", 
        fifelse(climb_type == 1, 'boulder', 'rope')
      ),
      ascent_date = lubridate::as_datetime(ascent_date),
      usa_boulders = factor(usa_boulders, levels = bouldering_grades),
      usa_routes = factor(usa_routes, levels = route_ratings),
      label = as.integer(method_name %chin% c("Onsight", "Flash"))
    )
  ][climb_dt[, .I[which.min(ascent_date)], by = .(user_id, route_id)]$V1
  ][, `:=`(route_rating = mode(usa_routes), bouldering_grade = mode(usa_boulders)),
      by = route_id
  ]

(dt_clean <- threshold_ascents_dt(climb_dt)
   [, route_idx := .GRP, keyby = route_id
  ][, user_idx := .GRP, keyby = user_id
  ][, -c("usa_routes", "usa_boulders")]
)
data.table [232,887 x 12]
route_id user_id crag climb_type route_name ascent_date method_name label route_rating bouldering_grade route_idx user_idx
Mt_Potasi__Moment_of_Clarity__rope 4 Mt Potasi 0 Moment of Clarity 2006-12-19 23:00:00 Onsight 1 5.12a V7 2 258 1
Mt_Potasi__The_Natural__rope 4 Mt Potasi 0 The Natural 2006-12-23 23:00:00 Redpoint 0 5.12d V9 2 259 1
Obed__Deathblow__rope 4 Obed 0 Deathblow 2006-02-28 23:00:00 Flash 1 5.11b V6 2 443 1
Obed__Jungle_Jane__rope 4 Obed 0 Jungle Jane 2006-11-18 23:00:00 Onsight 1 5.12a V7 2 459 1
Obed__Maximum_Overdrive_Face__rope 4 Obed 0 Maximum Overdrive Face 2006-02-28 23:00:00 Redpoint 0 5.12b V8 2 464 1
Obed__Pet_Cemetary__rope 4 Obed 0 Pet Cemetary 2006-02-28 23:00:00 Flash 1 5.11a V5 2 471 1
Obed__Rage__rope 4 Obed 0 Rage 2006-02-28 23:00:00 Redpoint 0 5.12c V9 2 475 1
Obed__Solstice__rope 4 Obed 0 Solstice 2006-02-28 23:00:00 Onsight 1 5.12a V7 2 482 1
Obed__Tierrany__rope 4 Obed 0 Tierrany 2006-11-18 23:00:00 Onsight 1 5.12a V7 2 495 1
Red_River_Gorge__Ale-8-One__rope 4 Red River Gorge 0 Ale-8-One 2006-11-23 23:00:00 Onsight 1 5.12b V8 2 715 1
Red_River_Gorge__Blue_Eyed_Honkey_Jesus__rope 4 Red River Gorge 0 Blue Eyed Honkey Jesus 2006-11-28 23:00:00 Redpoint 0 5.12c V8 2 759 1
Red_River_Gorge__Chainsaw_Massacre__rope 4 Red River Gorge 0 Chainsaw Massacre 2006-11-23 23:00:00 Onsight 1 5.12a V7 2 787 1
Red_River_Gorge__Fuzzy_Undercling__rope 4 Red River Gorge 0 Fuzzy Undercling 2006-11-20 23:00:00 Onsight 1 5.11b V5 2 850 1
Red_River_Gorge__Gung_Ho__rope 4 Red River Gorge 0 Gung Ho 2006-11-20 23:00:00 Onsight 1 5.12b V8 2 870 1
Red_River_Gorge__Kick_Me_In_The_Jimmie__rope 4 Red River Gorge 0 Kick Me In The Jimmie 2006-11-23 23:00:00 Onsight 1 5.12a V7 2 910 1
[ omitted 232,872 entries ]
Time difference of 6.866 secs
threshold_ascents_df <- function(old_df, limit = 20) {
  new_df <- old_df |> 
    group_by(user_id) |> filter(n() >= limit) |> 
    group_by(route_id) |> filter(n() >= limit) |> 
    ungroup()
  
  if (!identical(dim(old_df), dim(new_df))) 
    threshold_ascents_df(new_df, limit)
  else return(new_df)
}
(df_clean <- climb_dbp
  |> mutate(
    route_id = str_c(
      str_replace_all(crag, ' ', '_'), "__", 
      str_replace_all(route_name, ' ', '_'), "__", 
      if_else(climb_type == 1, 'boulder', 'rope')
    ),
    ascent_date = lubridate::as_datetime(ascent_date),
    usa_boulders = factor(usa_boulders, levels = bouldering_grades),
    usa_routes = factor(usa_routes, levels = route_ratings)
  )
  |> group_by(route_id) 
  |> mutate(route_rating = mode(usa_routes), bouldering_grade = mode(usa_boulders)) 
  |> ungroup()
  |> select(-c(usa_routes, usa_boulders))
  |> mutate(label = as.integer(method_name %in% c("Onsight", "Flash")))
  |> group_by(user_id, route_id) |> slice(which.min(ascent_date)) |> ungroup()
  |> threshold_ascents_df(limit = 20) |> ungroup()
  |> group_by(route_id) |> mutate(route_idx = cur_group_id()) |> ungroup()
  |> group_by(user_id) |> mutate(user_idx = cur_group_id()) |> ungroup()
)
data.frame [232,887 x 12]
user_id crag climb_type route_name ascent_date method_name route_id route_rating bouldering_grade label route_idx user_idx
4 Mt Potasi 0 Moment of Clarity 2006-12-19 23:00:00 Onsight Mt_Potasi__Moment_of_Clarity__rope 5.12a V7 1 2 260 1
4 Mt Potasi 0 The Natural 2006-12-23 23:00:00 Redpoint Mt_Potasi__The_Natural__rope 5.12d V9 0 2 261 1
4 Obed 0 Deathblow 2006-02-28 23:00:00 Flash Obed__Deathblow__rope 5.11b V5 1 2 445 1
4 Obed 0 Jungle Jane 2006-11-18 23:00:00 Onsight Obed__Jungle_Jane__rope 5.12a V7 1 2 461 1
4 Obed 0 Maximum Overdrive Face 2006-02-28 23:00:00 Redpoint Obed__Maximum_Overdrive_Face__rope 5.12b V8 0 2 467 1
4 Obed 0 Pet Cemetary 2006-02-28 23:00:00 Flash Obed__Pet_Cemetary__rope 5.11a V5 1 2 473 1
4 Obed 0 Rage 2006-02-28 23:00:00 Redpoint Obed__Rage__rope 5.12c V9 0 2 477 1
4 Obed 0 Solstice 2006-02-28 23:00:00 Onsight Obed__Solstice__rope 5.12a V7 1 2 484 1
4 Obed 0 Tierrany 2006-11-18 23:00:00 Onsight Obed__Tierrany__rope 5.12a V7 1 2 497 1
4 Red River Gorge 0 Ale-8-One 2006-11-23 23:00:00 Onsight Red_River_Gorge__Ale-8-One__rope 5.12b V8 1 2 619 1
4 Red River Gorge 0 Blue Eyed Honkey Jesus 2006-11-28 23:00:00 Redpoint Red_River_Gorge__Blue_Eyed_Honkey_Jesus__rope 5.12c V8 0 2 663 1
4 Red River Gorge 0 Chainsaw Massacre 2006-11-23 23:00:00 Onsight Red_River_Gorge__Chainsaw_Massacre__rope 5.12a V7 1 2 691 1
4 Red River Gorge 0 Fuzzy Undercling 2006-11-20 23:00:00 Onsight Red_River_Gorge__Fuzzy_Undercling__rope 5.11b V5 1 2 756 1
4 Red River Gorge 0 Gung Ho 2006-11-20 23:00:00 Onsight Red_River_Gorge__Gung_Ho__rope 5.12b V8 1 2 776 1
4 Red River Gorge 0 Kick Me In The Jimmie 2006-11-23 23:00:00 Onsight Red_River_Gorge__Kick_Me_In_The_Jimmie__rope 5.12a V7 1 2 817 1
[ omitted 232,872 entries ]
Time difference of 37.63 secs
Note

dtplyr automatically translates dplyr code into data.table

threshold_ascents_dtp <- function(old_df, limit = 20) {
  new_df <- lazy_dt(old_df) |> 
    group_by(user_id) |> filter(n() >= limit) |> 
    group_by(route_id) |> filter(n() >= limit) |> 
    ungroup() |> collect()
  
  if (!identical(dim(old_df), dim(new_df)))
    threshold_ascents_dtp(new_df, limit)
  else return(new_df)
}
(dtp_clean <- climb_dbp
  |> lazy_dt()
  |> mutate(
    route_id = str_c(
      str_replace_all(crag, ' ', '_'), "__", 
      str_replace_all(route_name, ' ', '_'), "__", 
      if_else(climb_type == 1, 'boulder', 'rope')
    ),
    ascent_date = lubridate::as_datetime(ascent_date),
    usa_boulders = factor(usa_boulders, levels = bouldering_grades),
    usa_routes = factor(usa_routes, levels = route_ratings)
  )
  |> group_by(route_id) 
  |> mutate(route_rating = mode(usa_routes), bouldering_grade = mode(usa_boulders)) 
  |> ungroup()
  |> select(-c(usa_routes, usa_boulders))
  |> mutate(label = as.integer(method_name %in% c("Onsight", "Flash")))
  |> group_by(user_id, route_id) |> slice(which.min(ascent_date)) |> ungroup()
  |> threshold_ascents_dtp(limit = 20) |> ungroup()
  |> group_by(route_id) |> mutate(route_idx = cur_group_id()) |> ungroup()
  |> group_by(user_id) |> mutate(user_idx = cur_group_id()) |> ungroup()
  |> collect()
)
data.frame [232,887 x 12]
user_id crag climb_type route_name ascent_date method_name route_id route_rating bouldering_grade label route_idx user_idx
312 Red Rocks 0 Yaak Crack 1998-03-31 22:00:00 Redpoint Red_Rocks__Yaak_Crack__rope 5.11d V6 0 3 263 19
630 Red Rocks 0 Yaak Crack 2003-12-31 23:00:00 Redpoint Red_Rocks__Yaak_Crack__rope 5.11d V6 0 3 263 47
790 Red Rocks 0 Yaak Crack 2001-01-01 23:00:00 Flash Red_Rocks__Yaak_Crack__rope 5.11d V6 1 3 263 66
932 Red Rocks 0 Yaak Crack 2008-12-20 23:00:00 Onsight Red_Rocks__Yaak_Crack__rope 5.11d V6 1 3 263 74
970 Red Rocks 0 Yaak Crack 2001-11-03 23:00:00 Onsight Red_Rocks__Yaak_Crack__rope 5.11d V6 1 3 263 75
1 145 Red Rocks 0 Yaak Crack 2003-12-20 23:00:00 Toprope Red_Rocks__Yaak_Crack__rope 5.11d V6 0 3 263 87
1 146 Red Rocks 0 Yaak Crack 2002-03-26 23:00:00 Redpoint Red_Rocks__Yaak_Crack__rope 5.11d V6 0 3 263 88
1 266 Red Rocks 0 Yaak Crack 2004-10-25 22:00:00 Onsight Red_Rocks__Yaak_Crack__rope 5.11d V6 1 3 263 93
1 315 Red Rocks 0 Yaak Crack 2005-02-03 23:00:00 Flash Red_Rocks__Yaak_Crack__rope 5.11d V6 1 3 263 96
1 455 Red Rocks 0 Yaak Crack 2014-09-27 22:00:00 Redpoint Red_Rocks__Yaak_Crack__rope 5.11d V6 0 3 263 103
1 603 Red Rocks 0 Yaak Crack 2004-05-29 22:00:00 Redpoint Red_Rocks__Yaak_Crack__rope 5.11d V6 0 3 263 113
2 011 Red Rocks 0 Yaak Crack 2009-04-02 22:00:00 Onsight Red_Rocks__Yaak_Crack__rope 5.11d V6 1 3 263 142
2 034 Red Rocks 0 Yaak Crack 1998-12-13 23:00:00 Onsight Red_Rocks__Yaak_Crack__rope 5.11d V6 1 3 263 144
1 140 Red Rocks 0 Yaak Crack 2009-01-02 23:00:00 Redpoint Red_Rocks__Yaak_Crack__rope 5.11d V6 0 3 263 86
2 055 Red Rocks 0 Yaak Crack 2013-03-04 23:00:00 Flash Red_Rocks__Yaak_Crack__rope 5.11d V6 1 3 263 145
[ omitted 232,872 entries ]
Time difference of 8.351 secs

3 Model


3.1 Stan code

Note

Updated Stan code using within-chain parallelization

functions {
  array[] int sequence(int start, int end) {
    array[end - start + 1] int seq;
    for (n in 1 : num_elements(seq)) {
      seq[n] = n + start - 1;
    }
    return seq;
  }

  // Compute partial sums of the log-likelihood
  real partial_log_lik_lpmf(array[] int seq, int start, int end,
                            data array[] int labels, real mean_ability,
                            data array[] int users, vector user_ability,
                            data array[] int routes, vector route_difficulty) {
    real ptarget = 0;
    int N = end - start + 1;

    vector[N] mu = mean_ability + rep_vector(0.0, N);
    for (n in 1 : N) {
      int nn = n + start - 1;
      mu[n] += user_ability[users[nn]] - route_difficulty[routes[nn]];
    }
    ptarget += bernoulli_logit_lpmf(labels[start : end] | mu);
    return ptarget;
  }
}
data {
  int<lower=1> num_ascents;
  int<lower=1> num_users;
  int<lower=1> num_routes;
  array[num_ascents] int<lower=1, upper=num_users> users;
  array[num_ascents] int<lower=1, upper=num_routes> routes;
  array[num_ascents] int<lower=0, upper=1> labels;

  int grainsize;
}
transformed data {
  array[num_ascents] int seq = sequence(1, num_ascents);
}
parameters {
  real mean_ability;
  vector[num_users] user_ability;
  vector[num_routes] route_difficulty;
}
model {
  user_ability ~ std_normal();
  route_difficulty ~ std_normal();
  mean_ability ~ std_normal();

  target += reduce_sum(
    partial_log_lik_lpmf, seq, grainsize, 
    labels, mean_ability, users, user_ability, routes, route_difficulty
  );
}

3.2 Stan data

stan_data <- list(
  num_ascents = nrow(dt_clean),
  num_users = n_distinct(dt_clean$user_id),
  num_routes = n_distinct(dt_clean$route_id),
  routes = pull(dt_clean, route_idx),
  users = pull(dt_clean, user_idx),
  labels = pull(dt_clean, label) |> as.integer(),
  grainsize = max(100, nrow(dt_clean) / 50)
)
List of 7
 $ num_ascents: int 232887
 $ num_users  : int 2977
 $ num_routes : int 4288
 $ routes     : int [1:232887] 2258 2259 2443 2459 2464 2471 2475 2482 2495 2715 ...
 $ users      : int [1:232887] 1 1 1 1 1 1 1 1 1 1 ...
 $ labels     : int [1:232887] 1 0 1 1 0 1 0 1 1 1 ...
 $ grainsize  : num 4658

3.3 Model fit

mod_stan <- mod_stan_exe$sample(
  data = stan_data, seed = 666,
  iter_warmup = 500, iter_sampling = 1000, refresh = 0,
  chains = 6, parallel_chains = 6, threads_per_chain = 5
)
Note

Sampling takes ~4.89 minutes on my CPU (Ryzen 5950X, 16 Cores/32 Threads), on WSL2 (Ubuntu 20.04)

data.table [6 x 2]
Chain Time
1 284.245s (~4.74 minutes)
2 297.297s (~4.95 minutes)
3 294.22s (~4.9 minutes)
4 289.123s (~4.82 minutes)
5 295.178s (~4.92 minutes)
6 299.366s (~4.99 minutes)

4 Model diagnostics


bayesplot::mcmc_neff_hist(bayesplot::neff_ratio(mod_stan))

bayesplot::mcmc_rhat_hist(bayesplot::rhat(mod_stan))

Plotting random subsets of the traces

hist_trace_plot <- function(mod, vars) {
  draws <- mod$draws(variables = vars, format = "draws_list")
  patchwork::wrap_plots(
    bayesplot::mcmc_hist(draws, facet_args = list(nrow = length(vars))),
    bayesplot::mcmc_trace(draws, facet_args = list(nrow = length(vars))),
    widths = c(1, 1.5)
  )
}
hist_trace_plot(
  mod_stan, 
  paste0("route_difficulty[", unique(dt_clean, by = "route_idx")[, route_idx] |> sample(5), "]")
)

hist_trace_plot(
  mod_stan, 
  paste0("user_ability[", unique(dt_clean, by = "user_idx")[, user_idx] |> sample(5), "]")
)

5 Posterior Predictions


5.1 Posterior data

Getting our Posterior Predictions (subset of 500 draws per route) into long format:

Note

Comparing data.table and dplyr (using the rvar format from posterior):

unique(dt_clean[, .(route_idx, bouldering_grade, route_rating, climb_type)], by = "route_idx")[
  as.data.table(mod_stan$draws(variables = "route_difficulty") |> 
   bind(x, subset_draws(x, "route_difficulty", regex = T, draw = sample.int(ndraws(x), size = 500))))
   [, .(route_difficulty = list(value)), by = variable
  ][, `:=`(route_idx = as.integer(str_extract(variable, "\\d{1,4}")), variable = NULL)],
  on = "route_idx", nomatch = NULL
][, `:=`(
  bouldering_grade = factor(bouldering_grade, levels = bouldering_grades), 
  route_rating = factor(route_rating, levels = route_ratings)
)][order(route_idx)] -> pp
data.table [4,288 x 5]
route_idx bouldering_grade route_rating climb_type route_difficulty
1 V7 5.12a 1 <numeric [500]>
2 V7 5.12a 1 <numeric [500]>
3 V5 5.11a 1 <numeric [500]>
4 V7 5.12a 1 <numeric [500]>
5 V4 5.10c 1 <numeric [500]>
6 V5 5.11a 1 <numeric [500]>
7 V9 5.12d 1 <numeric [500]>
8 V3 5.10a 1 <numeric [500]>
9 V9 5.12d 1 <numeric [500]>
10 V8 5.12c 1 <numeric [500]>
11 V5 5.11a 1 <numeric [500]>
12 V5 5.11a 1 <numeric [500]>
13 V3 5.10a 1 <numeric [500]>
14 V6 5.11d 1 <numeric [500]>
15 V5 5.11b 0 <numeric [500]>
[ omitted 4,273 entries ]

Time difference of 1.727 secs

With dplyr, we can use the rvar format to encapsulate the samples from the model, which drastically reduces the size of the samples’ data.frame

pp_df <- (inner_join(
    select(df_clean, route_idx, bouldering_grade, route_rating, climb_type) |> 
      distinct(route_idx, .keep_all = TRUE),
    tidybayes::spread_rvars(mod_stan, route_difficulty[route_idx], ndraws = 500),
    by = "route_idx"
  )
  |> mutate(
    bouldering_grade = factor(bouldering_grade, levels = bouldering_grades), 
    route_rating = factor(route_rating, levels = route_ratings)
  )
  |> arrange(route_idx)
)
data.frame [4,288 x 5]
route_idx bouldering_grade route_rating climb_type route_difficulty
1 V7 5.12a 1 <rvar [1 x NA]>
2 V7 5.12a 1 <rvar [1 x NA]>
3 V5 5.11a 1 <rvar [1 x NA]>
4 V7 5.12a 1 <rvar [1 x NA]>
5 V4 5.10c 1 <rvar [1 x NA]>
6 V5 5.11a 1 <rvar [1 x NA]>
7 V9 5.12d 1 <rvar [1 x NA]>
8 V3 5.10a 1 <rvar [1 x NA]>
9 V9 5.12d 1 <rvar [1 x NA]>
10 V8 5.12c 1 <rvar [1 x NA]>
11 V5 5.11a 1 <rvar [1 x NA]>
12 V5 5.11a 1 <rvar [1 x NA]>
13 V3 5.10a 1 <rvar [1 x NA]>
14 V6 5.11d 1 <rvar [1 x NA]>
15 V7 5.12a 0 <rvar [1 x NA]>
[ omitted 4,273 entries ]

Time difference of 1.032 secs

5.2 Posterior plots:

Plot code
ridgeline_plot <- function(dat, var) {
  if (class(dat[, route_difficulty]) == "list")
    dat <- dat[, .(route_difficulty = unlist(route_difficulty)), by = setdiff(names(dat), 'route_difficulty')]
  
  ggplot(dat, aes_string(y = var)) +
  ggridges::geom_density_ridges(
    aes_string(x = "route_difficulty", fill = var), 
    alpha = 0.5, scale = 2.5, color = "grey30"
  ) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  labs(x = str_to_title(str_replace_all(var, "_", " ")), y = "") +
  scale_y_discrete(position = "right") +
  theme(legend.position = "none", axis.line.y = element_blank())
}

Route Rating:

ridgeline_plot(pp[climb_type == 0], "route_rating")

Bouldering Grade:

ridgeline_plot(pp[climb_type == 1 & bouldering_grade != "V0"], "bouldering_grade")