1 Setup

library(here)        # File path management
library(pipebind)    # Piping goodies

library(data.table)  # Data wrangling (fast)
library(dplyr)       # Data wrangling
library(dtplyr)      # data.table backend for dplyr
library(dbplyr)      # SQL backend for dplyr
library(DBI)         # Database connection
library(RSQLite)     # SQLite interface
library(purrr)       # Manipulating lists
library(stringr)     # Manipulating strings
library(lubridate)   # Manipulating dates

library(cmdstanr)    # R interface with Stan
library(posterior)   # Wrangling Stan model ouputs

library(ggplot2)     # Plots
library(ggridges)    # Ridgeline plots
library(bayesplot)   # Plots for Stan models
library(patchwork)   # Combining plots

options(
  mc.cores = max(1L, parallel::detectCores(logical = TRUE)),
  scipen = 999L, 
  digits = 4L,
  ggplot2.discrete.colour = \() scale_color_viridis_d(),
  ggplot2.discrete.fill = \() scale_fill_viridis_d()
)

nrows_print <- 10

data.table::setDTthreads(getOption("mc.cores"))

1.1 Stan setup

Installing CmdStan

cmdstanr::check_cmdstan_toolchain(fix = TRUE, quiet = TRUE)

cpp_opts <- list(
  stan_threads = TRUE
  , STAN_CPP_OPTIMS = TRUE
  , STAN_NO_RANGE_CHECKS = TRUE # WARN: remove this if you haven't tested the model
  , PRECOMPILED_HEADERS = TRUE
  , CXXFLAGS_OPTIM = "-march=native -mtune=native"
  , CXXFLAGS_OPTIM_TBB = "-mtune=native -march=native"
  , CXXFLAGS_OPTIM_SUNDIALS = "-mtune=native -march=native"
)

cmdstanr::install_cmdstan(cpp_options = cpp_opts, quiet = TRUE)

Loading CmdStan (if already installed)

highest_cmdstan_version <- fs::dir_ls(config$cmdstan_path) |> fs::path_file() |> 
  keep(\(e) str_detect(e, "cmdstan-")) |> 
  bind(x, str_split(x, '-', simplify = TRUE)[,2]) |> 
  reduce(\(x, y) ifelse(utils::compareVersion(x, y) == 1, x, y))

set_cmdstan_path(glue::glue("{config$cmdstan_path}cmdstan-{highest_cmdstan_version}"))

Setting up knitr’s engine for CmdStan

## Inspired by: https://mpopov.com/blog/2020/07/30/replacing-the-knitr-engine-for-stan/

## Note: We could haved use cmdstanr::register_knitr_engine(), 
##       but it wouldn't include compiler optimizations & multi-threading by default

knitr::knit_engines$set(
  cmdstan = function(options) {
    output_var <- options$output.var
    if (!is.character(output_var) || length(output_var) != 1L) {
      stop(
        "The chunk option output.var must be a character string ",
        "providing a name for the returned `CmdStanModel` object."
      )
    }
    if (options$eval) {
      if (options$cache) {
        cache_path <- options$cache.path
        if (length(cache_path) == 0L || is.na(cache_path) || cache_path == "NA") 
          cache_path <- ""
        dir <- paste0(cache_path, options$label)
      } else {
        dir <- tempdir()
      }
      file <- write_stan_file(options$code, dir = dir, force_overwrite = TRUE)
      mod <- cmdstan_model(
        file, 
        cpp_opts <- list(
          stan_threads = TRUE
          , STAN_CPP_OPTIMS = TRUE
          , STAN_NO_RANGE_CHECKS = TRUE # The model was already tested
          , PRECOMPILED_HEADERS = TRUE
          # , CXXFLAGS_OPTIM = "-march=native -mtune=native"
          , CXXFLAGS_OPTIM_TBB = "-mtune=native -march=native"
          , CXXFLAGS_OPTIM_SUNDIALS = "-mtune=native -march=native"
        ),
        stanc_options = list("Oexperimental")
      )
      assign(output_var, mod, envir = knitr::knit_global())
    }
    options$engine <- "stan"
    code <- paste(options$code, collapse = "\n")
    knitr::engine_output(options, code, '')
  }
)

Expand for Session Info

─ Session info ───────────────────────────────────────────────────────────────
 setting        value
 version        R version 4.2.1 (2022-06-23)
 os             Ubuntu 20.04.4 LTS
 system         x86_64, linux-gnu
 ui             X11
 language       (EN)
 collate        C.UTF-8
 ctype          C.UTF-8
 tz             Europe/Paris
 date           2022-09-24
 pandoc         2.19.2 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
 Quarto         1.1.251
 Stan (CmdStan) 2.30.1

─ Packages ───────────────────────────────────────────────────────────────────
 ! package    * version date (UTC) lib source
   bayesplot  * 1.9.0   2022-03-10 [1] CRAN (R 4.2.0)
   cmdstanr   * 0.5.3   2022-08-03 [1] local
   data.table * 1.14.2  2021-09-27 [1] CRAN (R 4.2.0)
   DBI        * 1.1.3   2022-06-18 [1] CRAN (R 4.2.0)
   dbplyr     * 2.2.1   2022-06-27 [1] CRAN (R 4.2.0)
   dplyr      * 1.0.10  2022-09-01 [1] CRAN (R 4.2.1)
   dtplyr     * 1.2.2   2022-08-20 [1] CRAN (R 4.2.1)
   ggplot2    * 3.3.6   2022-05-03 [1] CRAN (R 4.2.0)
   ggridges   * 0.5.3   2021-01-08 [1] CRAN (R 4.2.0)
 P here       * 1.0.1   2020-12-13 [?] CRAN (R 4.2.0)
   lubridate  * 1.8.0   2021-10-07 [1] CRAN (R 4.2.0)
   patchwork  * 1.1.2   2022-08-19 [1] CRAN (R 4.2.1)
   pipebind   * 0.1.1   2022-08-10 [1] CRAN (R 4.2.0)
   posterior  * 1.3.1   2022-09-06 [1] CRAN (R 4.2.1)
   purrr      * 0.3.4   2020-04-17 [1] CRAN (R 4.2.0)
   RSQLite    * 2.2.17  2022-09-10 [1] CRAN (R 4.2.1)
 P stringr    * 1.4.1   2022-08-20 [?] CRAN (R 4.2.1)

 [1] /home/mar/Dev/Projects/R/Bayes/renv/library/R-4.2/x86_64-pc-linux-gnu
 [2] /home/mar/.cache/R/renv/library/Bayes-9578a481/R-4.2/x86_64-pc-linux-gnu
 [3] /usr/lib/R/library
 [4] /usr/local/lib/R/site-library
 [5] /usr/lib/R/site-library

 P ── Loaded and on-disk path mismatch.

──────────────────────────────────────────────────────────────────────────────

2 Data

2.1 Loading from SQL

con <- DBI::dbConnect(RSQLite::SQLite(), dbname = climb_path)

Note

Comparing raw SQL and dbplyr:

dbplyr
SQL

Note

dbplyr automatically translates dplyr code into SQL

climb_dbp <- (reduce(
    list(
      tbl(con, "ascent") |> filter(country %like% "USA") |> 
        select(user_id, grade_id, method_id, crag, climb_type, route_name = name, ascent_date = date),
      tbl(con, "grade") |> select(grade_id = id, usa_routes, usa_boulders), 
      tbl(con, "method") |> select(method_id = id, method_name = name)
    ),
    .f = \(acc, i) left_join(acc, i)
  ) 
  |> select(-grade_id, -method_id)
  |> collect()
)

data.frame [658,822 x 8]

user_id	crag	climb_type	route_name	ascent_date	usa_routes	usa_boulders	method_name
25	Hueco Tanks	1	Frogger	917 823 600	5.12d	V9	Redpoint
25	Hueco Tanks	1	Sex After Death	917 823 600	5.12d	V9	Redpoint
25	Hueco Tanks	1	Red	915 145 200	5.12d	V9	Redpoint
25	Hueco Tanks	1	Better Eat Your Wheaties	949 359 600	5.12c	V8/9	Redpoint
25	Horse Flats	1	Blank Generation	957 132 000	5.12d	V9	Redpoint
25	Bishop	1	Shazam	933 458 400	5.12d	V9	Redpoint
25	Bishop	1	It’s All About Love	938 728 800	5.12c	V8/9	Redpoint
25	Joshua Tree	1	Crypic Tips	957 132 000	5.12d	V9	Redpoint
25	Hueco Tanks	1	Mushroom Roof	946 681 200	5.12c	V8/9	Redpoint
26	The Pit	0	Castillo	969 660 000	5.13c	V12	Redpoint
26	The Pit	0	Sister of Mercy	969 660 000	5.12c	V8/9	Flash
26	The Pit	0	Rage	965 858 400	5.13b	V11	Redpoint
26	The Pit	0	Fresh Squeezed	949 359 600	5.13a	V10	Redpoint
26	The Pit	0	The Specialist	957 132 000	5.13a	V10	Redpoint
26	The Pit	0	No Joke	965 080 800	5.13a	V10	Redpoint
[ omitted 658,807 entries ]

Time difference of 1.274 secs

SELECT
  ascent.user_id
  , ascent.crag
  , ascent.climb_type
  , ascent.name AS route_name
  , ascent.date AS ascent_date
  , grade.usa_routes
  , grade.usa_boulders
  , method.name AS method_name
FROM ascent
JOIN grade ON grade.id = ascent.grade_id
JOIN method ON method.id = ascent.method_id
WHERE ascent.country = 'USA'

data.frame [658,822 x 8]

user_id	crag	climb_type	route_name	ascent_date	usa_routes	usa_boulders	method_name
25	Hueco Tanks	1	Frogger	917 823 600	5.12d	V9	Redpoint
25	Hueco Tanks	1	Sex After Death	917 823 600	5.12d	V9	Redpoint
25	Hueco Tanks	1	Red	915 145 200	5.12d	V9	Redpoint
25	Hueco Tanks	1	Better Eat Your Wheaties	949 359 600	5.12c	V8/9	Redpoint
25	Horse Flats	1	Blank Generation	957 132 000	5.12d	V9	Redpoint
25	Bishop	1	Shazam	933 458 400	5.12d	V9	Redpoint
25	Bishop	1	It’s All About Love	938 728 800	5.12c	V8/9	Redpoint
25	Joshua Tree	1	Crypic Tips	957 132 000	5.12d	V9	Redpoint
25	Hueco Tanks	1	Mushroom Roof	946 681 200	5.12c	V8/9	Redpoint
26	The Pit	0	Castillo	969 660 000	5.13c	V12	Redpoint
26	The Pit	0	Sister of Mercy	969 660 000	5.12c	V8/9	Flash
26	The Pit	0	Rage	965 858 400	5.13b	V11	Redpoint
26	The Pit	0	Fresh Squeezed	949 359 600	5.13a	V10	Redpoint
26	The Pit	0	The Specialist	957 132 000	5.13a	V10	Redpoint
26	The Pit	0	No Joke	965 080 800	5.13a	V10	Redpoint
[ omitted 658,807 entries ]

Time difference of 0.9819 secs

2.2 Processing

route_ratings <- c(
  str_c("5.", 1:9), 
  map(str_c("5.", 10:15), \(x) str_c(x, letters[1:4])) |> unlist()
)

bouldering_grades <- c(paste0("V", 0:20))

## Mode for non-numerical data
mode <- \(x) levels(x)[which.max(tabulate(match(x, levels(x))))]

Note

Comparing data.table, dplyr, and dtplyr:

climb_dt <- as.data.table(climb_dbp)

threshold_ascents_dt <- function(old_dt, limit = 20) {
  new_dt <- old_dt[, if(.N >= limit) .SD, by = user_id
                 ][, if(.N >= limit) .SD, by = route_id]
  
  if (!identical(dim(old_dt), dim(new_dt))) 
    threshold_ascents_dt(new_dt, limit)
  else return(new_dt)
}

climb_dt <- climb_dt[,
    `:=`(
      route_id = str_c(
        str_replace_all(crag, ' ', '_'), "__", 
        str_replace_all(route_name, ' ', '_'), "__", 
        fifelse(climb_type == 1, 'boulder', 'rope')
      ),
      ascent_date = lubridate::as_datetime(ascent_date),
      usa_boulders = factor(usa_boulders, levels = bouldering_grades),
      usa_routes = factor(usa_routes, levels = route_ratings),
      label = as.integer(method_name %chin% c("Onsight", "Flash"))
    )
  ][climb_dt[, .I[which.min(ascent_date)], by = .(user_id, route_id)]$V1
  ][, `:=`(route_rating = mode(usa_routes), bouldering_grade = mode(usa_boulders)),
      by = route_id
  ]

(dt_clean <- threshold_ascents_dt(climb_dt)
   [, route_idx := .GRP, keyby = route_id
  ][, user_idx := .GRP, keyby = user_id
  ][, -c("usa_routes", "usa_boulders")]
)

data.table [232,887 x 12]

route_id	user_id	crag	climb_type	route_name	ascent_date	method_name	label	route_rating	bouldering_grade	route_idx	user_idx
Mt_Potasi__Moment_of_Clarity__rope	4	Mt Potasi	0	Moment of Clarity	2006-12-19 23:00:00	Onsight	1	5.12a	V7	2 258	1
Mt_Potasi__The_Natural__rope	4	Mt Potasi	0	The Natural	2006-12-23 23:00:00	Redpoint	0	5.12d	V9	2 259	1
Obed__Deathblow__rope	4	Obed	0	Deathblow	2006-02-28 23:00:00	Flash	1	5.11b	V6	2 443	1
Obed__Jungle_Jane__rope	4	Obed	0	Jungle Jane	2006-11-18 23:00:00	Onsight	1	5.12a	V7	2 459	1
Obed__Maximum_Overdrive_Face__rope	4	Obed	0	Maximum Overdrive Face	2006-02-28 23:00:00	Redpoint	0	5.12b	V8	2 464	1
Obed__Pet_Cemetary__rope	4	Obed	0	Pet Cemetary	2006-02-28 23:00:00	Flash	1	5.11a	V5	2 471	1
Obed__Rage__rope	4	Obed	0	Rage	2006-02-28 23:00:00	Redpoint	0	5.12c	V9	2 475	1
Obed__Solstice__rope	4	Obed	0	Solstice	2006-02-28 23:00:00	Onsight	1	5.12a	V7	2 482	1
Obed__Tierrany__rope	4	Obed	0	Tierrany	2006-11-18 23:00:00	Onsight	1	5.12a	V7	2 495	1
Red_River_Gorge__Ale-8-One__rope	4	Red River Gorge	0	Ale-8-One	2006-11-23 23:00:00	Onsight	1	5.12b	V8	2 715	1
Red_River_Gorge__Blue_Eyed_Honkey_Jesus__rope	4	Red River Gorge	0	Blue Eyed Honkey Jesus	2006-11-28 23:00:00	Redpoint	0	5.12c	V8	2 759	1
Red_River_Gorge__Chainsaw_Massacre__rope	4	Red River Gorge	0	Chainsaw Massacre	2006-11-23 23:00:00	Onsight	1	5.12a	V7	2 787	1
Red_River_Gorge__Fuzzy_Undercling__rope	4	Red River Gorge	0	Fuzzy Undercling	2006-11-20 23:00:00	Onsight	1	5.11b	V5	2 850	1
Red_River_Gorge__Gung_Ho__rope	4	Red River Gorge	0	Gung Ho	2006-11-20 23:00:00	Onsight	1	5.12b	V8	2 870	1
Red_River_Gorge__Kick_Me_In_The_Jimmie__rope	4	Red River Gorge	0	Kick Me In The Jimmie	2006-11-23 23:00:00	Onsight	1	5.12a	V7	2 910	1
[ omitted 232,872 entries ]

Time difference of 6.866 secs

threshold_ascents_df <- function(old_df, limit = 20) {
  new_df <- old_df |> 
    group_by(user_id) |> filter(n() >= limit) |> 
    group_by(route_id) |> filter(n() >= limit) |> 
    ungroup()
  
  if (!identical(dim(old_df), dim(new_df))) 
    threshold_ascents_df(new_df, limit)
  else return(new_df)
}

(df_clean <- climb_dbp
  |> mutate(
    route_id = str_c(
      str_replace_all(crag, ' ', '_'), "__", 
      str_replace_all(route_name, ' ', '_'), "__", 
      if_else(climb_type == 1, 'boulder', 'rope')
    ),
    ascent_date = lubridate::as_datetime(ascent_date),
    usa_boulders = factor(usa_boulders, levels = bouldering_grades),
    usa_routes = factor(usa_routes, levels = route_ratings)
  )
  |> group_by(route_id) 
  |> mutate(route_rating = mode(usa_routes), bouldering_grade = mode(usa_boulders)) 
  |> ungroup()
  |> select(-c(usa_routes, usa_boulders))
  |> mutate(label = as.integer(method_name %in% c("Onsight", "Flash")))
  |> group_by(user_id, route_id) |> slice(which.min(ascent_date)) |> ungroup()
  |> threshold_ascents_df(limit = 20) |> ungroup()
  |> group_by(route_id) |> mutate(route_idx = cur_group_id()) |> ungroup()
  |> group_by(user_id) |> mutate(user_idx = cur_group_id()) |> ungroup()
)

data.frame [232,887 x 12]

user_id	crag	climb_type	route_name	ascent_date	method_name	route_id	route_rating	bouldering_grade	label	route_idx	user_idx
4	Mt Potasi	0	Moment of Clarity	2006-12-19 23:00:00	Onsight	Mt_Potasi__Moment_of_Clarity__rope	5.12a	V7	1	2 260	1
4	Mt Potasi	0	The Natural	2006-12-23 23:00:00	Redpoint	Mt_Potasi__The_Natural__rope	5.12d	V9	0	2 261	1
4	Obed	0	Deathblow	2006-02-28 23:00:00	Flash	Obed__Deathblow__rope	5.11b	V5	1	2 445	1
4	Obed	0	Jungle Jane	2006-11-18 23:00:00	Onsight	Obed__Jungle_Jane__rope	5.12a	V7	1	2 461	1
4	Obed	0	Maximum Overdrive Face	2006-02-28 23:00:00	Redpoint	Obed__Maximum_Overdrive_Face__rope	5.12b	V8	0	2 467	1
4	Obed	0	Pet Cemetary	2006-02-28 23:00:00	Flash	Obed__Pet_Cemetary__rope	5.11a	V5	1	2 473	1
4	Obed	0	Rage	2006-02-28 23:00:00	Redpoint	Obed__Rage__rope	5.12c	V9	0	2 477	1
4	Obed	0	Solstice	2006-02-28 23:00:00	Onsight	Obed__Solstice__rope	5.12a	V7	1	2 484	1
4	Obed	0	Tierrany	2006-11-18 23:00:00	Onsight	Obed__Tierrany__rope	5.12a	V7	1	2 497	1
4	Red River Gorge	0	Ale-8-One	2006-11-23 23:00:00	Onsight	Red_River_Gorge__Ale-8-One__rope	5.12b	V8	1	2 619	1
4	Red River Gorge	0	Blue Eyed Honkey Jesus	2006-11-28 23:00:00	Redpoint	Red_River_Gorge__Blue_Eyed_Honkey_Jesus__rope	5.12c	V8	0	2 663	1
4	Red River Gorge	0	Chainsaw Massacre	2006-11-23 23:00:00	Onsight	Red_River_Gorge__Chainsaw_Massacre__rope	5.12a	V7	1	2 691	1
4	Red River Gorge	0	Fuzzy Undercling	2006-11-20 23:00:00	Onsight	Red_River_Gorge__Fuzzy_Undercling__rope	5.11b	V5	1	2 756	1
4	Red River Gorge	0	Gung Ho	2006-11-20 23:00:00	Onsight	Red_River_Gorge__Gung_Ho__rope	5.12b	V8	1	2 776	1
4	Red River Gorge	0	Kick Me In The Jimmie	2006-11-23 23:00:00	Onsight	Red_River_Gorge__Kick_Me_In_The_Jimmie__rope	5.12a	V7	1	2 817	1
[ omitted 232,872 entries ]

Time difference of 37.63 secs

Note

dtplyr automatically translates dplyr code into data.table

threshold_ascents_dtp <- function(old_df, limit = 20) {
  new_df <- lazy_dt(old_df) |> 
    group_by(user_id) |> filter(n() >= limit) |> 
    group_by(route_id) |> filter(n() >= limit) |> 
    ungroup() |> collect()
  
  if (!identical(dim(old_df), dim(new_df)))
    threshold_ascents_dtp(new_df, limit)
  else return(new_df)
}

(dtp_clean <- climb_dbp
  |> lazy_dt()
  |> mutate(
    route_id = str_c(
      str_replace_all(crag, ' ', '_'), "__", 
      str_replace_all(route_name, ' ', '_'), "__", 
      if_else(climb_type == 1, 'boulder', 'rope')
    ),
    ascent_date = lubridate::as_datetime(ascent_date),
    usa_boulders = factor(usa_boulders, levels = bouldering_grades),
    usa_routes = factor(usa_routes, levels = route_ratings)
  )
  |> group_by(route_id) 
  |> mutate(route_rating = mode(usa_routes), bouldering_grade = mode(usa_boulders)) 
  |> ungroup()
  |> select(-c(usa_routes, usa_boulders))
  |> mutate(label = as.integer(method_name %in% c("Onsight", "Flash")))
  |> group_by(user_id, route_id) |> slice(which.min(ascent_date)) |> ungroup()
  |> threshold_ascents_dtp(limit = 20) |> ungroup()
  |> group_by(route_id) |> mutate(route_idx = cur_group_id()) |> ungroup()
  |> group_by(user_id) |> mutate(user_idx = cur_group_id()) |> ungroup()
  |> collect()
)

data.frame [232,887 x 12]

user_id	crag	climb_type	route_name	ascent_date	method_name	route_id	route_rating	bouldering_grade	label	route_idx	user_idx
312	Red Rocks	0	Yaak Crack	1998-03-31 22:00:00	Redpoint	Red_Rocks__Yaak_Crack__rope	5.11d	V6	0	3 263	19
630	Red Rocks	0	Yaak Crack	2003-12-31 23:00:00	Redpoint	Red_Rocks__Yaak_Crack__rope	5.11d	V6	0	3 263	47
790	Red Rocks	0	Yaak Crack	2001-01-01 23:00:00	Flash	Red_Rocks__Yaak_Crack__rope	5.11d	V6	1	3 263	66
932	Red Rocks	0	Yaak Crack	2008-12-20 23:00:00	Onsight	Red_Rocks__Yaak_Crack__rope	5.11d	V6	1	3 263	74
970	Red Rocks	0	Yaak Crack	2001-11-03 23:00:00	Onsight	Red_Rocks__Yaak_Crack__rope	5.11d	V6	1	3 263	75
1 145	Red Rocks	0	Yaak Crack	2003-12-20 23:00:00	Toprope	Red_Rocks__Yaak_Crack__rope	5.11d	V6	0	3 263	87
1 146	Red Rocks	0	Yaak Crack	2002-03-26 23:00:00	Redpoint	Red_Rocks__Yaak_Crack__rope	5.11d	V6	0	3 263	88
1 266	Red Rocks	0	Yaak Crack	2004-10-25 22:00:00	Onsight	Red_Rocks__Yaak_Crack__rope	5.11d	V6	1	3 263	93
1 315	Red Rocks	0	Yaak Crack	2005-02-03 23:00:00	Flash	Red_Rocks__Yaak_Crack__rope	5.11d	V6	1	3 263	96
1 455	Red Rocks	0	Yaak Crack	2014-09-27 22:00:00	Redpoint	Red_Rocks__Yaak_Crack__rope	5.11d	V6	0	3 263	103
1 603	Red Rocks	0	Yaak Crack	2004-05-29 22:00:00	Redpoint	Red_Rocks__Yaak_Crack__rope	5.11d	V6	0	3 263	113
2 011	Red Rocks	0	Yaak Crack	2009-04-02 22:00:00	Onsight	Red_Rocks__Yaak_Crack__rope	5.11d	V6	1	3 263	142
2 034	Red Rocks	0	Yaak Crack	1998-12-13 23:00:00	Onsight	Red_Rocks__Yaak_Crack__rope	5.11d	V6	1	3 263	144
1 140	Red Rocks	0	Yaak Crack	2009-01-02 23:00:00	Redpoint	Red_Rocks__Yaak_Crack__rope	5.11d	V6	0	3 263	86
2 055	Red Rocks	0	Yaak Crack	2013-03-04 23:00:00	Flash	Red_Rocks__Yaak_Crack__rope	5.11d	V6	1	3 263	145
[ omitted 232,872 entries ]

Time difference of 8.351 secs

3 Model

3.1 Stan code

Note

Updated Stan code using within-chain parallelization

functions {
  array[] int sequence(int start, int end) {
    array[end - start + 1] int seq;
    for (n in 1 : num_elements(seq)) {
      seq[n] = n + start - 1;
    }
    return seq;
  }

  // Compute partial sums of the log-likelihood
  real partial_log_lik_lpmf(array[] int seq, int start, int end,
                            data array[] int labels, real mean_ability,
                            data array[] int users, vector user_ability,
                            data array[] int routes, vector route_difficulty) {
    real ptarget = 0;
    int N = end - start + 1;

    vector[N] mu = mean_ability + rep_vector(0.0, N);
    for (n in 1 : N) {
      int nn = n + start - 1;
      mu[n] += user_ability[users[nn]] - route_difficulty[routes[nn]];
    }
    ptarget += bernoulli_logit_lpmf(labels[start : end] | mu);
    return ptarget;
  }
}
data {
  int<lower=1> num_ascents;
  int<lower=1> num_users;
  int<lower=1> num_routes;
  array[num_ascents] int<lower=1, upper=num_users> users;
  array[num_ascents] int<lower=1, upper=num_routes> routes;
  array[num_ascents] int<lower=0, upper=1> labels;

  int grainsize;
}
transformed data {
  array[num_ascents] int seq = sequence(1, num_ascents);
}
parameters {
  real mean_ability;
  vector[num_users] user_ability;
  vector[num_routes] route_difficulty;
}
model {
  user_ability ~ std_normal();
  route_difficulty ~ std_normal();
  mean_ability ~ std_normal();

  target += reduce_sum(
    partial_log_lik_lpmf, seq, grainsize, 
    labels, mean_ability, users, user_ability, routes, route_difficulty
  );
}

3.2 Stan data

stan_data <- list(
  num_ascents = nrow(dt_clean),
  num_users = n_distinct(dt_clean$user_id),
  num_routes = n_distinct(dt_clean$route_id),
  routes = pull(dt_clean, route_idx),
  users = pull(dt_clean, user_idx),
  labels = pull(dt_clean, label) |> as.integer(),
  grainsize = max(100, nrow(dt_clean) / 50)
)

List of 7
 $ num_ascents: int 232887
 $ num_users  : int 2977
 $ num_routes : int 4288
 $ routes     : int [1:232887] 2258 2259 2443 2459 2464 2471 2475 2482 2495 2715 ...
 $ users      : int [1:232887] 1 1 1 1 1 1 1 1 1 1 ...
 $ labels     : int [1:232887] 1 0 1 1 0 1 0 1 1 1 ...
 $ grainsize  : num 4658

3.3 Model fit

mod_stan <- mod_stan_exe$sample(
  data = stan_data, seed = 666,
  iter_warmup = 500, iter_sampling = 1000, refresh = 0,
  chains = 6, parallel_chains = 6, threads_per_chain = 5
)

Note

Sampling takes ~4.89 minutes on my CPU (Ryzen 5950X, 16 Cores/32 Threads), on WSL2 (Ubuntu 20.04)

data.table [6 x 2]

Chain	Time
1	284.245s (~4.74 minutes)
2	297.297s (~4.95 minutes)
3	294.22s (~4.9 minutes)
4	289.123s (~4.82 minutes)
5	295.178s (~4.92 minutes)
6	299.366s (~4.99 minutes)

4 Model diagnostics

bayesplot::mcmc_neff_hist(bayesplot::neff_ratio(mod_stan))

bayesplot::mcmc_rhat_hist(bayesplot::rhat(mod_stan))

Plotting random subsets of the traces

hist_trace_plot <- function(mod, vars) {
  draws <- mod$draws(variables = vars, format = "draws_list")
  patchwork::wrap_plots(
    bayesplot::mcmc_hist(draws, facet_args = list(nrow = length(vars))),
    bayesplot::mcmc_trace(draws, facet_args = list(nrow = length(vars))),
    widths = c(1, 1.5)
  )
}

hist_trace_plot(
  mod_stan, 
  paste0("route_difficulty[", unique(dt_clean, by = "route_idx")[, route_idx] |> sample(5), "]")
)

hist_trace_plot(
  mod_stan, 
  paste0("user_ability[", unique(dt_clean, by = "user_idx")[, user_idx] |> sample(5), "]")
)

5 Posterior Predictions

5.1 Posterior data

Getting our Posterior Predictions (subset of 500 draws per route) into long format:

Note

Comparing data.table and dplyr (using the rvar format from posterior):

unique(dt_clean[, .(route_idx, bouldering_grade, route_rating, climb_type)], by = "route_idx")[
  as.data.table(mod_stan$draws(variables = "route_difficulty") |> 
   bind(x, subset_draws(x, "route_difficulty", regex = T, draw = sample.int(ndraws(x), size = 500))))
   [, .(route_difficulty = list(value)), by = variable
  ][, `:=`(route_idx = as.integer(str_extract(variable, "\\d{1,4}")), variable = NULL)],
  on = "route_idx", nomatch = NULL
][, `:=`(
  bouldering_grade = factor(bouldering_grade, levels = bouldering_grades), 
  route_rating = factor(route_rating, levels = route_ratings)
)][order(route_idx)] -> pp

data.table [4,288 x 5]

route_idx	bouldering_grade	route_rating	climb_type	route_difficulty
1	V7	5.12a	1	<numeric [500]>
2	V7	5.12a	1	<numeric [500]>
3	V5	5.11a	1	<numeric [500]>
4	V7	5.12a	1	<numeric [500]>
5	V4	5.10c	1	<numeric [500]>
6	V5	5.11a	1	<numeric [500]>
7	V9	5.12d	1	<numeric [500]>
8	V3	5.10a	1	<numeric [500]>
9	V9	5.12d	1	<numeric [500]>
10	V8	5.12c	1	<numeric [500]>
11	V5	5.11a	1	<numeric [500]>
12	V5	5.11a	1	<numeric [500]>
13	V3	5.10a	1	<numeric [500]>
14	V6	5.11d	1	<numeric [500]>
15	V5	5.11b	0	<numeric [500]>
[ omitted 4,273 entries ]

Time difference of 1.727 secs

With dplyr, we can use the rvar format to encapsulate the samples from the model, which drastically reduces the size of the samples’ data.frame

pp_df <- (inner_join(
    select(df_clean, route_idx, bouldering_grade, route_rating, climb_type) |> 
      distinct(route_idx, .keep_all = TRUE),
    tidybayes::spread_rvars(mod_stan, route_difficulty[route_idx], ndraws = 500),
    by = "route_idx"
  )
  |> mutate(
    bouldering_grade = factor(bouldering_grade, levels = bouldering_grades), 
    route_rating = factor(route_rating, levels = route_ratings)
  )
  |> arrange(route_idx)
)

data.frame [4,288 x 5]

route_idx	bouldering_grade	route_rating	climb_type	route_difficulty
1	V7	5.12a	1	<rvar [1 x NA]>
2	V7	5.12a	1	<rvar [1 x NA]>
3	V5	5.11a	1	<rvar [1 x NA]>
4	V7	5.12a	1	<rvar [1 x NA]>
5	V4	5.10c	1	<rvar [1 x NA]>
6	V5	5.11a	1	<rvar [1 x NA]>
7	V9	5.12d	1	<rvar [1 x NA]>
8	V3	5.10a	1	<rvar [1 x NA]>
9	V9	5.12d	1	<rvar [1 x NA]>
10	V8	5.12c	1	<rvar [1 x NA]>
11	V5	5.11a	1	<rvar [1 x NA]>
12	V5	5.11a	1	<rvar [1 x NA]>
13	V3	5.10a	1	<rvar [1 x NA]>
14	V6	5.11d	1	<rvar [1 x NA]>
15	V7	5.12a	0	<rvar [1 x NA]>
[ omitted 4,273 entries ]

Time difference of 1.032 secs

5.2 Posterior plots:

Plot code

ridgeline_plot <- function(dat, var) {
  if (class(dat[, route_difficulty]) == "list")
    dat <- dat[, .(route_difficulty = unlist(route_difficulty)), by = setdiff(names(dat), 'route_difficulty')]
  
  ggplot(dat, aes_string(y = var)) +
  ggridges::geom_density_ridges(
    aes_string(x = "route_difficulty", fill = var), 
    alpha = 0.5, scale = 2.5, color = "grey30"
  ) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  labs(x = str_to_title(str_replace_all(var, "_", " ")), y = "") +
  scale_y_discrete(position = "right") +
  theme(legend.position = "none", axis.line.y = element_blank())
}

Route Rating:

ridgeline_plot(pp[climb_type == 0], "route_rating")

Bouldering Grade:

ridgeline_plot(pp[climb_type == 1 & bouldering_grade != "V0"], "bouldering_grade")