title: “Speed-up tidySummarizedExperiment with plyxp

author: “Stefano Mangiola”

contributors: - Michael Love - Justin Landis

output: prettydoc::html_pretty: theme: cayman toc: yes toc_depth: 2 number_sections: yes fig_caption: yes df_print: paged vignette: > % % % —

knitr::opts_chunk$set(collapse = TRUE, comment = "#>")

1 Motivation and design principles

This benchmark supports ongoing work to improve mutate() execution paths, including exploring plyxp-backed execution for mixed-slot operations (see issue: Attempt using plyxp for some cases in tidySummarizedExperiment). The current proposal is grounded in three principles:

These design choices aim to preserve dimnames, avoid unnecessary tibble round-trips, and provide predictable performance across simple and mixed-slot scenarios.

2 Overview

This vignette benchmarks a set of mutate() scenarios on two branches of the repository, by explicitly checking out the branches via git worktree, loading each branch’s code with devtools::load_all(), running the same scenarios multiple times, and comparing the runtimes with ggplot boxplots.

Requirements: git, devtools, airway, dplyr, SummarizedExperiment, ggplot2.

3 Setup helper functions

suppressPackageStartupMessages({
  library(ggplot2)
  library(dplyr)
  library(SummarizedExperiment)
  library(rlang)
  library(devtools)
  library(airway)
  library(microbenchmark)
  library(future.apply)
})
#> Warning: package 'GenomeInfoDb' was built under R version 4.5.1
#> Warning: package 'future' was built under R version 4.5.1

ensure_worktree <- function(branch_ref, worktree_dir) {
  cmd <- sprintf(
    "sh -c 'git fetch origin --prune; git worktree remove -f %s >/dev/null 2>&1 || true; git worktree add --detach %s %s'",
    shQuote(worktree_dir), shQuote(worktree_dir), shQuote(branch_ref)
  )
  status <- system(cmd)
  if (status != 0) stop(sprintf("Failed to create worktree for %s", branch_ref))
}

load_branch_code <- function(worktree_dir) {
  if (!requireNamespace("devtools", quietly = TRUE)) stop("Please install devtools to run this vignette.")
  suppressMessages(devtools::load_all(worktree_dir, quiet = TRUE))
}

create_airway_test_se <- function() {
  suppressPackageStartupMessages(library(airway))
  data(airway)
  se <- airway
  se[1:200, ]
}

benchmark_scenarios <- function() {
  list(
    coldata_simple_assignment = quo({ se %>% mutate(new_dex = dex) }),
    coldata_arithmetic = quo({ se %>% mutate(avgLength_plus_5 = avgLength + 5) }),
    coldata_concat = quo({ se %>% mutate(sample_info = paste(cell, dex, SampleName, sep = "_")) }),
    coldata_grouped_mean = quo({ se %>% group_by(dex) %>% mutate(avgLength_group_mean = mean(avgLength)) %>% ungroup() }),
    assay_simple_assignment = quo({ se %>% mutate(counts_copy = counts) }),
    assay_plus_one = quo({ se %>% mutate(counts_plus_1 = counts + 1) }),
    assay_log = quo({ se %>% mutate(log_counts_manual = log2(counts + 1)) }),
    complex_conditional_coldata = quo({ se %>% mutate(length_group = ifelse(avgLength > mean(avgLength), "longer", "shorter")) }),
    complex_nested = quo({ se %>% mutate(complex_category = ifelse(dex == "trt" & avgLength > mean(avgLength), "treated_long", ifelse(dex == "untrt", "untreated", "other"))) }),
    mixed_assay_coldata = quo({ se %>% mutate(new_counts = counts * avgLength) }),
    multiple_simple_assay = quo({ se %>% mutate(normalized_counts = counts / 1000, sqrt_counts = sqrt(counts)) }),
    chained_mutates = quo({ se %>% mutate(tmp = avgLength * 2) %>% mutate(flag = ifelse(tmp > mean(tmp), 1, 0)) }),

    # Filter benchmarks (scoped and non-rectangular)
    filter_coldata_simple = quo({ se %>% filter(dex == "trt") }),
    filter_coldata_numeric = quo({ se %>% filter(avgLength > median(avgLength)) }),
    filter_assay_nonrect = quo({ se %>% filter(counts > 0) }),

    # Select benchmarks (covering colData-only, rowData-only, assays-only, mixed)
    select_coldata_simple = quo({ se %>% select(.sample, dex) }),
    select_rowdata_simple = quo({ se %>% select(.feature) }),
    select_assay_only = quo({ se %>% select(counts) }),
    select_mixed_keys_counts = quo({ se %>% select(.sample, .feature, counts) }),
    select_coldata_wide = quo({ se %>% select(.sample, dex, avgLength, SampleName) })
  )
}

run_one <- function(expr_quo, reps = 5L) {
  se_base <- create_airway_test_se()
  mb <- microbenchmark::microbenchmark(
    eval_tidy(expr_quo),
    times = reps,
    setup = { se <- se_base },          # reuse the same input, avoid recreating inside the timed expr
    control = list(warmup = 2L)
  )
  # microbenchmark returns nanoseconds; convert to milliseconds
  as.numeric(mb$time) / 1e6
}

run_all_scenarios <- function(branch_label, reps = 7L) {
  scenarios <- benchmark_scenarios()
  out <- list()
  for (nm in names(scenarios)) {
    tms <- run_one(scenarios[[nm]], reps = reps)
    out[[length(out) + 1L]] <- data.frame(
      branch = branch_label,
      scenario = nm,
      replicate = seq_along(tms),
      elapsed_ms = tms,
      stringsAsFactors = FALSE
    )
  }
  bind_rows(out)
}

# Parallel version: run each scenario on a separate worker
run_all_scenarios_parallel <- function(branch_label, reps = 20L, workers = 1L, initializer = NULL) {
  scenarios <- benchmark_scenarios()
  nms <- names(scenarios)
  old_plan <- future::plan()
  on.exit(future::plan(old_plan), add = TRUE)
  future::plan(future::multisession, workers = workers)
  res <- future.apply::future_lapply(nms, function(nm) {
    if (!is.null(initializer)) initializer()
    tms <- run_one(scenarios[[nm]], reps = reps)
    data.frame(
      branch = branch_label,
      scenario = nm,
      replicate = seq_along(tms),
      elapsed_ms = tms,
      stringsAsFactors = FALSE
    )
  }, future.seed = TRUE)
  dplyr::bind_rows(res)
}

4 Prepare explicit worktrees for both branches

branch_a_ref <- "origin/master"
branch_b_ref <- "origin/query-to-slot-routines"
wt_a <- normalizePath("../tidySummarizedExperiment.__bench_master__", mustWork = FALSE)
wt_b <- normalizePath("../tidySummarizedExperiment.__bench_query_to_slot__", mustWork = FALSE)
ensure_worktree(branch_a_ref, wt_a)
ensure_worktree(branch_b_ref, wt_b)

5 Run benchmarks on both branches

# Branch A
load_branch_code(wt_a)
#> Warning: package 'ttservice' was built under R version 4.5.1
init_a <- function() suppressMessages(devtools::load_all(wt_a, quiet = TRUE))
res_a <- run_all_scenarios_parallel(branch_label = "master", reps = 20L, workers = 8L, initializer = init_a)
#> Warning: package ‘future’ was built under R version 4.5.1
#> Warning: package ‘GenomeInfoDb’ was built under R version 4.5.1
#> Warning: package ‘ttservice’ was built under R version 4.5.1
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> Warning: package ‘future’ was built under R version 4.5.1
#> Warning: package ‘GenomeInfoDb’ was built under R version 4.5.1
#> Warning: package ‘ttservice’ was built under R version 4.5.1
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> Warning: package ‘future’ was built under R version 4.5.1
#> Warning: package ‘GenomeInfoDb’ was built under R version 4.5.1
#> Warning: package ‘ttservice’ was built under R version 4.5.1
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> Warning: package ‘future’ was built under R version 4.5.1
#> Warning: package ‘GenomeInfoDb’ was built under R version 4.5.1
#> Warning: package ‘ttservice’ was built under R version 4.5.1
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> Warning: package ‘future’ was built under R version 4.5.1
#> Warning: package ‘GenomeInfoDb’ was built under R version 4.5.1
#> Warning: package ‘ttservice’ was built under R version 4.5.1
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> Warning: package ‘future’ was built under R version 4.5.1
#> Warning: package ‘GenomeInfoDb’ was built under R version 4.5.1
#> Warning: package ‘ttservice’ was built under R version 4.5.1
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> Warning: package ‘future’ was built under R version 4.5.1
#> Warning: package ‘GenomeInfoDb’ was built under R version 4.5.1
#> Warning: package ‘ttservice’ was built under R version 4.5.1
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> Warning: package ‘future’ was built under R version 4.5.1
#> Warning: package ‘GenomeInfoDb’ was built under R version 4.5.1
#> Warning: package ‘ttservice’ was built under R version 4.5.1
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.

# Branch B
load_branch_code(wt_b)
init_b <- function() suppressMessages(devtools::load_all(wt_b, quiet = TRUE))
res_b <- run_all_scenarios_parallel(branch_label = "query-to-slot-routines", reps = 20L, workers = 1L, initializer = init_b)
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: A data frame is returned for independent data analysis.
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> tidySummarizedExperiment says: The resulting data frame is not rectangular (all genes for all samples), a tibble is returned for independent data analysis.
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> Warning in microbenchmark::microbenchmark(eval_tidy(expr_quo), times = reps, :
#> Could not measure overhead. Your clock might lack precision.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.
#> tidySummarizedExperiment says: Key columns are missing. A data frame is returned for independent data analysis.

results <- dplyr::bind_rows(res_a, res_b) %>%
  dplyr::mutate(operation = dplyr::case_when(
    grepl("^filter", scenario) ~ "filter",
    grepl("^select", scenario) ~ "select",
    TRUE ~ "mutate"
  ))

summary_table <- results %>%
  group_by(branch, scenario) %>%
  summarise(median_ms = median(elapsed_ms), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = branch, values_from = median_ms)

summary_table

6 Visualise with ggplot boxplots

dodge_w <- 0.7

ggplot(results, aes(x = scenario, y = elapsed_ms, fill = branch)) +
  geom_boxplot(position = position_dodge(width = dodge_w), width = 0.7, outlier.shape = NA) +

  # Add jittered points aligned with the dodged boxplots
  geom_point(
    position = position_jitterdodge(jitter.width = 0.1, jitter.height = 0, dodge.width = dodge_w), 
    alpha = 0.6, 
    size = 0.5
  ) +
  scale_y_log10() + 
  coord_flip() +
  facet_grid(operation ~ ., scales = "free_y", space = "free_y") +
  annotation_logticks(sides = "b") +
  labs(title = "Mutate, Filter, and Select benchmark across branches",
       x = "Scenario",
       y = "Elapsed (ms)") +
  theme_bw() +
  
  # Angle x labels  
  theme(legend.position = "top", axis.text.x = element_text(angle = 45, hjust = 1))

sessionInfo()
#> R version 4.5.0 (2025-04-11)
#> Platform: x86_64-apple-darwin20
#> Running under: macOS Sonoma 14.6.1
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> time zone: Australia/Adelaide
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] tidySummarizedExperiment_1.19.7 tidyr_1.3.1                    
#>  [3] testthat_3.2.3                  ttservice_0.5.3                
#>  [5] future.apply_1.20.0             future_1.67.0                  
#>  [7] microbenchmark_1.5.0            airway_1.28.0                  
#>  [9] devtools_2.4.5                  usethis_3.1.0                  
#> [11] rlang_1.1.6                     SummarizedExperiment_1.38.1    
#> [13] Biobase_2.68.0                  GenomicRanges_1.60.0           
#> [15] GenomeInfoDb_1.44.2             IRanges_2.42.0                 
#> [17] S4Vectors_0.46.0                BiocGenerics_0.54.0            
#> [19] generics_0.1.4                  MatrixGenerics_1.20.0          
#> [21] matrixStats_1.5.0               dplyr_1.1.4                    
#> [23] ggplot2_3.5.2                  
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_1.2.1        viridisLite_0.4.2       farver_2.1.2           
#>  [4] S7_0.2.0                lazyeval_0.2.2          fastmap_1.2.0          
#>  [7] promises_1.3.3          plyxp_1.2.7             digest_0.6.37          
#> [10] mime_0.13               lifecycle_1.0.4         ellipsis_0.3.2         
#> [13] magrittr_2.0.3          compiler_4.5.0          sass_0.4.10            
#> [16] tools_4.5.0             yaml_2.3.10             data.table_1.17.8      
#> [19] knitr_1.50              S4Arrays_1.8.1          htmlwidgets_1.6.4      
#> [22] pkgbuild_1.4.8          DelayedArray_0.34.1     RColorBrewer_1.1-3     
#> [25] pkgload_1.4.0           abind_1.4-8             miniUI_0.1.2           
#> [28] withr_3.0.2             purrr_1.1.0             desc_1.4.3             
#> [31] grid_4.5.0              fansi_1.0.6             urlchecker_1.0.1       
#> [34] profvis_0.4.0           xtable_1.8-4            globals_0.18.0         
#> [37] scales_1.4.0            cli_3.6.5               rmarkdown_2.29         
#> [40] crayon_1.5.3            remotes_2.5.0           rstudioapi_0.17.1      
#> [43] httr_1.4.7              sessioninfo_1.2.3       cachem_1.1.0           
#> [46] stringr_1.5.1           parallel_4.5.0          XVector_0.48.0         
#> [49] vctrs_0.6.5             Matrix_1.7-3            jsonlite_2.0.0         
#> [52] listenv_0.9.1           plotly_4.11.0           jquerylib_0.1.4        
#> [55] glue_1.8.0              parallelly_1.45.1       codetools_0.2-20       
#> [58] stringi_1.8.7           gtable_0.3.6            later_1.4.4            
#> [61] UCSC.utils_1.4.0        tibble_3.3.0            pillar_1.11.0          
#> [64] brio_1.1.5              htmltools_0.5.8.1       GenomeInfoDbData_1.2.14
#> [67] R6_2.6.1                rprojroot_2.1.0         evaluate_1.0.5         
#> [70] shiny_1.11.1            lattice_0.22-7          memoise_2.0.1          
#> [73] httpuv_1.6.16           bslib_0.9.0             Rcpp_1.1.0             
#> [76] SparseArray_1.8.1       xfun_0.53               fs_1.6.6               
#> [79] prettydoc_0.4.1         pkgconfig_2.0.3