Computing on X using online (incremental) algorithms

This tutorial showcases computing per-gene and per-cell statistics for a user-defined query using out-of-core operations, where incremental (online) algorithms are needed.

NOTE: when query results are small enough to fit in memory, it’s usually easier to use get_seurat() or get_single_cell_experiment().

Incremental mean calculation

Many statistics, such as marginal means, are easy to calculate incrementally. Let’s begin with a query on the X$raw sparse matrix of unnormalized read counts, which will return results in shards Incrementally accumulate the read count for each gene, then divide by the cell count to get the mean reads per cell for each gene.

First define a query - in this case a slice over the obs axis for cells with a specific tissue & sex value, and all genes on the var axis. The query$X() method returns an iterator of results, each as an Arrow Table. Each table will contain the sparse X data and obs/var coordinates, using standard SOMA names:

soma_data - the X values (float32)
soma_dim_0 - the obs coordinate (int64)
soma_dim_1 - the var coordinate (int64)

Important: the X matrices are joined to var/obs axis DataFrames by an integer join “id” (aka soma_joinid). They are NOT positionally indexed, and any given cell or gene may have a soma_joinid of any value (e.g., a large integer). In other words, for any given X value, the soma_dim_0 corresponds to the soma_joinid in the obs dataframe, and the soma_dim_1 coordinate corresponds to the soma_joinid in the var dataframe.

For convenience, the query class includes a utility to simplify operations on query slices. query$indexer is an indexer used to wrap the output of query$X(), converting from soma_joinids to positional indexing in the query results. Positions are [0, N), where N are the number of results on the query for any given axis.

Key points:

it is expensive to query and read the results - so rather than make multiple passes over the data, read it once and perform multiple computations.
by default, data in the census is indexed by soma_joinid and not positionally.

library(tiledbsoma)
census <- cellxgene.census::open_soma()

query <- census$get("census_data")$get("mus_musculus")$axis_query(
  measurement_name = "RNA",
  obs_query = SOMAAxisQuery$new(value_filter = "tissue=='brain' && sex=='male'")
)

genes_df <- query$var(column_names = c("feature_id", "feature_name"))$concat()
genes_df <- as.data.frame(genes_df)
n_genes <- nrow(genes_df)

# accumulator vector (for each gene) for the total count over all cells in X("raw")
raw_sum_by_gene <- numeric(n_genes)

# iterate through in-memory shards of query results
tables <- query$X("raw")$tables()
while (!tables$read_complete()) {
  table_part <- tables$read_next()
  # table_part is an Arrow table with the columns mentioned above. The result
  # order is not guaranteed!

  # table_part$soma_dim_1 is the var/gene soma_joinid. But note that these are
  # arbitrary int64 id's, and moreover each table_part may exhibit only a subset
  # of the values we'll see over all query results. query$indexer helps us map
  # any given soma_dim_1 values onto positions in query$var() (genes_df), that is
  # the union of all values we'll see.
  gene_indexes <- query$indexer$by_var(table_part$soma_dim_1)$as_vector()
  stopifnot(sum(gene_indexes >= n_genes) == 0)
  # sum(table_part) group by gene, yielding a numeric vector with the gene_index
  # in its names
  sum_part <- tapply(as.vector(table_part$soma_data), gene_indexes, sum)
  # update the accumulator vector
  which_genes <- as.integer(names(sum_part)) + 1  # nb: gene_indexes is zero-based
  stopifnot(sum(which_genes > n_genes) == 0)
  raw_sum_by_gene[which_genes] <- raw_sum_by_gene[which_genes] + sum_part
}

# Divide each sum by cell count to get mean reads per cell (for each gene),
# implicitly averaging in all zero entries in X even though they weren't included
# in the sparse query results.
genes_df$raw_mean <- raw_sum_by_gene / query$n_obs
genes_df
#> # A tibble: 52,392 × 3
#>    feature_id         feature_name   raw_mean
#>    <chr>              <chr>             <dbl>
#>  1 ENSMUSG00000051951 Xkr4          1.28     
#>  2 ENSMUSG00000089699 Gm1992        0        
#>  3 ENSMUSG00000102343 Gm37381       0        
#>  4 ENSMUSG00000025900 Rp1           0.291    
#>  5 ENSMUSG00000025902 Sox17        60.7      
#>  6 ENSMUSG00000104328 Gm37323       0.0000570
#>  7 ENSMUSG00000033845 Mrpl15       36.2      
#>  8 ENSMUSG00000025903 Lypla1       18.3      
#>  9 ENSMUSG00000104217 Gm37988       0        
#> 10 ENSMUSG00000033813 Tcea1        39.6      
#> # ℹ 52,382 more rows

Counting cells grouped by dataset and gene

The goal of this example is to count the number of cells with nonzero reads, grouped by gene and Census dataset_id. The result is a data frame with dataset, gene, and the number of cells with nonzero reads for that dataset and gene.

For this multi-factor aggregation, we’ll take advantage of dplyr routines instead of the lower-level vector indexer shown above. And for presentation purposes, we’ll limit the query to four genes, but this can be expanded to all genes easily.

library(dplyr)

query <- census$get("census_data")$get("mus_musculus")$axis_query(
  measurement_name = "RNA",
  obs_query = SOMAAxisQuery$new(value_filter = "tissue=='brain'"),
  var_query = SOMAAxisQuery$new(value_filter = "feature_name %in% c('Malat1', 'Ptprd', 'Dlg2', 'Pcdh9')")
)

obs_tbl <- query$obs(column_names=c("soma_joinid", "dataset_id"))$concat()
obs_df <- data.frame(
  # materialize soma_joinid as character to avoid overflowing R 32-bit integer
  cell_id = as.character(obs_tbl$soma_joinid),
  dataset_id = obs_tbl$dataset_id$as_vector()
)
var_tbl <- query$var(column_names=c("soma_joinid", "feature_name"))$concat()
var_df <- data.frame(
  gene_id = as.character(var_tbl$soma_joinid),
  feature_name = var_tbl$feature_name$as_vector()
)

# accumulator for # cells by dataset & gene
n_cells_grouped <- data.frame(
  "dataset_id" = character(0),
  "gene_id" = character(0),
  "n_cells" = numeric(0)
)

# iterate through in-memory shards of query results
tables <- query$X("raw")$tables()
while (!tables$read_complete()) {
  table_part <- tables$read_next()

  # prepare a (dataset,gene,1) tuple for each entry in table_part
  n_cells_part <- data.frame(
    "cell_id" = as.character(table_part$soma_dim_0),
    "gene_id" = as.character(table_part$soma_dim_1),
    "n_cells" = 1
  )
  n_cells_part <- left_join(n_cells_part, obs_df, by = "cell_id")
  stopifnot(sum(is.null(n_cells_part$dataset_id)) == 0)
  n_cells_part$cell_id <- NULL

  # fold those into n_cells_grouped
  n_cells_grouped <- bind_rows(n_cells_grouped, n_cells_part) %>%
                      group_by(dataset_id, gene_id) %>%
                      summarise(n_cells = sum(n_cells)) %>%
                      ungroup()
}

# add gene names for display
n_cells_grouped <- left_join(n_cells_grouped, var_df, by = "gene_id")
stopifnot(sum(is.null(n_cells_grouped$feature_name)) == 0)
n_cells_grouped[c("dataset_id", "feature_name", "n_cells")]
#> # A tibble: 17 × 3
#>    dataset_id                           feature_name n_cells
#>    <chr>                                <chr>          <dbl>
#>  1 3bbb6cf9-72b9-41be-b568-656de6eb18b5 Dlg2           79513
#>  2 3bbb6cf9-72b9-41be-b568-656de6eb18b5 Pcdh9          79476
#>  3 3bbb6cf9-72b9-41be-b568-656de6eb18b5 Malat1         79667
#>  4 3bbb6cf9-72b9-41be-b568-656de6eb18b5 Ptprd          79578
#>  5 58b01044-c5e5-4b0f-8a2d-6ebf951e01ff Dlg2              81
#>  6 58b01044-c5e5-4b0f-8a2d-6ebf951e01ff Pcdh9            125
#>  7 58b01044-c5e5-4b0f-8a2d-6ebf951e01ff Malat1         12622
#>  8 58b01044-c5e5-4b0f-8a2d-6ebf951e01ff Ptprd            474
#>  9 66ff82b4-9380-469c-bc4b-cfa08eacd325 Dlg2             856
#> 10 66ff82b4-9380-469c-bc4b-cfa08eacd325 Pcdh9           2910
#> 11 66ff82b4-9380-469c-bc4b-cfa08eacd325 Malat1          7102
#> 12 98e5ea9f-16d6-47ec-a529-686e76515e39 Dlg2             908
#> 13 98e5ea9f-16d6-47ec-a529-686e76515e39 Pcdh9           3027
#> 14 98e5ea9f-16d6-47ec-a529-686e76515e39 Malat1         20094
#> 15 c08f8441-4a10-4748-872a-e70c0bcccdba Dlg2              52
#> 16 c08f8441-4a10-4748-872a-e70c0bcccdba Pcdh9            117
#> 17 c08f8441-4a10-4748-872a-e70c0bcccdba Malat1         12992

Don’t forget to close the census.

census$close()