This tutorial showcases computing per-gene and per-cell statistics for a user-defined query using out-of-core operations, where incremental (online) algorithms are needed.
NOTE: when query results are small enough to fit in memory,
it’s usually easier to use get_seurat() or
get_single_cell_experiment().
Contents
Many statistics, such as marginal means, are easy to calculate
incrementally. Let’s begin with a query on the X$raw sparse
matrix of unnormalized read counts, which will return results in shards
Incrementally accumulate the read count for each gene, then divide by
the cell count to get the mean reads per cell for each gene.
First define a query - in this case a slice over the obs axis for
cells with a specific tissue & sex value, and all genes on the var
axis. The query$X() method returns an iterator of results,
each as an Arrow Table. Each table will contain the sparse X data and
obs/var coordinates, using standard SOMA names:
soma_data - the X values (float32)soma_dim_0 - the obs coordinate (int64)soma_dim_1 - the var coordinate (int64)Important: the X matrices are joined to var/obs axis
DataFrames by an integer join “id” (aka soma_joinid). They
are NOT positionally indexed, and any given cell or gene may have a
soma_joinid of any value (e.g., a large integer). In other
words, for any given X value, the soma_dim_0 corresponds to
the soma_joinid in the obs dataframe, and the
soma_dim_1 coordinate corresponds to the
soma_joinid in the var dataframe.
For convenience, the query class includes a utility to simplify
operations on query slices. query$indexer is an indexer
used to wrap the output of query$X(), converting from
soma_joinids to positional indexing in the query results.
Positions are [0, N), where N are the number of results on
the query for any given axis.
Key points:
soma_joinid and not positionally.library(tiledbsoma)
census <- cellxgene.census::open_soma()
query <- census$get("census_data")$get("mus_musculus")$axis_query(
measurement_name = "RNA",
obs_query = SOMAAxisQuery$new(value_filter = "tissue=='brain' && sex=='male'")
)
genes_df <- query$var(column_names = c("feature_id", "feature_name"))$concat()
genes_df <- as.data.frame(genes_df)
n_genes <- nrow(genes_df)
# accumulator vector (for each gene) for the total count over all cells in X("raw")
raw_sum_by_gene <- numeric(n_genes)
# iterate through in-memory shards of query results
tables <- query$X("raw")$tables()
while (!tables$read_complete()) {
table_part <- tables$read_next()
# table_part is an Arrow table with the columns mentioned above. The result
# order is not guaranteed!
# table_part$soma_dim_1 is the var/gene soma_joinid. But note that these are
# arbitrary int64 id's, and moreover each table_part may exhibit only a subset
# of the values we'll see over all query results. query$indexer helps us map
# any given soma_dim_1 values onto positions in query$var() (genes_df), that is
# the union of all values we'll see.
gene_indexes <- query$indexer$by_var(table_part$soma_dim_1)$as_vector()
stopifnot(sum(gene_indexes >= n_genes) == 0)
# sum(table_part) group by gene, yielding a numeric vector with the gene_index
# in its names
sum_part <- tapply(as.vector(table_part$soma_data), gene_indexes, sum)
# update the accumulator vector
which_genes <- as.integer(names(sum_part)) + 1 # nb: gene_indexes is zero-based
stopifnot(sum(which_genes > n_genes) == 0)
raw_sum_by_gene[which_genes] <- raw_sum_by_gene[which_genes] + sum_part
}
# Divide each sum by cell count to get mean reads per cell (for each gene),
# implicitly averaging in all zero entries in X even though they weren't included
# in the sparse query results.
genes_df$raw_mean <- raw_sum_by_gene / query$n_obs
genes_df
#> # A tibble: 52,392 × 3
#> feature_id feature_name raw_mean
#> <chr> <chr> <dbl>
#> 1 ENSMUSG00000051951 Xkr4 1.28
#> 2 ENSMUSG00000089699 Gm1992 0
#> 3 ENSMUSG00000102343 Gm37381 0
#> 4 ENSMUSG00000025900 Rp1 0.291
#> 5 ENSMUSG00000025902 Sox17 60.7
#> 6 ENSMUSG00000104328 Gm37323 0.0000570
#> 7 ENSMUSG00000033845 Mrpl15 36.2
#> 8 ENSMUSG00000025903 Lypla1 18.3
#> 9 ENSMUSG00000104217 Gm37988 0
#> 10 ENSMUSG00000033813 Tcea1 39.6
#> # ℹ 52,382 more rowsThe goal of this example is to count the number of cells with nonzero
reads, grouped by gene and Census dataset_id. The result is
a data frame with dataset, gene, and the number of cells with nonzero
reads for that dataset and gene.
For this multi-factor aggregation, we’ll take advantage of
dplyr routines instead of the lower-level vector indexer
shown above. And for presentation purposes, we’ll limit the query to
four genes, but this can be expanded to all genes easily.
library(dplyr)
query <- census$get("census_data")$get("mus_musculus")$axis_query(
measurement_name = "RNA",
obs_query = SOMAAxisQuery$new(value_filter = "tissue=='brain'"),
var_query = SOMAAxisQuery$new(value_filter = "feature_name %in% c('Malat1', 'Ptprd', 'Dlg2', 'Pcdh9')")
)
obs_tbl <- query$obs(column_names=c("soma_joinid", "dataset_id"))$concat()
obs_df <- data.frame(
# materialize soma_joinid as character to avoid overflowing R 32-bit integer
cell_id = as.character(obs_tbl$soma_joinid),
dataset_id = obs_tbl$dataset_id$as_vector()
)
var_tbl <- query$var(column_names=c("soma_joinid", "feature_name"))$concat()
var_df <- data.frame(
gene_id = as.character(var_tbl$soma_joinid),
feature_name = var_tbl$feature_name$as_vector()
)
# accumulator for # cells by dataset & gene
n_cells_grouped <- data.frame(
"dataset_id" = character(0),
"gene_id" = character(0),
"n_cells" = numeric(0)
)
# iterate through in-memory shards of query results
tables <- query$X("raw")$tables()
while (!tables$read_complete()) {
table_part <- tables$read_next()
# prepare a (dataset,gene,1) tuple for each entry in table_part
n_cells_part <- data.frame(
"cell_id" = as.character(table_part$soma_dim_0),
"gene_id" = as.character(table_part$soma_dim_1),
"n_cells" = 1
)
n_cells_part <- left_join(n_cells_part, obs_df, by = "cell_id")
stopifnot(sum(is.null(n_cells_part$dataset_id)) == 0)
n_cells_part$cell_id <- NULL
# fold those into n_cells_grouped
n_cells_grouped <- bind_rows(n_cells_grouped, n_cells_part) %>%
group_by(dataset_id, gene_id) %>%
summarise(n_cells = sum(n_cells)) %>%
ungroup()
}
# add gene names for display
n_cells_grouped <- left_join(n_cells_grouped, var_df, by = "gene_id")
stopifnot(sum(is.null(n_cells_grouped$feature_name)) == 0)
n_cells_grouped[c("dataset_id", "feature_name", "n_cells")]
#> # A tibble: 17 × 3
#> dataset_id feature_name n_cells
#> <chr> <chr> <dbl>
#> 1 3bbb6cf9-72b9-41be-b568-656de6eb18b5 Dlg2 79513
#> 2 3bbb6cf9-72b9-41be-b568-656de6eb18b5 Pcdh9 79476
#> 3 3bbb6cf9-72b9-41be-b568-656de6eb18b5 Malat1 79667
#> 4 3bbb6cf9-72b9-41be-b568-656de6eb18b5 Ptprd 79578
#> 5 58b01044-c5e5-4b0f-8a2d-6ebf951e01ff Dlg2 81
#> 6 58b01044-c5e5-4b0f-8a2d-6ebf951e01ff Pcdh9 125
#> 7 58b01044-c5e5-4b0f-8a2d-6ebf951e01ff Malat1 12622
#> 8 58b01044-c5e5-4b0f-8a2d-6ebf951e01ff Ptprd 474
#> 9 66ff82b4-9380-469c-bc4b-cfa08eacd325 Dlg2 856
#> 10 66ff82b4-9380-469c-bc4b-cfa08eacd325 Pcdh9 2910
#> 11 66ff82b4-9380-469c-bc4b-cfa08eacd325 Malat1 7102
#> 12 98e5ea9f-16d6-47ec-a529-686e76515e39 Dlg2 908
#> 13 98e5ea9f-16d6-47ec-a529-686e76515e39 Pcdh9 3027
#> 14 98e5ea9f-16d6-47ec-a529-686e76515e39 Malat1 20094
#> 15 c08f8441-4a10-4748-872a-e70c0bcccdba Dlg2 52
#> 16 c08f8441-4a10-4748-872a-e70c0bcccdba Pcdh9 117
#> 17 c08f8441-4a10-4748-872a-e70c0bcccdba Malat1 12992Don’t forget to close the census.