library("cellxgene.census")
library("ggplot2")
census <- open_soma(census_version = "stable")
## The stable Census release is currently 2023-07-25. Specify census_version = "2023-07-25" in future calls to open_soma() to ensure data consistency.

Define data selection and filter

primary_data_only <- TRUE
columns <- c("dataset_id", "donor_id", "assay", "suspension_type")
batches <- list(
  c("dataset_id", "donor_id", "assay", "suspension_type"),
  c("dataset_id", "donor_id"),
  c("donor_id"),
  c("dataset_id", "donor_id", "suspension_type"),
  c("dataset_id", "assay", "suspension_type"),
  c("dataset_id", "suspension_type"),
  c("dataset_id", "assay")
)
histogram_cutoff <- 5000

Human

Read cell metadata

obs_df <- census$get("census_data")$get("homo_sapiens")$obs$read(
  value_filter = paste0("is_primary_data == ", primary_data_only),
  column_names = columns
)$concat()$to_data_frame()

Read tallies per combination of batch

plots <- list()
for (i in 1:length(batches)) { 
  variables = batches[[i]]
  obs_df$values <- do.call(paste, c(obs_df[,variables, drop = F], sep = "..."))
  tallies <- as.data.frame(sort(table(obs_df$values)))
  text <- paste0(
    "min = ", min(tallies$Freq), "\n",
    "max = ", max(tallies$Freq), "\n",
    "avg = ", mean(tallies$Freq), "\n",
    "median = ", median(tallies$Freq), "\n",
    "n = ", sum(tallies$Freq)
  )
  
  plots[[i]] <- ggplot(tallies[tallies$Freq < histogram_cutoff,], aes(x=Freq)) +
    geom_histogram(bins = 100, color="gray50", fill = "gray90") +
    geom_text(label = text, x=Inf, y=Inf, hjust = 1, vjust = 1) +
    labs(subtitle = paste(variables, collapse = "--")) +
    theme_bw()
}
for (p in plots){
  print(p)
}

Mouse

Read cell metadata

obs_df <- census$get("census_data")$get("mus_musculus")$obs$read(
  value_filter = paste0("is_primary_data == ", primary_data_only),
  column_names = columns
)$concat()$to_data_frame()

Read tallies per combination of batch

plots <- list()
for (i in 1:length(batches)) { 
  variables = batches[[i]]
  obs_df$values <- do.call(paste, c(obs_df[,variables, drop = F], sep = "..."))
  tallies <- as.data.frame(sort(table(obs_df$values)))
  text <- paste0(
    "min = ", min(tallies$Freq), "\n",
    "max = ", max(tallies$Freq), "\n",
    "avg = ", mean(tallies$Freq), "\n",
    "median = ", median(tallies$Freq), "\n",
    "n = ", sum(tallies$Freq)
  )
  
  plots[[i]] <- ggplot(tallies[tallies$Freq < histogram_cutoff,], aes(x=Freq)) +
    geom_histogram(bins = 100, color="gray50", fill = "gray90") +
    geom_text(label = text, x=Inf, y=Inf, hjust = 1, vjust = 1) +
    labs(subtitle = paste(variables, collapse = "--")) +
    theme_bw()
}
for (p in plots){
  print(p)
}