library("cellxgene.census")
library("ggplot2")
census <- open_soma(census_version = "stable")
## The stable Census release is currently 2023-07-25. Specify census_version = "2023-07-25" in future calls to open_soma() to ensure data consistency.
Define data selection and filter
primary_data_only <- TRUE
columns <- c("dataset_id", "donor_id", "assay", "suspension_type")
batches <- list(
c("dataset_id", "donor_id", "assay", "suspension_type"),
c("dataset_id", "donor_id"),
c("donor_id"),
c("dataset_id", "donor_id", "suspension_type"),
c("dataset_id", "assay", "suspension_type"),
c("dataset_id", "suspension_type"),
c("dataset_id", "assay")
)
histogram_cutoff <- 5000
Read cell metadata
obs_df <- census$get("census_data")$get("homo_sapiens")$obs$read(
value_filter = paste0("is_primary_data == ", primary_data_only),
column_names = columns
)$concat()$to_data_frame()
Read tallies per combination of batch
plots <- list()
for (i in 1:length(batches)) {
variables = batches[[i]]
obs_df$values <- do.call(paste, c(obs_df[,variables, drop = F], sep = "..."))
tallies <- as.data.frame(sort(table(obs_df$values)))
text <- paste0(
"min = ", min(tallies$Freq), "\n",
"max = ", max(tallies$Freq), "\n",
"avg = ", mean(tallies$Freq), "\n",
"median = ", median(tallies$Freq), "\n",
"n = ", sum(tallies$Freq)
)
plots[[i]] <- ggplot(tallies[tallies$Freq < histogram_cutoff,], aes(x=Freq)) +
geom_histogram(bins = 100, color="gray50", fill = "gray90") +
geom_text(label = text, x=Inf, y=Inf, hjust = 1, vjust = 1) +
labs(subtitle = paste(variables, collapse = "--")) +
theme_bw()
}
for (p in plots){
print(p)
}
Read cell metadata
obs_df <- census$get("census_data")$get("mus_musculus")$obs$read(
value_filter = paste0("is_primary_data == ", primary_data_only),
column_names = columns
)$concat()$to_data_frame()
Read tallies per combination of batch
plots <- list()
for (i in 1:length(batches)) {
variables = batches[[i]]
obs_df$values <- do.call(paste, c(obs_df[,variables, drop = F], sep = "..."))
tallies <- as.data.frame(sort(table(obs_df$values)))
text <- paste0(
"min = ", min(tallies$Freq), "\n",
"max = ", max(tallies$Freq), "\n",
"avg = ", mean(tallies$Freq), "\n",
"median = ", median(tallies$Freq), "\n",
"n = ", sum(tallies$Freq)
)
plots[[i]] <- ggplot(tallies[tallies$Freq < histogram_cutoff,], aes(x=Freq)) +
geom_histogram(bins = 100, color="gray50", fill = "gray90") +
geom_text(label = text, x=Inf, y=Inf, hjust = 1, vjust = 1) +
labs(subtitle = paste(variables, collapse = "--")) +
theme_bw()
}
for (p in plots){
print(p)
}