This notebook is only used to create the figures and tables. Results are discussed here.

Reading data

Load the data generated by “TODO insert link to code that generated data”

n_categories <- read_tsv("./n_categories.tsv") %>% 
  arrange(-n_col_two_or_more_category)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   exp_url = col_character(),
##   n_col_one_category = col_double(),
##   n_col_two_or_more_category = col_double(),
##   largest_category_n = col_double(),
##   largest_category_label = col_character()
## )
n_categories$exp_url <- gsub("cellxgene.dev.single-cell.czi.technology","cellxgene.cziscience.com", n_categories$exp_url)

category_counts <- read_tsv("./category_counts.tsv") %>%
  arrange(n_categories) 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   n_categories = col_double(),
##   counts = col_double()
## )
category_counts <- category_counts[category_counts$n_categories > 1, ]

Analysis

Histogram of number columns that have x number of categories. A column is a cell metadata column (e.g. “Sex”) and a category is the possible values that exist (e.g. “Male”, “Female”, “Unknown”)

ggplot(head(category_counts, 80), aes(x = as.factor(n_categories), y = counts)) + 
  geom_bar(stat = "identity") +
  labs(title = "Histogram of category counts per cell metadata column", subtitle = paste("Max number of categories is ", max(category_counts$n_categories))) +
  xlab("Number of categories in column") +
  ylab("Cell metadata column counts") +
  theme_bw()

Table, each row is a dataset. The columns are:

  1. Number of columns w/ 1 cat: number of cell metadata columns that only have 1 category.
  2. Number of columns w/ > 1 cat: number of cell metadata columns that have more than 1 category.
  3. Column with highest cats: the column that has the highest number of categories.
  4. N cats for column with highest cat n: the number of categories of previous column.
n_categories %>%
  dplyr::rename(`Number of columns w/ 1 cat` = n_col_one_category, 
                `Number of columns w/ > 1 cat` = n_col_two_or_more_category,
                `Column with highest cats` = largest_category_label,
                `N cats for column with highest cat n` = largest_category_n
                ) %>%
  head(40) %>%
  kbl() %>%
  kable_paper("hover", full_width = T)
exp_url Number of columns w/ 1 cat Number of columns w/ > 1 cat N cats for column with highest cat n Column with highest cats
https://cellxgene.cziscience.com/e/d622cee4-56e1-44ba-8b05-fd2f0f2032e6.cxg/ 10 49 6288 exp_component_vendor_name
https://cellxgene.cziscience.com/e/1304e107-0f06-4d33-b634-d95ed986d02b.cxg/ 18 44 6171 exp_component_vendor_name
https://cellxgene.cziscience.com/e/066943a2-fdac-4b29-b348-40cede398e4e.cxg/ 2 41 354 original_ann_nonharmonized
https://cellxgene.cziscience.com/e/01ad3cd7-3929-4654-84c0-6db05bd5fd59.cxg/ 6 39 199 pool_clust
https://cellxgene.cziscience.com/e/28c696bb-9549-434b-9340-dc745a846f9a.cxg/ 4 31 22375 seq_name
https://cellxgene.cziscience.com/e/0b75c598-0893-4216-afe8-5414cab7739d.cxg/ 2 30 93 library
https://cellxgene.cziscience.com/e/de2c780c-1747-40bd-9ccf-9588ec186cee.cxg/ 4 26 20 Sample ID
https://cellxgene.cziscience.com/e/fa8605cf-f27e-44af-ac2a-476bee4410d3.cxg/ 5 24 32 rank
https://cellxgene.cziscience.com/e/9dbab10c-118d-496b-966a-67f1763a6b7d.cxg/ 2 24 284 sampleID
https://cellxgene.cziscience.com/e/edc8d3fe-153c-4e3d-8be0-2108d30f8d70.cxg/ 1 23 97 sample_id
https://cellxgene.cziscience.com/e/4ed927e9-c099-49af-b8ce-a2652d069333.cxg/ 4 23 95 Sample ID_prep
https://cellxgene.cziscience.com/e/Single_cell_drug_screening_a549-42-remixed.cxg/ 10 22 1536 Combo
https://cellxgene.cziscience.com/e/Single_cell_drug_screening_mcf7-44-remixed.cxg/ 10 22 1535 Combo
https://cellxgene.cziscience.com/e/32b9bdce-2481-4c85-ba1b-6ad5fcea844c.cxg/ 3 21 58 ClusterNumber
https://cellxgene.cziscience.com/e/Single_cell_drug_screening_k562-43-remixed.cxg/ 11 21 1536 Combo
https://cellxgene.cziscience.com/e/42ff5b55-b848-4f4c-b7cb-b8aac107841c.cxg/ 4 21 531 Slice
https://cellxgene.cziscience.com/e/cb5efdb0-f91c-4cbd-9ad4-9d4fa41c572d.cxg/ 4 21 410 Slice
https://cellxgene.cziscience.com/e/574e9f9e-f8b4-41ef-bf19-89a9964fd9c7.cxg/ 2 20 82 resolution_5
https://cellxgene.cziscience.com/e/5ba85070-a41c-4184-9c18-cf34c3fd0f62.cxg/ 2 20 51 resolution_5
https://cellxgene.cziscience.com/e/07854d9c-5375-4a9b-ac34-fa919d3c3686.cxg/ 2 20 100 cluster_id
https://cellxgene.cziscience.com/e/c7775e88-49bf-4ba2-a03b-93f00447c958.cxg/ 4 20 143 sample_id
https://cellxgene.cziscience.com/e/d3a83885-5198-4b04-8314-b753b66ef9a8.cxg/ 2 20 60 resolution_5
https://cellxgene.cziscience.com/e/75e6eee5-d0e3-4291-9360-f288ffe6c7c4.cxg/ 6 20 225 Specimen ID
https://cellxgene.cziscience.com/e/07428d73-fdea-4bd4-a801-94b00c4d961c.cxg/ 6 20 216 Specimen ID
https://cellxgene.cziscience.com/e/c19275f5-739e-4796-ad5d-b0830b760db1.cxg/ 6 20 225 Specimen ID
https://cellxgene.cziscience.com/e/c76098ba-eed3-45b1-98f2-96fcac55ed18.cxg/ 6 20 227 Specimen ID
https://cellxgene.cziscience.com/e/9bb9596d-f23f-4558-912f-d4dc7d52721b.cxg/ 5 20 241 Slice
https://cellxgene.cziscience.com/e/236baeda-5fdd-41bf-8e32-bae21ac7d435.cxg/ 6 20 220 Specimen ID
https://cellxgene.cziscience.com/e/8a8aedcb-5bb3-453d-a9f0-f37951ae1515.cxg/ 6 20 224 Specimen ID
https://cellxgene.cziscience.com/e/8c6ed88f-11bf-4159-bad7-ff41fb1e1eca.cxg/ 6 20 226 Specimen ID
https://cellxgene.cziscience.com/e/a0d173dd-bb10-4d13-8b9a-e13a4ee83c8c.cxg/ 6 20 229 Specimen ID
https://cellxgene.cziscience.com/e/d87f3f91-dca4-494b-8993-c4e3008a8fa5.cxg/ 6 20 228 Specimen ID
https://cellxgene.cziscience.com/e/eec3e37d-ed41-4881-bc6e-aaf39a2c6eb0.cxg/ 6 20 228 Specimen ID
https://cellxgene.cziscience.com/e/d2fc9880-e6d3-4922-af5c-61f4f517adfa.cxg/ 6 20 227 Specimen ID
https://cellxgene.cziscience.com/e/605b89b1-c474-4180-8c0b-88afb5920991.cxg/ 6 20 228 Specimen ID
https://cellxgene.cziscience.com/e/5ae6ab72-1927-4d5b-9826-86be65791293.cxg/ 6 20 224 Specimen ID
https://cellxgene.cziscience.com/e/de104f7e-14fa-4795-bd19-b5ee2c1563e0.cxg/ 6 20 226 Specimen ID
https://cellxgene.cziscience.com/e/79d485a8-b8b1-49f2-85aa-c44e5206aa53.cxg/ 6 20 226 Specimen ID
https://cellxgene.cziscience.com/e/5097d77d-08fa-4105-a18f-4072d61522a4.cxg/ 6 20 227 Specimen ID
https://cellxgene.cziscience.com/e/ac0fee7e-0999-4319-b244-20278e1ff2fb.cxg/ 6 20 228 Specimen ID
n_categories %>%
  arrange(-largest_category_n) %>%
  dplyr::rename(`Number of columns w/ 1 cat` = n_col_one_category, 
                `Number of columns w/ > 1 cat` = n_col_two_or_more_category,
                `Column with highest cats` = largest_category_label,
                `N cats for column with highest cat n` = largest_category_n
                ) %>%
  head(40) %>%
  kbl() %>%
  kable_paper("hover", full_width = T)
exp_url Number of columns w/ 1 cat Number of columns w/ > 1 cat N cats for column with highest cat n Column with highest cats
https://cellxgene.cziscience.com/e/f7c1c579-2dc0-47e2-ba19-8165c5a0e353.cxg/ 3 18 4062980 sample
https://cellxgene.cziscience.com/e/fa27492b-82ff-4ab7-ac61-0e2b184eee67.cxg/ 3 18 1001288 sample
https://cellxgene.cziscience.com/e/d33814fc-31e8-4f39-a200-a0ca21a9b134.cxg/ 8 9 387060 cellID
https://cellxgene.cziscience.com/e/b22a3a55-9a35-434e-93e1-f2483037f33c.cxg/ 7 10 259558 cellID
https://cellxgene.cziscience.com/e/48b37086-25f7-4ecd-be66-f5bb378e3aea.cxg/ 5 12 245389 cell
https://cellxgene.cziscience.com/e/ca3790f0-c6a6-41b1-b532-61f9b86d346a.cxg/ 8 9 167181 cellID
https://cellxgene.cziscience.com/e/35081d47-99bf-4507-9541-735428df9a9f.cxg/ 5 11 128930 QC
https://cellxgene.cziscience.com/e/c88e2a9c-72b8-4a88-a2f6-e428eada0c86.cxg/ 11 14 122641 exp_component_name
https://cellxgene.cziscience.com/e/98e5ea9f-16d6-47ec-a529-686e76515e39.cxg/ 5 13 110824 cell
https://cellxgene.cziscience.com/e/5e765f97-1cf1-407e-a86c-e28701f4749d.cxg/ 9 16 71183 exp_component_name
https://cellxgene.cziscience.com/e/31dd355c-3140-4558-a648-7e1e5f00480b.cxg/ 9 9 68394 cellID
https://cellxgene.cziscience.com/e/047d57f2-4d14-45de-aa98-336c6f583750.cxg/ 6 8 67794 CellID
https://cellxgene.cziscience.com/e/01209dce-3575-4bed-b1df-129f57fbc031.cxg/ 4 9 50186 barcode
https://cellxgene.cziscience.com/e/0bd1a1de-3aee-40e0-b2ec-86c7a30c7149.cxg/ 6 10 40220 cell
https://cellxgene.cziscience.com/e/50d79de5-bd17-4d14-a295-199d71ff56be.cxg/ 12 12 40166 exp_component_name
https://cellxgene.cziscience.com/e/e2b469d4-b5c3-4a35-9d19-ee71ce61cae0.cxg/ 6 9 35718 cell
https://cellxgene.cziscience.com/e/f7a068f1-0fdb-48e8-8029-db870ff11d9e.cxg/ 4 12 29486 sample_id
https://cellxgene.cziscience.com/e/2872f4b0-b171-46e2-abc6-befcf6de6306.cxg/ 3 18 29467 sample
https://cellxgene.cziscience.com/e/6acb6637-ac08-4a65-b2d1-581e51dc7ccf.cxg/ 4 12 29050 sample_id
https://cellxgene.cziscience.com/e/a7ace090-1ba1-47f2-8def-6e11298b7816.cxg/ 6 10 28867 cell
https://cellxgene.cziscience.com/e/e0ed3c55-aff6-4bb7-b6ff-98a2d90b890c.cxg/ 12 12 25001 nUMI
https://cellxgene.cziscience.com/e/0fb7916e-7a68-4a4c-a441-3ab3989f29a7.cxg/ 6 10 24540 cell
https://cellxgene.cziscience.com/e/b6203114-e133-458a-aed5-eed1028378b4.cxg/ 4 12 24213 sample_id
https://cellxgene.cziscience.com/e/28c696bb-9549-434b-9340-dc745a846f9a.cxg/ 4 31 22375 seq_name
https://cellxgene.cziscience.com/e/b8c618e5-4b3d-4566-8a3f-7e40047f5c54.cxg/ 6 10 21647 cell
https://cellxgene.cziscience.com/e/98ad5247-68f8-42f8-b8e5-7938cb373a91.cxg/ 6 10 20680 cell
https://cellxgene.cziscience.com/e/7bdddd90-9428-47e2-bb80-e77d8b1cc96e.cxg/ 6 18 16204 uid
https://cellxgene.cziscience.com/e/ccda558a-27d5-4c58-bb79-2c079abe059f.cxg/ 6 18 16204 uid
https://cellxgene.cziscience.com/e/db55b719-6102-493a-9251-404bc501d0de.cxg/ 6 11 14517 cell
https://cellxgene.cziscience.com/e/c08f8441-4a10-4748-872a-e70c0bcccdba.cxg/ 6 10 13417 cell
https://cellxgene.cziscience.com/e/7c6091da-4606-44c7-a2c4-ef896de09e28.cxg/ 7 8 12295 cell
https://cellxgene.cziscience.com/e/9b686bb6-1427-4e13-b451-7ee961115cf9.cxg/ 3 13 10739 sample_id
https://cellxgene.cziscience.com/e/34575f91-6990-4df9-9d1f-c175deba676b.cxg/ 7 12 9876 FullCellID
https://cellxgene.cziscience.com/e/15d7a3cf-bb3a-4169-a6be-03353be4f680.cxg/ 7 12 9876 FullCellID
https://cellxgene.cziscience.com/e/1efd4700-87dd-4b45-8762-11ba3fea7a65.cxg/ 6 11 9669 cell
https://cellxgene.cziscience.com/e/krasnow_lab_human_lung_cell_atlas_smartseq2-2-remixed.cxg/ 5 15 9409 cell.id
https://cellxgene.cziscience.com/e/6e4f871d-fd7c-4909-8c14-e4c9957c2e8f.cxg/ 6 9 9275 cell
https://cellxgene.cziscience.com/e/93966790-bbfa-420f-aa85-bc5ca51d9c96.cxg/ 6 10 8945 cell
https://cellxgene.cziscience.com/e/1fe63353-9e75-4824-aa30-ed8d84be748c.cxg/ 6 11 8613 cell
https://cellxgene.cziscience.com/e/2491629a-bde0-46ad-a073-e34fcb516857.cxg/ 6 11 8311 cell