source("./utils/dot_plot_functions.R", chdir = T)
source("./utils/make_ontology_ordering.R", chdir = T)
library("gridExtra")
library("kableExtra")
Let’s load CL onotlogy
CL <- get_ontology("../data/CL.obo")
Let’s define gene sets
genes_immune <- c("CD68", "CD79A", "CD8A", "FOXJ1", "GNLY", "CD4",
"GNLY", "CD79A", "JCHAIN", "MNDA", "MUC5B", "KTR18")
genes_housekeeping <- c("MALAT1")
genes_lungmap_paper <- c("TP63", "MYB1", "RIMS2", "FCN1", "FLT3", "PRF1", "PAX5",
"MS4A2", "AGER", "STPA2", "BMX", "ACKRT1")
genes_krasnow <- c("CCL5", "CCL21", "CCL23")
genes_zemin <- c("CD3D", "GNLY", "JCHAIN", "CSF3R", "CD68", "FOXJ1", "MUC5B", "KRT5", "SPPR3")
genes_marker_rankit <- c("AOAH", "PARAL1", "SKAP1", "LGI3", "CSRP3-AS1", "MS4A1", "GNLY", "LGALS7B", "LAMC3", "ACKR1", "CD3D", "CD8B", "CCL7", "LYPD2")
genes_marker_scvi <- c("AOAH", "PARAL1", "SKAP1", "PLA2G1B", "CSRP3-AS1", "MS4A1", "GNLY", "KRT5", "LAMC3", "ACKR1", "IL7R", "CD8A", "CCL2", "MUC16")
names(genes_marker_scvi) <- names(genes_marker_rankit) <- c("monocyte", "macrophage", "T cell", "type II pneumocyte", "ciliated cell", "B cell", "natural killer cell", "basal cell", "pericyte cell", "vein endothelial cell", "CD4-positive, alpha-beta T cell", "CD8-positive, alpha-beta T cell", "CD14-positive, CD16-negative classical monocyte", "secretory cell")
genes_all_marker_rankit <- c("STAC","TPRG1","AOAH","TBXAS1","TREM2","MCEMP1","SKAP1","ARHGAP15","MAGI1","MALAT1","RPL12","RPL18A","PLA2G1B","GKN2","B2M","RPLP1","GPC5","SFTA3_ENSG00000229415","LDB2","GALNT18","RTKN2","ST6GALNAC5","CSRP3-AS1","LMNTD1","MS4A1","BLK","GNLY","SAMD3","KRT5","TP63","TMBIM4","GCHFR","CALD1","SH3BGRL3","HLA-DQA2","GPR183","GRHL1","CP","IFITM3","RPL39","ACTB","FTL","LAMC3","PDGFRB","EEF1A1","RPL10","TMSB10","FTL","ACKR1","RAMP3","PTPRB","VWF","CCL3","FMN1","ADH1B","PDZRN3","PLEK","S100A8","STARD13","NRP1","MTRNR2L12","MTRNR2L8","CCL2","TNFSF13B","IL1RN","CLMN","FAM3D","TSPAN8","CPA3","TPSAB1","RP11-1143G9.4","S100A12","LRRC10B","CFAP126","IL7R","BTG1","DERL3","MZB1","CD8B","CD8A","MTRNR2L12","DPYD","FGL2","MS4A6A","CA4","ADRB1","PTPRC","BTBD9","TPM2","TAGLN","LGALS3","ARID5B","PLVAP","ACKR1","MTRNR2L12","MTRNR2L8","LYZ","AIF1","MTRNR2L12","TNFSF13B","SFTPC","TIMP2","DEFB1","CCL7","LTB","TRBC2","MUC4","TACSTD2","HLA-DRB5","CD68","KRT23","GABRP","RSAD2","TNFSF10")
genes_all_marker_rankit <- c("STAC" , "TPRG1" , "AOAH" , "TBXAS1" , "TREM2" , "MCEMP1" , "SKAP1" , "ARHGAP15" , "MAGI1" , "PLA2G1B" , "GKN2" , "GPC5" , "SFTA3_ENSG00000229415" , "LDB2" , "GALNT18" , "RTKN2" , "ST6GALNAC5" , "CSRP3-AS1" , "LMNTD1" , "MS4A1" , "BLK" , "GNLY" , "SAMD3" , "KRT5" , "TP63" , "CALD1" , "HLA-DQA2" , "GPR183" , "GRHL1" , "CP" , "LAMC3" , "PDGFRB" , "ACKR1" , "RAMP3" , "PTPRB" , "VWF" , "ADH1B" , "PDZRN3" , "PLEK" , "S100A8" , "FAM3D" , "TSPAN8" , "CPA3" , "TPSAB1" , "RP11-1143G9.4" , "S100A12" , "LRRC10B" , "CFAP126" , "IL7R" , "BTG1" , "DERL3" , "MZB1" , "CD8B" , "CD8A" , "FGL2" , "MS4A6A" , "CA4" , "ADRB1" , "TPM2" , "TAGLN" , "PLVAP" , "ACKR1" , "LYZ" , "AIF1" , "DEFB1" , "CCL7" , "LTB" , "TRBC2" , "MUC4" , "TACSTD2" , "HLA-DRB5" , "CD68" , "KRT23" , "GABRP")
names(genes_all_marker_rankit) <- c("alveolar macrophage" , "alveolar macrophage" , "monocyte" , "monocyte" , "macrophage" , "macrophage" , "T cell" , "T cell" , "native cell" , "type II pneumocyte" , "type II pneumocyte" , "epithelial cell" , "epithelial cell" , "endothelial cell" , "endothelial cell" , "type I pneumocyte" , "type I pneumocyte" , "ciliated cell" , "ciliated cell" , "B cell" , "B cell" , "natural killer cell" , "natural killer cell" , "basal cell" , "basal cell" , "vascular associated smooth muscle cell" , "dendritic cell" , "dendritic cell" , "club cell" , "club cell" , "pericyte cell" , "pericyte cell" , "vein endothelial cell" , "vein endothelial cell" , "endothelial cell of artery" , "endothelial cell of artery" , "fibroblast" , "fibroblast" , "neutrophil" , "neutrophil" , "respiratory goblet cell" , "respiratory goblet cell" , "basophil" , "basophil" , "classical monocyte" , "classical monocyte" , "lung ciliated cell" , "lung ciliated cell" , "CD4-positive, alpha-beta T cell" , "CD4-positive, alpha-beta T cell" , "plasma cell" , "plasma cell" , "CD8-positive, alpha-beta T cell" , "CD8-positive, alpha-beta T cell" , "non-classical monocyte" , "non-classical monocyte" , "capillary endothelial cell" , "capillary endothelial cell" , "bronchial smooth muscle cell" , "bronchial smooth muscle cell" , "lung microvascular endothelial cell" , "lung microvascular endothelial cell" , "intermediate monocyte" , "intermediate monocyte" , "CD14-positive, CD16-negative classical monocyte" , "CD14-positive, CD16-negative classical monocyte" , "mature alpha-beta T cell" , "mature alpha-beta T cell" , "squamous epithelial cell" , "squamous epithelial cell" , "CD14-positive, CD16-positive monocyte" , "CD14-positive, CD16-positive monocyte" , "secretory cell" , "secretory cell")
genes <- unique(c(genes_all_marker_rankit, genes_marker_rankit, genes_marker_scvi, genes_immune, genes_housekeeping, genes_lungmap_paper, genes_krasnow, genes_zemin))
Now we load the expression matrices, gene and cell metadata.
data_dir <- "../data/normalized/for_notebook/"
layer_files <- file.path(data_dir,
c("all_datasets_rankit_mtx.gz", "all_datasets_scvi_mtx.gz",
"all_datasets_inmf_mtx.gz", "all_datasets_z_score_mtx.gz",
"all_datasets_X_mtx.gz")
)
exp_obj_all <- read_expression(layer_files = layer_files,
layer_names = c("rankit", "scvi", "inmf", "z_score", "raw"),
obs_file = file.path(data_dir, "all_datasets_obs.gz"),
var_file = file.path(data_dir, "all_datasets_var.gz"),
genes=genes)
And we make genes with less or equal than 2 reads be zero across all layers. This helps removing spurious expression and avoids creating bimodal distributions of expression (lower tail towards low expression)
exp_obj <- filter_genes_expression(exp_obj_all, layer = "raw", min_expression = 3)
Finally we get the cell type ordering based on the CL ontology tree and the cell types available in the data
ordered_cell_types_ontology <- get_cell_type_order(CL, unique(exp_obj$obs$cell_type_ontology_term_id))
cell_types_all <- setNames(exp_obj$obs$cell_type, exp_obj$obs$cell_type_ontology_term_id)
ordered_cell_types <- rev(cell_types_all[ordered_cell_types_ontology$ordered_nodes$name])
Let’s take a look at the tree
ordered_cell_types_ontology$onto_plot
print(ordered_cell_types_ontology$ordered_nodes)
## name label x y
## CL:0000003 CL:0000003 Native Cell 1389 1580
## CL:0000084 CL:0000084 T Cell 207 601
## CL:0000791 CL:0000791 Mature Alpha beta 99 322
## CL:0000624 CL:0000624 CD4 positive, 27 182
## CL:0001044 CL:0001044 Effector CD4 99 42
## CL:0000814 CL:0000814 Mature NK T Cell 99 182
## CL:0000625 CL:0000625 CD8 positive, 243 182
## CL:0001050 CL:0001050 Effector CD8 207 42
## CL:0000800 CL:0000800 Mature Gamma 234 322
## CL:0000236 CL:0000236 B Cell 315 461
## CL:0000786 CL:0000786 Plasma Cell 387 322
## CL:0000623 CL:0000623 Natural Killer 459 322
## CL:0000451 CL:0000451 Dendritic Cell 547 741
## CL:0000784 CL:0000784 Plasmacytoid 477 601
## CL:0000576 CL:0000576 Monocyte 767 741
## CL:0000860 CL:0000860 Classical 693 601
## CL:0002057 CL:0002057 CD14 positive, 693 461
## CL:0002397 CL:0002397 CD14 positive, 765 461
## CL:0000875 CL:0000875 Non classical 837 601
## CL:0002393 CL:0002393 Intermediate 909 601
## CL:0000235 CL:0000235 Macrophage 621 741
## CL:0000583 CL:0000583 Alveolar 567 461
## CL:0000767 CL:0000767 Basophil 981 601
## CL:0000775 CL:0000775 Neutrophil 1053 601
## CL:0000097 CL:0000097 Mast Cell 1125 601
## CL:0000595 CL:0000595 Enucleate 1197 601
## CL:0000646 CL:0000646 Basal Cell 1207 741
## CL:0000077 CL:0000077 Mesothelial Cell 1317 881
## CL:0000115 CL:0000115 Endothelial Cell 1389 881
## CL:0000071 CL:0000071 Blood Vessel 1292 601
## CL:0002543 CL:0002543 Vein Endothelial 1220 461
## CL:1000413 CL:1000413 Endothelial Cell 1292 461
## CL:0002144 CL:0002144 Capillary 1364 461
## CL:0002138 CL:0002138 Endothelial Cell 1389 601
## CL:2000016 CL:2000016 Lung 1489 461
## CL:0000556 CL:0000556 Megakaryocyte 1108 741
## CL:0000066 CL:0000066 Epithelial Cell 1677 1160
## CL:0000076 CL:0000076 Squamous 1317 1021
## CL:0017000 CL:0017000 Pulmonary 1461 881
## CL:0002062 CL:0002062 Type I Pneumocyte 1681 601
## CL:0002063 CL:0002063 Type II 2007 601
## CL:1000223 CL:1000223 Lung 2125 601
## CL:0000158 CL:0000158 Club Cell 1827 322
## CL:0000160 CL:0000160 Goblet Cell 1885 601
## CL:0002370 CL:0002370 Respiratory 1885 461
## CL:0000138 CL:0000138 Chondrocyte 1796 601
## CL:0000057 CL:0000057 Fibroblast 2041 1021
## CL:0002503 CL:0002503 Adventitial Cell 2223 1021
## CL:0000669 CL:0000669 Pericyte Cell 2439 1021
## CL:0000186 CL:0000186 Myofibroblast 2011 1160
## CL:0000192 CL:0000192 Smooth Muscle 2371 881
## CL:0000359 CL:0000359 Vascular 2352 741
## CL:0002598 CL:0002598 Bronchial Smooth 2424 601
## CL:0000064 CL:0000064 Ciliated Cell 1856 1440
## CL:1000271 CL:1000271 Lung Ciliated 1865 1300
## CL:0000151 CL:0000151 Secretory Cell 1897 1021
## CL:1000331 CL:1000331 Serous Cell Of 2197 461
The following is a view of a marker gene for natural killer cells: GNLY. Distributions in invididual datasets across different methods is shown.
Let’s take a look at how different normalization methods look in a dotplot using cell type ordering based on CL
Cell type hierarchical clustering
Gene hierarchical clustering
Cell type and gene hierarchical clustering
scanpy.tl.rank_genes_groupsThese marker genes were obtained with scanpy, as follows. After, the top gene with a positive fold change was used.
layer = "rankit"
# Create a mask for the cell type
adata.obs["_mask"] = adata.obs.cell_type == cell_type
adata.obs["_mask"] = adata.obs["_mask"].astype("category")
# Find top 1000 differentially expressed genes
sc.tl.rank_genes_groups(adata, groupby="_mask", n_genes=1000, layer=layer, rankby_abs=True)
Now doing hierarchical clustering of genes
Now doing hierarchical clustering of genes and cell types
Clips absolute mean to a quantile, currently set at 0.95.
Cell type hierarchical clustering
Gene hierarchical clustering
Cell type and gene hierarchical clustering
## [1] "AGER, FCN1, CCL5, PRF1, BMX, CCL23, MS4A2, RIMS2, TP63, FLT3, PAX5"
scanpy.tl.rank_genes_groupsThese marker genes were obtained with scanpy, as follows. After, the top gene with a positive fold change was used.
layer = "rankit"
# Create a mask for the cell type
adata.obs["_mask"] = adata.obs.cell_type == cell_type
adata.obs["_mask"] = adata.obs["_mask"].astype("category")
# Find top 1000 differentially expressed genes
sc.tl.rank_genes_groups(adata, groupby="_mask", n_genes=1000, layer=layer, rankby_abs=True)
Now doing hierarchical clustering of genes
Now doing hierarchical clustering of cell types
Now doing hierarchical clustering of cell types and genes
Clustering both genes and cell types with rankit data.
Clustering both genes and cell types with rankit data. And clipping colors to 0.98 quantile. All data shown further will be clipped.
Now clustering with scvi data.