source("./utils/dot_plot_functions.R", chdir = T)
source("./utils/make_ontology_ordering.R", chdir = T)
library("gridExtra")
library("kableExtra")
Let’s load CL onotlogy
CL <- get_ontology("../data/CL.obo")
Let’s define gene sets
genes_immune <- c("CD68", "CD79A", "CD8A", "FOXJ1", "GNLY", "CD4",
"GNLY", "CD79A", "JCHAIN", "MNDA", "MUC5B", "KTR18")
genes_housekeeping <- c("MALAT1")
genes_lungmap_paper <- c("TP63", "MYB1", "RIMS2", "FCN1", "FLT3", "PRF1", "PAX5",
"MS4A2", "AGER", "STPA2", "BMX", "ACKRT1")
genes_krasnow <- c("CCL5", "CCL21", "CCL23")
genes_zemin <- c("CD3D", "GNLY", "JCHAIN", "CSF3R", "CD68", "FOXJ1", "MUC5B", "KRT5", "SPPR3")
genes_marker_rankit <- c("AOAH", "PARAL1", "SKAP1", "LGI3", "CSRP3-AS1", "MS4A1", "GNLY", "LGALS7B", "LAMC3", "ACKR1", "CD3D", "CD8B", "CCL7", "LYPD2")
genes_marker_scvi <- c("AOAH", "PARAL1", "SKAP1", "PLA2G1B", "CSRP3-AS1", "MS4A1", "GNLY", "KRT5", "LAMC3", "ACKR1", "IL7R", "CD8A", "CCL2", "MUC16")
names(genes_marker_scvi) <- names(genes_marker_rankit) <- c("monocyte", "macrophage", "T cell", "type II pneumocyte", "ciliated cell", "B cell", "natural killer cell", "basal cell", "pericyte cell", "vein endothelial cell", "CD4-positive, alpha-beta T cell", "CD8-positive, alpha-beta T cell", "CD14-positive, CD16-negative classical monocyte", "secretory cell")
genes_all_marker_rankit <- c("STAC","TPRG1","AOAH","TBXAS1","TREM2","MCEMP1","SKAP1","ARHGAP15","MAGI1","MALAT1","RPL12","RPL18A","PLA2G1B","GKN2","B2M","RPLP1","GPC5","SFTA3_ENSG00000229415","LDB2","GALNT18","RTKN2","ST6GALNAC5","CSRP3-AS1","LMNTD1","MS4A1","BLK","GNLY","SAMD3","KRT5","TP63","TMBIM4","GCHFR","CALD1","SH3BGRL3","HLA-DQA2","GPR183","GRHL1","CP","IFITM3","RPL39","ACTB","FTL","LAMC3","PDGFRB","EEF1A1","RPL10","TMSB10","FTL","ACKR1","RAMP3","PTPRB","VWF","CCL3","FMN1","ADH1B","PDZRN3","PLEK","S100A8","STARD13","NRP1","MTRNR2L12","MTRNR2L8","CCL2","TNFSF13B","IL1RN","CLMN","FAM3D","TSPAN8","CPA3","TPSAB1","RP11-1143G9.4","S100A12","LRRC10B","CFAP126","IL7R","BTG1","DERL3","MZB1","CD8B","CD8A","MTRNR2L12","DPYD","FGL2","MS4A6A","CA4","ADRB1","PTPRC","BTBD9","TPM2","TAGLN","LGALS3","ARID5B","PLVAP","ACKR1","MTRNR2L12","MTRNR2L8","LYZ","AIF1","MTRNR2L12","TNFSF13B","SFTPC","TIMP2","DEFB1","CCL7","LTB","TRBC2","MUC4","TACSTD2","HLA-DRB5","CD68","KRT23","GABRP","RSAD2","TNFSF10")
genes_all_marker_rankit <- c("STAC" , "TPRG1" , "AOAH" , "TBXAS1" , "TREM2" , "MCEMP1" , "SKAP1" , "ARHGAP15" , "MAGI1" , "PLA2G1B" , "GKN2" , "GPC5" , "SFTA3_ENSG00000229415" , "LDB2" , "GALNT18" , "RTKN2" , "ST6GALNAC5" , "CSRP3-AS1" , "LMNTD1" , "MS4A1" , "BLK" , "GNLY" , "SAMD3" , "KRT5" , "TP63" , "CALD1" , "HLA-DQA2" , "GPR183" , "GRHL1" , "CP" , "LAMC3" , "PDGFRB" , "ACKR1" , "RAMP3" , "PTPRB" , "VWF" , "ADH1B" , "PDZRN3" , "PLEK" , "S100A8" , "FAM3D" , "TSPAN8" , "CPA3" , "TPSAB1" , "RP11-1143G9.4" , "S100A12" , "LRRC10B" , "CFAP126" , "IL7R" , "BTG1" , "DERL3" , "MZB1" , "CD8B" , "CD8A" , "FGL2" , "MS4A6A" , "CA4" , "ADRB1" , "TPM2" , "TAGLN" , "PLVAP" , "ACKR1" , "LYZ" , "AIF1" , "DEFB1" , "CCL7" , "LTB" , "TRBC2" , "MUC4" , "TACSTD2" , "HLA-DRB5" , "CD68" , "KRT23" , "GABRP")
names(genes_all_marker_rankit) <- c("alveolar macrophage" , "alveolar macrophage" , "monocyte" , "monocyte" , "macrophage" , "macrophage" , "T cell" , "T cell" , "native cell" , "type II pneumocyte" , "type II pneumocyte" , "epithelial cell" , "epithelial cell" , "endothelial cell" , "endothelial cell" , "type I pneumocyte" , "type I pneumocyte" , "ciliated cell" , "ciliated cell" , "B cell" , "B cell" , "natural killer cell" , "natural killer cell" , "basal cell" , "basal cell" , "vascular associated smooth muscle cell" , "dendritic cell" , "dendritic cell" , "club cell" , "club cell" , "pericyte cell" , "pericyte cell" , "vein endothelial cell" , "vein endothelial cell" , "endothelial cell of artery" , "endothelial cell of artery" , "fibroblast" , "fibroblast" , "neutrophil" , "neutrophil" , "respiratory goblet cell" , "respiratory goblet cell" , "basophil" , "basophil" , "classical monocyte" , "classical monocyte" , "lung ciliated cell" , "lung ciliated cell" , "CD4-positive, alpha-beta T cell" , "CD4-positive, alpha-beta T cell" , "plasma cell" , "plasma cell" , "CD8-positive, alpha-beta T cell" , "CD8-positive, alpha-beta T cell" , "non-classical monocyte" , "non-classical monocyte" , "capillary endothelial cell" , "capillary endothelial cell" , "bronchial smooth muscle cell" , "bronchial smooth muscle cell" , "lung microvascular endothelial cell" , "lung microvascular endothelial cell" , "intermediate monocyte" , "intermediate monocyte" , "CD14-positive, CD16-negative classical monocyte" , "CD14-positive, CD16-negative classical monocyte" , "mature alpha-beta T cell" , "mature alpha-beta T cell" , "squamous epithelial cell" , "squamous epithelial cell" , "CD14-positive, CD16-positive monocyte" , "CD14-positive, CD16-positive monocyte" , "secretory cell" , "secretory cell")
genes <- unique(c(genes_all_marker_rankit, genes_marker_rankit, genes_marker_scvi, genes_immune, genes_housekeeping, genes_lungmap_paper, genes_krasnow, genes_zemin))
Now we load the expression matrices, gene and cell metadata.
data_dir <- "../data/normalized/for_notebook/"
layer_files <- file.path(data_dir,
c("all_datasets_rankit_mtx.gz", "all_datasets_scvi_mtx.gz",
"all_datasets_inmf_mtx.gz", "all_datasets_z_score_mtx.gz",
"all_datasets_X_mtx.gz")
)
exp_obj_all <- read_expression(layer_files = layer_files,
layer_names = c("rankit", "scvi", "inmf", "z_score", "raw"),
obs_file = file.path(data_dir, "all_datasets_obs.gz"),
var_file = file.path(data_dir, "all_datasets_var.gz"),
genes=genes)
And we make genes with less or equal than 2 reads be zero across all layers. This helps removing spurious expression and avoids creating bimodal distributions of expression (lower tail towards low expression)
exp_obj <- filter_genes_expression(exp_obj_all, layer = "raw", min_expression = 3)
Finally we get the cell type ordering based on the CL ontology tree and the cell types available in the data
ordered_cell_types_ontology <- get_cell_type_order(CL, unique(exp_obj$obs$cell_type_ontology_term_id))
cell_types_all <- setNames(exp_obj$obs$cell_type, exp_obj$obs$cell_type_ontology_term_id)
ordered_cell_types <- rev(cell_types_all[ordered_cell_types_ontology$ordered_nodes$name])
Let’s take a look at the tree
ordered_cell_types_ontology$onto_plot
The following is a view of a marker gene for natural killer cells: GNLY. Distributions in invididual datasets across different methods is shown.
Let’s take a look at how different normalization methods look in a dotplot using cell type ordering based on CL
Cell type hierarchical clustering
Gene hierarchical clustering
Cell type and gene hierarchical clustering
scanpy.tl.rank_genes_groupsThese marker genes were obtained with scanpy, as follows. After, the top gene with a positive fold change was used.
layer = "rankit"
# Create a mask for the cell type
adata.obs["_mask"] = adata.obs.cell_type == cell_type
adata.obs["_mask"] = adata.obs["_mask"].astype("category")
# Find top 1000 differentially expressed genes
sc.tl.rank_genes_groups(adata, groupby="_mask", n_genes=1000, layer=layer, rankby_abs=True)
Now doing hierarchical clustering of genes
Now doing hierarchical clustering of genes and cell types
Clips absolute mean to a quantile, currently set at 0.95.
Cell type hierarchical clustering
Gene hierarchical clustering
Cell type and gene hierarchical clustering
## [1] "AGER, FCN1, CCL5, PRF1, BMX, CCL23, MS4A2, RIMS2, TP63, FLT3, PAX5"
scanpy.tl.rank_genes_groupsThese marker genes were obtained with scanpy, as follows. After, the top gene with a positive fold change was used.
layer = "rankit"
# Create a mask for the cell type
adata.obs["_mask"] = adata.obs.cell_type == cell_type
adata.obs["_mask"] = adata.obs["_mask"].astype("category")
# Find top 1000 differentially expressed genes
sc.tl.rank_genes_groups(adata, groupby="_mask", n_genes=1000, layer=layer, rankby_abs=True)
Now doing hierarchical clustering of genes
Now doing hierarchical clustering of cell types
Now doing hierarchical clustering of cell types and genes
Clustering both genes and cell types with rankit data.
Clustering both genes and cell types with rankit data. And clipping colors to 0.98 quantile. All data shown further will be clipped.
Now clustering with scvi data.
Clustering only genes with rankit data, cell types are ordered based on ontology tree.
## [1] "LDB2, VWF, CA4, PTPRB, ADH1B, GALNT18, MAGI1, KRT5, TSPAN8, ADRB1, PLVAP, PDGFRB, MS4A1, ACKR1, RAMP3, GNLY, LTB, GPR183, PLEK, TBXAS1, AOAH, ARHGAP15, TPM2, CALD1, TACSTD2, S100A8, AIF1, MS4A6A, FGL2, LYZ, BTG1, IL7R, TAGLN, ST6GALNAC5, GPC5, PDZRN3, SFTA3_ENSG00000229415, TP63, LAMC3, CSRP3-AS1, LMNTD1, GKN2, PLA2G1B, KRT23, GABRP, CFAP126, LRRC10B, FAM3D, MUC4, RTKN2, SKAP1, BLK, SAMD3, TPRG1, DERL3, MZB1, CD68, DEFB1, HLA-DRB5, CCL7, HLA-DQA2, STAC, CP, GRHL1, MCEMP1, TREM2, RP11-1143G9.4, S100A12, CPA3, TPSAB1, TRBC2, CD8A, CD8B"