Normalization method comparison

source("./utils/dot_plot_functions.R", chdir = T)
source("./utils/make_ontology_ordering.R", chdir = T)
library("gridExtra")
library("kableExtra")

Load data

Let’s load CL onotlogy

CL <- get_ontology("../data/CL.obo")

Let’s define gene sets

genes_immune <- c("CD68", "CD79A", "CD8A", "FOXJ1", "GNLY", "CD4", 
           "GNLY", "CD79A", "JCHAIN", "MNDA", "MUC5B", "KTR18")
genes_housekeeping <- c("MALAT1")
genes_lungmap_paper <- c("TP63", "MYB1", "RIMS2", "FCN1", "FLT3", "PRF1", "PAX5", 
                         "MS4A2", "AGER", "STPA2", "BMX", "ACKRT1")
genes_krasnow <- c("CCL5", "CCL21", "CCL23")
genes_zemin <- c("CD3D", "GNLY", "JCHAIN", "CSF3R", "CD68", "FOXJ1", "MUC5B", "KRT5", "SPPR3")

genes_marker_rankit <- c("AOAH", "PARAL1", "SKAP1", "LGI3", "CSRP3-AS1", "MS4A1", "GNLY", "LGALS7B", "LAMC3", "ACKR1", "CD3D", "CD8B", "CCL7", "LYPD2")
genes_marker_scvi <- c("AOAH", "PARAL1", "SKAP1", "PLA2G1B", "CSRP3-AS1", "MS4A1", "GNLY", "KRT5", "LAMC3", "ACKR1", "IL7R", "CD8A", "CCL2", "MUC16")
names(genes_marker_scvi) <- names(genes_marker_rankit) <- c("monocyte", "macrophage", "T cell", "type II pneumocyte", "ciliated cell", "B cell", "natural killer cell", "basal cell", "pericyte cell", "vein endothelial cell", "CD4-positive, alpha-beta T cell", "CD8-positive, alpha-beta T cell", "CD14-positive, CD16-negative classical monocyte", "secretory cell")

genes_all_marker_rankit <- c("STAC","TPRG1","AOAH","TBXAS1","TREM2","MCEMP1","SKAP1","ARHGAP15","MAGI1","MALAT1","RPL12","RPL18A","PLA2G1B","GKN2","B2M","RPLP1","GPC5","SFTA3_ENSG00000229415","LDB2","GALNT18","RTKN2","ST6GALNAC5","CSRP3-AS1","LMNTD1","MS4A1","BLK","GNLY","SAMD3","KRT5","TP63","TMBIM4","GCHFR","CALD1","SH3BGRL3","HLA-DQA2","GPR183","GRHL1","CP","IFITM3","RPL39","ACTB","FTL","LAMC3","PDGFRB","EEF1A1","RPL10","TMSB10","FTL","ACKR1","RAMP3","PTPRB","VWF","CCL3","FMN1","ADH1B","PDZRN3","PLEK","S100A8","STARD13","NRP1","MTRNR2L12","MTRNR2L8","CCL2","TNFSF13B","IL1RN","CLMN","FAM3D","TSPAN8","CPA3","TPSAB1","RP11-1143G9.4","S100A12","LRRC10B","CFAP126","IL7R","BTG1","DERL3","MZB1","CD8B","CD8A","MTRNR2L12","DPYD","FGL2","MS4A6A","CA4","ADRB1","PTPRC","BTBD9","TPM2","TAGLN","LGALS3","ARID5B","PLVAP","ACKR1","MTRNR2L12","MTRNR2L8","LYZ","AIF1","MTRNR2L12","TNFSF13B","SFTPC","TIMP2","DEFB1","CCL7","LTB","TRBC2","MUC4","TACSTD2","HLA-DRB5","CD68","KRT23","GABRP","RSAD2","TNFSF10")

genes_all_marker_rankit <- c("STAC" , "TPRG1" , "AOAH" , "TBXAS1" , "TREM2" , "MCEMP1" , "SKAP1" , "ARHGAP15" , "MAGI1" , "PLA2G1B" , "GKN2" , "GPC5" , "SFTA3_ENSG00000229415" , "LDB2" , "GALNT18" , "RTKN2" , "ST6GALNAC5" , "CSRP3-AS1" , "LMNTD1" , "MS4A1" , "BLK" , "GNLY" , "SAMD3" , "KRT5" , "TP63" , "CALD1" , "HLA-DQA2" , "GPR183" , "GRHL1" , "CP" , "LAMC3" , "PDGFRB" , "ACKR1" , "RAMP3" , "PTPRB" , "VWF" , "ADH1B" , "PDZRN3" , "PLEK" , "S100A8" , "FAM3D" , "TSPAN8" , "CPA3" , "TPSAB1" , "RP11-1143G9.4" , "S100A12" , "LRRC10B" , "CFAP126" , "IL7R" , "BTG1" , "DERL3" , "MZB1" , "CD8B" , "CD8A" , "FGL2" , "MS4A6A" , "CA4" , "ADRB1" , "TPM2" , "TAGLN" , "PLVAP" , "ACKR1" , "LYZ" , "AIF1" , "DEFB1" , "CCL7" , "LTB" , "TRBC2" , "MUC4" , "TACSTD2" , "HLA-DRB5" , "CD68" , "KRT23" , "GABRP")

names(genes_all_marker_rankit) <- c("alveolar macrophage" , "alveolar macrophage" , "monocyte" , "monocyte" , "macrophage" , "macrophage" , "T cell" , "T cell" , "native cell" , "type II pneumocyte" , "type II pneumocyte" , "epithelial cell" , "epithelial cell" , "endothelial cell" , "endothelial cell" , "type I pneumocyte" , "type I pneumocyte" , "ciliated cell" , "ciliated cell" , "B cell" , "B cell" , "natural killer cell" , "natural killer cell" , "basal cell" , "basal cell" , "vascular associated smooth muscle cell" , "dendritic cell" , "dendritic cell" , "club cell" , "club cell" , "pericyte cell" , "pericyte cell" , "vein endothelial cell" , "vein endothelial cell" , "endothelial cell of artery" , "endothelial cell of artery" , "fibroblast" , "fibroblast" , "neutrophil" , "neutrophil" , "respiratory goblet cell" , "respiratory goblet cell" , "basophil" , "basophil" , "classical monocyte" , "classical monocyte" , "lung ciliated cell" , "lung ciliated cell" , "CD4-positive, alpha-beta T cell" , "CD4-positive, alpha-beta T cell" , "plasma cell" , "plasma cell" , "CD8-positive, alpha-beta T cell" , "CD8-positive, alpha-beta T cell" , "non-classical monocyte" , "non-classical monocyte" , "capillary endothelial cell" , "capillary endothelial cell" , "bronchial smooth muscle cell" , "bronchial smooth muscle cell" , "lung microvascular endothelial cell" , "lung microvascular endothelial cell" , "intermediate monocyte" , "intermediate monocyte" , "CD14-positive, CD16-negative classical monocyte" , "CD14-positive, CD16-negative classical monocyte" , "mature alpha-beta T cell" , "mature alpha-beta T cell" , "squamous epithelial cell" , "squamous epithelial cell" , "CD14-positive, CD16-positive monocyte" , "CD14-positive, CD16-positive monocyte" , "secretory cell" , "secretory cell")

genes <- unique(c(genes_all_marker_rankit, genes_marker_rankit, genes_marker_scvi, genes_immune, genes_housekeeping, genes_lungmap_paper, genes_krasnow, genes_zemin))

Now we load the expression matrices, gene and cell metadata.

data_dir <- "../data/normalized/for_notebook/"
layer_files <- file.path(data_dir,
                         c("all_datasets_rankit_mtx.gz", "all_datasets_scvi_mtx.gz",
                           "all_datasets_inmf_mtx.gz", "all_datasets_z_score_mtx.gz",
                           "all_datasets_X_mtx.gz")
)

exp_obj_all <- read_expression(layer_files = layer_files, 
                           layer_names =  c("rankit", "scvi", "inmf", "z_score", "raw"), 
                           obs_file = file.path(data_dir, "all_datasets_obs.gz"), 
                           var_file = file.path(data_dir, "all_datasets_var.gz"), 
                           genes=genes)

And we make genes with less or equal than 2 reads be zero across all layers. This helps removing spurious expression and avoids creating bimodal distributions of expression (lower tail towards low expression)

exp_obj <- filter_genes_expression(exp_obj_all, layer = "raw", min_expression = 3)

Finally we get the cell type ordering based on the CL ontology tree and the cell types available in the data

ordered_cell_types_ontology <- get_cell_type_order(CL, unique(exp_obj$obs$cell_type_ontology_term_id))
cell_types_all <- setNames(exp_obj$obs$cell_type, exp_obj$obs$cell_type_ontology_term_id)
ordered_cell_types <- rev(cell_types_all[ordered_cell_types_ontology$ordered_nodes$name])

Let’s take a look at the tree

ordered_cell_types_ontology$onto_plot

Method comparison

What happens at the signle gene/cell type level?

The following is a view of a marker gene for natural killer cells: GNLY. Distributions in invididual datasets across different methods is shown.

Dot plot views

Marker genes from papers

Out-of-the-box visual results

Let’s take a look at how different normalization methods look in a dotplot using cell type ordering based on CL

Out-of-the-box visual results (hierarchical clustering)

Cell type hierarchical clustering

Gene hierarchical clustering

Cell type and gene hierarchical clustering

Marker genes from `scanpy.tl.rank_genes_groups`

These marker genes were obtained with scanpy, as follows. After, the top gene with a positive fold change was used.

layer = "rankit"
# Create a mask for the cell type    
adata.obs["_mask"] = adata.obs.cell_type == cell_type    
adata.obs["_mask"] = adata.obs["_mask"].astype("category")    
    
# Find top 1000 differentially expressed genes    
sc.tl.rank_genes_groups(adata, groupby="_mask", n_genes=1000, layer=layer, rankby_abs=True)

Now doing hierarchical clustering of genes

Now doing hierarchical clustering of genes and cell types

Manual visual optimization to increase color intesity

Clips absolute mean to a quantile, currently set at 0.95.

Cell type hierarchical clustering

Gene hierarchical clustering

Cell type and gene hierarchical clustering

## [1] "AGER, FCN1, CCL5, PRF1, BMX, CCL23, MS4A2, RIMS2, TP63, FLT3, PAX5"

Subset of marker genes from `scanpy.tl.rank_genes_groups`

These marker genes were obtained with scanpy, as follows. After, the top gene with a positive fold change was used.

layer = "rankit"
# Create a mask for the cell type    
adata.obs["_mask"] = adata.obs.cell_type == cell_type    
adata.obs["_mask"] = adata.obs["_mask"].astype("category")    
    
# Find top 1000 differentially expressed genes    
sc.tl.rank_genes_groups(adata, groupby="_mask", n_genes=1000, layer=layer, rankby_abs=True)

Now doing hierarchical clustering of genes

Now doing hierarchical clustering of cell types

Now doing hierarchical clustering of cell types and genes

All markers

Clustering both genes and cell types with rankit data.

Clustering both genes and cell types with rankit data. And clipping colors to 0.98 quantile. All data shown further will be clipped.

Now clustering with scvi data.

Clustering only genes with rankit data, cell types are ordered based on ontology tree.

## [1] "LDB2, VWF, CA4, PTPRB, ADH1B, GALNT18, MAGI1, KRT5, TSPAN8, ADRB1, PLVAP, PDGFRB, MS4A1, ACKR1, RAMP3, GNLY, LTB, GPR183, PLEK, TBXAS1, AOAH, ARHGAP15, TPM2, CALD1, TACSTD2, S100A8, AIF1, MS4A6A, FGL2, LYZ, BTG1, IL7R, TAGLN, ST6GALNAC5, GPC5, PDZRN3, SFTA3_ENSG00000229415, TP63, LAMC3, CSRP3-AS1, LMNTD1, GKN2, PLA2G1B, KRT23, GABRP, CFAP126, LRRC10B, FAM3D, MUC4, RTKN2, SKAP1, BLK, SAMD3, TPRG1, DERL3, MZB1, CD68, DEFB1, HLA-DRB5, CCL7, HLA-DQA2, STAC, CP, GRHL1, MCEMP1, TREM2, RP11-1143G9.4, S100A12, CPA3, TPSAB1, TRBC2, CD8A, CD8B"

Clustering only genes with scvi data, cell types are oredered based on ontology tree.

Including a house-keeping gene

With 0.95 quantile clipping as shown above.

Agreement between SCVI and rankit

As SCVI and rankit seem to provide the best visual results it would be helpful to know whether they agree on their signal.

Performing correlations for this analysis is not desirable because most genes are lowly expressed and correlating them between any method would translate to correlating noise.

In addition, “where’s my gene” is concerned with finding highly expressed genes. Thus an agreement comparison should only consider highly expressed genes per cell type.

One way to perform such comparisons is by looking at the overlap of the top N cell types per gene between the two methods.

Below there a table is shown for the number of shared cell types per gene among the top 5 cell types between rankit and scvi.

intersection among top 5 cell types	counts
0	1
1	1
2	5
3	17
4	11
5	3

intersection among top 5 genes	counts
1	1
2	14
3	17
4	18
5	7

Normalization method comparison – Multidataset

Pablo Garcia-Nieto

11/23/2021

Load data

Method comparison

What happens at the signle gene/cell type level?

Dot plot views

Marker genes from papers

Out-of-the-box visual results

Out-of-the-box visual results (hierarchical clustering)

Marker genes from `scanpy.tl.rank_genes_groups`

Manual visual optimization to increase color intesity

Subset of marker genes from `scanpy.tl.rank_genes_groups`

All markers

Including a house-keeping gene

Agreement between SCVI and rankit

Normalization method comparison – Multidataset

Pablo Garcia-Nieto

11/23/2021

Load data

Method comparison

What happens at the signle gene/cell type level?

Dot plot views

Marker genes from papers

Out-of-the-box visual results

Out-of-the-box visual results (hierarchical clustering)

Marker genes from scanpy.tl.rank_genes_groups

Manual visual optimization to increase color intesity

Subset of marker genes from scanpy.tl.rank_genes_groups

All markers

Including a house-keeping gene

Agreement between SCVI and rankit

Marker genes from `scanpy.tl.rank_genes_groups`

Subset of marker genes from `scanpy.tl.rank_genes_groups`