Normalization method comparison

source("./utils/dot_plot_functions.R", chdir = T)
source("./utils/make_ontology_ordering.R", chdir = T)
library("gridExtra")
library("kableExtra")

Load data

Let’s load CL onotlogy

CL <- get_ontology("../data/CL.obo")

Let’s define gene sets

genes_immune <- c("CD68", "CD79A", "CD8A", "FOXJ1", "GNLY", "CD4", 
           "GNLY", "CD79A", "JCHAIN", "MNDA", "MUC5B", "KTR18")
genes_housekeeping <- c("MALAT1")
genes_lungmap_paper <- c("TP63", "MYB1", "RIMS2", "FCN1", "FLT3", "PRF1", "PAX5", 
                         "MS4A2", "AGER", "STPA2", "BMX", "ACKRT1")
genes_krasnow <- c("CCL5", "CCL21", "CCL23")
genes_zemin <- c("CD3D", "GNLY", "JCHAIN", "CSF3R", "CD68", "FOXJ1", "MUC5B", "KRT5", "SPPR3")

genes_marker_rankit <- c("AOAH", "PARAL1", "SKAP1", "LGI3", "CSRP3-AS1", "MS4A1", "GNLY", "LGALS7B", "LAMC3", "ACKR1", "CD3D", "CD8B", "CCL7", "LYPD2")
genes_marker_scvi <- c("AOAH", "PARAL1", "SKAP1", "PLA2G1B", "CSRP3-AS1", "MS4A1", "GNLY", "KRT5", "LAMC3", "ACKR1", "IL7R", "CD8A", "CCL2", "MUC16")
names(genes_marker_scvi) <- names(genes_marker_rankit) <- c("monocyte", "macrophage", "T cell", "type II pneumocyte", "ciliated cell", "B cell", "natural killer cell", "basal cell", "pericyte cell", "vein endothelial cell", "CD4-positive, alpha-beta T cell", "CD8-positive, alpha-beta T cell", "CD14-positive, CD16-negative classical monocyte", "secretory cell")

genes_all_marker_rankit <- c("STAC","TPRG1","AOAH","TBXAS1","TREM2","MCEMP1","SKAP1","ARHGAP15","MAGI1","MALAT1","RPL12","RPL18A","PLA2G1B","GKN2","B2M","RPLP1","GPC5","SFTA3_ENSG00000229415","LDB2","GALNT18","RTKN2","ST6GALNAC5","CSRP3-AS1","LMNTD1","MS4A1","BLK","GNLY","SAMD3","KRT5","TP63","TMBIM4","GCHFR","CALD1","SH3BGRL3","HLA-DQA2","GPR183","GRHL1","CP","IFITM3","RPL39","ACTB","FTL","LAMC3","PDGFRB","EEF1A1","RPL10","TMSB10","FTL","ACKR1","RAMP3","PTPRB","VWF","CCL3","FMN1","ADH1B","PDZRN3","PLEK","S100A8","STARD13","NRP1","MTRNR2L12","MTRNR2L8","CCL2","TNFSF13B","IL1RN","CLMN","FAM3D","TSPAN8","CPA3","TPSAB1","RP11-1143G9.4","S100A12","LRRC10B","CFAP126","IL7R","BTG1","DERL3","MZB1","CD8B","CD8A","MTRNR2L12","DPYD","FGL2","MS4A6A","CA4","ADRB1","PTPRC","BTBD9","TPM2","TAGLN","LGALS3","ARID5B","PLVAP","ACKR1","MTRNR2L12","MTRNR2L8","LYZ","AIF1","MTRNR2L12","TNFSF13B","SFTPC","TIMP2","DEFB1","CCL7","LTB","TRBC2","MUC4","TACSTD2","HLA-DRB5","CD68","KRT23","GABRP","RSAD2","TNFSF10")

genes_all_marker_rankit <- c("STAC" , "TPRG1" , "AOAH" , "TBXAS1" , "TREM2" , "MCEMP1" , "SKAP1" , "ARHGAP15" , "MAGI1" , "PLA2G1B" , "GKN2" , "GPC5" , "SFTA3_ENSG00000229415" , "LDB2" , "GALNT18" , "RTKN2" , "ST6GALNAC5" , "CSRP3-AS1" , "LMNTD1" , "MS4A1" , "BLK" , "GNLY" , "SAMD3" , "KRT5" , "TP63" , "CALD1" , "HLA-DQA2" , "GPR183" , "GRHL1" , "CP" , "LAMC3" , "PDGFRB" , "ACKR1" , "RAMP3" , "PTPRB" , "VWF" , "ADH1B" , "PDZRN3" , "PLEK" , "S100A8" , "FAM3D" , "TSPAN8" , "CPA3" , "TPSAB1" , "RP11-1143G9.4" , "S100A12" , "LRRC10B" , "CFAP126" , "IL7R" , "BTG1" , "DERL3" , "MZB1" , "CD8B" , "CD8A" , "FGL2" , "MS4A6A" , "CA4" , "ADRB1" , "TPM2" , "TAGLN" , "PLVAP" , "ACKR1" , "LYZ" , "AIF1" , "DEFB1" , "CCL7" , "LTB" , "TRBC2" , "MUC4" , "TACSTD2" , "HLA-DRB5" , "CD68" , "KRT23" , "GABRP")

names(genes_all_marker_rankit) <- c("alveolar macrophage" , "alveolar macrophage" , "monocyte" , "monocyte" , "macrophage" , "macrophage" , "T cell" , "T cell" , "native cell" , "type II pneumocyte" , "type II pneumocyte" , "epithelial cell" , "epithelial cell" , "endothelial cell" , "endothelial cell" , "type I pneumocyte" , "type I pneumocyte" , "ciliated cell" , "ciliated cell" , "B cell" , "B cell" , "natural killer cell" , "natural killer cell" , "basal cell" , "basal cell" , "vascular associated smooth muscle cell" , "dendritic cell" , "dendritic cell" , "club cell" , "club cell" , "pericyte cell" , "pericyte cell" , "vein endothelial cell" , "vein endothelial cell" , "endothelial cell of artery" , "endothelial cell of artery" , "fibroblast" , "fibroblast" , "neutrophil" , "neutrophil" , "respiratory goblet cell" , "respiratory goblet cell" , "basophil" , "basophil" , "classical monocyte" , "classical monocyte" , "lung ciliated cell" , "lung ciliated cell" , "CD4-positive, alpha-beta T cell" , "CD4-positive, alpha-beta T cell" , "plasma cell" , "plasma cell" , "CD8-positive, alpha-beta T cell" , "CD8-positive, alpha-beta T cell" , "non-classical monocyte" , "non-classical monocyte" , "capillary endothelial cell" , "capillary endothelial cell" , "bronchial smooth muscle cell" , "bronchial smooth muscle cell" , "lung microvascular endothelial cell" , "lung microvascular endothelial cell" , "intermediate monocyte" , "intermediate monocyte" , "CD14-positive, CD16-negative classical monocyte" , "CD14-positive, CD16-negative classical monocyte" , "mature alpha-beta T cell" , "mature alpha-beta T cell" , "squamous epithelial cell" , "squamous epithelial cell" , "CD14-positive, CD16-positive monocyte" , "CD14-positive, CD16-positive monocyte" , "secretory cell" , "secretory cell")

genes <- unique(c(genes_all_marker_rankit, genes_marker_rankit, genes_marker_scvi, genes_immune, genes_housekeeping, genes_lungmap_paper, genes_krasnow, genes_zemin))

Now we load the expression matrices, gene and cell metadata.

data_dir <- "../data/normalized/for_notebook/"
layer_files <- file.path(data_dir,
                         c("all_datasets_rankit_mtx.gz", "all_datasets_scvi_mtx.gz",
                           "all_datasets_inmf_mtx.gz", "all_datasets_z_score_mtx.gz",
                           "all_datasets_X_mtx.gz")
)

exp_obj_all <- read_expression(layer_files = layer_files, 
                           layer_names =  c("rankit", "scvi", "inmf", "z_score", "raw"), 
                           obs_file = file.path(data_dir, "all_datasets_obs.gz"), 
                           var_file = file.path(data_dir, "all_datasets_var.gz"), 
                           genes=genes)

And we make genes with less or equal than 2 reads be zero across all layers. This helps removing spurious expression and avoids creating bimodal distributions of expression (lower tail towards low expression)

exp_obj <- filter_genes_expression(exp_obj_all, layer = "raw", min_expression = 3)

Finally we get the cell type ordering based on the CL ontology tree and the cell types available in the data

ordered_cell_types_ontology <- get_cell_type_order(CL, unique(exp_obj$obs$cell_type_ontology_term_id))
cell_types_all <- setNames(exp_obj$obs$cell_type, exp_obj$obs$cell_type_ontology_term_id)
ordered_cell_types <- rev(cell_types_all[ordered_cell_types_ontology$ordered_nodes$name])

Let’s take a look at the tree

ordered_cell_types_ontology$onto_plot

print(ordered_cell_types_ontology$ordered_nodes)

##                  name             label    x    y
## CL:0000003 CL:0000003       Native Cell 1389 1580
## CL:0000084 CL:0000084            T Cell  207  601
## CL:0000791 CL:0000791 Mature Alpha beta   99  322
## CL:0000624 CL:0000624     CD4 positive,   27  182
## CL:0001044 CL:0001044      Effector CD4   99   42
## CL:0000814 CL:0000814  Mature NK T Cell   99  182
## CL:0000625 CL:0000625     CD8 positive,  243  182
## CL:0001050 CL:0001050      Effector CD8  207   42
## CL:0000800 CL:0000800      Mature Gamma  234  322
## CL:0000236 CL:0000236            B Cell  315  461
## CL:0000786 CL:0000786       Plasma Cell  387  322
## CL:0000623 CL:0000623    Natural Killer  459  322
## CL:0000451 CL:0000451    Dendritic Cell  547  741
## CL:0000784 CL:0000784      Plasmacytoid  477  601
## CL:0000576 CL:0000576          Monocyte  767  741
## CL:0000860 CL:0000860         Classical  693  601
## CL:0002057 CL:0002057    CD14 positive,  693  461
## CL:0002397 CL:0002397    CD14 positive,  765  461
## CL:0000875 CL:0000875     Non classical  837  601
## CL:0002393 CL:0002393      Intermediate  909  601
## CL:0000235 CL:0000235        Macrophage  621  741
## CL:0000583 CL:0000583          Alveolar  567  461
## CL:0000767 CL:0000767          Basophil  981  601
## CL:0000775 CL:0000775        Neutrophil 1053  601
## CL:0000097 CL:0000097         Mast Cell 1125  601
## CL:0000595 CL:0000595         Enucleate 1197  601
## CL:0000646 CL:0000646        Basal Cell 1207  741
## CL:0000077 CL:0000077  Mesothelial Cell 1317  881
## CL:0000115 CL:0000115  Endothelial Cell 1389  881
## CL:0000071 CL:0000071      Blood Vessel 1292  601
## CL:0002543 CL:0002543  Vein Endothelial 1220  461
## CL:1000413 CL:1000413  Endothelial Cell 1292  461
## CL:0002144 CL:0002144         Capillary 1364  461
## CL:0002138 CL:0002138  Endothelial Cell 1389  601
## CL:2000016 CL:2000016              Lung 1489  461
## CL:0000556 CL:0000556     Megakaryocyte 1108  741
## CL:0000066 CL:0000066   Epithelial Cell 1677 1160
## CL:0000076 CL:0000076          Squamous 1317 1021
## CL:0017000 CL:0017000         Pulmonary 1461  881
## CL:0002062 CL:0002062 Type I Pneumocyte 1681  601
## CL:0002063 CL:0002063           Type II 2007  601
## CL:1000223 CL:1000223              Lung 2125  601
## CL:0000158 CL:0000158         Club Cell 1827  322
## CL:0000160 CL:0000160       Goblet Cell 1885  601
## CL:0002370 CL:0002370       Respiratory 1885  461
## CL:0000138 CL:0000138       Chondrocyte 1796  601
## CL:0000057 CL:0000057        Fibroblast 2041 1021
## CL:0002503 CL:0002503  Adventitial Cell 2223 1021
## CL:0000669 CL:0000669     Pericyte Cell 2439 1021
## CL:0000186 CL:0000186     Myofibroblast 2011 1160
## CL:0000192 CL:0000192     Smooth Muscle 2371  881
## CL:0000359 CL:0000359          Vascular 2352  741
## CL:0002598 CL:0002598  Bronchial Smooth 2424  601
## CL:0000064 CL:0000064     Ciliated Cell 1856 1440
## CL:1000271 CL:1000271     Lung Ciliated 1865 1300
## CL:0000151 CL:0000151    Secretory Cell 1897 1021
## CL:1000331 CL:1000331    Serous Cell Of 2197  461

Method comparison

What happens at the signle gene/cell type level?

The following is a view of a marker gene for natural killer cells: GNLY. Distributions in invididual datasets across different methods is shown.

Dot plot views

Marker genes from papers

Out-of-the-box visual results

Let’s take a look at how different normalization methods look in a dotplot using cell type ordering based on CL

Out-of-the-box visual results (hierarchical clustering)

Cell type hierarchical clustering

Gene hierarchical clustering

Cell type and gene hierarchical clustering

Marker genes from `scanpy.tl.rank_genes_groups`

These marker genes were obtained with scanpy, as follows. After, the top gene with a positive fold change was used.

layer = "rankit"
# Create a mask for the cell type    
adata.obs["_mask"] = adata.obs.cell_type == cell_type    
adata.obs["_mask"] = adata.obs["_mask"].astype("category")    
    
# Find top 1000 differentially expressed genes    
sc.tl.rank_genes_groups(adata, groupby="_mask", n_genes=1000, layer=layer, rankby_abs=True)

Now doing hierarchical clustering of genes

Now doing hierarchical clustering of genes and cell types

Manual visual optimization to increase color intesity

Clips absolute mean to a quantile, currently set at 0.95.

Cell type hierarchical clustering

Gene hierarchical clustering

Cell type and gene hierarchical clustering

## [1] "AGER, FCN1, CCL5, PRF1, BMX, CCL23, MS4A2, RIMS2, TP63, FLT3, PAX5"

Subset of marker genes from `scanpy.tl.rank_genes_groups`

These marker genes were obtained with scanpy, as follows. After, the top gene with a positive fold change was used.

layer = "rankit"
# Create a mask for the cell type    
adata.obs["_mask"] = adata.obs.cell_type == cell_type    
adata.obs["_mask"] = adata.obs["_mask"].astype("category")    
    
# Find top 1000 differentially expressed genes    
sc.tl.rank_genes_groups(adata, groupby="_mask", n_genes=1000, layer=layer, rankby_abs=True)

Now doing hierarchical clustering of genes

Now doing hierarchical clustering of cell types

Now doing hierarchical clustering of cell types and genes

All markers

Clustering both genes and cell types with rankit data.

Clustering both genes and cell types with rankit data. And clipping colors to 0.98 quantile. All data shown further will be clipped.

Now clustering with scvi data.

Clustering only genes with rankit data, cell types are ordered based on ontology tree.

## [1] "LDB2, VWF, CA4, PTPRB, ADH1B, GALNT18, MAGI1, KRT5, TSPAN8, ADRB1, PLVAP, PDGFRB, MS4A1, ACKR1, RAMP3, GNLY, LTB, GPR183, PLEK, TBXAS1, AOAH, ARHGAP15, TPM2, CALD1, TACSTD2, S100A8, AIF1, MS4A6A, FGL2, LYZ, BTG1, IL7R, TAGLN, ST6GALNAC5, GPC5, PDZRN3, SFTA3_ENSG00000229415, TP63, LAMC3, CSRP3-AS1, LMNTD1, GKN2, PLA2G1B, KRT23, GABRP, CFAP126, LRRC10B, FAM3D, MUC4, RTKN2, SKAP1, BLK, SAMD3, TPRG1, DERL3, MZB1, CD68, DEFB1, HLA-DRB5, CCL7, HLA-DQA2, STAC, CP, GRHL1, MCEMP1, TREM2, RP11-1143G9.4, S100A12, CPA3, TPSAB1, TRBC2, CD8A, CD8B"

Clustering only genes with scvi data, cell types are oredered based on ontology tree.

Including a house-keeping gene

With 0.95 quantile clipping as shown above.

Agreement between SCVI and rankit

As SCVI and rankit seem to provide the best visual results it would be helpful to know whether they agree on their signal.

Performing correlations for this analysis is not desirable because most genes are lowly expressed and correlating them between any method would translate to correlating noise.

In addition, “where’s my gene” is concerned with finding highly expressed genes. Thus an agreement comparison should only consider highly expressed genes per cell type.

One way to perform such comparisons is by looking at the overlap of the top N cell types per gene between the two methods.

Below there a table is shown for the number of shared cell types per gene among the top 5 cell types between rankit and scvi.

intersection among top 5 cell types	counts
0	1
1	1
2	5
3	17
4	11
5	3

intersection among top 5 genes	counts
1	1
2	14
3	17
4	18
5	7

Normalization method comparison – Multidataset

Pablo Garcia-Nieto

11/23/2021

Load data

Method comparison

What happens at the signle gene/cell type level?

Dot plot views

Marker genes from papers

Out-of-the-box visual results

Out-of-the-box visual results (hierarchical clustering)

Marker genes from `scanpy.tl.rank_genes_groups`

Manual visual optimization to increase color intesity

Subset of marker genes from `scanpy.tl.rank_genes_groups`

All markers

Including a house-keeping gene

Agreement between SCVI and rankit

Normalization method comparison – Multidataset

Pablo Garcia-Nieto

11/23/2021

Load data

Method comparison

What happens at the signle gene/cell type level?

Dot plot views

Marker genes from papers

Out-of-the-box visual results

Out-of-the-box visual results (hierarchical clustering)

Marker genes from scanpy.tl.rank_genes_groups

Manual visual optimization to increase color intesity

Subset of marker genes from scanpy.tl.rank_genes_groups

All markers

Including a house-keeping gene

Agreement between SCVI and rankit

Marker genes from `scanpy.tl.rank_genes_groups`

Subset of marker genes from `scanpy.tl.rank_genes_groups`