1 Introduction

This markdown is for individual case-control study based on Barycenter and Wasserstein distance estimation on the gene differential expression (DE) analysis of certain cell type based on the idiopathic pulmonary fibrosis.

2 Initial Setup

R packages required in this pipeline:

library("MASS")
library("emdbook")
library("moments")
library("MAST")
library("lme4")
library("DESeq2")
library("abind")
library("doRNG")
library("reticulate")
library("BSDE")

3 Computation

Here we present a demo version of our real data analysis,We take the basic parameters, i.e., the mean, dispersion and dropout parameter from the distribution of the real reference scRNAseq database from paper Single-cell RNA sequencing identifies diverse roles of epithelial cells in idiopathic pulmonary fibrosis.

We focus on the cell type SCGB3A2+ and limit our analysis in the first 1000 genes.The trimmed file are at the data/IPF folder.

3.1 Basic Settings

Senario settings in our analysis are as follows:

#senarios
covariate_flag=NA #c(NA, "quantile99")
perm_label=0 #perm_label =0 means calculate the observed data other wise, permutated data
merge_method="python" # c("R","python","python_exp")
lambda_fold=2


dir = "~/Dropbox/Barycenter/BSDEpackage/BSDE/data/"
setwd(dir)

3.2 Data Preparation

###########input###############
meta=readRDS(paste0(dir,"IPF/meta.rds"))

#input phenotype
cur_individual=unique(meta$individual)

#input bulk
rawM_bulk=readRDS(paste0(dir,"IPF/matrix_bulk.rds"))
rawM_cell=readRDS(paste0(dir,"IPF/ind_cellnum.rds"))
rawM_bulk=t(apply(rawM_bulk,1,function(x) x*100/rawM_cell)) 
colnames(rawM_bulk)=names(rawM_cell)
logcount_data_bulk=log2(rawM_bulk+1)

#input counts
rawM=readRDS(paste0(dir,"IPF/rawM.rds"))
logcount_data=log2(rawM+1)
dim(logcount_data)

## [1] 1000 3220

dim(meta)

## [1] 3220   15

meta_ind=meta[match(unique(meta$individual),meta$individual),c("individual","diagnosis")]

rdp=readRDS(paste0(dir,"IPF/ind_readdepth.rds"))

meta_ind$rdp=scale(rdp)
meta_ind$individual=scale(as.numeric(meta_ind$individual))
meta_ind$diagnosis=meta_ind$diagnosis

After read in the data, we apply inverse probability weight to remove the covariance effects.

Then we run our method for that.

dim(logcount_data)
logcount_data[1:10,1:10]
cell_id=meta$cell
gene_id=dimnames(logcount_data)[[1]]
rownames(logcount_data)=gene_id
colnames(logcount_data)=cell_id

op_pvalw=matrix(ncol=1,nrow=nrow(logcount_data))

bc_case_ob=list()
bc_ctrl_ob=list()
bcw_case_ob=list()
bcw_ctrl_ob=list()
#for(ig in 1:nrow(logcount_data)){
for(ig in 1:10){
  
  bc_case_ob[[ig]]=list()
  bc_ctrl_ob[[ig]]=list()
  bcw_case_ob[[ig]]=list()
  bcw_ctrl_ob[[ig]]=list()
  
  cur_sim=logcount_data[ig,]
  cur_ind=meta$individual
  cur_pheno=meta$diagnosis
  
  tempa=tryCatch({cal_w2_pval(count_per_gene=cur_sim,meta_individual=cur_ind,meta_phenotype=cur_pheno,perm_num=200,unif_round_unit=0.5,l_fold=lambda_fold,merge_method=merge_method,weight=ipww,shrink=FALSE)}, error = function(e) {NA} )
  op_pvalw[ig]=tryCatch({tempa[[1]]}, error = function(e) {NA} )
  bcw_case_ob[[ig]]=tryCatch({tempa[[2]]}, error = function(e) {NA} )
  bcw_ctrl_ob[[ig]]=tryCatch({tempa[[3]]}, error = function(e) {NA} )
  if(!(is.na(op_pvalw[ig])) && op_pvalw[ig]<2/200){
    op_pvalw[ig]=tryCatch({cal_w2_pval(count_per_gene=cur_sim,meta_individual=cur_ind,meta_phenotype=cur_pheno,perm_num=1000,unif_round_unit=0.5,l_fold=lambda_fold,merge_method=merge_method,weight=ipww,shrink=FALSE)[[1]]}, error = function(e) {NA} )}
  if(!(is.na(op_pvalw[ig])) && op_pvalw[ig]<2/1000){
    op_pvalw[ig]=tryCatch({cal_w2_pval(count_per_gene=cur_sim,meta_individual=cur_ind,meta_phenotype=cur_pheno,perm_num=5000,unif_round_unit=0.5,l_fold=lambda_fold,merge_method=merge_method,weight=ipww,shrink=FALSE)[[1]]}, error = function(e) {NA} )}
}

names(op_pvalw)=gene_id

4 Result Visualization

Once got the p-values, we can present our results as follows.

4.1 Data Preparation

We’ve done our the analysis based on the top 3000 expressed genes among all clusters, and use FDR method multi-effect correction. Then we will do some data analysis and visualization.

n_cluster=31
case_col="#00539B"
ctrl_col="#a3d6f0" #"#4B9CD3"

set.seed(1)
library(ggplot2)
library(stringr)
library(scales)

#geom_split_violin function
GeomSplitViolin <- ggproto("GeomSplitViolin", GeomViolin, 
                           draw_group = function(self, data, ..., draw_quantiles = NULL) {
                             data <- transform(data, xminv = x - violinwidth * (x - xmin), xmaxv = x + violinwidth * (xmax - x))
                             grp <- data[1, "group"]
                             newdata <- plyr::arrange(transform(data, x = if (grp %% 2 == 1) xminv else xmaxv), if (grp %% 2 == 1) y else -y)
                             newdata <- rbind(newdata[1, ], newdata, newdata[nrow(newdata), ], newdata[1, ])
                             newdata[c(1, nrow(newdata) - 1, nrow(newdata)), "x"] <- round(newdata[1, "x"])
                             
                             if (length(draw_quantiles) > 0 & !scales::zero_range(range(data$y))) {
                               stopifnot(all(draw_quantiles >= 0), all(draw_quantiles <=
                                                                         1))
                               quantiles <- ggplot2:::create_quantile_segment_frame(data, draw_quantiles)
                               aesthetics <- data[rep(1, nrow(quantiles)), setdiff(names(data), c("x", "y")), drop = FALSE]
                               aesthetics$alpha <- rep(1, nrow(quantiles))
                               both <- cbind(quantiles, aesthetics)
                               quantile_grob <- GeomPath$draw_panel(both, ...)
                               ggplot2:::ggname("geom_split_violin", grid::grobTree(GeomPolygon$draw_panel(newdata, ...), quantile_grob))
                             }
                             else {
                               ggplot2:::ggname("geom_split_violin", GeomPolygon$draw_panel(newdata, ...))
                             }
                           })

geom_split_violin <- function(mapping = NULL, data = NULL, stat = "ydensity", position = "identity", ..., 
                              draw_quantiles = NULL, trim = TRUE, scale = "area", na.rm = FALSE, 
                              show.legend = NA, inherit.aes = TRUE) {
  layer(data = data, mapping = mapping, stat = stat, geom = GeomSplitViolin, 
        position = position, show.legend = show.legend, inherit.aes = inherit.aes, 
        params = list(trim = trim, scale = scale, draw_quantiles = draw_quantiles, na.rm = na.rm, ...))
}

t_col = function(color, percent = 60, name = NULL) {
  #      color = color name
  #    percent = % transparency
  #       name = an optional name for the color
  
  ## Get RGB values for named color
  rgb.val <- col2rgb(color)
  
  ## Make new color using input color as base and alpha set by transparency
  t.col <- rgb(rgb.val[1], rgb.val[2], rgb.val[3],
               max = 255,
               alpha = (100 - percent) * 255 / 100,
               names = name)
  
  ## Save the color
  invisible(t.col)
}

#read_in
opw_padj=readRDS(paste0(dir,"IPF/opw_padj.rds"))
gene_id=names(opw_padj[[1]])
tmeta=readRDS(paste0(dir,"IPF/tmeta.rds"))
cell_num_table=readRDS(paste0(dir,"IPF/cell_num_table.rds"))
cluster=unique(tmeta$cluster)
total_individual=unique(tmeta$individual)

cur_cluster="Proliferating Epithelial Cells" #Proliferating Epithelial Cells  #Plasma Cells #Endothelial Cells
cluster=unique(tmeta$celltype)
cluster_plot=gsub("Proliferating","P.",cluster)
cluster_plot=gsub("Differentiating","Diff.",cluster_plot)
cluster_plot=gsub("Transitional","Trans.",cluster_plot)
cluster_plot=gsub("+ SCGB1A","/",cluster_plot)

4.2 Boxplot of the number of cells per subjects.

pval_num=sapply(opw_padj,function(x){sum(x<=0.1,na.rm=TRUE)}) #adjusted FDR <0.1

population=tmeta$population[match(cluster,tmeta$cluster)]
sub_cluster_index=which(population=="Epithelial")

diagnose=tmeta$Status[match(total_individual,tmeta$individual)]
cell_num_table=cell_num_table[sub_cluster_index,]
pval_num=pval_num[sub_cluster_index]
porder=order(pval_num,decreasing = TRUE)
this_table=data.frame(cell_num=as.numeric(cell_num_table),
                      cluster=factor(rep(cluster_plot[sub_cluster_index],times=ncol(cell_num_table))
                      ,levels=cluster_plot[sub_cluster_index][porder]),
                      Diagnose=factor(rep(diagnose,each=nrow(cell_num_table)),levels=c("ILD","Control")))
this_table$cur_col=rep(case_col,nrow(this_table))
this_table$cur_col[this_table$diagnose=="Control"]=ctrl_col

this_table=this_table[this_table$cell_num>0,]
#violin plot cell number
cols = hue_pal()(length(unique(this_table$cluster)))
cols2=sapply(cols,t_col)

p1=ggplot(this_table, aes(x=cluster, y=cell_num, fill=Diagnose)) +
  #scale_fill_manual(values=hue_pal()(2)) +
  scale_fill_manual(values=c(case_col,ctrl_col)) +
  labs(title="Number of Epithelial Cells per Subject",x=" ", y = "Number of Cells", tag = " ")+
  #geom_split_violin(trim=FALSE)+#geom_jitter(shape=16, position=position_jitter(0.2))+
  geom_boxplot()+ scale_y_continuous(trans='log10') +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        #legend.position = "none",
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(p1)

4.3 bar plot of number of differentially expressed genes per subjects.

this_table=data.frame(pval_num=pval_num,
                      cluster=factor(cluster_plot[sub_cluster_index],
                      levels=cluster_plot[sub_cluster_index][porder]))

p2=ggplot(this_table, aes(x=cluster, y=pval_num,fill=1)) +
  labs(title="Number of DEGs in Epithelial Cells",x=" ", y = "Number of DEGs", tag = " ")+
  geom_bar(stat="identity")+ #geom_jitter(shape=16, position=position_jitter(0.2))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        legend.position = "none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(p2)

##Gene set enrichment analysis

We also perform gene set enrichment analysis using the GOSeq package.For manually annotation, we download gencode.v37.annotation.gtf from here.

p3=ggplot(go_table, aes(x=label, y=nlog10pval,fill=cluster)) +
  labs(title="GO: DEGs Enriched Pathways",x=" ", y = "-log10 Enrichment q-values", tag = " ")+
  geom_bar(stat="identity")+ #geom_jitter(shape=16, position=position_jitter(0.2))+
  coord_flip()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        legend.position = "none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(p3)

4.4 Illustration of the differentially expressed genes

op=par(mfrow=c(5,4),oma=c(0,0,0,0),mar=c(3,3,3,0),cex.lab=2,cex.axis=2,cex.main=2)
#focus on "SCGB3A2+","AT2","Basal","MUC5B+","AT1"
for(i_cluster in c(7,11,2,8,12)){ 
  count=0 #initiate
  sig_names=matrix(ncol=1,nrow=0)
  
  cur_cluster=as.character(unique(tmeta$cluster)[i_cluster])
  meta=tmeta[tmeta$cluster==cur_cluster,]
  meta$diagnosis=as.numeric(meta$diagnosis=="Control")
  cur_individual=unique(meta$individual)
  
  cur_cluster=cluster[i_cluster]
  cur_pval=NA
  cur_padj=NA
  
  cur_pval=readRDS(paste0(dir,"IPF/op_pval/op_pvalw.rds"))[[i_cluster]]
  names(cur_pval)=gene_id
  names(cur_pval)=gsub("ABR_ENSG00000159842","ABR",names(cur_pval))
  cur_padj=p.adjust(cur_pval,method="fdr")
  
  bcw_case=readRDS(paste0(dir,"IPF/op_pval/bcw_case.rds"))[[i_cluster]]
  bcw_ctrl=readRDS(paste0(dir,"IPF/op_pval/bcw_ctrl.rds"))[[i_cluster]]
  
  cur_n=100
  
  sig_index_opw=c(order(cur_pval,decreasing = FALSE))[1:sum(cur_padj<=0.1,na.rm = TRUE)]
  
  mt_index=grep("^MT",names(cur_pval)) #remove MT genes
  sig_index_opw=setdiff(sig_index_opw,mt_index)
  
  ###plot 
  for(i_sig in c(sig_index_opw)){
    
    cur_case=bcw_case[[i_sig]]
    cur_ctrl=bcw_ctrl[[i_sig]]
    
    case_dx=as.numeric(names(cur_case))
    case_dy=as.numeric(cur_case)
    ctrl_dx=as.numeric(names(cur_ctrl))
    ctrl_dy=as.numeric(cur_ctrl)
    
    if(is.na(match(names(cur_pval)[i_sig],sig_names)) &&
       (sum(case_dy[case_dx>=1]) + sum(ctrl_dy[ctrl_dx>=1])) >=0.5){ #total proportion >=1 more than 40%
      
      plot(case_dx, case_dy, type = "l",xlab="",ylab="",
           xlim=c(0,max(case_dx,ctrl_dx)),ylim=c(0,max(case_dy,ctrl_dy)),
           main=gene_id[i_sig],frame.plot = FALSE, axes.y=FALSE)#, axes=FALSE)
      lines(ctrl_dx, ctrl_dy, type = "l")
      polygon(c(case_dx,rev(case_dx)), c(case_dy,rep(0,length(case_dy))), col=t_col(case_col,percent = 20), border=NA)
      polygon(c(ctrl_dx,rev(ctrl_dx)), c(ctrl_dy,rep(0,length(ctrl_dy))), col=t_col(ctrl_col,percent = 50), border=NA)
      
      count=count+1
      sig_names=c(sig_names,names(cur_pval)[i_sig])
      #print(c(names(cur_pval)[i_sig],cluster[i_cluster],count))
    }
    if(count==4){break}
  }
}

par(op)

5 Session Information

sessionInfo()

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets
## [8] methods   base
##
## other attached packages:
##  [1] GO.db_3.12.1                org.Hs.eg.db_3.12.0
##  [3] AnnotationDbi_1.52.0        biomaRt_2.46.3
##  [5] goseq_1.42.0                geneLenDataBase_1.26.0
##  [7] BiasedUrn_1.07              scales_1.1.1
##  [9] stringr_1.4.0               ggplot2_3.3.3
## [11] ipw_1.0-11                  reticulate_1.18
## [13] doRNG_1.8.2                 rngtools_1.5
## [15] foreach_1.5.1               abind_1.4-5
## [17] DESeq2_1.30.1               lme4_1.1-26
## [19] Matrix_1.3-2                MAST_1.16.0
## [21] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [23] Biobase_2.50.0              GenomicRanges_1.42.0
## [25] GenomeInfoDb_1.26.2         IRanges_2.24.1
## [27] S4Vectors_0.28.1            BiocGenerics_0.36.0
## [29] MatrixGenerics_1.2.1        matrixStats_0.58.0
## [31] moments_0.14                emdbook_1.3.12
## [33] MASS_7.3-53.1               knitr_1.31
##
## loaded via a namespace (and not attached):
##   [1] backports_1.2.1          BiocFileCache_1.14.0     plyr_1.8.6
##   [4] splines_4.0.3            BiocParallel_1.24.1      digest_0.6.27
##   [7] htmltools_0.5.1.1        fansi_0.4.2              magrittr_2.0.1
##  [10] memoise_2.0.0            Biostrings_2.58.0        annotate_1.68.0
##  [13] askpass_1.1              bdsmatrix_1.3-4          prettyunits_1.1.1
##  [16] colorspace_2.0-0         blob_1.2.1               rappdirs_0.3.3
##  [19] xfun_0.21                dplyr_1.0.5              crayon_1.4.1
##  [22] RCurl_1.98-1.2           jsonlite_1.7.2           genefilter_1.72.1
##  [25] survival_3.2-7           iterators_1.0.13         glue_1.4.2
##  [28] geepack_1.3-2            gtable_0.3.0             zlibbioc_1.36.0
##  [31] XVector_0.30.0           DelayedArray_0.16.2      mvtnorm_1.1-1
##  [34] DBI_1.1.1                Rcpp_1.0.6               xtable_1.8-4
##  [37] progress_1.2.2           bit_4.0.4                httr_1.4.2
##  [40] RColorBrewer_1.1-2       ellipsis_0.3.1           pkgconfig_2.0.3
##  [43] XML_3.99-0.5             farver_2.1.0             nnet_7.3-15
##  [46] sass_0.3.1               dbplyr_2.1.0             locfit_1.5-9.4
##  [49] utf8_1.1.4               tidyselect_1.1.0         labeling_0.4.2
##  [52] rlang_0.4.10             reshape2_1.4.4           munsell_0.5.0
##  [55] tools_4.0.3              cachem_1.0.4             generics_0.1.0
##  [58] RSQLite_2.2.3            broom_0.7.5              evaluate_0.14
##  [61] fastmap_1.1.0            yaml_2.2.1               bit64_4.0.5
##  [64] purrr_0.3.4              nlme_3.1-152             xml2_1.3.2
##  [67] compiler_4.0.3           curl_4.3                 tibble_3.1.0
##  [70] statmod_1.4.35           geneplotter_1.68.0       bslib_0.2.4
##  [73] stringi_1.5.3            highr_0.8                GenomicFeatures_1.42.1
##  [76] lattice_0.20-41          nloptr_1.2.2.2           vctrs_0.3.6
##  [79] pillar_1.5.1             lifecycle_1.0.0          jquerylib_0.1.3
##  [82] data.table_1.14.0        bitops_1.0-6             rtracklayer_1.50.0
##  [85] R6_2.5.0                 codetools_0.2-18         boot_1.3-27
##  [88] assertthat_0.2.1         openssl_1.4.3            withr_2.4.1
##  [91] GenomicAlignments_1.26.0 Rsamtools_2.6.0          GenomeInfoDbData_1.2.4
##  [94] mgcv_1.8-34              hms_1.0.0                grid_4.0.3
##  [97] tidyr_1.1.3              coda_0.19-4              minqa_1.2.4
## [100] rmarkdown_2.7            bbmle_1.0.23.1           numDeriv_2016.8-1.1

Real Data Analysis in BSDE Projects

Mengqi Zhang, Richard Guo

2021-03-25