This markdown is for individual case-control study based on Barycenter and Wasserstein distance estimation on the gene differential expression (DE) analysis of certain cell type based on the idiopathic pulmonary fibrosis.
R packages required in this pipeline:
Here we present a demo version of our real data analysis,We take the basic parameters, i.e., the mean, dispersion and dropout parameter from the distribution of the real reference scRNAseq database from paper Single-cell RNA sequencing identifies diverse roles of epithelial cells in idiopathic pulmonary fibrosis.
We focus on the cell type SCGB3A2+ and limit our analysis in the first 1000 genes.The trimmed file are at the data/IPF folder.
Senario settings in our analysis are as follows:
###########input###############
meta=readRDS(paste0(dir,"IPF/meta.rds"))
#input phenotype
cur_individual=unique(meta$individual)
#input bulk
rawM_bulk=readRDS(paste0(dir,"IPF/matrix_bulk.rds"))
rawM_cell=readRDS(paste0(dir,"IPF/ind_cellnum.rds"))
rawM_bulk=t(apply(rawM_bulk,1,function(x) x*100/rawM_cell))
colnames(rawM_bulk)=names(rawM_cell)
logcount_data_bulk=log2(rawM_bulk+1)
#input counts
rawM=readRDS(paste0(dir,"IPF/rawM.rds"))
logcount_data=log2(rawM+1)
dim(logcount_data)## [1] 1000 3220
## [1] 3220 15
meta_ind=meta[match(unique(meta$individual),meta$individual),c("individual","diagnosis")]
rdp=readRDS(paste0(dir,"IPF/ind_readdepth.rds"))
meta_ind$rdp=scale(rdp)
meta_ind$individual=scale(as.numeric(meta_ind$individual))
meta_ind$diagnosis=meta_ind$diagnosisAfter read in the data, we apply inverse probability weight to remove the covariance effects.
Then we run our method for that.
dim(logcount_data)
logcount_data[1:10,1:10]
cell_id=meta$cell
gene_id=dimnames(logcount_data)[[1]]
rownames(logcount_data)=gene_id
colnames(logcount_data)=cell_id
op_pvalw=matrix(ncol=1,nrow=nrow(logcount_data))
bc_case_ob=list()
bc_ctrl_ob=list()
bcw_case_ob=list()
bcw_ctrl_ob=list()
#for(ig in 1:nrow(logcount_data)){
for(ig in 1:10){
bc_case_ob[[ig]]=list()
bc_ctrl_ob[[ig]]=list()
bcw_case_ob[[ig]]=list()
bcw_ctrl_ob[[ig]]=list()
cur_sim=logcount_data[ig,]
cur_ind=meta$individual
cur_pheno=meta$diagnosis
tempa=tryCatch({cal_w2_pval(count_per_gene=cur_sim,meta_individual=cur_ind,meta_phenotype=cur_pheno,perm_num=200,unif_round_unit=0.5,l_fold=lambda_fold,merge_method=merge_method,weight=ipww,shrink=FALSE)}, error = function(e) {NA} )
op_pvalw[ig]=tryCatch({tempa[[1]]}, error = function(e) {NA} )
bcw_case_ob[[ig]]=tryCatch({tempa[[2]]}, error = function(e) {NA} )
bcw_ctrl_ob[[ig]]=tryCatch({tempa[[3]]}, error = function(e) {NA} )
if(!(is.na(op_pvalw[ig])) && op_pvalw[ig]<2/200){
op_pvalw[ig]=tryCatch({cal_w2_pval(count_per_gene=cur_sim,meta_individual=cur_ind,meta_phenotype=cur_pheno,perm_num=1000,unif_round_unit=0.5,l_fold=lambda_fold,merge_method=merge_method,weight=ipww,shrink=FALSE)[[1]]}, error = function(e) {NA} )}
if(!(is.na(op_pvalw[ig])) && op_pvalw[ig]<2/1000){
op_pvalw[ig]=tryCatch({cal_w2_pval(count_per_gene=cur_sim,meta_individual=cur_ind,meta_phenotype=cur_pheno,perm_num=5000,unif_round_unit=0.5,l_fold=lambda_fold,merge_method=merge_method,weight=ipww,shrink=FALSE)[[1]]}, error = function(e) {NA} )}
}
names(op_pvalw)=gene_idOnce got the p-values, we can present our results as follows.
We’ve done our the analysis based on the top 3000 expressed genes among all clusters, and use FDR method multi-effect correction. Then we will do some data analysis and visualization.
n_cluster=31
case_col="#00539B"
ctrl_col="#a3d6f0" #"#4B9CD3"
set.seed(1)
library(ggplot2)
library(stringr)
library(scales)
#geom_split_violin function
GeomSplitViolin <- ggproto("GeomSplitViolin", GeomViolin,
draw_group = function(self, data, ..., draw_quantiles = NULL) {
data <- transform(data, xminv = x - violinwidth * (x - xmin), xmaxv = x + violinwidth * (xmax - x))
grp <- data[1, "group"]
newdata <- plyr::arrange(transform(data, x = if (grp %% 2 == 1) xminv else xmaxv), if (grp %% 2 == 1) y else -y)
newdata <- rbind(newdata[1, ], newdata, newdata[nrow(newdata), ], newdata[1, ])
newdata[c(1, nrow(newdata) - 1, nrow(newdata)), "x"] <- round(newdata[1, "x"])
if (length(draw_quantiles) > 0 & !scales::zero_range(range(data$y))) {
stopifnot(all(draw_quantiles >= 0), all(draw_quantiles <=
1))
quantiles <- ggplot2:::create_quantile_segment_frame(data, draw_quantiles)
aesthetics <- data[rep(1, nrow(quantiles)), setdiff(names(data), c("x", "y")), drop = FALSE]
aesthetics$alpha <- rep(1, nrow(quantiles))
both <- cbind(quantiles, aesthetics)
quantile_grob <- GeomPath$draw_panel(both, ...)
ggplot2:::ggname("geom_split_violin", grid::grobTree(GeomPolygon$draw_panel(newdata, ...), quantile_grob))
}
else {
ggplot2:::ggname("geom_split_violin", GeomPolygon$draw_panel(newdata, ...))
}
})
geom_split_violin <- function(mapping = NULL, data = NULL, stat = "ydensity", position = "identity", ...,
draw_quantiles = NULL, trim = TRUE, scale = "area", na.rm = FALSE,
show.legend = NA, inherit.aes = TRUE) {
layer(data = data, mapping = mapping, stat = stat, geom = GeomSplitViolin,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(trim = trim, scale = scale, draw_quantiles = draw_quantiles, na.rm = na.rm, ...))
}
t_col = function(color, percent = 60, name = NULL) {
# color = color name
# percent = % transparency
# name = an optional name for the color
## Get RGB values for named color
rgb.val <- col2rgb(color)
## Make new color using input color as base and alpha set by transparency
t.col <- rgb(rgb.val[1], rgb.val[2], rgb.val[3],
max = 255,
alpha = (100 - percent) * 255 / 100,
names = name)
## Save the color
invisible(t.col)
}#read_in
opw_padj=readRDS(paste0(dir,"IPF/opw_padj.rds"))
gene_id=names(opw_padj[[1]])
tmeta=readRDS(paste0(dir,"IPF/tmeta.rds"))
cell_num_table=readRDS(paste0(dir,"IPF/cell_num_table.rds"))
cluster=unique(tmeta$cluster)
total_individual=unique(tmeta$individual)
cur_cluster="Proliferating Epithelial Cells" #Proliferating Epithelial Cells #Plasma Cells #Endothelial Cells
cluster=unique(tmeta$celltype)
cluster_plot=gsub("Proliferating","P.",cluster)
cluster_plot=gsub("Differentiating","Diff.",cluster_plot)
cluster_plot=gsub("Transitional","Trans.",cluster_plot)
cluster_plot=gsub("+ SCGB1A","/",cluster_plot)pval_num=sapply(opw_padj,function(x){sum(x<=0.1,na.rm=TRUE)}) #adjusted FDR <0.1
population=tmeta$population[match(cluster,tmeta$cluster)]
sub_cluster_index=which(population=="Epithelial")
diagnose=tmeta$Status[match(total_individual,tmeta$individual)]
cell_num_table=cell_num_table[sub_cluster_index,]
pval_num=pval_num[sub_cluster_index]
porder=order(pval_num,decreasing = TRUE)
this_table=data.frame(cell_num=as.numeric(cell_num_table),
cluster=factor(rep(cluster_plot[sub_cluster_index],times=ncol(cell_num_table))
,levels=cluster_plot[sub_cluster_index][porder]),
Diagnose=factor(rep(diagnose,each=nrow(cell_num_table)),levels=c("ILD","Control")))
this_table$cur_col=rep(case_col,nrow(this_table))
this_table$cur_col[this_table$diagnose=="Control"]=ctrl_col
this_table=this_table[this_table$cell_num>0,]
#violin plot cell number
cols = hue_pal()(length(unique(this_table$cluster)))
cols2=sapply(cols,t_col)
p1=ggplot(this_table, aes(x=cluster, y=cell_num, fill=Diagnose)) +
#scale_fill_manual(values=hue_pal()(2)) +
scale_fill_manual(values=c(case_col,ctrl_col)) +
labs(title="Number of Epithelial Cells per Subject",x=" ", y = "Number of Cells", tag = " ")+
#geom_split_violin(trim=FALSE)+#geom_jitter(shape=16, position=position_jitter(0.2))+
geom_boxplot()+ scale_y_continuous(trans='log10') +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
#legend.position = "none",
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(p1)this_table=data.frame(pval_num=pval_num,
cluster=factor(cluster_plot[sub_cluster_index],
levels=cluster_plot[sub_cluster_index][porder]))
p2=ggplot(this_table, aes(x=cluster, y=pval_num,fill=1)) +
labs(title="Number of DEGs in Epithelial Cells",x=" ", y = "Number of DEGs", tag = " ")+
geom_bar(stat="identity")+ #geom_jitter(shape=16, position=position_jitter(0.2))+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
legend.position = "none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(p2)##Gene set enrichment analysis
We also perform gene set enrichment analysis using the GOSeq package.For manually annotation, we download gencode.v37.annotation.gtf from here.
p3=ggplot(go_table, aes(x=label, y=nlog10pval,fill=cluster)) +
labs(title="GO: DEGs Enriched Pathways",x=" ", y = "-log10 Enrichment q-values", tag = " ")+
geom_bar(stat="identity")+ #geom_jitter(shape=16, position=position_jitter(0.2))+
coord_flip()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
legend.position = "none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(p3)op=par(mfrow=c(5,4),oma=c(0,0,0,0),mar=c(3,3,3,0),cex.lab=2,cex.axis=2,cex.main=2)
#focus on "SCGB3A2+","AT2","Basal","MUC5B+","AT1"
for(i_cluster in c(7,11,2,8,12)){
count=0 #initiate
sig_names=matrix(ncol=1,nrow=0)
cur_cluster=as.character(unique(tmeta$cluster)[i_cluster])
meta=tmeta[tmeta$cluster==cur_cluster,]
meta$diagnosis=as.numeric(meta$diagnosis=="Control")
cur_individual=unique(meta$individual)
cur_cluster=cluster[i_cluster]
cur_pval=NA
cur_padj=NA
cur_pval=readRDS(paste0(dir,"IPF/op_pval/op_pvalw.rds"))[[i_cluster]]
names(cur_pval)=gene_id
names(cur_pval)=gsub("ABR_ENSG00000159842","ABR",names(cur_pval))
cur_padj=p.adjust(cur_pval,method="fdr")
bcw_case=readRDS(paste0(dir,"IPF/op_pval/bcw_case.rds"))[[i_cluster]]
bcw_ctrl=readRDS(paste0(dir,"IPF/op_pval/bcw_ctrl.rds"))[[i_cluster]]
cur_n=100
sig_index_opw=c(order(cur_pval,decreasing = FALSE))[1:sum(cur_padj<=0.1,na.rm = TRUE)]
mt_index=grep("^MT",names(cur_pval)) #remove MT genes
sig_index_opw=setdiff(sig_index_opw,mt_index)
###plot
for(i_sig in c(sig_index_opw)){
cur_case=bcw_case[[i_sig]]
cur_ctrl=bcw_ctrl[[i_sig]]
case_dx=as.numeric(names(cur_case))
case_dy=as.numeric(cur_case)
ctrl_dx=as.numeric(names(cur_ctrl))
ctrl_dy=as.numeric(cur_ctrl)
if(is.na(match(names(cur_pval)[i_sig],sig_names)) &&
(sum(case_dy[case_dx>=1]) + sum(ctrl_dy[ctrl_dx>=1])) >=0.5){ #total proportion >=1 more than 40%
plot(case_dx, case_dy, type = "l",xlab="",ylab="",
xlim=c(0,max(case_dx,ctrl_dx)),ylim=c(0,max(case_dy,ctrl_dy)),
main=gene_id[i_sig],frame.plot = FALSE, axes.y=FALSE)#, axes=FALSE)
lines(ctrl_dx, ctrl_dy, type = "l")
polygon(c(case_dx,rev(case_dx)), c(case_dy,rep(0,length(case_dy))), col=t_col(case_col,percent = 20), border=NA)
polygon(c(ctrl_dx,rev(ctrl_dx)), c(ctrl_dy,rep(0,length(ctrl_dy))), col=t_col(ctrl_col,percent = 50), border=NA)
count=count+1
sig_names=c(sig_names,names(cur_pval)[i_sig])
#print(c(names(cur_pval)[i_sig],cluster[i_cluster],count))
}
if(count==4){break}
}
}## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] GO.db_3.12.1 org.Hs.eg.db_3.12.0
## [3] AnnotationDbi_1.52.0 biomaRt_2.46.3
## [5] goseq_1.42.0 geneLenDataBase_1.26.0
## [7] BiasedUrn_1.07 scales_1.1.1
## [9] stringr_1.4.0 ggplot2_3.3.3
## [11] ipw_1.0-11 reticulate_1.18
## [13] doRNG_1.8.2 rngtools_1.5
## [15] foreach_1.5.1 abind_1.4-5
## [17] DESeq2_1.30.1 lme4_1.1-26
## [19] Matrix_1.3-2 MAST_1.16.0
## [21] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [23] Biobase_2.50.0 GenomicRanges_1.42.0
## [25] GenomeInfoDb_1.26.2 IRanges_2.24.1
## [27] S4Vectors_0.28.1 BiocGenerics_0.36.0
## [29] MatrixGenerics_1.2.1 matrixStats_0.58.0
## [31] moments_0.14 emdbook_1.3.12
## [33] MASS_7.3-53.1 knitr_1.31
##
## loaded via a namespace (and not attached):
## [1] backports_1.2.1 BiocFileCache_1.14.0 plyr_1.8.6
## [4] splines_4.0.3 BiocParallel_1.24.1 digest_0.6.27
## [7] htmltools_0.5.1.1 fansi_0.4.2 magrittr_2.0.1
## [10] memoise_2.0.0 Biostrings_2.58.0 annotate_1.68.0
## [13] askpass_1.1 bdsmatrix_1.3-4 prettyunits_1.1.1
## [16] colorspace_2.0-0 blob_1.2.1 rappdirs_0.3.3
## [19] xfun_0.21 dplyr_1.0.5 crayon_1.4.1
## [22] RCurl_1.98-1.2 jsonlite_1.7.2 genefilter_1.72.1
## [25] survival_3.2-7 iterators_1.0.13 glue_1.4.2
## [28] geepack_1.3-2 gtable_0.3.0 zlibbioc_1.36.0
## [31] XVector_0.30.0 DelayedArray_0.16.2 mvtnorm_1.1-1
## [34] DBI_1.1.1 Rcpp_1.0.6 xtable_1.8-4
## [37] progress_1.2.2 bit_4.0.4 httr_1.4.2
## [40] RColorBrewer_1.1-2 ellipsis_0.3.1 pkgconfig_2.0.3
## [43] XML_3.99-0.5 farver_2.1.0 nnet_7.3-15
## [46] sass_0.3.1 dbplyr_2.1.0 locfit_1.5-9.4
## [49] utf8_1.1.4 tidyselect_1.1.0 labeling_0.4.2
## [52] rlang_0.4.10 reshape2_1.4.4 munsell_0.5.0
## [55] tools_4.0.3 cachem_1.0.4 generics_0.1.0
## [58] RSQLite_2.2.3 broom_0.7.5 evaluate_0.14
## [61] fastmap_1.1.0 yaml_2.2.1 bit64_4.0.5
## [64] purrr_0.3.4 nlme_3.1-152 xml2_1.3.2
## [67] compiler_4.0.3 curl_4.3 tibble_3.1.0
## [70] statmod_1.4.35 geneplotter_1.68.0 bslib_0.2.4
## [73] stringi_1.5.3 highr_0.8 GenomicFeatures_1.42.1
## [76] lattice_0.20-41 nloptr_1.2.2.2 vctrs_0.3.6
## [79] pillar_1.5.1 lifecycle_1.0.0 jquerylib_0.1.3
## [82] data.table_1.14.0 bitops_1.0-6 rtracklayer_1.50.0
## [85] R6_2.5.0 codetools_0.2-18 boot_1.3-27
## [88] assertthat_0.2.1 openssl_1.4.3 withr_2.4.1
## [91] GenomicAlignments_1.26.0 Rsamtools_2.6.0 GenomeInfoDbData_1.2.4
## [94] mgcv_1.8-34 hms_1.0.0 grid_4.0.3
## [97] tidyr_1.1.3 coda_0.19-4 minqa_1.2.4
## [100] rmarkdown_2.7 bbmle_1.0.23.1 numDeriv_2016.8-1.1