suppressPackageStartupMessages({
library(matrixStats)
library(clusterProfiler)
library(org.Hs.eg.db)
library(magrittr)
})
recount2PCAmodel is a matrix of avgLoading vectors built from recount2 data
(studies with >= 50 samples). Top 20 PCs from each study were clustered and averaged.
recount2PLIERmodel is PLIER model output from PLIER::PLIER fucntion. All the
recount2 data (including even a few samples per study) were used with the prior
knowledge from KEGG_2019_Human data.
recount2PCAmodel = readRDS("/data2/Genomic_Super_Signature/multiPCA/data/avgLoadingRecount2.rds")
recound_KEGG_model = readRDS("/data2/Genomic_Super_Signature/PLIER/data/recount_KEGG_PLIER_model.RDS")
recount2PLIERmodel = recound_KEGG_model$Z
dim(recount2PCAmodel)
## [1] 6750 118
dim(recount2PLIERmodel)
## [1] 7619 394
The Pearson correlation between these two Z matrices were calculated.
cm = intersect(rownames(recount2PCAmodel), rownames(recount2PLIERmodel))
recount2.cor = cor(recount2PCAmodel[cm,], recount2PLIERmodel[cm,], method = c("pearson"))
colnames(recount2.cor) = paste0("LV_", c(1:ncol(recount2.cor)))
cut_off = 0.5
recount2.cor.subset = recount2.cor[rowMaxs(abs(recount2.cor), na.rm = TRUE) > cut_off,
colMaxs(abs(recount2.cor), na.rm = TRUE) > cut_off,
drop = FALSE]
corLoadingPCA = rownames(recount2.cor.subset)
corLoadingPLIER = colnames(recount2.cor.subset)
We found that 42 loadings from recount2PCAmodel
and 31 loadings from recount2PLIERmodel are
correlatd with each other with the correlation coefficiency >= 0.5.
This matching of loding vectors between Z matrices are not happening 1:1 ratio.
corrplot::corrplot(recount2.cor.subset, order = "original",
# method = "number",
tl.col = "black", tl.cex = 0.7,
mar = c(0,0,1,0))
Venn diagram of LVs and PCclusters. Overlapping part is defined as Pearson correlation
coefficient > 0.5. Because the correlation pairing is not 1:1 between
Z matrices, in the below Venn diagram, the size of the overlappting part is based on recount2PCAmodel.
library(VennDiagram)
grid.newpage()
draw.pairwise.venn(area1 = ncol(recount2PCAmodel),
area2 = ncol(recount2PLIERmodel),
cross.area = nrow(recount2.cor.subset),
category = c("perPCA_model", "megaPLIER_model"),
lty = rep("blank", 2),
fill = c("light blue", "pink"),
alpha = rep(0.5, 2),
cat.pos = c(0, 0),
cat.dist = rep(0.025, 2))
## (polygon[GRID.polygon.11], polygon[GRID.polygon.12], polygon[GRID.polygon.13], polygon[GRID.polygon.14], text[GRID.text.15], text[GRID.text.16], text[GRID.text.17], text[GRID.text.18], text[GRID.text.19])
Summary table of the most correlated pairs of loading vectors based on Pearson correlation.
loadingPair = data.frame(matrix(ncol = 2, nrow = 0))
colnames(loadingPair) = c("PCA model", "PLIER model")
for (i in 1:nrow(recount2.cor.subset)) {
j = which.max(recount2.cor.subset[i,])
loadingPair[i,1] = rownames(recount2.cor.subset)[i]
loadingPair[i,2] = colnames(recount2.cor.subset)[j]
}
head(loadingPair)
## PCA model PLIER model
## 1 Cl118_01 (15/15) LV_36
## 2 Cl118_03 (9/9) LV_65
## 3 Cl118_06 (31/31) LV_339
## 4 Cl118_07 (35/35) LV_1
## 5 Cl118_08 (12/12) LV_90
## 6 Cl118_13 (11/11) LV_292
42 PCclusters are associated with 24 LVs, and 31 LVs are associated with 23 PCclusters.
KEGG_2019_Human geneset was used earlier as a prior knowledge of PLIER::PLIER,
so PCclusters from perData loadings are ranked with the same geneset.
gmt.file = "/data2/Genomic_Super_Signature/raw_data/KEGG_2019_Human.txt"
# KEGG_2019_Human
set2gene = read.gmt(gmt.file)
colnames(set2gene)[1] = c("name")
# Assign 'id' for each pathway
pathwayId = data.frame(name = unique(set2gene$name), id = paste0("KEGG2019_", seq_along(unique(set2gene$name))))
set2gene = dplyr::full_join(set2gene, pathwayId, by = "name")
# Inputs for `clusterProfiler::GSEA`
TERM2GENE = set2gene %>% dplyr::select(id, gene)
TERM2NAME = set2gene %>% dplyr::select(id, name)
numPathways = 10
z = recount2PCAmodel
set.seed(1234)
topLoadings_recount2PCAmodel = makeGeneList(corLoadingPCA, z) %>% gseaLoading(., TERM2GENE, TERM2NAME, pvalueCutoff = 0.5)
topPathways_recount2PCAmodel = topPathways(topLoadings_recount2PCAmodel, numPathways)
topPathways_recount2PCAmodel[,1:2]
## Cl118_01 (15/15)
## Up_1 Influenza A
## Up_2 Hepatitis C
## Up_3 Measles
## Up_4 NOD-like receptor signaling pathway
## Up_5 Epstein-Barr virus infection
## Up_6 Herpes simplex virus 1 infection
## Up_7 Cytosolic DNA-sensing pathway
## Up_8 Systemic lupus erythematosus
## Up_9 Pyrimidine metabolism
## Up_10 Proteasome
## Down_1 Parathyroid hormone synthesis, secretion and action
## Down_2 Regulation of actin cytoskeleton
## Down_3 Ribosome
## Down_4 T cell receptor signaling pathway
## Down_5 Adrenergic signaling in cardiomyocytes
## Down_6 Vascular smooth muscle contraction
## Down_7 GnRH signaling pathway
## Down_8 Platelet activation
## Down_9 Choline metabolism in cancer
## Down_10 Rap1 signaling pathway
## Cl118_03 (9/9)
## Up_1 Lysosome
## Up_2 Legionellosis
## Up_3 Pertussis
## Up_4 Rheumatoid arthritis
## Up_5 Phagosome
## Up_6 Drug metabolism
## Up_7 Salmonella infection
## Up_8 Oxidative phosphorylation
## Up_9 Amoebiasis
## Up_10 Tuberculosis
## Down_1 Ribosome
## Down_2 RIG-I-like receptor signaling pathway
## Down_3 Hepatitis C
## Down_4 Measles
## Down_5 ABC transporters
## Down_6 Cellular senescence
## Down_7 VEGF signaling pathway
## Down_8 FoxO signaling pathway
## Down_9 T cell receptor signaling pathway
## Down_10 Epstein-Barr virus infection
multiPLIER.summary = recound_KEGG_model$summary
topPathways_recount2PLIERmodel = list()
for (LV in unique(multiPLIER.summary$`LV index`)) {
LVname = paste0("LV_", LV)
pathwayInd = which(multiPLIER.summary$`LV index` == LV)
topPathways_recount2PLIERmodel[[LVname]] = multiPLIER.summary$pathway[pathwayInd]
}
# the number of pathways for each LV
LVPathwayNum = sapply(topPathways_recount2PLIERmodel, function(x) {length(x)})
head(LVPathwayNum)
## LV_1 LV_2 LV_3 LV_4 LV_5 LV_6
## 91 35 35 30 28 18
LVPathwayList = sapply(topPathways_recount2PLIERmodel, function(x) {head(x)})
head(LVPathwayList, 3)
## $LV_1
## [1] "ABC transporters"
## [2] "AMPK signaling pathway"
## [3] "Alanine, aspartate and glutamate metabolism"
## [4] "Alzheimer disease"
## [5] "Amino sugar and nucleotide sugar metabolism"
## [6] "Aminoacyl-tRNA biosynthesis"
##
## $LV_2
## [1] "Aminoacyl-tRNA biosynthesis" "Basal transcription factors"
## [3] "Cell cycle" "Citrate cycle (TCA cycle)"
## [5] "Cysteine and methionine metabolism" "DNA replication"
##
## $LV_3
## [1] "Adrenergic signaling in cardiomyocytes"
## [2] "Alanine, aspartate and glutamate metabolism"
## [3] "Alcoholism"
## [4] "Aldosterone synthesis and secretion"
## [5] "Alzheimer disease"
## [6] "Amyotrophic lateral sclerosis (ALS)"
Here, I printed out some of the pathways of highly correlated loading pairs from perData PCclusters and multiPLIER LVs.
k = nrow(loadingPair) # for all pairs
k = 2 # Let's check only the two pairs
for (i in 1:k) {
PCclusterName = loadingPair$`PCA model`[i]
LVName = loadingPair$`PLIER model`[i]
# PCcluster
perData_pathways = topPathways_recount2PCAmodel[[PCclusterName]] %>%
as.data.frame(., row.names = c(paste0("Up_", 1:10), paste0("Down_", 1:10)))
# remove the loading that's corrleated but doesn't have enriched pathway
if (nrow(perData_pathways) == 0) {
print(paste("No enrichment result from", PCclusterName))
break
}
colnames(perData_pathways) = PCclusterName
# LV that's highly correlated to PCcluster
LVind = gsub("LV_", "", LVName)
multiPLIER_pathways = multiPLIER.summary[which(multiPLIER.summary$`LV index` == LVind),]
print(perData_pathways)
print(multiPLIER_pathways)
}
## Cl118_01 (15/15)
## Up_1 Influenza A
## Up_2 Hepatitis C
## Up_3 Measles
## Up_4 NOD-like receptor signaling pathway
## Up_5 Epstein-Barr virus infection
## Up_6 Herpes simplex virus 1 infection
## Up_7 Cytosolic DNA-sensing pathway
## Up_8 Systemic lupus erythematosus
## Up_9 Pyrimidine metabolism
## Up_10 Proteasome
## Down_1 Parathyroid hormone synthesis, secretion and action
## Down_2 Regulation of actin cytoskeleton
## Down_3 Ribosome
## Down_4 T cell receptor signaling pathway
## Down_5 Adrenergic signaling in cardiomyocytes
## Down_6 Vascular smooth muscle contraction
## Down_7 GnRH signaling pathway
## Down_8 Platelet activation
## Down_9 Choline metabolism in cancer
## Down_10 Rap1 signaling pathway
## pathway LV index AUC p-value
## 748 Cytokine-cytokine receptor interaction 36 0.5733501 1.868229e-02
## 749 Epstein-Barr virus infection 36 0.7545356 6.049556e-09
## 750 Hepatitis C 36 0.7401197 3.011455e-07
## 751 Herpes simplex virus 1 infection 36 0.5376563 8.738719e-02
## 752 Influenza A 36 0.7245609 1.524735e-06
## 753 NOD-like receptor signaling pathway 36 0.7192433 6.487636e-07
## 754 Phagosome 36 0.5232982 3.285635e-01
## 755 RIG-I-like receptor signaling pathway 36 0.9181102 2.485356e-09
## FDR
## 748 3.667638e-02
## 749 9.719324e-08
## 750 2.950686e-06
## 751 1.320477e-01
## 752 1.238493e-05
## 753 5.931165e-06
## 754 4.004518e-01
## 755 4.525419e-08
## Cl118_03 (9/9)
## Up_1 Lysosome
## Up_2 Legionellosis
## Up_3 Pertussis
## Up_4 Rheumatoid arthritis
## Up_5 Phagosome
## Up_6 Drug metabolism
## Up_7 Salmonella infection
## Up_8 Oxidative phosphorylation
## Up_9 Amoebiasis
## Up_10 Tuberculosis
## Down_1 Ribosome
## Down_2 RIG-I-like receptor signaling pathway
## Down_3 Hepatitis C
## Down_4 Measles
## Down_5 ABC transporters
## Down_6 Cellular senescence
## Down_7 VEGF signaling pathway
## Down_8 FoxO signaling pathway
## Down_9 T cell receptor signaling pathway
## Down_10 Epstein-Barr virus infection
## pathway LV index AUC
## 1022 Antigen processing and presentation 65 0.7191015
## 1023 Chemokine signaling pathway 65 0.6842200
## 1024 Cytokine-cytokine receptor interaction 65 0.5138591
## 1025 Endocytosis 65 0.7160690
## 1026 Fc gamma R-mediated phagocytosis 65 0.7371418
## 1027 Hematopoietic cell lineage 65 0.6681468
## 1028 Legionellosis 65 0.5870519
## 1029 Leishmaniasis 65 0.8031311
## 1030 Leukocyte transendothelial migration 65 0.5879973
## 1031 Lysosome 65 0.8221714
## 1032 NOD-like receptor signaling pathway 65 0.8027526
## 1033 Necroptosis 65 0.8088238
## 1034 Osteoclast differentiation 65 0.8238136
## 1035 Parathyroid hormone synthesis, secretion and action 65 0.6368513
## 1036 Phagosome 65 0.5743568
## 1037 Platelet activation 65 0.5974071
## 1038 Staphylococcus aureus infection 65 0.6913810
## 1039 Transcriptional misregulation in cancer 65 0.5464550
## 1040 Tuberculosis 65 0.8250780
## p-value FDR
## 1022 2.535200e-03 6.796825e-03
## 1023 3.105191e-06 2.355848e-05
## 1024 3.352764e-01 4.069883e-01
## 1025 1.246358e-09 2.453416e-08
## 1026 2.400230e-05 1.331092e-04
## 1027 3.271577e-03 8.363897e-03
## 1028 1.213906e-01 1.739495e-01
## 1029 2.397827e-05 1.331092e-04
## 1030 4.764568e-02 7.977457e-02
## 1031 9.474290e-11 2.688484e-09
## 1032 2.662381e-13 1.817907e-11
## 1033 9.572196e-13 5.652770e-11
## 1034 3.203878e-11 1.060678e-09
## 1035 5.638720e-03 1.337742e-02
## 1036 6.277074e-02 9.982102e-02
## 1037 2.691374e-02 4.954213e-02
## 1038 7.176856e-03 1.642035e-02
## 1039 1.302222e-01 1.834530e-01
## 1040 6.212076e-14 4.680478e-12
Next, I compared Z matrices using their annotation.
x = topPathways_recount2PCAmodel
y = topPathways_recount2PLIERmodel
res = data.frame(row.names = colnames(x))
for (i in seq_along(x)) {
for (j in seq_along(y)) {
res[i,j] = length(intersect(x[[i]], y[[j]]))
colnames(res)[j] = names(y)[j]
}
}
This is the heatmap of all the loading pairs.
This is the subset of the loading pairs with more than 80% shared pathways based on loadings with a fewer number of pathways assigned.
# new empty data frame with row and column as same as `res`
ind = data.frame(matrix(NA, ncol = ncol(res), nrow = nrow(res)),
row.names = rownames(res))
colnames(ind) = colnames(res)
# logical of each pair with more than 80% of shared pathways
for (i in seq_along(LVPathwayNum)) {
if (LVPathwayNum[i] >= 10) {
k = round(numPathways * 0.8) # more than 80% of the PCcluster pathways are overlap
ind[,i] = res[,i] > k
} else if (LVPathwayNum[i] < 10) { # in case, fewer pathways are assigned to LV (than PCcluster)
k = round(LVPathwayNum[i] * 0.8) # more than 80% of the LV pathways are overlap
ind[,i] = res[,i] > k
}
}
subset = res[which(rowSums(ind) != 0),
which(colSums(ind) != 0)]
heatmap(as.matrix(subset))
Here is the actual number of shared pathways between loadings.
corrplot::corrplot(as.matrix(subset),
method = "color",
addCoef.col = "white",
tl.srt = 45, tl.cex = 0.8, tl.col = "black",
col=colorRampPalette(c("blue","light yellow","red"))(200),
# col=hcl.colors(12, "YlOrRd", rev = TRUE),
is.corr = FALSE,
mar = c(0,0,1,0))