suppressPackageStartupMessages({
  library(matrixStats)
  library(clusterProfiler)
  library(org.Hs.eg.db)
  library(magrittr)
})

1 Load Z matrices

recount2PCAmodel is a matrix of avgLoading vectors built from recount2 data (studies with >= 50 samples). Top 20 PCs from each study were clustered and averaged.

recount2PLIERmodel is PLIER model output from PLIER::PLIER fucntion. All the recount2 data (including even a few samples per study) were used with the prior knowledge from KEGG_2019_Human data.

recount2PCAmodel = readRDS("/data2/Genomic_Super_Signature/multiPCA/data/avgLoadingRecount2.rds")
recound_KEGG_model = readRDS("/data2/Genomic_Super_Signature/PLIER/data/recount_KEGG_PLIER_model.RDS")
recount2PLIERmodel = recound_KEGG_model$Z

dim(recount2PCAmodel)

## [1] 6750  118

dim(recount2PLIERmodel)

## [1] 7619  394

2 Pearson Correlation

The Pearson correlation between these two Z matrices were calculated.

cm = intersect(rownames(recount2PCAmodel), rownames(recount2PLIERmodel))
recount2.cor = cor(recount2PCAmodel[cm,], recount2PLIERmodel[cm,], method = c("pearson"))
colnames(recount2.cor) = paste0("LV_", c(1:ncol(recount2.cor)))

cut_off = 0.5
recount2.cor.subset = recount2.cor[rowMaxs(abs(recount2.cor), na.rm = TRUE) > cut_off, 
                                   colMaxs(abs(recount2.cor), na.rm = TRUE) > cut_off,
                                   drop = FALSE] 

corLoadingPCA = rownames(recount2.cor.subset)
corLoadingPLIER = colnames(recount2.cor.subset)

We found that 42 loadings from recount2PCAmodel and 31 loadings from recount2PLIERmodel are correlatd with each other with the correlation coefficiency >= 0.5. This matching of loding vectors between Z matrices are not happening 1:1 ratio.

2.1 Correlation plot

corrplot::corrplot(recount2.cor.subset, order = "original", 
                   # method = "number",
                   tl.col = "black", tl.cex = 0.7,
                   mar = c(0,0,1,0))

2.2 Venn diagram

Venn diagram of LVs and PCclusters. Overlapping part is defined as Pearson correlation coefficient > 0.5. Because the correlation pairing is not 1:1 between Z matrices, in the below Venn diagram, the size of the overlappting part is based on recount2PCAmodel.

library(VennDiagram)

grid.newpage()
draw.pairwise.venn(area1 = ncol(recount2PCAmodel),
                   area2 = ncol(recount2PLIERmodel),
                   cross.area = nrow(recount2.cor.subset),
                   category = c("perPCA_model", "megaPLIER_model"),
                   lty = rep("blank", 2), 
                   fill = c("light blue", "pink"), 
                   alpha = rep(0.5, 2), 
                   cat.pos = c(0, 0), 
                   cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.11], polygon[GRID.polygon.12], polygon[GRID.polygon.13], polygon[GRID.polygon.14], text[GRID.text.15], text[GRID.text.16], text[GRID.text.17], text[GRID.text.18], text[GRID.text.19])

2.3 Summary Table

Summary table of the most correlated pairs of loading vectors based on Pearson correlation.

loadingPair = data.frame(matrix(ncol = 2, nrow = 0))
colnames(loadingPair) = c("PCA model", "PLIER model")

for (i in 1:nrow(recount2.cor.subset)) {
  j = which.max(recount2.cor.subset[i,])
  loadingPair[i,1] = rownames(recount2.cor.subset)[i]
  loadingPair[i,2] = colnames(recount2.cor.subset)[j]
}

head(loadingPair)

##          PCA model PLIER model
## 1 Cl118_01 (15/15)       LV_36
## 2   Cl118_03 (9/9)       LV_65
## 3 Cl118_06 (31/31)      LV_339
## 4 Cl118_07 (35/35)        LV_1
## 5 Cl118_08 (12/12)       LV_90
## 6 Cl118_13 (11/11)      LV_292

42 PCclusters are associated with 24 LVs, and 31 LVs are associated with 23 PCclusters.

3 Annotating Loadings

3.1 Annotate PCclusters with GSEA

KEGG_2019_Human geneset was used earlier as a prior knowledge of PLIER::PLIER, so PCclusters from perData loadings are ranked with the same geneset.

3.1.1 Prepare GSEA inputs

gmt.file = "/data2/Genomic_Super_Signature/raw_data/KEGG_2019_Human.txt"

# KEGG_2019_Human
set2gene = read.gmt(gmt.file)
colnames(set2gene)[1] = c("name")

# Assign 'id' for each pathway
pathwayId = data.frame(name = unique(set2gene$name), id = paste0("KEGG2019_", seq_along(unique(set2gene$name))))
set2gene = dplyr::full_join(set2gene, pathwayId, by = "name")

# Inputs for `clusterProfiler::GSEA`
TERM2GENE = set2gene %>% dplyr::select(id, gene) 
TERM2NAME = set2gene %>% dplyr::select(id, name)

3.1.2 GSEA on PCclusters

numPathways = 10
z = recount2PCAmodel

set.seed(1234)
topLoadings_recount2PCAmodel = makeGeneList(corLoadingPCA, z) %>% gseaLoading(., TERM2GENE, TERM2NAME, pvalueCutoff = 0.5)
topPathways_recount2PCAmodel = topPathways(topLoadings_recount2PCAmodel, numPathways) 

topPathways_recount2PCAmodel[,1:2]

##                                            Cl118_01 (15/15)
## Up_1                                            Influenza A
## Up_2                                            Hepatitis C
## Up_3                                                Measles
## Up_4                    NOD-like receptor signaling pathway
## Up_5                           Epstein-Barr virus infection
## Up_6                       Herpes simplex virus 1 infection
## Up_7                          Cytosolic DNA-sensing pathway
## Up_8                           Systemic lupus erythematosus
## Up_9                                  Pyrimidine metabolism
## Up_10                                            Proteasome
## Down_1  Parathyroid hormone synthesis, secretion and action
## Down_2                     Regulation of actin cytoskeleton
## Down_3                                             Ribosome
## Down_4                    T cell receptor signaling pathway
## Down_5               Adrenergic signaling in cardiomyocytes
## Down_6                   Vascular smooth muscle contraction
## Down_7                               GnRH signaling pathway
## Down_8                                  Platelet activation
## Down_9                         Choline metabolism in cancer
## Down_10                              Rap1 signaling pathway
##                                Cl118_03 (9/9)
## Up_1                                 Lysosome
## Up_2                            Legionellosis
## Up_3                                Pertussis
## Up_4                     Rheumatoid arthritis
## Up_5                                Phagosome
## Up_6                          Drug metabolism
## Up_7                     Salmonella infection
## Up_8                Oxidative phosphorylation
## Up_9                               Amoebiasis
## Up_10                            Tuberculosis
## Down_1                               Ribosome
## Down_2  RIG-I-like receptor signaling pathway
## Down_3                            Hepatitis C
## Down_4                                Measles
## Down_5                       ABC transporters
## Down_6                    Cellular senescence
## Down_7                 VEGF signaling pathway
## Down_8                 FoxO signaling pathway
## Down_9      T cell receptor signaling pathway
## Down_10          Epstein-Barr virus infection

3.2 Annotation of LVs from PLIER

multiPLIER.summary = recound_KEGG_model$summary
topPathways_recount2PLIERmodel = list()

for (LV in unique(multiPLIER.summary$`LV index`)) {
  LVname = paste0("LV_", LV)
  pathwayInd = which(multiPLIER.summary$`LV index` == LV)
  topPathways_recount2PLIERmodel[[LVname]] = multiPLIER.summary$pathway[pathwayInd]
}

# the number of pathways for each LV
LVPathwayNum = sapply(topPathways_recount2PLIERmodel, function(x) {length(x)})
head(LVPathwayNum)

## LV_1 LV_2 LV_3 LV_4 LV_5 LV_6 
##   91   35   35   30   28   18

LVPathwayList = sapply(topPathways_recount2PLIERmodel, function(x) {head(x)})
head(LVPathwayList, 3)

## $LV_1
## [1] "ABC transporters"                           
## [2] "AMPK signaling pathway"                     
## [3] "Alanine, aspartate and glutamate metabolism"
## [4] "Alzheimer disease"                          
## [5] "Amino sugar and nucleotide sugar metabolism"
## [6] "Aminoacyl-tRNA biosynthesis"                
## 
## $LV_2
## [1] "Aminoacyl-tRNA biosynthesis"        "Basal transcription factors"       
## [3] "Cell cycle"                         "Citrate cycle (TCA cycle)"         
## [5] "Cysteine and methionine metabolism" "DNA replication"                   
## 
## $LV_3
## [1] "Adrenergic signaling in cardiomyocytes"     
## [2] "Alanine, aspartate and glutamate metabolism"
## [3] "Alcoholism"                                 
## [4] "Aldosterone synthesis and secretion"        
## [5] "Alzheimer disease"                          
## [6] "Amyotrophic lateral sclerosis (ALS)"

4 Similarity between annotations

4.1 Table

Here, I printed out some of the pathways of highly correlated loading pairs from perData PCclusters and multiPLIER LVs.

k = nrow(loadingPair)   # for all pairs
k = 2   # Let's check only the two pairs

for (i in 1:k) {
  PCclusterName = loadingPair$`PCA model`[i]
  LVName = loadingPair$`PLIER model`[i]
  
  # PCcluster
  perData_pathways = topPathways_recount2PCAmodel[[PCclusterName]] %>% 
    as.data.frame(., row.names = c(paste0("Up_", 1:10), paste0("Down_", 1:10)))
  # remove the loading that's corrleated but doesn't have enriched pathway
  if (nrow(perData_pathways) == 0) {
    print(paste("No enrichment result from", PCclusterName)) 
    break
    }
  colnames(perData_pathways) = PCclusterName
  
  # LV that's highly correlated to PCcluster
  LVind = gsub("LV_", "", LVName)
  multiPLIER_pathways = multiPLIER.summary[which(multiPLIER.summary$`LV index` == LVind),]
  
  print(perData_pathways)
  print(multiPLIER_pathways)
}

##                                            Cl118_01 (15/15)
## Up_1                                            Influenza A
## Up_2                                            Hepatitis C
## Up_3                                                Measles
## Up_4                    NOD-like receptor signaling pathway
## Up_5                           Epstein-Barr virus infection
## Up_6                       Herpes simplex virus 1 infection
## Up_7                          Cytosolic DNA-sensing pathway
## Up_8                           Systemic lupus erythematosus
## Up_9                                  Pyrimidine metabolism
## Up_10                                            Proteasome
## Down_1  Parathyroid hormone synthesis, secretion and action
## Down_2                     Regulation of actin cytoskeleton
## Down_3                                             Ribosome
## Down_4                    T cell receptor signaling pathway
## Down_5               Adrenergic signaling in cardiomyocytes
## Down_6                   Vascular smooth muscle contraction
## Down_7                               GnRH signaling pathway
## Down_8                                  Platelet activation
## Down_9                         Choline metabolism in cancer
## Down_10                              Rap1 signaling pathway
##                                    pathway LV index       AUC      p-value
## 748 Cytokine-cytokine receptor interaction       36 0.5733501 1.868229e-02
## 749           Epstein-Barr virus infection       36 0.7545356 6.049556e-09
## 750                            Hepatitis C       36 0.7401197 3.011455e-07
## 751       Herpes simplex virus 1 infection       36 0.5376563 8.738719e-02
## 752                            Influenza A       36 0.7245609 1.524735e-06
## 753    NOD-like receptor signaling pathway       36 0.7192433 6.487636e-07
## 754                              Phagosome       36 0.5232982 3.285635e-01
## 755  RIG-I-like receptor signaling pathway       36 0.9181102 2.485356e-09
##              FDR
## 748 3.667638e-02
## 749 9.719324e-08
## 750 2.950686e-06
## 751 1.320477e-01
## 752 1.238493e-05
## 753 5.931165e-06
## 754 4.004518e-01
## 755 4.525419e-08
##                                Cl118_03 (9/9)
## Up_1                                 Lysosome
## Up_2                            Legionellosis
## Up_3                                Pertussis
## Up_4                     Rheumatoid arthritis
## Up_5                                Phagosome
## Up_6                          Drug metabolism
## Up_7                     Salmonella infection
## Up_8                Oxidative phosphorylation
## Up_9                               Amoebiasis
## Up_10                            Tuberculosis
## Down_1                               Ribosome
## Down_2  RIG-I-like receptor signaling pathway
## Down_3                            Hepatitis C
## Down_4                                Measles
## Down_5                       ABC transporters
## Down_6                    Cellular senescence
## Down_7                 VEGF signaling pathway
## Down_8                 FoxO signaling pathway
## Down_9      T cell receptor signaling pathway
## Down_10          Epstein-Barr virus infection
##                                                  pathway LV index       AUC
## 1022                 Antigen processing and presentation       65 0.7191015
## 1023                         Chemokine signaling pathway       65 0.6842200
## 1024              Cytokine-cytokine receptor interaction       65 0.5138591
## 1025                                         Endocytosis       65 0.7160690
## 1026                    Fc gamma R-mediated phagocytosis       65 0.7371418
## 1027                          Hematopoietic cell lineage       65 0.6681468
## 1028                                       Legionellosis       65 0.5870519
## 1029                                       Leishmaniasis       65 0.8031311
## 1030                Leukocyte transendothelial migration       65 0.5879973
## 1031                                            Lysosome       65 0.8221714
## 1032                 NOD-like receptor signaling pathway       65 0.8027526
## 1033                                         Necroptosis       65 0.8088238
## 1034                          Osteoclast differentiation       65 0.8238136
## 1035 Parathyroid hormone synthesis, secretion and action       65 0.6368513
## 1036                                           Phagosome       65 0.5743568
## 1037                                 Platelet activation       65 0.5974071
## 1038                     Staphylococcus aureus infection       65 0.6913810
## 1039             Transcriptional misregulation in cancer       65 0.5464550
## 1040                                        Tuberculosis       65 0.8250780
##           p-value          FDR
## 1022 2.535200e-03 6.796825e-03
## 1023 3.105191e-06 2.355848e-05
## 1024 3.352764e-01 4.069883e-01
## 1025 1.246358e-09 2.453416e-08
## 1026 2.400230e-05 1.331092e-04
## 1027 3.271577e-03 8.363897e-03
## 1028 1.213906e-01 1.739495e-01
## 1029 2.397827e-05 1.331092e-04
## 1030 4.764568e-02 7.977457e-02
## 1031 9.474290e-11 2.688484e-09
## 1032 2.662381e-13 1.817907e-11
## 1033 9.572196e-13 5.652770e-11
## 1034 3.203878e-11 1.060678e-09
## 1035 5.638720e-03 1.337742e-02
## 1036 6.277074e-02 9.982102e-02
## 1037 2.691374e-02 4.954213e-02
## 1038 7.176856e-03 1.642035e-02
## 1039 1.302222e-01 1.834530e-01
## 1040 6.212076e-14 4.680478e-12

4.2 Heatmap

Next, I compared Z matrices using their annotation.

x = topPathways_recount2PCAmodel
y = topPathways_recount2PLIERmodel

res = data.frame(row.names = colnames(x))

for (i in seq_along(x)) {
  for (j in seq_along(y)) {
    res[i,j] = length(intersect(x[[i]], y[[j]]))
    colnames(res)[j] = names(y)[j]
  }
}

This is the heatmap of all the loading pairs.

This is the subset of the loading pairs with more than 80% shared pathways based on loadings with a fewer number of pathways assigned.

# new empty data frame with row and column as same as `res`
ind = data.frame(matrix(NA, ncol = ncol(res), nrow = nrow(res)),
                 row.names = rownames(res))
colnames(ind) = colnames(res)

# logical of each pair with more than 80% of shared pathways
for (i in seq_along(LVPathwayNum)) {
  if (LVPathwayNum[i] >= 10) {
    k = round(numPathways * 0.8)   # more than 80% of the PCcluster pathways are overlap
    ind[,i] = res[,i] > k
  } else if (LVPathwayNum[i] < 10) {   # in case, fewer pathways are assigned to LV (than PCcluster)
    k = round(LVPathwayNum[i] * 0.8)   # more than 80% of the LV pathways are overlap
    ind[,i] = res[,i] > k
  }
}

subset = res[which(rowSums(ind) != 0),
             which(colSums(ind) != 0)]

heatmap(as.matrix(subset))

Here is the actual number of shared pathways between loadings.

corrplot::corrplot(as.matrix(subset), 
                   method = "color", 
                   addCoef.col = "white",
                   tl.srt = 45, tl.cex = 0.8, tl.col = "black",
                   col=colorRampPalette(c("blue","light yellow","red"))(200),
                   # col=hcl.colors(12, "YlOrRd", rev = TRUE),
                   is.corr = FALSE,
                   mar = c(0,0,1,0))

Compare PCA-based models with multiPLIER model

2019-12-04