Contents

1 Build different loading matrices

1.1 Load recount2 dataset

This is the list of 146 recount2 datasets with >= 50 samples in it. Top 20 PCs from each dataset is saved in this object. Pre-processing steps are described in the vignette, 02_PCA.Rmd.

recount2_PCs = readRDS("~/data2/Genomic_Super_Signature/PCA/data/recount2_PCs_logTransformed.rds")

1.2 PCAmodel from colon datasets

We used 6 colon releated studies in recount2 to build ‘colon-specific signature’ and compared how well this ‘colon-specific signature’ is preserved in the model built from a larger, heterozygous training datasets.

recount_colon.csv contains recount2 studies with the keyword ‘colon’ in their abstract.

recount_colon = read.table("~/data2/Genomic_Super_Signature/data/recount_colon.csv", sep = ",", header = TRUE)
ind = which(recount_colon$number.of.samples >= 50)
recount_colon_study = recount_colon$accession[ind]
colonSamples = recount2_PCs[recount_colon_study]
recount_colon = read.table("~/data2/Genomic_Super_Signature/data/recount_colorectal_cancer.csv", sep = ",", header = TRUE)
ind = which(recount_colon$number.of.samples >= 20)
recount_colon_study = recount_colon$accession[ind]
colonSamples = recount2_PCs[recount_colon_study]
loading_colon = data.frame(matrix(NA, nrow = nrow(colonSamples[[1]]), ncol = 0))
for (i in seq_along(colonSamples)) {
        loading_colon = cbind(loading_colon, colonSamples[[i]])
    }

k = PLIER::num.pc(loading_colon)   # k = 12
avgloading_colon = buildAvgLoading(t(loading_colon), k)

1.3 PCAmodel from noColon datasets

nonColonSamples = recount2_PCs[!names(recount2_PCs) %in% recount_colon_study]
loading_noColon = data.frame(matrix(NA, nrow = nrow(nonColonSamples[[1]]), ncol = 0))
for (i in seq_along(nonColonSamples)) {
        loading_noColon = cbind(loading_noColon, nonColonSamples[[i]])
    }

k = PLIER::num.pc(as.matrix(loading_noColon))   # k = 135

### This takes long.
avgloading_noColon = buildAvgLoading(t(as.matrix(loading_noColon)), k, iter.max = 20)

1.4 PCAmodel from both datasets

loading_all = data.frame(matrix(NA, nrow = nrow(recount2_PCs[[1]]), ncol = 0))
for (i in seq_along(recount2_PCs)) {
        loading_all = cbind(loading_all, recount2_PCs[[i]])
}

k = PLIER::num.pc(as.matrix(loading_all)) 

### This takes long.
avgloading_all = buildAvgLoading(t(as.matrix(loading_all)), k, iter.max = 20)

2 Compare different loading matrices

Here, we will compare three different Z matrices built from
1) colon datasets only (avgloading_colon)
2) without colon datasets (avgloading_noColon)
3) both (avgloading_all)

2.1 Validation

2.1.1 Load validation datastes

Validation process is calculating Pearson correlation coefficient between avgLoading and top 8 PCs from the validation datasets. We define ‘successul’ validates when any of the top 8 PCs have >= 0.5 correlation coefficent with any of the avgLoading vectors.

We used 5 TCGA RNAseq datasts for validation.

library(curatedTCGAData)
mae = curatedTCGAData(diseaseCode = c("LUAD", "BRCA", "OV", "COAD", "READ"),
                      assays = "RNASeq2GeneNorm",
                      dry.run = FALSE)

dataset = list()
for (i in seq_along(mae)) {
  setName = gsub("_RNASeq2GeneNorm-20160128", "", names(mae)[i])
  count = assay(mae[[i]]) %>% as.matrix %>% rmNaInf
  dataset[[setName]] = log2(count + 1)
}

2.1.2 Validation of avgloading_colon

val = validate(dataset, avgloading_colon$avgLoading)
cutoff = 0.4
k = sapply(colnames(val), function(x) {sum(val[,x] > cutoff) != 0})
validated_ind_colon = names(k[which(k=="TRUE")])
heatmapTable(val[,validated_ind_colon], 
             column_title = paste0("PCAmodel-colon (cutoff ", cutoff, ")"), 
             colors = col)

2.1.3 Validation of avgloading_noColon

2.1.4 Validation of avgloading_all

3 Compare to avgLoading_CRC

3.1 CRC_PCAmodel

# Load pre-calculated top 20 PCs from 8 CRC dataset
individualPCA = read.csv("~/data2/GenomicSuperSignature/inst/extdata/PCs_8crc_training.csv", row.names = 1)
k = numpc(individualPCA)
## Computing svd
# Clustering with the suggested cluster number, k (from elbow method)
CRC_PCAmodel = buildAvgLoading(t(individualPCA), k, n = 20)

3.2 avgloading_colon vs. CRC_PCAmodel

cm = intersect(rownames(avgloading_colon$avgLoading), rownames(CRC_PCAmodel$avgLoading))
correlations = cor(avgloading_colon$avgLoading[cm,], CRC_PCAmodel$avgLoading[cm,], method = c("pearson"))

max(correlations)
## [1] 0.3905879
corrplot::corrplot(correlations, order = "original", 
                   # method = "number",
                   tl.col = "black", tl.cex = 0.7,
                   mar = c(0,0,1,0))

3.3 recount_colonOnly_PCs vs. CRC_PCAmodel

x = allZ_colon
y = CRC_PCAmodel$avgLoading
cm = intersect(rownames(x), rownames(y))
correlations = cor(x[cm,], y[cm,], method = c("pearson"))

max(correlations)
## [1] 0.3865185

3.4 avgloading_all vs. CRC_PCAmodel

cm = intersect(rownames(avgloading_all$avgLoading), rownames(CRC_PCAmodel$avgLoading))
correlations = cor(avgloading_all$avgLoading[cm,], CRC_PCAmodel$avgLoading[cm,], method = c("pearson"))

max(correlations)
## [1] 0.5536184

3.5 recount_all_PCs vs. CRC_PCAmodel

x = allZ
y = CRC_PCAmodel$avgLoading
cm = intersect(rownames(x), rownames(y))
correlations = cor(x[cm,], y[cm,], method = c("pearson"))

max(correlations)
## [1] 0.5136675

4 Similarity of loadings

4.1 avgloading_colon vs. avgloading_noColon

4.2 avgloading_colon vs. avgloading_all

4.3 avgloading_noColon vs. avgloading_all

5 Loadings significantly associated with pathways

Each loading matrix is annotated with (check the EDA #05)