This is the list of 146 recount2 datasets with >= 50 samples in it. Top 20 PCs
from each dataset is saved in this object. Pre-processing steps are described in
the vignette, 02_PCA.Rmd.
recount2_PCs = readRDS("~/data2/Genomic_Super_Signature/PCA/data/recount2_PCs_logTransformed.rds")
We used 6 colon releated studies in recount2 to build ‘colon-specific signature’ and compared how well this ‘colon-specific signature’ is preserved in the model built from a larger, heterozygous training datasets.
recount_colon.csv contains recount2 studies with the keyword ‘colon’ in their abstract.
recount_colon = read.table("~/data2/Genomic_Super_Signature/data/recount_colon.csv", sep = ",", header = TRUE)
ind = which(recount_colon$number.of.samples >= 50)
recount_colon_study = recount_colon$accession[ind]
colonSamples = recount2_PCs[recount_colon_study]
recount_colon = read.table("~/data2/Genomic_Super_Signature/data/recount_colorectal_cancer.csv", sep = ",", header = TRUE)
ind = which(recount_colon$number.of.samples >= 20)
recount_colon_study = recount_colon$accession[ind]
colonSamples = recount2_PCs[recount_colon_study]
loading_colon = data.frame(matrix(NA, nrow = nrow(colonSamples[[1]]), ncol = 0))
for (i in seq_along(colonSamples)) {
loading_colon = cbind(loading_colon, colonSamples[[i]])
}
k = PLIER::num.pc(loading_colon) # k = 12
avgloading_colon = buildAvgLoading(t(loading_colon), k)
nonColonSamples = recount2_PCs[!names(recount2_PCs) %in% recount_colon_study]
loading_noColon = data.frame(matrix(NA, nrow = nrow(nonColonSamples[[1]]), ncol = 0))
for (i in seq_along(nonColonSamples)) {
loading_noColon = cbind(loading_noColon, nonColonSamples[[i]])
}
k = PLIER::num.pc(as.matrix(loading_noColon)) # k = 135
### This takes long.
avgloading_noColon = buildAvgLoading(t(as.matrix(loading_noColon)), k, iter.max = 20)
loading_all = data.frame(matrix(NA, nrow = nrow(recount2_PCs[[1]]), ncol = 0))
for (i in seq_along(recount2_PCs)) {
loading_all = cbind(loading_all, recount2_PCs[[i]])
}
k = PLIER::num.pc(as.matrix(loading_all))
### This takes long.
avgloading_all = buildAvgLoading(t(as.matrix(loading_all)), k, iter.max = 20)
Here, we will compare three different Z matrices built from
1) colon datasets only (avgloading_colon)
2) without colon datasets (avgloading_noColon)
3) both (avgloading_all)
Validation process is calculating Pearson correlation coefficient between avgLoading and top 8 PCs from the validation datasets. We define ‘successul’ validates when any of the top 8 PCs have >= 0.5 correlation coefficent with any of the avgLoading vectors.
We used 5 TCGA RNAseq datasts for validation.
library(curatedTCGAData)
mae = curatedTCGAData(diseaseCode = c("LUAD", "BRCA", "OV", "COAD", "READ"),
assays = "RNASeq2GeneNorm",
dry.run = FALSE)
dataset = list()
for (i in seq_along(mae)) {
setName = gsub("_RNASeq2GeneNorm-20160128", "", names(mae)[i])
count = assay(mae[[i]]) %>% as.matrix %>% rmNaInf
dataset[[setName]] = log2(count + 1)
}
avgloading_colonval = validate(dataset, avgloading_colon$avgLoading)
cutoff = 0.4
k = sapply(colnames(val), function(x) {sum(val[,x] > cutoff) != 0})
validated_ind_colon = names(k[which(k=="TRUE")])
heatmapTable(val[,validated_ind_colon],
column_title = paste0("PCAmodel-colon (cutoff ", cutoff, ")"),
colors = col)
avgloading_noColonavgloading_all# Load pre-calculated top 20 PCs from 8 CRC dataset
individualPCA = read.csv("~/data2/GenomicSuperSignature/inst/extdata/PCs_8crc_training.csv", row.names = 1)
k = numpc(individualPCA)
## Computing svd
# Clustering with the suggested cluster number, k (from elbow method)
CRC_PCAmodel = buildAvgLoading(t(individualPCA), k, n = 20)
cm = intersect(rownames(avgloading_colon$avgLoading), rownames(CRC_PCAmodel$avgLoading))
correlations = cor(avgloading_colon$avgLoading[cm,], CRC_PCAmodel$avgLoading[cm,], method = c("pearson"))
max(correlations)
## [1] 0.3905879
corrplot::corrplot(correlations, order = "original",
# method = "number",
tl.col = "black", tl.cex = 0.7,
mar = c(0,0,1,0))
x = allZ_colon
y = CRC_PCAmodel$avgLoading
cm = intersect(rownames(x), rownames(y))
correlations = cor(x[cm,], y[cm,], method = c("pearson"))
max(correlations)
## [1] 0.3865185
cm = intersect(rownames(avgloading_all$avgLoading), rownames(CRC_PCAmodel$avgLoading))
correlations = cor(avgloading_all$avgLoading[cm,], CRC_PCAmodel$avgLoading[cm,], method = c("pearson"))
max(correlations)
## [1] 0.5536184
x = allZ
y = CRC_PCAmodel$avgLoading
cm = intersect(rownames(x), rownames(y))
correlations = cor(x[cm,], y[cm,], method = c("pearson"))
max(correlations)
## [1] 0.5136675
Each loading matrix is annotated with (check the EDA #05)