suppressPackageStartupMessages({
library(SummarizedExperiment)
library(tidyverse)
library(PLIER)
library(dplyr)
})
crc_dir = "/data/crc-subtyping-paper"
plier.dir = "/data2/Genomic_Super_Signature/PLIER"
# Load functions from CRC paper and muliPLIER paper
source(file.path(crc_dir, "src/misc.R"))
source("/data/multi-plier/util/plier_util.R")
# Load PLIER training datasets
data(bloodCellMarkersIRISDMAP)
data(canonicalPathways)
data(svmMarkers)
allPaths = combinePaths(bloodCellMarkersIRISDMAP, svmMarkers, canonicalPathways)
# Load CRC model datasets
load(file.path(crc_dir, "data/eSets/trainingSetNames.RData"))
for(set in trainingSetNames) {
load(paste0(crc_dir, "/data/eSets/", set, ".RData"))
}
# Load OV model datasets
load("/data2/GenomicSuperSignature/data/trainingSets_10ovc.rda")
for(set in trainingSets_10ovc) {
load(paste0("/data/curatedOvarianData/data/", set, ".rda"))
}
# Combine all training datasets (8 CRC + 10 OV)
trainingSetNames = c(trainingSetNames, trainingSets_10ovc)
# Prepare for PLIER
plier.data.list = list()
for (set in trainingSetNames) {
# subset dataset with the common genes
eSet = get(set)
exprs = exprs(eSet) %>% rmNaInf
exprs = apply(exprs, 1, function(x) x - mean(x)) %>% t # PLIER::rowNorm()
genesCommon = intersect(rownames(exprs), rownames(allPaths)) # PLIER::commonRows()
k = PLIER::num.pc(exprs)
plier.data.list[[set]] = list("exprs.cg" = exprs[genesCommon,],
"all.paths.cm" = allPaths[genesCommon,],
"k" = k)
}
saveRDS(plier.data.list, file = file.path(plier.dir, "data/CrcOv_data_prep_PLIER.rds"))
plier.dir = "/data2/Genomic_Super_Signature/PLIER"
plier.data.list = readRDS(file.path(plier.dir, "data/CrcOv_data_prep_PLIER.rds"))
library(magrittr)
library(SummarizedExperiment)
library(PLIER)
library(GenomicSuperSignature) # install.packages("/data2/GenomicSuperSignature", repos=NULL, type="source")
library(dplyr)
library(grid)
library(circlize)
library(ComplexHeatmap)
# Common genes
cm = lapply(plier.data.list, function(x) rownames(x$exprs.cg)) %>%
Reduce(intersect, .)
# Combine all count matrices with common genes
datN_list = lapply(plier.data.list, function(x) x$exprs.cg[cm,])
datN = Reduce(cbind, datN_list)
# Calculate the number of desired PCs (from elbow method)
num.pc = PLIER::num.pc(datN)
## Computing svd
# Perform PCA and get the average loading
allPCA = calPC(datN)
avg.loading.megaData = allPCA$rotation[,1:num.pc]
# Load pre-calculated top 20 PCs from each dataset
individualPCA = read.csv("/data2/GenomicSuperSignature/inst/extdata/PCs_8crc10ovc_training.csv", row.names = 1)
k = PLIER::num.pc(individualPCA)
## Computing svd
# Clustering with the suggested cluster number, k (from elbow method)
avg.loading.perData = avgLoading(t(individualPCA), k)
Use some modified-version of multiPLIER fucntions
source('/data2/Genomic_Super_Signature/PLIER/multiPLIER_functions/GetNewDataB2.R')
source('/data2/Genomic_Super_Signature/PLIER/multiPLIER_functions/GetOrderedRowNorm2.R')
load("/data2/GenomicSuperSignature/data/TCGA_validationDatasets.rda")
exprs.mat = assay(TCGA_validationDatasets[["OV"]])
# Assign score using avg.loading.perData
score = calScore(exprs.mat, avg.loading.perData)
sampleScoreHeatmap(score[[1]], "TCGA_OV_rseq", "perDataLoading")
# Project to avg.loading.megaData
new.data.b = GetNewDataB2(exprs.mat, avg.loading.megaData)
sampleScoreHeatmap(t(new.data.b[1:k,]), "TCGA_OV_rseq", "megaDataLoading", cluster_columns = FALSE)
Process as same as TCGA_COAD_rseq dataset example above.
Process as same as TCGA_COAD_rseq dataset example above.
megaDataLoading = avg.loading.megaData[,1:k]
megaDataLoading.cor = cor(megaDataLoading, method = c("pearson"))
corrplot::corrplot(megaDataLoading.cor, type = "upper", order = "original", diag = FALSE,
tl.pos = "td", tl.col = "black", tl.cex = 0.5)
perDataLoading = avg.loading.perData
perDataLoading.cor = cor(perDataLoading, method = c("pearson"))
corrplot::corrplot(perDataLoading.cor, type = "upper", order = "original", diag = FALSE,
tl.pos = "td", tl.col = "black", tl.cex = 0.5)
cm_loading = intersect(rownames(megaDataLoading), rownames(perDataLoading))
x = megaDataLoading[cm_loading,]
y = perDataLoading[cm_loading,]
methods.cor = cor(x, y, method = c("pearson"))
corrplot::corrplot(methods.cor, order = "original",
tl.col = "black", tl.cex = 0.5)
load("/data2/GenomicSuperSignature/data/TCGA_validationDatasets.rda")
dataset = TCGA_validationDatasets
The maximum pearson correlation coefficiency between top 8 PCs of the dataset and avg.loading is validation at the dataset-level.
val_megaData = validate(dataset, megaDataLoading)
val_perData = validate(dataset, perDataLoading)
heatmapTable(val_megaData, column_title = "TCGA_rseq validation with megaDataLoading")
heatmapTable(val_perData, column_title = "TCGA_rseq validation with perDataLoading")
val_megaData = validate(dataset, megaDataLoading, level = "all")
val_perData = validate(dataset, perDataLoading, level = "all")
heatmapTable(t(val_megaData[[1]]), column_title = "TCGA_COAD validation with megaDataLoading")
heatmapTable(t(val_perData[[1]]), column_title = "TCGA_COAD validation with perDataLoading")
Silhouette width of perDataLoading clusters
library(cluster)
set.seed(123)
df = t(individualPCA)
res = kmeans(df, centers = k)
sw = silhouette(res$cluster, daisy(df))
Plotting
boxplot(sw[,"sil_width"] ~ sw[,"cluster"],
xlab = "Cluster Number",
ylab = "Silhouette Width",
main = "CRC+OV Datasets Cluster Evaluation")
abline(h = 0.15, col = "red")
sw_df = data.frame(cluster = sw[,"cluster"],
sil_width = sw[,"sil_width"])
sw_mean = sw_df %>%
group_by(cluster) %>%
summarise(mean = mean(sil_width), sd = sd(sil_width))
which(sw_mean$mean > 0.15)
## [1] 4 8 18 24 25 30 32 43