Contents

1 Pre-processing for PLIER model

suppressPackageStartupMessages({
  library(SummarizedExperiment)
  library(tidyverse)
  library(PLIER)
  library(dplyr)
})

crc_dir = "/data/crc-subtyping-paper"
plier.dir = "/data2/Genomic_Super_Signature/PLIER"

# Load functions from CRC paper and muliPLIER paper
source(file.path(crc_dir, "src/misc.R"))
source("/data/multi-plier/util/plier_util.R")

# Load PLIER training datasets
data(bloodCellMarkersIRISDMAP)
data(canonicalPathways)
data(svmMarkers) 
allPaths = combinePaths(bloodCellMarkersIRISDMAP, svmMarkers, canonicalPathways) 

# Load CRC model datasets
load(file.path(crc_dir, "data/eSets/trainingSetNames.RData"))
for(set in trainingSetNames) {
  load(paste0(crc_dir, "/data/eSets/", set, ".RData"))
}

# Load OV model datasets
load("/data2/GenomicSuperSignature/data/trainingSets_10ovc.rda")
for(set in trainingSets_10ovc) {
  load(paste0("/data/curatedOvarianData/data/", set, ".rda"))
}

# Combine all training datasets (8 CRC + 10 OV)
trainingSetNames = c(trainingSetNames, trainingSets_10ovc)

# Prepare for PLIER
plier.data.list = list()
for (set in trainingSetNames) {
  # subset dataset with the common genes
  eSet = get(set)
  exprs = exprs(eSet) %>% rmNaInf
  exprs = apply(exprs, 1, function(x) x - mean(x)) %>% t   # PLIER::rowNorm()
  genesCommon = intersect(rownames(exprs), rownames(allPaths))   # PLIER::commonRows()
  k = PLIER::num.pc(exprs)

  plier.data.list[[set]] = list("exprs.cg" = exprs[genesCommon,],
                         "all.paths.cm" = allPaths[genesCommon,],
                         "k" = k)
}

saveRDS(plier.data.list, file = file.path(plier.dir, "data/CrcOv_data_prep_PLIER.rds"))

2 Load pre-processed training datasets

plier.dir = "/data2/Genomic_Super_Signature/PLIER"
plier.data.list = readRDS(file.path(plier.dir, "data/CrcOv_data_prep_PLIER.rds"))
library(magrittr)
library(SummarizedExperiment)
library(PLIER)
library(GenomicSuperSignature)   # install.packages("/data2/GenomicSuperSignature", repos=NULL, type="source")
library(dplyr)

library(grid)
library(circlize)
library(ComplexHeatmap)

2.1 Build avg.loading.megaData

# Common genes
cm = lapply(plier.data.list, function(x) rownames(x$exprs.cg)) %>%
    Reduce(intersect, .)

# Combine all count matrices with common genes
datN_list = lapply(plier.data.list, function(x) x$exprs.cg[cm,])
datN = Reduce(cbind, datN_list)

# Calculate the number of desired PCs (from elbow method) 
num.pc = PLIER::num.pc(datN)
## Computing svd
# Perform PCA and get the average loading
allPCA = calPC(datN)
avg.loading.megaData = allPCA$rotation[,1:num.pc]

2.2 Build avg.loading.perData

# Load pre-calculated top 20 PCs from each dataset
individualPCA = read.csv("/data2/GenomicSuperSignature/inst/extdata/PCs_8crc10ovc_training.csv", row.names = 1)
k = PLIER::num.pc(individualPCA)
## Computing svd
# Clustering with the suggested cluster number, k (from elbow method)
avg.loading.perData = avgLoading(t(individualPCA), k)

3 Assign score (= matrix B = samples x LVs)

Use some modified-version of multiPLIER fucntions

source('/data2/Genomic_Super_Signature/PLIER/multiPLIER_functions/GetNewDataB2.R')
source('/data2/Genomic_Super_Signature/PLIER/multiPLIER_functions/GetOrderedRowNorm2.R')

3.1 TCGA_OV_rseq dataset

load("/data2/GenomicSuperSignature/data/TCGA_validationDatasets.rda")
exprs.mat = assay(TCGA_validationDatasets[["OV"]])

# Assign score using avg.loading.perData
score = calScore(exprs.mat, avg.loading.perData)
sampleScoreHeatmap(score[[1]], "TCGA_OV_rseq", "perDataLoading")

# Project to avg.loading.megaData
new.data.b = GetNewDataB2(exprs.mat, avg.loading.megaData)
sampleScoreHeatmap(t(new.data.b[1:k,]), "TCGA_OV_rseq", "megaDataLoading", cluster_columns = FALSE)

3.2 TCGA_COAD_rseq dataset

Process as same as TCGA_COAD_rseq dataset example above.

3.3 TCGA_OV_rseq and TCGA_COAD_rseq datasets

Process as same as TCGA_COAD_rseq dataset example above.

4 Comparing two average loadings

4.1 avg.loading.megaData

megaDataLoading = avg.loading.megaData[,1:k]
megaDataLoading.cor = cor(megaDataLoading, method = c("pearson"))
corrplot::corrplot(megaDataLoading.cor, type = "upper", order = "original", diag = FALSE,
                   tl.pos = "td", tl.col = "black", tl.cex = 0.5)

4.2 avg.loading.perData

perDataLoading = avg.loading.perData
perDataLoading.cor = cor(perDataLoading, method = c("pearson"))
corrplot::corrplot(perDataLoading.cor, type = "upper", order = "original", diag = FALSE,
                   tl.pos = "td", tl.col = "black", tl.cex = 0.5)

4.3 avg.loading.megaData vs. avg.loading.perData

cm_loading = intersect(rownames(megaDataLoading), rownames(perDataLoading)) 
x = megaDataLoading[cm_loading,]
y = perDataLoading[cm_loading,]

methods.cor = cor(x, y, method = c("pearson"))
corrplot::corrplot(methods.cor, order = "original",
                   tl.col = "black", tl.cex = 0.5)

5 Load TCGA RNAseq data for validation

load("/data2/GenomicSuperSignature/data/TCGA_validationDatasets.rda")
dataset = TCGA_validationDatasets

5.1 Validation at the dataset-level

The maximum pearson correlation coefficiency between top 8 PCs of the dataset and avg.loading is validation at the dataset-level.

val_megaData = validate(dataset, megaDataLoading)
val_perData = validate(dataset, perDataLoading)
heatmapTable(val_megaData, column_title = "TCGA_rseq validation with megaDataLoading")

heatmapTable(val_perData, column_title = "TCGA_rseq validation with perDataLoading")

5.2 Validation from all PCs

val_megaData = validate(dataset, megaDataLoading, level = "all")
val_perData = validate(dataset, perDataLoading, level = "all")
heatmapTable(t(val_megaData[[1]]), column_title = "TCGA_COAD validation with megaDataLoading")

heatmapTable(t(val_perData[[1]]), column_title = "TCGA_COAD validation with perDataLoading")

5.3 Cluster evaluation

Silhouette width of perDataLoading clusters

library(cluster)

set.seed(123)
df = t(individualPCA)
res = kmeans(df, centers = k)
sw = silhouette(res$cluster, daisy(df))

Plotting

boxplot(sw[,"sil_width"] ~ sw[,"cluster"], 
        xlab = "Cluster Number",
        ylab = "Silhouette Width",
        main = "CRC+OV Datasets Cluster Evaluation")
abline(h = 0.15, col = "red")

sw_df = data.frame(cluster = sw[,"cluster"],
                   sil_width = sw[,"sil_width"])

sw_mean = sw_df %>%
  group_by(cluster) %>%
  summarise(mean = mean(sil_width), sd = sd(sil_width))

which(sw_mean$mean > 0.15)
## [1]  4  8 18 24 25 30 32 43