Ovarian Cancer Meta-Analysis Validation

Which genes correlate with survival and differentiate normal vs stage I/II?

setwd("/Users/lszabo/Documents/RProjects/ButteLab/biomarkers")
source("db_functions.R")
source("validation_sets/validate_functions.R")

## Loading required package: DBI

## Loading required package: RMySQL

Load the gene expression and phenotype data

# gene expression matrix (n*p) for ovarian cancer samples and normals from
# OvCa data set
load("validation_sets/TCGA/gene_expm.save")

# 4 digit patient ids of the expression data (for mapping to pheno data)
load("validation_sets/TCGA/gene_exp_ids.save")

# get the phenotype data
pheno <- read.table("/Users/lszabo/Documents/TCGAdata/ovarian/clinical_3de43933-0806-4ce8-a7fc-917e43b50c65/Clinical/BCR/clinical_patient_public_OV.txt", 
    header = T, sep = "\t")
load("validation_sets/TCGA/pheno_patients.save")  #4 digit patient ids of those we have phenodata for

Do any of the biomarkers in Bruce's list correlate with survival in TCGA validation set?


# Get our survival data
pheno_death <- as.character(pheno$days_to_death[which(pheno$days_to_death != 
    "null")])
# 297 patient ids we have death data for
pheno_patients_death <- pheno_patients[which(pheno$days_to_death != "null")]
# 294 we have death and exp data for
pheno_patients_death_and_exp <- intersect(pheno_patients_death, gene_exp_ids)

# array with TRUE in all indices we have pheno data for
matches_death <- gene_exp_ids %in% pheno_patients_death_and_exp
# expression matrix data for all of those we have death data for
death_expm <- gene_expm[matches_death, ]
# now need to populate death array by looking up matching days to death
# based on the 4 digit id
death_exp_ids <- rownames(death_expm)
death_exp_ids <- strsplit(x = death_exp_ids, split = "-")
# now this is what to compare to pheno_patients to populate pheno array
death_exp_ids <- unlist(lapply(death_exp_ids, function(x) x[1]))
# for each 4 digit id of a row in the exp matrix
death_pheno_arr = NULL
for (i in 1:length(death_exp_ids)) {
    # find index of matching id in pheno_patients and populate pheno array
    # with days to death for this patient
    j <- which(pheno_patients_death == death_exp_ids[i])
    death_pheno_arr[i] <- pheno_death[j]
}

death_pheno_arr <- as.numeric(death_pheno_arr)

And now look at correlations

# the 5 genes DE between normal and stage I/II also in Bruce's list
biomarkers <- c("NUP210", "PRKDC", "SCRIB", "TCOF1", "RAD54L")

survival_corr <- cor(death_expm[, biomarkers], death_pheno_arr, method = "spearman")
survival_corr

##             [,1]
## NUP210 -0.044544
## PRKDC   0.005623
## SCRIB   0.037705
## TCOF1  -0.090366
## RAD54L  0.004999

What if we combine the genes from Bruce's list that have positive correlation with survival?

biomarkers_pos_corr <- c("PRKDC", "SCRIB", "RAD54L")
biomarkers_pos_corr_geomMeans <- apply(death_expm[, biomarkers_pos_corr], 1, 
    geomMean)
survival_corr_geomMeans <- cor(biomarkers_pos_corr_geomMeans, death_pheno_arr, 
    method = "spearman")
survival_corr_geomMeans

## [1] 0.02632

That didn't help.

Any genes in the 80 DE between normal and early stage are better correlated with survival?


load(file = "validation_sets/TCGA/output/norm_low_goi.save")  #80 gene symbols of interest
survival_corr_norm_low_goi <- cor(death_expm[, norm_low_goi], death_pheno_arr, 
    method = "spearman")
# get gene symbols for the genes most correlated with survival
survival_corr_norm_low_goi_sorted <- sort(survival_corr_norm_low_goi, decreasing = TRUE, 
    index.return = TRUE)
head(survival_corr_norm_low_goi_sorted$x)

## [1] 0.11066 0.09337 0.09282 0.08011 0.07947 0.07901

top5_survival_corr_norm_low_goi <- norm_low_goi[survival_corr_norm_low_goi_sorted$ix[1:5]]

# output rank data to file
corr_ranks <- 81 - rank(survival_corr_norm_low_goi)
corr_ranks_df <- data.frame(gene = rownames(survival_corr_norm_low_goi), rank = corr_ranks)
write.table(corr_ranks_df, file = "validation_sets/TCGA/output/survival_corr_norm_low_goi.txt", 
    sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)

Do the 5 genes most correlated with survival separate normal from early stage?

load("validation_sets/TCGA/stage_expm_norm_low.save")  #expression for all norm & stage I/II
load("validation_sets/TCGA/use_stage_norm_low.save")  #0/1 array for norm/stage I or II

par(mfrow = c(2, 3), cex = 0.9)
for (i in 1:length(top5_survival_corr_norm_low_goi)) {
    plotGeoms(top5_survival_corr_norm_low_goi[i], stage_expm_norm_low, use_stage_norm_low, 
        boxnames = c("Normal", "Stage I/II"), plotname = top5_survival_corr_norm_low_goi[i])
}

## Loading required package: beeswarm

plotGeoms(top5_survival_corr_norm_low_goi, stage_expm_norm_low, use_stage_norm_low, 
    boxnames = c("Normal", "Stage I/II"), plotname = "Top 5 correlated with survival")

plot of chunk unnamed-chunk-7

Is this any better than just taking the top 5 DE genes between normal and stage I/II?

par(mfrow = c(1, 2), cex = 0.9)
plotGeoms(head(norm_low_goi, 5), stage_expm_norm_low, use_stage_norm_low, boxnames = c("Normal", 
    "Stage I/II"), plotname = "Top 5 DE genes")
plotGeoms(top5_survival_corr_norm_low_goi, stage_expm_norm_low, use_stage_norm_low, 
    boxnames = c("Normal", "Stage I/II"), plotname = "Top 5 correlated with survival")

plot of chunk unnamed-chunk-8

It's not any better, but just as good so we should use genes correlating with survival if possible.