Ovarian Cancer Biomarker Validation

AUC calculations

I am going to first calculate AUC for the geometric means of OVA1 genes and CA125/HE4, then compare to my top 13 genes, top5, top5 between normal/low stage, top5 between normal/low with most correlated with survival.

Results Summary

AUCs In TCGA cohort - MUC16 & HE4: .610; Ova1: .874; 4 genes from Ova1: .859; top 5 genes: .954
AUCs In 4122 cohort - 4 genes from Ova1: .671; 3 of top 5 correlated with survival: .922; 5 of top 10 between norm/low: .883

setwd("/Users/lszabo/Documents/RProjects/ButteLab/biomarkers")
require(pROC)

## Loading required package: pROC

## Loading required package: plyr

## Type 'citation("pROC")' for a citation.

## Attaching package: 'pROC'

## The following object(s) are masked from 'package:stats':
## 
## cov, smooth, var

source("db_functions.R")
source("validation_sets/validate_functions.R")

## Loading required package: DBI

## Loading required package: RMySQL


load("validation_sets/TCGA/gene_expm.save")
load("validation_sets/TCGA/classes.save")
load("tops.save")
load("standard_of_care.save")
load("ova1.save")

Standard of Care CA-125 + HE4:

standard_means <- apply(gene_expm[, standard_of_care], 1, geomMean)
standard_roc <- roc(classes, standard_means, ci = T)
plot.roc(standard_roc, print.auc = T, main = "CA-125 and HE4 (TCGA cohort)")

plot of chunk unnamed-chunk-2

## 
## Call:
## roc.default(response = classes, predictor = standard_means, ci = T)
## 
## Data: standard_means in 8 controls (classes 0) < 586 cases (classes 1).
## Area under the curve: 0.61
## 95% CI: 0.411-0.81 (DeLong)

Standard of Care: Ova1 panel of 5 genes:

ova1_means <- apply(gene_expm[, ova1], 1, geomMean)
ova1_roc <- roc(classes, ova1_means, ci = T)
plot.roc(ova1_roc, print.auc = T, main = "Ova1 panel (TCGA cohort)")

plot of chunk unnamed-chunk-3

## 
## Call:
## roc.default(response = classes, predictor = ova1_means, ci = T)
## 
## Data: ova1_means in 8 controls (classes 0) < 586 cases (classes 1).
## Area under the curve: 0.874
## 95% CI: 0.818-0.93 (DeLong)

Top 5 of my predicted genes


top5_means <- apply(gene_expm[, tops[1:5]], 1, geomMean)
top5_roc <- roc(classes, top5_means, ci = T)
plot.roc(top5_roc, print.auc = T, main = "My top 5 genes (TCGA cohort)")

plot of chunk unnamed-chunk-4

## 
## Call:
## roc.default(response = classes, predictor = top5_means, ci = T)
## 
## Data: top5_means in 8 controls (classes 0) < 586 cases (classes 1).
## Area under the curve: 0.954
## 95% CI: 0.93-0.979 (DeLong)

Top 13 of my predicted genes

top13_means <- apply(gene_expm[, tops[1:13]], 1, geomMean)
top13_roc <- roc(classes, top13_means, ci = T)
plot.roc(top13_roc, print.auc = T, main = "My top 13 genes (TCGA cohort)")

plot of chunk unnamed-chunk-5

## 
## Call:
## roc.default(response = classes, predictor = top13_means, ci = T)
## 
## Data: top13_means in 8 controls (classes 0) < 586 cases (classes 1).
## Area under the curve: 0.985
## 95% CI: 0.975-0.996 (DeLong)

Top 5 between normal/low stage:

This ROC is created using the 8 normal samples and xx stage III/IV samples in TCGA. This is not ideal since I am still using the same 8 controls I used to generate the list of 80 significant genes, but ok for a first pass and I will validate in another independent cohort.

# 80 gene symbols of interest between normal/low
load(file = "validation_sets/TCGA/output/norm_low_goi.save")
load(file = "validation_sets/TCGA/stage_expm.save")  #expression data only for those we have stage data for
load(file = "validation_sets/TCGA/use_stage_grouped.save")  # 0, 1, or 2 for normal, early, late

norm_high_stage_expm <- stage_expm[which(use_stage_grouped != 1), ]  #533 late stage, 8 normal
norm_high_classes <- use_stage_grouped[which(use_stage_grouped != 1)]  #0s and 2s
norm_high_classes[norm_high_classes == 2] <- 1  #change to 0s and 1s for plotting

# now finally ready to plot
top5_norm_low_means <- apply(norm_high_stage_expm[, norm_low_goi[1:5]], 1, geomMean)
top5_norm_low_roc <- roc(norm_high_classes, top5_norm_low_means, ci = T)
plot.roc(top5_norm_low_roc, print.auc = T, main = "Top 5 between normal/low stage")

plot of chunk unnamed-chunk-6

## 
## Call:
## roc.default(response = norm_high_classes, predictor = top5_norm_low_means,     ci = T)
## 
## Data: top5_norm_low_means in 8 controls (norm_high_classes 0) < 533 cases (norm_high_classes 1).
## Area under the curve: 1
## 95% CI: 0.998-1 (DeLong)

Top 5 between normal/low also correlated with survival:

load("validation_sets/TCGA/survival_corr_norm_low_goi_sorted.save")
top5_survival_genes <- norm_low_goi[survival_corr_norm_low_goi_sorted$ix[1:5]]  #gene symbols

top5_survival_means <- apply(norm_high_stage_expm[, top5_survival_genes], 1, 
    geomMean)
top5_survival_roc <- roc(norm_high_classes, top5_survival_means, ci = T)
plot.roc(top5_survival_roc, print.auc = T, main = "Top 5 correlated with survival also up between normal/low stage")

plot of chunk unnamed-chunk-7

## 
## Call:
## roc.default(response = norm_high_classes, predictor = top5_survival_means,     ci = T)
## 
## Data: top5_survival_means in 8 controls (norm_high_classes 0) < 533 cases (norm_high_classes 1).
## Area under the curve: 0.998
## 95% CI: 0.995-1 (DeLong)

Now I need to actually get an AUC for my top 5 genes in a completely independent dataset since my top 5 were picked based on what separated the normals and low stage in the TCGA dataset. Will use the 4122 dataset which was previously prepped for the meta-analysis but not used.

load("Ovarian_Cancer_Datasets.RData")
temp <- mtab4122GEM$keys[which(mtab4122GEM$keys %in% top5_survival_genes)]  #get the keys that are in my list

# only 3 matched, need to expand symbols to see if I can get the other 2
con = connect.db(uname, pword, host = hostname, dbname = "annot_gene")
top5_survival_synonyms <- getAllSynonyms.db(con, top5_survival_genes)[, "synonym"]
dbDisconnect(con)

## [1] TRUE


# didn't help, only got TMPRSS3 with is synonmy for TMPRSS4 which I
# already have
mtab4122GEM$keys[which(mtab4122GEM$keys %in% top5_survival_synonyms)]  #get the keys that are in my list

## 201598_s_at   204086_at   218960_at 220177_s_at 
##    "INPPL1"     "PRAME"   "TMPRSS4"   "TMPRSS3"


# so let's see what we get with these top 3
temp_exprs <- mtab4122GEM$expr[which(mtab4122GEM$keys %in% top5_survival_genes), 
    ]
temp_survival_means <- apply(temp_exprs, 2, geomMean)
temp_survival_roc <- roc(mtab4122GEM$class, temp_survival_means, ci = T)
plot.roc(temp_survival_roc, print.auc = T, main = "Top 3 correlated with survival found in GEO-4122 cohort")

plot of chunk unnamed-chunk-8

## 
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp_survival_means,     ci = T)
## 
## Data: temp_survival_means in 32 controls (mtab4122GEM$class 0) < 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.922
## 95% CI: 0.859-0.985 (DeLong)


# and how does ova1 do in this data set (MUC16 not included since not
# measured)?
temp_ova1 <- mtab4122GEM$keys[which(mtab4122GEM$keys %in% ova1)]  #get the keys that are in ova1
temp_ova1_exprs <- mtab4122GEM$expr[which(mtab4122GEM$keys %in% ova1), ]
temp_ova1_means <- apply(temp_ova1_exprs, 2, geomMean)
temp_ova1_roc <- roc(mtab4122GEM$class, temp_ova1_means, ci = T)
plot.roc(temp_ova1_roc, print.auc = T, main = "4 Ova1 genes found in 4122")

plot of chunk unnamed-chunk-8

## 
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp_ova1_means,     ci = T)
## 
## Data: temp_ova1_means in 32 controls (mtab4122GEM$class 0) > 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.671
## 95% CI: 0.537-0.805 (DeLong)


# how would removing MUC16 change the results in the TCGA dataset?
ova1_nomuc16_means <- apply(gene_expm[, ova1[-1]], 1, geomMean)
ova1_nomuc16_roc <- roc(classes, ova1_nomuc16_means, ci = T)
plot.roc(ova1_nomuc16_roc, print.auc = T, main = "Ova1 panel without MUC16 (TCGA cohort)")

plot of chunk unnamed-chunk-8

## 
## Call:
## roc.default(response = classes, predictor = ova1_nomuc16_means,     ci = T)
## 
## Data: ova1_nomuc16_means in 8 controls (classes 0) < 586 cases (classes 1).
## Area under the curve: 0.859
## 95% CI: 0.793-0.926 (DeLong)


# let's see if I can get a few more of my top survival genes actually in
# the 4122 dataset
top10_survival_genes <- norm_low_goi[survival_corr_norm_low_goi_sorted$ix[1:10]]  #gene symbols
mtab4122GEM$keys[which(mtab4122GEM$keys %in% top10_survival_genes)]  #get the keys that are in my list

## 201598_s_at   204086_at   217912_at   218960_at 
##    "INPPL1"     "PRAME"     "DUS1L"   "TMPRSS4"


# only 4 of the top 10, let's try those
temp4_exprs <- mtab4122GEM$expr[which(mtab4122GEM$keys %in% top10_survival_genes), 
    ]
temp4_survival_means <- apply(temp4_exprs, 2, geomMean)
temp4_survival_roc <- roc(mtab4122GEM$class, temp4_survival_means, ci = T)
plot.roc(temp4_survival_roc, print.auc = T, main = "Top 4 correlated with survival found in GEO-4122 cohort")

plot of chunk unnamed-chunk-8

## 
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp4_survival_means,     ci = T)
## 
## Data: temp4_survival_means in 32 controls (mtab4122GEM$class 0) < 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.922
## 95% CI: 0.858-0.986 (DeLong)


# what about the top genes between norm and low?
mtab4122GEM$keys[which(mtab4122GEM$keys %in% norm_low_goi[1:10])]  #5 of top 10 are in 4122

##  38158_at 201664_at 203276_at 204086_at 205339_at 
##   "ESPL1"    "SMC4"   "LMNB1"   "PRAME"    "STIL"

temp5_in4122_exprs <- mtab4122GEM$expr[which(mtab4122GEM$keys %in% norm_low_goi[1:10]), 
    ]
temp5_in4122_norm_low_means <- apply(temp5_in4122_exprs, 2, geomMean)
temp5_in4122_norm_low_roc <- roc(mtab4122GEM$class, temp5_in4122_norm_low_means, 
    ci = T)
plot.roc(temp5_in4122_norm_low_roc, print.auc = T, main = "5 in top 10 between normal/low stage found in GEO-4122 cohort")

plot of chunk unnamed-chunk-8

## 
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp5_in4122_norm_low_means,     ci = T)
## 
## Data: temp5_in4122_norm_low_means in 32 controls (mtab4122GEM$class 0) < 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.883
## 95% CI: 0.786-0.979 (DeLong)

Figure for poster

plot.roc(temp_survival_roc)

plot of chunk unnamed-chunk-9

## 
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp_survival_means,     ci = T)
## 
## Data: temp_survival_means in 32 controls (mtab4122GEM$class 0) < 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.922
## 95% CI: 0.859-0.985 (DeLong)

plot.roc(temp_ova1_roc)

plot of chunk unnamed-chunk-9

## 
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp_ova1_means,     ci = T)
## 
## Data: temp_ova1_means in 32 controls (mtab4122GEM$class 0) > 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.671
## 95% CI: 0.537-0.805 (DeLong)

Try genes up in early stage and most negatively correlated with survival. These perform worse than the genes most positively correlated with survival.

load("validation_sets/TCGA/survival_corr_norm_low_goi_sorted.save")
tail(survival_corr_norm_low_goi_sorted$x)

## [1] -0.1042 -0.1132 -0.1177 -0.1411 -0.1753 -0.2301

bot5_survival_corr_norm_low_goi <- norm_low_goi[survival_corr_norm_low_goi_sorted$ix[76:80]]
bot5_survival_corr_norm_low_goi

## [1] "POLQ"     "CCNE1"    "C1orf38"  "MX2"      "C1orf135"


temp <- mtab4122GEM$keys[which(mtab4122GEM$keys %in% bot5_survival_corr_norm_low_goi)]  #get the keys that are in my list
# need to check if more are found by synonym
con = connect.db(uname, pword, host = hostname, dbname = "annot_gene")
bot5_survival_synonyms <- getAllSynonyms.db(con, bot5_survival_corr_norm_low_goi)[, 
    "synonym"]
dbDisconnect(con)

## [1] TRUE


temp_exprs <- mtab4122GEM$expr[which(mtab4122GEM$keys %in% bot5_survival_synonyms), 
    ]
temp_survival_means <- apply(temp_exprs, 2, geomMean)
temp_survival_roc <- roc(mtab4122GEM$class, temp_survival_means, ci = T)
plot.roc(temp_survival_roc, print.auc = T, main = "Top 4 anti-correlated with survival found in GEO-4122 cohort")

plot of chunk unnamed-chunk-10

## 
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp_survival_means,     ci = T)
## 
## Data: temp_survival_means in 32 controls (mtab4122GEM$class 0) < 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.799
## 95% CI: 0.691-0.907 (DeLong)