I am going to first calculate AUC for the geometric means of OVA1 genes and CA125/HE4, then compare to my top 13 genes, top5, top5 between normal/low stage, top5 between normal/low with most correlated with survival.
AUCs In TCGA cohort - MUC16 & HE4: .610; Ova1: .874; 4 genes from Ova1: .859; top 5 genes: .954
AUCs In 4122 cohort - 4 genes from Ova1: .671; 3 of top 5 correlated with survival: .922; 5 of top 10 between norm/low: .883
setwd("/Users/lszabo/Documents/RProjects/ButteLab/biomarkers")
require(pROC)
## Loading required package: pROC
## Loading required package: plyr
## Type 'citation("pROC")' for a citation.
## Attaching package: 'pROC'
## The following object(s) are masked from 'package:stats':
##
## cov, smooth, var
source("db_functions.R")
source("validation_sets/validate_functions.R")
## Loading required package: DBI
## Loading required package: RMySQL
load("validation_sets/TCGA/gene_expm.save")
load("validation_sets/TCGA/classes.save")
load("tops.save")
load("standard_of_care.save")
load("ova1.save")
Standard of Care CA-125 + HE4:
standard_means <- apply(gene_expm[, standard_of_care], 1, geomMean)
standard_roc <- roc(classes, standard_means, ci = T)
plot.roc(standard_roc, print.auc = T, main = "CA-125 and HE4 (TCGA cohort)")
##
## Call:
## roc.default(response = classes, predictor = standard_means, ci = T)
##
## Data: standard_means in 8 controls (classes 0) < 586 cases (classes 1).
## Area under the curve: 0.61
## 95% CI: 0.411-0.81 (DeLong)
Standard of Care: Ova1 panel of 5 genes:
ova1_means <- apply(gene_expm[, ova1], 1, geomMean)
ova1_roc <- roc(classes, ova1_means, ci = T)
plot.roc(ova1_roc, print.auc = T, main = "Ova1 panel (TCGA cohort)")
##
## Call:
## roc.default(response = classes, predictor = ova1_means, ci = T)
##
## Data: ova1_means in 8 controls (classes 0) < 586 cases (classes 1).
## Area under the curve: 0.874
## 95% CI: 0.818-0.93 (DeLong)
Top 5 of my predicted genes
top5_means <- apply(gene_expm[, tops[1:5]], 1, geomMean)
top5_roc <- roc(classes, top5_means, ci = T)
plot.roc(top5_roc, print.auc = T, main = "My top 5 genes (TCGA cohort)")
##
## Call:
## roc.default(response = classes, predictor = top5_means, ci = T)
##
## Data: top5_means in 8 controls (classes 0) < 586 cases (classes 1).
## Area under the curve: 0.954
## 95% CI: 0.93-0.979 (DeLong)
Top 13 of my predicted genes
top13_means <- apply(gene_expm[, tops[1:13]], 1, geomMean)
top13_roc <- roc(classes, top13_means, ci = T)
plot.roc(top13_roc, print.auc = T, main = "My top 13 genes (TCGA cohort)")
##
## Call:
## roc.default(response = classes, predictor = top13_means, ci = T)
##
## Data: top13_means in 8 controls (classes 0) < 586 cases (classes 1).
## Area under the curve: 0.985
## 95% CI: 0.975-0.996 (DeLong)
Top 5 between normal/low stage:
This ROC is created using the 8 normal samples and xx stage III/IV samples in TCGA. This is not ideal since I am still using the same 8 controls I used to generate the list of 80 significant genes, but ok for a first pass and I will validate in another independent cohort.
# 80 gene symbols of interest between normal/low
load(file = "validation_sets/TCGA/output/norm_low_goi.save")
load(file = "validation_sets/TCGA/stage_expm.save") #expression data only for those we have stage data for
load(file = "validation_sets/TCGA/use_stage_grouped.save") # 0, 1, or 2 for normal, early, late
norm_high_stage_expm <- stage_expm[which(use_stage_grouped != 1), ] #533 late stage, 8 normal
norm_high_classes <- use_stage_grouped[which(use_stage_grouped != 1)] #0s and 2s
norm_high_classes[norm_high_classes == 2] <- 1 #change to 0s and 1s for plotting
# now finally ready to plot
top5_norm_low_means <- apply(norm_high_stage_expm[, norm_low_goi[1:5]], 1, geomMean)
top5_norm_low_roc <- roc(norm_high_classes, top5_norm_low_means, ci = T)
plot.roc(top5_norm_low_roc, print.auc = T, main = "Top 5 between normal/low stage")
##
## Call:
## roc.default(response = norm_high_classes, predictor = top5_norm_low_means, ci = T)
##
## Data: top5_norm_low_means in 8 controls (norm_high_classes 0) < 533 cases (norm_high_classes 1).
## Area under the curve: 1
## 95% CI: 0.998-1 (DeLong)
Top 5 between normal/low also correlated with survival:
This ROC is created using the 8 normal samples and xx stage III/IV samples in TCGA. This is not ideal since I am still using the same 8 controls I used to generate the list of 80 significant genes, but ok for a first pass and I will validate in another independent cohort.
load("validation_sets/TCGA/survival_corr_norm_low_goi_sorted.save")
top5_survival_genes <- norm_low_goi[survival_corr_norm_low_goi_sorted$ix[1:5]] #gene symbols
top5_survival_means <- apply(norm_high_stage_expm[, top5_survival_genes], 1,
geomMean)
top5_survival_roc <- roc(norm_high_classes, top5_survival_means, ci = T)
plot.roc(top5_survival_roc, print.auc = T, main = "Top 5 correlated with survival also up between normal/low stage")
##
## Call:
## roc.default(response = norm_high_classes, predictor = top5_survival_means, ci = T)
##
## Data: top5_survival_means in 8 controls (norm_high_classes 0) < 533 cases (norm_high_classes 1).
## Area under the curve: 0.998
## 95% CI: 0.995-1 (DeLong)
Now I need to actually get an AUC for my top 5 genes in a completely independent dataset since my top 5 were picked based on what separated the normals and low stage in the TCGA dataset. Will use the 4122 dataset which was previously prepped for the meta-analysis but not used.
load("Ovarian_Cancer_Datasets.RData")
temp <- mtab4122GEM$keys[which(mtab4122GEM$keys %in% top5_survival_genes)] #get the keys that are in my list
# only 3 matched, need to expand symbols to see if I can get the other 2
con = connect.db(uname, pword, host = hostname, dbname = "annot_gene")
top5_survival_synonyms <- getAllSynonyms.db(con, top5_survival_genes)[, "synonym"]
dbDisconnect(con)
## [1] TRUE
# didn't help, only got TMPRSS3 with is synonmy for TMPRSS4 which I
# already have
mtab4122GEM$keys[which(mtab4122GEM$keys %in% top5_survival_synonyms)] #get the keys that are in my list
## 201598_s_at 204086_at 218960_at 220177_s_at
## "INPPL1" "PRAME" "TMPRSS4" "TMPRSS3"
# so let's see what we get with these top 3
temp_exprs <- mtab4122GEM$expr[which(mtab4122GEM$keys %in% top5_survival_genes),
]
temp_survival_means <- apply(temp_exprs, 2, geomMean)
temp_survival_roc <- roc(mtab4122GEM$class, temp_survival_means, ci = T)
plot.roc(temp_survival_roc, print.auc = T, main = "Top 3 correlated with survival found in GEO-4122 cohort")
##
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp_survival_means, ci = T)
##
## Data: temp_survival_means in 32 controls (mtab4122GEM$class 0) < 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.922
## 95% CI: 0.859-0.985 (DeLong)
# and how does ova1 do in this data set (MUC16 not included since not
# measured)?
temp_ova1 <- mtab4122GEM$keys[which(mtab4122GEM$keys %in% ova1)] #get the keys that are in ova1
temp_ova1_exprs <- mtab4122GEM$expr[which(mtab4122GEM$keys %in% ova1), ]
temp_ova1_means <- apply(temp_ova1_exprs, 2, geomMean)
temp_ova1_roc <- roc(mtab4122GEM$class, temp_ova1_means, ci = T)
plot.roc(temp_ova1_roc, print.auc = T, main = "4 Ova1 genes found in 4122")
##
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp_ova1_means, ci = T)
##
## Data: temp_ova1_means in 32 controls (mtab4122GEM$class 0) > 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.671
## 95% CI: 0.537-0.805 (DeLong)
# how would removing MUC16 change the results in the TCGA dataset?
ova1_nomuc16_means <- apply(gene_expm[, ova1[-1]], 1, geomMean)
ova1_nomuc16_roc <- roc(classes, ova1_nomuc16_means, ci = T)
plot.roc(ova1_nomuc16_roc, print.auc = T, main = "Ova1 panel without MUC16 (TCGA cohort)")
##
## Call:
## roc.default(response = classes, predictor = ova1_nomuc16_means, ci = T)
##
## Data: ova1_nomuc16_means in 8 controls (classes 0) < 586 cases (classes 1).
## Area under the curve: 0.859
## 95% CI: 0.793-0.926 (DeLong)
# let's see if I can get a few more of my top survival genes actually in
# the 4122 dataset
top10_survival_genes <- norm_low_goi[survival_corr_norm_low_goi_sorted$ix[1:10]] #gene symbols
mtab4122GEM$keys[which(mtab4122GEM$keys %in% top10_survival_genes)] #get the keys that are in my list
## 201598_s_at 204086_at 217912_at 218960_at
## "INPPL1" "PRAME" "DUS1L" "TMPRSS4"
# only 4 of the top 10, let's try those
temp4_exprs <- mtab4122GEM$expr[which(mtab4122GEM$keys %in% top10_survival_genes),
]
temp4_survival_means <- apply(temp4_exprs, 2, geomMean)
temp4_survival_roc <- roc(mtab4122GEM$class, temp4_survival_means, ci = T)
plot.roc(temp4_survival_roc, print.auc = T, main = "Top 4 correlated with survival found in GEO-4122 cohort")
##
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp4_survival_means, ci = T)
##
## Data: temp4_survival_means in 32 controls (mtab4122GEM$class 0) < 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.922
## 95% CI: 0.858-0.986 (DeLong)
# what about the top genes between norm and low?
mtab4122GEM$keys[which(mtab4122GEM$keys %in% norm_low_goi[1:10])] #5 of top 10 are in 4122
## 38158_at 201664_at 203276_at 204086_at 205339_at
## "ESPL1" "SMC4" "LMNB1" "PRAME" "STIL"
temp5_in4122_exprs <- mtab4122GEM$expr[which(mtab4122GEM$keys %in% norm_low_goi[1:10]),
]
temp5_in4122_norm_low_means <- apply(temp5_in4122_exprs, 2, geomMean)
temp5_in4122_norm_low_roc <- roc(mtab4122GEM$class, temp5_in4122_norm_low_means,
ci = T)
plot.roc(temp5_in4122_norm_low_roc, print.auc = T, main = "5 in top 10 between normal/low stage found in GEO-4122 cohort")
##
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp5_in4122_norm_low_means, ci = T)
##
## Data: temp5_in4122_norm_low_means in 32 controls (mtab4122GEM$class 0) < 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.883
## 95% CI: 0.786-0.979 (DeLong)
Figure for poster
plot.roc(temp_survival_roc)
##
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp_survival_means, ci = T)
##
## Data: temp_survival_means in 32 controls (mtab4122GEM$class 0) < 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.922
## 95% CI: 0.859-0.985 (DeLong)
plot.roc(temp_ova1_roc)
##
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp_ova1_means, ci = T)
##
## Data: temp_ova1_means in 32 controls (mtab4122GEM$class 0) > 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.671
## 95% CI: 0.537-0.805 (DeLong)
Try genes up in early stage and most negatively correlated with survival. These perform worse than the genes most positively correlated with survival.
load("validation_sets/TCGA/survival_corr_norm_low_goi_sorted.save")
tail(survival_corr_norm_low_goi_sorted$x)
## [1] -0.1042 -0.1132 -0.1177 -0.1411 -0.1753 -0.2301
bot5_survival_corr_norm_low_goi <- norm_low_goi[survival_corr_norm_low_goi_sorted$ix[76:80]]
bot5_survival_corr_norm_low_goi
## [1] "POLQ" "CCNE1" "C1orf38" "MX2" "C1orf135"
temp <- mtab4122GEM$keys[which(mtab4122GEM$keys %in% bot5_survival_corr_norm_low_goi)] #get the keys that are in my list
# need to check if more are found by synonym
con = connect.db(uname, pword, host = hostname, dbname = "annot_gene")
bot5_survival_synonyms <- getAllSynonyms.db(con, bot5_survival_corr_norm_low_goi)[,
"synonym"]
dbDisconnect(con)
## [1] TRUE
temp_exprs <- mtab4122GEM$expr[which(mtab4122GEM$keys %in% bot5_survival_synonyms),
]
temp_survival_means <- apply(temp_exprs, 2, geomMean)
temp_survival_roc <- roc(mtab4122GEM$class, temp_survival_means, ci = T)
plot.roc(temp_survival_roc, print.auc = T, main = "Top 4 anti-correlated with survival found in GEO-4122 cohort")
##
## Call:
## roc.default(response = mtab4122GEM$class, predictor = temp_survival_means, ci = T)
##
## Data: temp_survival_means in 32 controls (mtab4122GEM$class 0) < 32 cases (mtab4122GEM$class 1).
## Area under the curve: 0.799
## 95% CI: 0.691-0.907 (DeLong)