library(curatedOvarianData)
library(logging)
library(affy)
library(survival)
library(metafor)
Following the vignette of curatedOvarianData, this code chunk creates a list of ExpressionSets, containing all patients who are annotated for censored overall survival (days_to_death and vital_status). The arguments in patientselection.config also result in keeping only genes present in all studies, and in scaling these to unit variance before sample filtering.
source(system.file("extdata", "patientselection.config",package="curatedOvarianData"))
keep.common.only <- TRUE
source(system.file("extdata", "createEsetList.R", package = "curatedOvarianData"))
## INFO [2015-03-21 19:37:22] Inside script createEsetList.R - inputArgs =
## INFO [2015-03-21 19:38:17] including E.MTAB.386_eset
## INFO [2015-03-21 19:38:18] excluding GSE12418_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:18] excluding GSE12470_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:20] including GSE13876_eset
## INFO [2015-03-21 19:38:20] including GSE14764_eset
## INFO [2015-03-21 19:38:21] including GSE17260_eset
## INFO [2015-03-21 19:38:21] including GSE18520_eset
## INFO [2015-03-21 19:38:22] excluding GSE19829.GPL570_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:22] including GSE19829.GPL8300_eset
## INFO [2015-03-21 19:38:23] excluding GSE20565_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:24] excluding GSE2109_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:25] including GSE26193_eset
## INFO [2015-03-21 19:38:26] including GSE26712_eset
## INFO [2015-03-21 19:38:26] excluding GSE30009_eset (min.number.of.genes)
## INFO [2015-03-21 19:38:27] including GSE30161_eset
## INFO [2015-03-21 19:38:28] including GSE32062.GPL6480_eset
## INFO [2015-03-21 19:38:28] excluding GSE32063_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:29] excluding GSE44104_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:30] including GSE49997_eset
## INFO [2015-03-21 19:38:30] including GSE51088_eset
## INFO [2015-03-21 19:38:31] excluding GSE6008_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:31] excluding GSE6822_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:31] excluding GSE8842_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:33] including GSE9891_eset
## INFO [2015-03-21 19:38:33] excluding PMID15897565_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:34] including PMID17290060_eset
## INFO [2015-03-21 19:38:34] excluding PMID19318476_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:36] including TCGA_eset
## INFO [2015-03-21 19:38:36] excluding TCGA.mirna.8x15kv2_eset (min.number.of.genes)
## INFO [2015-03-21 19:38:38] including TCGA.RNASeqV2_eset
## INFO [2015-03-21 19:38:38] Ids with missing data: GSE51088_eset
esets <- esets[!grepl("TCGA.RNASeqV2_eset", names(esets))]
This function is copied from the curatedOvarianData vignette, with an added argument plot=TRUE to allow use of the function without creating a forest plot. Here we use this to perform meta-analysis on all genes in a loop.
forestplot <- function(esets, y="y", probeset, formula=y~probeset,
mlab="Overall", rma.method="FE", at=NULL,xlab="Hazard Ratio", plot=TRUE, ...) {
require(metafor)
esets <- esets[sapply(esets, function(x) probeset %in% featureNames(x))]
coefs <- sapply(1:length(esets), function(i) {
tmp <- as(phenoData(esets[[i]]), "data.frame")
tmp$y <- esets[[i]][[y]]
tmp$probeset <- exprs(esets[[i]])[probeset,]
summary(coxph(formula,data=tmp))$coefficients[1,c(1,3)]
})
res.rma <- metafor::rma(yi = coefs[1,], sei = coefs[2,],
method=rma.method)
if (is.null(at)) at <- log(c(0.25,1,4,20))
if(plot){
forest.rma(res.rma, xlab=xlab, slab=gsub("_eset$","",names(esets)),
atransf=exp, at=at, mlab=mlab,...)
}
return(res.rma)
}
First using a fixed-effects model:
fe.metafor <- lapply(featureNames(esets[[1]]), function(probeset){
forestplot(esets, y="y", probeset=probeset, plot=FALSE)
})
## Warning in fitter(X, Y, strats, offset, init, control, weights = weights,
## : Loglik converged before variable 1 ; beta may be infinite.
names(fe.metafor) <- featureNames(esets[[1]])
Note the use of try() here because the random-effects fit does not converge for some genes.
reml.metafor <- lapply(featureNames(esets[[1]]), function(probeset){
try(forestplot(esets, y="y", probeset=probeset, rma.method="REML", control=list(maxiter=1000, stepadj=0.5), plot=FALSE))
})
## Warning in fitter(X, Y, strats, offset, init, control, weights = weights,
## : Loglik converged before variable 1 ; beta may be infinite.
names(reml.metafor) <- featureNames(esets[[1]])
We now remove from the analysis any genes for which the random-effects fit did not converge.
idx.noconv <- sapply(reml.metafor, function(x) is(x, "try-error"))
reml.metafor <- reml.metafor[!idx.noconv]
fe.metafor <- fe.metafor[!idx.noconv]
Extract the Q-test p-value, and the Cox model p-values synthesized by fixed-effects and random-effects models.
QEp <- sapply(fe.metafor, function(one.metafor) one.metafor$QEp)
#QE <- sapply(fe.metafor, function(one.metafor) one.metafor$QE)
coxp.fe <- sapply(fe.metafor, function(one.metafor) one.metafor$pval)
coxp.re <- sapply(reml.metafor, function(one.metafor) one.metafor$pval)
hist(QEp, main="p-value from Cochrane's Q test", xlab="P-value")
First note that the top-ranked gene is the same whether using fixed-effects or random-effects meta-analysis:
(idx.fe <- which.min(coxp.fe))
## NUAK1
## 1713
(idx.re <- which.min(coxp.re))
## NUAK1
## 1713
The synthesized result is identical by random or fixed-effects meta-analysis:
probesets <- names(fe.metafor)
forest.rma(reml.metafor[[idx.re]], slab=sub("_eset", "", names(esets)),
main=paste("Top-ranked gene for survival association:", probesets[idx.re],
"\n Q-test p-value =", signif(QEp[idx.re], 2)))
addpoly(fe.metafor[[idx.re]], annotate=FALSE, col="grey")
idx.het <- which.min(QEp)
forest.rma(reml.metafor[[idx.het]], slab=sub("_eset", "", names(esets)),
main=paste("Gene with maximum heterogeneity:", probesets[idx.het],
"\n Q-test p-value =", signif(QEp[idx.het], 2)))
addpoly(fe.metafor[[idx.het]], annotate=FALSE, col="grey")
First we create a dataframe containing fractions of patients who are suboptimally debulked, serous subtype, high-grade, late-stage, and over 70 years of age.
covars <- sapply(esets, function(eset){
output <- c(sum(eset$debulking == "suboptimal", na.rm=TRUE) / sum(!is.na(eset$debulking)),
sum(eset$histological_type == "ser", na.rm=TRUE) / sum(!is.na(eset$histological_type)),
sum(eset$summarygrade == "high", na.rm=TRUE) / sum(!is.na(eset$summarygrade)),
sum(eset$summarystage == "late", na.rm=TRUE) / sum(!is.na(eset$summarystage)),
sum(eset$age > 70, na.rm=TRUE)/sum(!is.na(eset$age)))
names(output) <- c("suboptimal", "serous", "highgrade", "latestage", "old")
return(output)
})
covars <- t(covars)
covars[is.nan(covars)] <- NA
covars <- data.frame(covars)
Add the Cox regression coefficients (log HR) and sample size:
covars$estimates <- fe.metafor[[idx.het]]$yi
covars$n <- sapply(esets, ncol)
P-values from simple linear regression between the proportion of each covariate and the Cox coefficient:
covars.assocs <- sapply(1:5, function(i){
dat <- covars[, c(i, 6, 7)]
dat <- dat[complete.cases(dat), ]
anova(lm(dat[, 2] ~ dat[, 1], weights=dat$n))[["Pr(>F)"]][1]
})
Create a dataframe with just the covariate whose proportion is most associated with Cox coefficient. Do a linear regression with points weighted by the sample size of the study.
dat <- covars[, c(which.min(covars.assocs), 6:7)]
dat <- dat[complete.cases(dat), ]
my.lm <- lm(dat[, 2] ~ dat[, 1], weights=dat$n)
Finally, plot the association between the proportion of sub-optimally debulked patients in each dataset against the Cox regression coefficient for that dataset. Points are sized with cex=sqrt(n) so that the area of the circles is proportional to sample size.
plot(dat[, 1:2], cex=sqrt(dat$n / sum(dat$n)) * 5,
xlab=paste0("% ", colnames(covars)[which.min(covars.assocs)], "ly debulked"),
ylab=paste(probesets[idx.het], "Cox coefficient"))
abline(my.lm)
legend("topleft", legend=paste("P =", signif(covars.assocs[which.min(covars.assocs)], 1)), bty="n")
library(devtools)
session_info()
## Session info --------------------------------------------------------------
## setting value
## version R version 3.1.2 (2014-10-31)
## system x86_64, darwin13.4.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz America/New_York
## Packages ------------------------------------------------------------------
## package * version date source
## affy 1.44.0 2014-10-14 Bioconductor
## affyio * 1.34.0 2014-10-14 Bioconductor
## Biobase 2.26.0 2014-10-14 Bioconductor
## BiocGenerics 0.12.1 2014-11-14 Bioconductor
## BiocInstaller * 1.16.2 2015-03-15 Bioconductor
## curatedOvarianData 1.3.4 2014-10-22 Bioconductor
## devtools 1.7.0 2015-01-17 CRAN (R 3.1.2)
## digest * 0.6.8 2014-12-31 CRAN (R 3.1.2)
## evaluate * 0.5.5 2014-04-29 CRAN (R 3.1.0)
## formatR * 1.0 2014-08-25 CRAN (R 3.1.1)
## htmltools * 0.2.6 2014-09-08 CRAN (R 3.1.1)
## knitr * 1.9 2015-01-20 CRAN (R 3.1.2)
## lattice * 0.20-29 2014-04-04 CRAN (R 3.1.2)
## logging 0.7-103 2013-04-12 CRAN (R 3.1.2)
## Matrix * 1.1-5 2015-01-18 CRAN (R 3.1.2)
## metafor 1.9-5 2014-11-24 CRAN (R 3.1.2)
## preprocessCore * 1.28.0 2014-10-14 Bioconductor
## rmarkdown * 0.5.1 2015-01-26 CRAN (R 3.1.2)
## rstudioapi * 0.2 2014-12-31 CRAN (R 3.1.2)
## stringr * 0.6.2 2012-12-06 CRAN (R 3.1.0)
## survival 2.37-7 2014-01-22 CRAN (R 3.1.2)
## yaml * 2.1.13 2014-06-12 CRAN (R 3.1.0)
## zlibbioc * 1.12.0 2014-10-14 Bioconductor