1 Needed libraries

library(curatedOvarianData)
library(logging)
library(affy)
library(survival)
library(metafor)

2 Sample selection

Following the vignette of curatedOvarianData, this code chunk creates a list of ExpressionSets, containing all patients who are annotated for censored overall survival (days_to_death and vital_status). The arguments in patientselection.config also result in keeping only genes present in all studies, and in scaling these to unit variance before sample filtering.

source(system.file("extdata", "patientselection.config",package="curatedOvarianData"))
keep.common.only <- TRUE
source(system.file("extdata", "createEsetList.R", package = "curatedOvarianData"))
## INFO [2015-03-21 19:37:22] Inside script createEsetList.R - inputArgs =
## INFO [2015-03-21 19:38:17] including E.MTAB.386_eset
## INFO [2015-03-21 19:38:18] excluding GSE12418_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:18] excluding GSE12470_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:20] including GSE13876_eset
## INFO [2015-03-21 19:38:20] including GSE14764_eset
## INFO [2015-03-21 19:38:21] including GSE17260_eset
## INFO [2015-03-21 19:38:21] including GSE18520_eset
## INFO [2015-03-21 19:38:22] excluding GSE19829.GPL570_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:22] including GSE19829.GPL8300_eset
## INFO [2015-03-21 19:38:23] excluding GSE20565_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:24] excluding GSE2109_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:25] including GSE26193_eset
## INFO [2015-03-21 19:38:26] including GSE26712_eset
## INFO [2015-03-21 19:38:26] excluding GSE30009_eset (min.number.of.genes)
## INFO [2015-03-21 19:38:27] including GSE30161_eset
## INFO [2015-03-21 19:38:28] including GSE32062.GPL6480_eset
## INFO [2015-03-21 19:38:28] excluding GSE32063_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:29] excluding GSE44104_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:30] including GSE49997_eset
## INFO [2015-03-21 19:38:30] including GSE51088_eset
## INFO [2015-03-21 19:38:31] excluding GSE6008_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:31] excluding GSE6822_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:31] excluding GSE8842_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:33] including GSE9891_eset
## INFO [2015-03-21 19:38:33] excluding PMID15897565_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:34] including PMID17290060_eset
## INFO [2015-03-21 19:38:34] excluding PMID19318476_eset (min.number.of.events or min.sample.size)
## INFO [2015-03-21 19:38:36] including TCGA_eset
## INFO [2015-03-21 19:38:36] excluding TCGA.mirna.8x15kv2_eset (min.number.of.genes)
## INFO [2015-03-21 19:38:38] including TCGA.RNASeqV2_eset
## INFO [2015-03-21 19:38:38] Ids with missing data: GSE51088_eset
esets <- esets[!grepl("TCGA.RNASeqV2_eset", names(esets))]

3 A useful meta-analysis function

This function is copied from the curatedOvarianData vignette, with an added argument plot=TRUE to allow use of the function without creating a forest plot. Here we use this to perform meta-analysis on all genes in a loop.

forestplot <- function(esets, y="y", probeset, formula=y~probeset,
mlab="Overall", rma.method="FE", at=NULL,xlab="Hazard Ratio", plot=TRUE, ...) {
    require(metafor)
    esets <- esets[sapply(esets, function(x) probeset %in% featureNames(x))]
    coefs <- sapply(1:length(esets), function(i) {
        tmp   <- as(phenoData(esets[[i]]), "data.frame")
        tmp$y <- esets[[i]][[y]]
        tmp$probeset <- exprs(esets[[i]])[probeset,]
        summary(coxph(formula,data=tmp))$coefficients[1,c(1,3)]
    })  

    res.rma <- metafor::rma(yi = coefs[1,], sei = coefs[2,], 
        method=rma.method)

    if (is.null(at)) at <- log(c(0.25,1,4,20))
    if(plot){
      forest.rma(res.rma, xlab=xlab, slab=gsub("_eset$","",names(esets)),
      atransf=exp, at=at, mlab=mlab,...)
    }
    return(res.rma)
}

4 Perform meta-analysis on all genes

First using a fixed-effects model:

fe.metafor <- lapply(featureNames(esets[[1]]), function(probeset){
  forestplot(esets, y="y", probeset=probeset, plot=FALSE)
})
## Warning in fitter(X, Y, strats, offset, init, control, weights = weights,
## : Loglik converged before variable 1 ; beta may be infinite.
names(fe.metafor) <- featureNames(esets[[1]])

5 Then using a random-effects model

Note the use of try() here because the random-effects fit does not converge for some genes.

reml.metafor <- lapply(featureNames(esets[[1]]), function(probeset){
  try(forestplot(esets, y="y", probeset=probeset, rma.method="REML", control=list(maxiter=1000, stepadj=0.5), plot=FALSE))
})
## Warning in fitter(X, Y, strats, offset, init, control, weights = weights,
## : Loglik converged before variable 1 ; beta may be infinite.
names(reml.metafor) <- featureNames(esets[[1]])

We now remove from the analysis any genes for which the random-effects fit did not converge.

idx.noconv <- sapply(reml.metafor, function(x) is(x, "try-error"))
reml.metafor <- reml.metafor[!idx.noconv]
fe.metafor <- fe.metafor[!idx.noconv]

Extract the Q-test p-value, and the Cox model p-values synthesized by fixed-effects and random-effects models.

QEp <- sapply(fe.metafor, function(one.metafor) one.metafor$QEp)
#QE <- sapply(fe.metafor, function(one.metafor) one.metafor$QE)
coxp.fe <- sapply(fe.metafor, function(one.metafor) one.metafor$pval)
coxp.re <- sapply(reml.metafor, function(one.metafor) one.metafor$pval)

6 All p-values from Cochrane’s Q test:

hist(QEp, main="p-value from Cochrane's Q test", xlab="P-value")

7 Forest plot for top survival-associated gene

First note that the top-ranked gene is the same whether using fixed-effects or random-effects meta-analysis:

(idx.fe <- which.min(coxp.fe))
## NUAK1 
##  1713
(idx.re <- which.min(coxp.re))
## NUAK1 
##  1713

The synthesized result is identical by random or fixed-effects meta-analysis:

probesets <- names(fe.metafor)
forest.rma(reml.metafor[[idx.re]], slab=sub("_eset", "", names(esets)), 
                      main=paste("Top-ranked gene for survival association:", probesets[idx.re],
                      "\n Q-test p-value =", signif(QEp[idx.re], 2)))
addpoly(fe.metafor[[idx.re]], annotate=FALSE, col="grey")

8 Analysis of gene with greatest evidence of heterogeneity

8.1 Forest plot

idx.het <- which.min(QEp)
forest.rma(reml.metafor[[idx.het]], slab=sub("_eset", "", names(esets)), 
                      main=paste("Gene with maximum heterogeneity:", probesets[idx.het],
                      "\n Q-test p-value =", signif(QEp[idx.het], 2)))
addpoly(fe.metafor[[idx.het]], annotate=FALSE, col="grey")

8.2 Association between Cox coefficient and covariates

First we create a dataframe containing fractions of patients who are suboptimally debulked, serous subtype, high-grade, late-stage, and over 70 years of age.

covars <- sapply(esets, function(eset){
  output <- c(sum(eset$debulking == "suboptimal", na.rm=TRUE) / sum(!is.na(eset$debulking)), 
  sum(eset$histological_type == "ser", na.rm=TRUE) / sum(!is.na(eset$histological_type)), 
  sum(eset$summarygrade == "high", na.rm=TRUE) / sum(!is.na(eset$summarygrade)),
  sum(eset$summarystage == "late", na.rm=TRUE) / sum(!is.na(eset$summarystage)),
  sum(eset$age > 70, na.rm=TRUE)/sum(!is.na(eset$age)))
  names(output) <- c("suboptimal", "serous", "highgrade", "latestage", "old")
  return(output)
})
covars <- t(covars)
covars[is.nan(covars)] <- NA
covars <- data.frame(covars)

Add the Cox regression coefficients (log HR) and sample size:

covars$estimates <- fe.metafor[[idx.het]]$yi
covars$n <- sapply(esets, ncol)

P-values from simple linear regression between the proportion of each covariate and the Cox coefficient:

covars.assocs <- sapply(1:5, function(i){
  dat <- covars[, c(i, 6, 7)]
  dat <- dat[complete.cases(dat), ]
  anova(lm(dat[, 2] ~ dat[, 1], weights=dat$n))[["Pr(>F)"]][1]
  })

Create a dataframe with just the covariate whose proportion is most associated with Cox coefficient. Do a linear regression with points weighted by the sample size of the study.

dat <- covars[, c(which.min(covars.assocs), 6:7)]
dat <- dat[complete.cases(dat), ]
my.lm <- lm(dat[, 2] ~ dat[, 1], weights=dat$n)

Finally, plot the association between the proportion of sub-optimally debulked patients in each dataset against the Cox regression coefficient for that dataset. Points are sized with cex=sqrt(n) so that the area of the circles is proportional to sample size.

plot(dat[, 1:2], cex=sqrt(dat$n / sum(dat$n)) * 5,
     xlab=paste0("% ", colnames(covars)[which.min(covars.assocs)], "ly debulked"), 
     ylab=paste(probesets[idx.het], "Cox coefficient"))
abline(my.lm)
legend("topleft", legend=paste("P =", signif(covars.assocs[which.min(covars.assocs)], 1)), bty="n")

9 Session Info

library(devtools)
session_info()
## Session info --------------------------------------------------------------
##  setting  value                       
##  version  R version 3.1.2 (2014-10-31)
##  system   x86_64, darwin13.4.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  tz       America/New_York
## Packages ------------------------------------------------------------------
##  package            * version date       source        
##  affy                 1.44.0  2014-10-14 Bioconductor  
##  affyio             * 1.34.0  2014-10-14 Bioconductor  
##  Biobase              2.26.0  2014-10-14 Bioconductor  
##  BiocGenerics         0.12.1  2014-11-14 Bioconductor  
##  BiocInstaller      * 1.16.2  2015-03-15 Bioconductor  
##  curatedOvarianData   1.3.4   2014-10-22 Bioconductor  
##  devtools             1.7.0   2015-01-17 CRAN (R 3.1.2)
##  digest             * 0.6.8   2014-12-31 CRAN (R 3.1.2)
##  evaluate           * 0.5.5   2014-04-29 CRAN (R 3.1.0)
##  formatR            * 1.0     2014-08-25 CRAN (R 3.1.1)
##  htmltools          * 0.2.6   2014-09-08 CRAN (R 3.1.1)
##  knitr              * 1.9     2015-01-20 CRAN (R 3.1.2)
##  lattice            * 0.20-29 2014-04-04 CRAN (R 3.1.2)
##  logging              0.7-103 2013-04-12 CRAN (R 3.1.2)
##  Matrix             * 1.1-5   2015-01-18 CRAN (R 3.1.2)
##  metafor              1.9-5   2014-11-24 CRAN (R 3.1.2)
##  preprocessCore     * 1.28.0  2014-10-14 Bioconductor  
##  rmarkdown          * 0.5.1   2015-01-26 CRAN (R 3.1.2)
##  rstudioapi         * 0.2     2014-12-31 CRAN (R 3.1.2)
##  stringr            * 0.6.2   2012-12-06 CRAN (R 3.1.0)
##  survival             2.37-7  2014-01-22 CRAN (R 3.1.2)
##  yaml               * 2.1.13  2014-06-12 CRAN (R 3.1.0)
##  zlibbioc           * 1.12.0  2014-10-14 Bioconductor