Applies meta-analysis to find LTB gene signature

Andrew Bo Liu

2018-05-15

My goal is to find a blood-based gene expression signature that distinguishes people with latent TB from those who are healthy or have active TB. This analysis uses TB\_human\_datasets\_04\_2018 from data/TB\_human\_datasets\_04\_2018.RData.

Following the typical MetaIntegrator meta-analysis workflow, I first choose 6 discovery datasets, which are identified by their GSE numbers, and created lVsAHDiscovery in which to store these:

library(magrittr)
library(purrr)
#> 
#> Attaching package: 'purrr'
#> The following object is masked from 'package:magrittr':
#> 
#>     set_names
library(glue)
devtools::load_all()
#> Loading tb
load(here::here("data", "TB_human_datasets_04_2018.RData"))
discoveryGSEIDs = c("GSE19491", "GSE28623", "GSE54992", "GSE37250", "GSE39939.noCultureNeg", "GSE39940")
case_classes = c("Latent TB")
control_classes = c("Active TB", "Healthy")
discoveryGSEs = lapply(TB_human_datasets_04_2018[discoveryGSEIDs], function(gse) {
    filteredGSE = .filterGSE(gse, classesOfInterest=c(case_classes, control_classes))
    filteredGSEWithClass = .addClassVec(filteredGSE, caseClasses=case_classes)
    filteredGSEWithClass
})
# Only keep GSEs which have >= 2 cases and >= 2 controls, for effect size (Hedges' g) to be well-defined.
discoveryGSEs = discoveryGSEs %>%
  keep(function(gse) sum(gse$class) > 1 && sum(gse$class) < length(gse$class)-1)
lVsAHDiscovery = MetaIntegrator::runMetaAnalysis(list(originalData=discoveryGSEs))
#> Setting options('download.file.method.GEOquery'='auto')
#> Setting options('GEOquery.inmemory.gpl'=FALSE)
#> Warning: replacing previous import 'data.table::first' by 'dplyr::first'
#> when loading 'MetaIntegrator'
#> Warning: replacing previous import 'data.table::between' by
#> 'dplyr::between' when loading 'MetaIntegrator'
#> Warning: replacing previous import 'data.table::last' by 'dplyr::last' when
#> loading 'MetaIntegrator'
#> Warning: replacing previous import 'Biobase::combine' by 'dplyr::combine'
#> when loading 'MetaIntegrator'
#> Warning: replacing previous import 'dplyr::failwith' by 'plyr::failwith'
#> when loading 'MetaIntegrator'
#> Warning: replacing previous import 'dplyr::id' by 'plyr::id' when loading
#> 'MetaIntegrator'
#> Warning: replacing previous import 'dplyr::summarize' by 'plyr::summarize'
#> when loading 'MetaIntegrator'
#> Warning: replacing previous import 'dplyr::count' by 'plyr::count' when
#> loading 'MetaIntegrator'
#> Warning: replacing previous import 'dplyr::desc' by 'plyr::desc' when
#> loading 'MetaIntegrator'
#> Warning: replacing previous import 'dplyr::mutate' by 'plyr::mutate' when
#> loading 'MetaIntegrator'
#> Warning: replacing previous import 'dplyr::arrange' by 'plyr::arrange' when
#> loading 'MetaIntegrator'
#> Warning: replacing previous import 'dplyr::rename' by 'plyr::rename' when
#> loading 'MetaIntegrator'
#> Warning: replacing previous import 'dplyr::summarise' by 'plyr::summarise'
#> when loading 'MetaIntegrator'
#> Warning: replacing previous import 'plyr::rename' by 'reshape::rename' when
#> loading 'MetaIntegrator'
#> Warning: replacing previous import 'data.table::melt' by 'reshape::melt'
#> when loading 'MetaIntegrator'
#> Warning: replacing previous import 'plyr::round_any' by
#> 'reshape::round_any' when loading 'MetaIntegrator'
#> Warning: replacing previous import 'reshape::recast' by 'reshape2::recast'
#> when loading 'MetaIntegrator'
#> Warning: replacing previous import 'reshape::melt' by 'reshape2::melt' when
#> loading 'MetaIntegrator'
#> Warning: replacing previous import 'reshape::colsplit' by
#> 'reshape2::colsplit' when loading 'MetaIntegrator'
#> Warning: replacing previous import 'data.table::dcast' by 'reshape2::dcast'
#> when loading 'MetaIntegrator'
#> Found common probes in 4 
#> Found common probes in 5 
#> Found common probes in 6 
#> Computing effect sizes...
#> Computing summary effect sizes...
#> Computing Fisher's output...

validationGSEIDs = c("GSE62525", "GSEScribaDay0to7", "GSE84076", "GSE101705", "GSE73408", "GSE69581")
validationGSEs = lapply(TB_human_datasets_04_2018[validationGSEIDs], function(gse) {
    filteredGSE = .filterGSE(gse, classesOfInterest=c(case_classes, control_classes))
    filteredGSEWithClass = .addClassVec(filteredGSE, caseClasses=case_classes)
    filteredGSEWithClass
})
validationGSEs = validationGSEs %>%
  keep(function(gse) sum(gse$class) > 1 && sum(gse$class) < length(gse$class)-1)
lVsAHValidation = MetaIntegrator::runMetaAnalysis(list(originalData=validationGSEs))
#> Found common probes in 3 
#> Found common probes in 4 
#> Computing effect sizes...
#> Computing summary effect sizes...
#> Computing Fisher's output...

effectSizeThresholds = c(0.6, 0.8, 1)
geneFilters = effectSizeThresholds %>% map(~MetaIntegrator::filterGenes(lVsAHDiscovery, numberStudiesThresh = 4, FDRThresh = 0.1, effectSizeThresh = .)) %>% map(~.$filterResults[[1]]) %>% map(~.removeLOCMIRFromFilter(.)) %>% map(~MetaIntegrator::forwardSearch(lVsAHDiscovery, .))
#> Converting probes:genes for gene list for all discovery GEMs......   done. 
#> next best= 793.9697 
#> Adding  SMARCD3 793.9697 
#> next best= 34.92217 
#> Adding  WDR73 828.8919 
#> next best= 16.21166 
#> Adding  GTF2H4 845.1035 
#> next best= 7.28888 
#> Adding  FCGBP 852.3924 
#> next best= 7.037914 
#> Adding  CTSD 859.4303 
#> next best= 3.889181 
#> Adding  TOP3B 863.3195 
#> next best= 3.621661 
#> Adding  POLD2 866.9412 
#> next best= 1.375401 
#> Adding  DNAJC13 868.3166 
#> next best= 1.740001 
#> Adding  VAMP5 870.0566 
#> next best= 2.5914 
#> Adding  METTL9 872.648 
#> next best= 1.619676 
#> Adding  MED6 874.2677 
#> next best= 1.352877 
#> Adding  ITGAM 875.6205 
#> next best= 2.188995 
#> Adding  FCGR1C 877.8095 
#> next best= 1.880087 
#> Adding  ATP6V0A1 879.6896 
#> next best= 0.8765973 
#> Adding  IFT20 880.5662 
#> next best= 1.685723 
#> Adding  KCTD14 882.2519 
#> next best= 2.989615 
#> Adding  FLJ38717 885.2415 
#> next best= 1.71511 
#> Adding  TCF4 886.9567 
#> next best= 1.484682 
#> Adding  LONP2 888.4413 
#> next best= 0.9167106 
#> Adding  TPST1 889.3581 
#> next best= 2.692505 
#> Adding  ELANE 892.0506 
#> next best= 0.7166615 
#> Adding  NCRNA00120 892.7672 
#> next best= 0.8629502 
#> Adding  TOP1P2 893.6302 
#> next best= 0.7634589 
#> Adding  PARM1 894.3936 
#> next best= 0.6739637 
#> Adding  CBX7 895.0676 
#> next best= 0.674939 
#> Adding  ABHD6 895.7425 
#> next best= 0.3149729 
#> Adding  C1orf93 896.0575 
#> next best= 0.5273649 
#> Adding  VWA5A 896.5849 
#> next best= 0.5315503 
#> Adding  GAGE2B 897.1164 
#> next best= 0.1393486 
#> Adding  GPN1 897.2558 
#> next best= 0.4369288 
#> Adding  RNASEH1 897.6927 
#> next best= -0.2937387 
#> 
#> Final discovery AUCs from forward search genes:
#> GSE19491 0.941 
#> GSE28623 0.879 
#> GSE54992 0.933 
#> GSE37250 0.945 
#> GSE39939.noCultureNeg 0.984 
#> GSE39940 0.968 
#> mean AUC: 0.9416667 
#> Converting probes:genes for gene list for all discovery GEMs......   done. 
#> next best= 793.9697 
#> Adding  SMARCD3 793.9697 
#> next best= 31.19408 
#> Adding  GTF2H4 825.1638 
#> next best= 21.67877 
#> Adding  CNNM3 846.8426 
#> next best= 6.036743 
#> Adding  CBX7 852.8793 
#> next best= 5.527485 
#> Adding  HP 858.4068 
#> next best= 7.508825 
#> Adding  FCGR1C 865.9156 
#> next best= 2.537776 
#> Adding  CTSD 868.4534 
#> next best= 2.608416 
#> Adding  TMEM167A 871.0618 
#> next best= 0.7638912 
#> Adding  C1orf203 871.8257 
#> next best= 0.6751241 
#> Adding  FNDC3B 872.5008 
#> next best= 0.1570987 
#> Adding  ZNF689 872.6579 
#> next best= 0.988671 
#> Adding  CD63 873.6466 
#> next best= -0.04378817 
#> 
#> Final discovery AUCs from forward search genes:
#> GSE19491 0.923 
#> GSE28623 0.826 
#> GSE54992 0.844 
#> GSE37250 0.927 
#> GSE39939.noCultureNeg 0.988 
#> GSE39940 0.942 
#> mean AUC: 0.9083333 
#> Converting probes:genes for gene list for all discovery GEMs......   done. 
#> next best= 781.0584 
#> Adding  HP 781.0584 
#> next best= 35.53863 
#> Adding  FCGR1C 816.597 
#> next best= 12.47897 
#> Adding  AGTRAP 829.076 
#> next best= 8.734183 
#> Adding  C16orf67 837.8101 
#> next best= 4.464736 
#> Adding  MGC3020 842.2749 
#> next best= 3.805493 
#> Adding  SIGLEC16 846.0804 
#> next best= 2.32786 
#> Adding  RNF220 848.4082 
#> next best= 0.928796 
#> Adding  DEFA1B 849.337 
#> next best= 2.466072 
#> Adding  ITGAM 851.8031 
#> next best= 0.4808456 
#> Adding  KIF22 852.2839 
#> next best= 1.043822 
#> Adding  FAM62B 853.3278 
#> next best= -0.365591 
#> 
#> Final discovery AUCs from forward search genes:
#> GSE19491 0.856 
#> GSE28623 0.812 
#> GSE54992 0.778 
#> GSE37250 0.931 
#> GSE39939.noCultureNeg 0.978 
#> GSE39940 0.931 
#> mean AUC: 0.881
names(geneFilters) = geneFilters %>% map_chr(~.describeGeneSig(., case_classes = case_classes, control_classes = control_classes, datasets = discoveryGSEIDs))
library(gridExtra)
plots = names(geneFilters) %>% map(function(gfName) {
  title = stringr::str_wrap(gfName, width = 80)
  prc = MetaIntegrator::multiplePRCplot(lVsAHValidation, geneFilters[[gfName]], title = title, size = 12)
  roc = MetaIntegrator::multipleROCPlot(lVsAHValidation, geneFilters[[gfName]], title = title, size = 12)
  return(list(prc=prc, roc=roc))
}) %>% unlist(recursive = FALSE)
do.call(grid.arrange, c(plots, ncol=2))