version#: 2024-09-21 09:50:17.478285

ds

### Dependencies and settings
#attaching libraries
sapply(c("BiocManager", "DT", "kableExtra", "tidyr"), library, character.only = TRUE)

# get the package's list and description of functions  
GetPackagesFunctions <- function(pck){
  pkcgFunctions <- do.call(
    rbind, 
    lapply(ls(paste0("package:", pck)), \(f) as.data.frame(
      cbind(f,
            gsub(".*._\bo_\bn:     |_\bU_\bs_\ba_\bg.*.", "", tools:::Rd2txt(utils:::.getHelpFile(as.character(help(f, package = as.character(pck))))) |> capture.output() |> paste0(collapse = "")))) 
)) |> dplyr::rename(func = 1, description = 2)
  return(pkcgFunctions)
}

Scope: evaluate R Bioconductor options for querying and retrieving data from the NCI’s GenomicDataCommons GDC resource.

  • The NCI’s Genomic Data Commons (GDC) provides programmatic access to its molecular and clinical data, including a REST interface.
  • The Bioconductor project is an R package repository dedicated to the field of biological data.
  • While it is possible to interact with the GDC with both R base and CRAN packages (e.g. Httr2, curl,…), this study aims at evaluating Bioconductor packages developed to providing access to the GDC data and metadata.
  • The GDC data model has been designed to manage clinical and molecular data while preserving the integrity of the relationship between patients, samples, diagnostics, laboratory and genomics data.
  • In particular, it implements the hierarchy and one-to-many relationship between Participant -> Samples -> Portions -> Analytes -> Aliquots.
  • In this study, we will be focusing on lung cancer. The idea is to get a sense of what are the various capabilities offered by Bioconductor packages.

First thing first, the list of Bioconductor packages dealing wih the GDC.

#getting the list of Bioconductor packages potentially set to enable access to the GDC resource  
BiocGDCpackages <- BiocManager::available("GDC|GenomicDataCommons|TCGA")

# getting the description of each relevant package
BiocGDCpackages.df <- do.call(rbind, lapply(BiocGDCpackages, \(pkg) {
   packageData <- try(packageDescription(pkg, fields = c("Package", "Title", "Version", "Description")))
    if(inherits(packageData, "try-error")){
      NA
    } else{
      packageData
    }
})) |> as.data.frame() |> tidyr::unnest()

# selecting out entries with missing title
BiocGDCpackages.df <- BiocGDCpackages.df[!is.na(BiocGDCpackages.df$Title),]

DT::datatable(BiocGDCpackages.df, caption = "R Bioconductor packages with the TCGA or GDC denomination", rownames = F, options = list(pageLength = 10, autoWidth = TRUE))

As of 2024-09-21 the Bioconductor package repository features 28 packages whose title points to the GDC

TCGAretriever

TCGAretriever functions

library(TCGAretriever)

TCGAretriever_functions.df <- GetPackagesFunctions("TCGAretriever")

DT::datatable(TCGAretriever_functions.df, caption = "Bioconductor TCGAretriever functions", rownames = F, options = list(pageLength = 10, autoWidth = TRUE))

Selecting lung cancer data sets

The TCGAretriever package pulls data from the cBioPortal, which includes TCGA data sets

#cancer type. filter on lung cancer
CancerTypes <- get_cancer_types()

#filtering on lung cancer
LungCancerTypes <- CancerTypes[grep("lung", CancerTypes$parent),]

CancerStudies <- get_cancer_studies()

#filtering
LungCancerStudies <- CancerStudies[CancerStudies$cancerTypeId %in% LungCancerTypes$cancerTypeId,]


DT::datatable(LungCancerStudies[,c("name", "description", "allSampleCount", "studyId", "cancerTypeId")], caption = "TCGAretriever packages lung cancer studies index", rownames = F, options = list(pageLength = 10, autoWidth = TRUE))
nsclc_tcga_broad_clinical.df <- get_clinical_data("nsclc_tcga_broad_2016")

RTCGA

RTCGA functions

library(RTCGA)

RTCGA_functions.df <- GetPackagesFunctions("RTCGA")

DT::datatable(RTCGA_functions.df, caption = "Bioconductor RTCGA functions", rownames = F, options = list(pageLength = 10, autoWidth = TRUE))

RTCGA handy functions

DT::datatable(infoTCGA(), caption = "TTCGA infoTCGA result set", rownames = F, options = list(pageLength = 10, autoWidth = TRUE))

GenomicDataCommons

GenomicDataCommons functions

library(GenomicDataCommons)

GenomicDataCommons_functions.df <- GetPackagesFunctions("GenomicDataCommons")

DT::datatable(GenomicDataCommons_functions.df, caption = "Bioconductor GenomicDataCommons functions", rownames = F, options = list(pageLength = 10, autoWidth = TRUE))

R Session Info
Setting Value
version R version 4.4.1 (2024-06-14)
os Ubuntu 24.04.1 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz America/New_York
date 2024-09-21
pandoc 3.1.11 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/x86_64/ (via rmarkdown)