library(TCGAbiolinks)
## Registered S3 method overwritten by 'R.oo':
##   method        from       
##   throw.default R.methodsS3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DT)

1 Get summary

Same as https://api.gdc.cancer.gov/projects/TCGA-OV?expand=summary,summary.data_categories&pretty=true

TCGAbiolinks:::getProjectSummary("TCGA-OV",legacy = FALSE)
## $data_categories
##   case_count file_count               data_category
## 1        492       2135     Transcriptome Profiling
## 2        597       2401       Copy Number Variation
## 3        443       4880 Simple Nucleotide Variation
## 4        602        623             DNA Methylation
## 5        608        597                    Clinical
## 6        575       1929            Sequencing Reads
## 7        608       2601                 Biospecimen
## 
## $case_count
## [1] 608
## 
## $file_count
## [1] 15166
## 
## $file_size
## [1] 3.174084e+13

2 Get XML data

query <- GDCquery(project = "TCGA-OV",
                  data.category = "Clinical",
                  file.type = "xml")
GDCdownload(query)
clinical <- GDCprepare_clinic(query, clinical.info = "patient")
length(dir(path = "GDCdata/TCGA-OV/harmonized/Clinical/Clinical_Supplement/",pattern = "xml",recursive = T))
## [1] 590
table(duplicated(clinical$bcr_patient_barcode))
## 
## FALSE  TRUE 
##   587     3

3 Get indexed clinical data

clinical.indexed <- GDCquery_clinic(project = "TCGA-OV", type = "clinical")
nrow(clinical.indexed)
## [1] 587
# Unique cases
length(unique(clinical.indexed$submitter_id))
## [1] 587

4 Get data from BCR Biotab

query <- GDCquery(project = "TCGA-OV", 
                  data.category = "Clinical",
                  data.type = "Clinical Supplement", 
                  data.format = "BCR Biotab")
GDCdownload(query)
clinical.BCRtab.all <- GDCprepare(query)
names(clinical.BCRtab.all)
## [1] "clinical_radiation_ov"          "clinical_patient_ov"           
## [3] "clinical_drug_ov"               "clinical_follow_up_v1.0_ov"    
## [5] "clinical_follow_up_v1.0_nte_ov" "clinical_omf_v4.0_ov"          
## [7] "clinical_nte_ov"
head(clinical.BCRtab.all$clinical_patient_ov$bcr_patient_barcode)
## [1] "bcr_patient_barcode" "CDE_ID:2003301"      "TCGA-04-1331"       
## [4] "TCGA-04-1332"        "TCGA-04-1335"        "TCGA-04-1336"
length(clinical.BCRtab.all$clinical_patient_ov$bcr_patient_barcode[-c(1:2)])
## [1] 587

5 Check file

file <- "GDCdata/TCGA-OV/harmonized/Clinical/Clinical_Supplement//30c149ac-9ac2-4f51-88a3-68bb4afb50a9/nationwidechildrens.org_clinical_patient_ov.txt"
dim(readr::read_tsv(file))
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
## [1] 589  57

6 Check biospecimen counts

biospecimen.indexed <- GDCquery_clinic(project = "TCGA-OV", type = "biospecimen")
nrow(biospecimen.indexed)
## [1] 1329
samples <- unique(substr(biospecimen.indexed$submitter_id,1,12))
head(samples)
## [1] "TCGA-13-1487" "TCGA-04-1357" "TCGA-09-0367" "TCGA-59-2372"
## [5] "TCGA-61-2000" "TCGA-24-1614"
length(samples)
## [1] 608
# Samples without clinical data
samples[!samples %in% clinical.indexed$submitter_id]
##  [1] "TCGA-01-0639" "TCGA-72-4233" "TCGA-13-0922" "TCGA-36-2539"
##  [5] "TCGA-72-4234" "TCGA-01-0642" "TCGA-72-4240" "TCGA-72-4238"
##  [9] "TCGA-72-4235" "TCGA-01-0628" "TCGA-72-4236" "TCGA-01-0633"
## [13] "TCGA-01-0630" "TCGA-72-4232" "TCGA-01-0636" "TCGA-72-4237"
## [17] "TCGA-13-0914" "TCGA-72-4241" "TCGA-72-4231" "TCGA-01-0631"
## [21] "TCGA-01-0637"

7 Checking features

features <-
    c(  "age_at_initial_pathologic_diagnosis",
        "gender",
        "race",
        "ajcc_pathologic_tumor_stage",
        "clinical_stage",
        "histological_type",
        "histological_grade",
        "initial_pathologic_dx_year",
        "menopause_status",
        "birth_days_to",
        "last_contact_days_to",
        "death_days_to",
        "new_tumor_event_dx_days_to"
    )

query <- GDCquery(project = "TCGA-OV",
                  data.category = "Clinical",
                  data.type = "Clinical Supplement",
                  data.format = "BCR Biotab")
## --------------------------------------
## o GDCquery: Searching in GDC database
## --------------------------------------
## Genome of reference: hg38
## --------------------------------------------
## oo Accessing GDC. This might take a while...
## --------------------------------------------
## ooo Project: TCGA-OV
## --------------------
## oo Filtering results
## --------------------
## ooo By data.format
## ooo By data.type
## ----------------
## oo Checking data
## ----------------
## ooo Check if there are duplicated cases
## Warning: There are more than one file for the same case. Please verify query results. You can use the command View(getResults(query)) in rstudio
## ooo Check if there results for the query
## -------------------
## o Preparing output
## -------------------
GDCdownload(query)
## Downloading data for project TCGA-OV
## Of the 7 files for download 7 already exist.
## All samples have been already downloaded
clinical.BCRtab.all <- GDCprepare(query)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |=================================================================| 100%
names(clinical.BCRtab.all)
## [1] "clinical_radiation_ov"          "clinical_patient_ov"           
## [3] "clinical_drug_ov"               "clinical_follow_up_v1.0_ov"    
## [5] "clinical_follow_up_v1.0_nte_ov" "clinical_omf_v4.0_ov"          
## [7] "clinical_nte_ov"
found <- rep(FALSE,length(features))
for(n in names(clinical.BCRtab.all)){
    idx <- features %in% colnames(clinical.BCRtab.all[[n]])
    if(any(idx)) {
        found <- found | idx
        message("----------------------------------------")
        message("The following featues can be found in table: ", n)
        message(paste0("- ",paste(features[idx],collapse = "\n- ")))
    }
}
## ----------------------------------------
## The following featues can be found in table: clinical_patient_ov
## - age_at_initial_pathologic_diagnosis
## - gender
## - race
## - clinical_stage
## - histological_type
## - initial_pathologic_dx_year
## - birth_days_to
## - last_contact_days_to
## - death_days_to
## ----------------------------------------
## The following featues can be found in table: clinical_follow_up_v1.0_ov
## - last_contact_days_to
## - death_days_to
## ----------------------------------------
## The following featues can be found in table: clinical_follow_up_v1.0_nte_ov
## - new_tumor_event_dx_days_to
## ----------------------------------------
## The following featues can be found in table: clinical_omf_v4.0_ov
## - ajcc_pathologic_tumor_stage
## - clinical_stage
## ----------------------------------------
## The following featues can be found in table: clinical_nte_ov
## - new_tumor_event_dx_days_to
features[!found]
## [1] "histological_grade" "menopause_status"