## Registered S3 method overwritten by 'R.oo':
## method from
## throw.default R.methodsS3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
1 Get summary
Same as https://api.gdc.cancer.gov/projects/TCGA-OV?expand=summary,summary.data_categories&pretty=true
## $data_categories
## case_count file_count data_category
## 1 492 2135 Transcriptome Profiling
## 2 597 2401 Copy Number Variation
## 3 443 4880 Simple Nucleotide Variation
## 4 602 623 DNA Methylation
## 5 608 597 Clinical
## 6 575 1929 Sequencing Reads
## 7 608 2601 Biospecimen
##
## $case_count
## [1] 608
##
## $file_count
## [1] 15166
##
## $file_size
## [1] 3.174084e+13
2 Get XML data
query <- GDCquery(project = "TCGA-OV",
data.category = "Clinical",
file.type = "xml")
GDCdownload(query)
clinical <- GDCprepare_clinic(query, clinical.info = "patient")
length(dir(path = "GDCdata/TCGA-OV/harmonized/Clinical/Clinical_Supplement/",pattern = "xml",recursive = T))
## [1] 590
##
## FALSE TRUE
## 587 3
3 Get indexed clinical data
## [1] 587
## [1] 587
4 Get data from BCR Biotab
query <- GDCquery(project = "TCGA-OV",
data.category = "Clinical",
data.type = "Clinical Supplement",
data.format = "BCR Biotab")
GDCdownload(query)
clinical.BCRtab.all <- GDCprepare(query)
## [1] "clinical_radiation_ov" "clinical_patient_ov"
## [3] "clinical_drug_ov" "clinical_follow_up_v1.0_ov"
## [5] "clinical_follow_up_v1.0_nte_ov" "clinical_omf_v4.0_ov"
## [7] "clinical_nte_ov"
## [1] "bcr_patient_barcode" "CDE_ID:2003301" "TCGA-04-1331"
## [4] "TCGA-04-1332" "TCGA-04-1335" "TCGA-04-1336"
## [1] 587
5 Check file
file <- "GDCdata/TCGA-OV/harmonized/Clinical/Clinical_Supplement//30c149ac-9ac2-4f51-88a3-68bb4afb50a9/nationwidechildrens.org_clinical_patient_ov.txt"
dim(readr::read_tsv(file))
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
## [1] 589 57
6 Check biospecimen counts
biospecimen.indexed <- GDCquery_clinic(project = "TCGA-OV", type = "biospecimen")
nrow(biospecimen.indexed)
## [1] 1329
## [1] "TCGA-13-1487" "TCGA-04-1357" "TCGA-09-0367" "TCGA-59-2372"
## [5] "TCGA-61-2000" "TCGA-24-1614"
## [1] 608
## [1] "TCGA-01-0639" "TCGA-72-4233" "TCGA-13-0922" "TCGA-36-2539"
## [5] "TCGA-72-4234" "TCGA-01-0642" "TCGA-72-4240" "TCGA-72-4238"
## [9] "TCGA-72-4235" "TCGA-01-0628" "TCGA-72-4236" "TCGA-01-0633"
## [13] "TCGA-01-0630" "TCGA-72-4232" "TCGA-01-0636" "TCGA-72-4237"
## [17] "TCGA-13-0914" "TCGA-72-4241" "TCGA-72-4231" "TCGA-01-0631"
## [21] "TCGA-01-0637"
7 Checking features
features <-
c( "age_at_initial_pathologic_diagnosis",
"gender",
"race",
"ajcc_pathologic_tumor_stage",
"clinical_stage",
"histological_type",
"histological_grade",
"initial_pathologic_dx_year",
"menopause_status",
"birth_days_to",
"last_contact_days_to",
"death_days_to",
"new_tumor_event_dx_days_to"
)
query <- GDCquery(project = "TCGA-OV",
data.category = "Clinical",
data.type = "Clinical Supplement",
data.format = "BCR Biotab")
## --------------------------------------
## o GDCquery: Searching in GDC database
## --------------------------------------
## Genome of reference: hg38
## --------------------------------------------
## oo Accessing GDC. This might take a while...
## --------------------------------------------
## ooo Project: TCGA-OV
## --------------------
## oo Filtering results
## --------------------
## ooo By data.format
## ooo By data.type
## ----------------
## oo Checking data
## ----------------
## ooo Check if there are duplicated cases
## Warning: There are more than one file for the same case. Please verify query results. You can use the command View(getResults(query)) in rstudio
## ooo Check if there results for the query
## -------------------
## o Preparing output
## -------------------
## Downloading data for project TCGA-OV
## Of the 7 files for download 7 already exist.
## All samples have been already downloaded
##
|
| | 0%
|
|========= | 14%
|
|=================== | 29%
|
|============================ | 43%
|
|===================================== | 57%
|
|============================================== | 71%
|
|======================================================== | 86%
|
|=================================================================| 100%
## [1] "clinical_radiation_ov" "clinical_patient_ov"
## [3] "clinical_drug_ov" "clinical_follow_up_v1.0_ov"
## [5] "clinical_follow_up_v1.0_nte_ov" "clinical_omf_v4.0_ov"
## [7] "clinical_nte_ov"
found <- rep(FALSE,length(features))
for(n in names(clinical.BCRtab.all)){
idx <- features %in% colnames(clinical.BCRtab.all[[n]])
if(any(idx)) {
found <- found | idx
message("----------------------------------------")
message("The following featues can be found in table: ", n)
message(paste0("- ",paste(features[idx],collapse = "\n- ")))
}
}
## ----------------------------------------
## The following featues can be found in table: clinical_patient_ov
## - age_at_initial_pathologic_diagnosis
## - gender
## - race
## - clinical_stage
## - histological_type
## - initial_pathologic_dx_year
## - birth_days_to
## - last_contact_days_to
## - death_days_to
## ----------------------------------------
## The following featues can be found in table: clinical_follow_up_v1.0_ov
## - last_contact_days_to
## - death_days_to
## ----------------------------------------
## The following featues can be found in table: clinical_follow_up_v1.0_nte_ov
## - new_tumor_event_dx_days_to
## ----------------------------------------
## The following featues can be found in table: clinical_omf_v4.0_ov
## - ajcc_pathologic_tumor_stage
## - clinical_stage
## ----------------------------------------
## The following featues can be found in table: clinical_nte_ov
## - new_tumor_event_dx_days_to
## [1] "histological_grade" "menopause_status"