library(TCGAbiolinks)

1 Look what the query returns

query <- GDCquery(project = "TCGA-CESC", 
                  data.category = "Simple nucleotide variation", 
                  data.type = "Simple somatic mutation",
                  access = "open", 
                  legacy = TRUE)

## --------------------------------------

## o GDCquery: Searching in GDC database

## --------------------------------------

## Genome of reference: hg19

## --------------------------------------------

## oo Accessing GDC. This might take a while...

## --------------------------------------------

## ooo Project: TCGA-CESC

## --------------------

## oo Filtering results

## --------------------

## ooo By access

## ooo By data.type

## ----------------

## oo Checking data

## ----------------

## ooo Check if there are duplicated cases

## Warning: There are more than one file for the same case. Please verify query results. You can use the command View(getResults(query)) in rstudio

## ooo Check if there results for the query

## -------------------

## o Preparing output

## -------------------

query$results[[1]]$file_name

## [1] "genome.wustl.edu_CESC.IlluminaGA_DNASeq_curated.Level_2.1.0.0.somatic.maf"   
## [2] "genome.wustl.edu_CESC.IlluminaHiSeq_DNASeq_automated.1.3.0.somatic.maf"      
## [3] "ucsc.edu_CESC.IlluminaGA_DNASeq_automated.Level_2.1.1.0.somatic.maf"         
## [4] "bcgsc.ca_CESC.IlluminaHiSeq_DNASeq.1.somatic.maf"                            
## [5] "PR_TCGA_CESC_PAIR_Capture_All_Pairs.aggregated.capture.tcga.uuid.somatic.maf"
## [6] "gsc_CESC_pairs.aggregated.capture.tcga.uuid.automated.somatic.maf"

results <- query$results[[1]] 
results$cases <- NULL
results %>% 
    DT::datatable(filter = 'top',
                  style = "bootstrap",
                  extensions = 'Buttons',
                  options = list(scrollX = TRUE, 
                                 dom = 'Bfrtip',
                                 buttons = I('colvis'),
                                 keys = TRUE, 
                                 pageLength = 10), 
                  rownames = FALSE,
                  caption = "Samples metadata")

2 Search each files

data_list <- plyr::alply(query$results[[1]]$file_name,.margins = 1,.fun = function(f){
    print(f)
    query <- GDCquery(project = "TCGA-CESC", 
                                  data.category = "Simple nucleotide variation", 
                                  data.type = "Simple somatic mutation",
                                  file.type = f,
                                  access = "open", 
                                  legacy = TRUE) 
    GDCdownload(query)
    GDCprepare(query)
})
names(data_list) <- query$results[[1]]$file_name

3 Data

names(data_list)

## [1] "genome.wustl.edu_CESC.IlluminaGA_DNASeq_curated.Level_2.1.0.0.somatic.maf"   
## [2] "genome.wustl.edu_CESC.IlluminaHiSeq_DNASeq_automated.1.3.0.somatic.maf"      
## [3] "ucsc.edu_CESC.IlluminaGA_DNASeq_automated.Level_2.1.1.0.somatic.maf"         
## [4] "bcgsc.ca_CESC.IlluminaHiSeq_DNASeq.1.somatic.maf"                            
## [5] "PR_TCGA_CESC_PAIR_Capture_All_Pairs.aggregated.capture.tcga.uuid.somatic.maf"
## [6] "gsc_CESC_pairs.aggregated.capture.tcga.uuid.automated.somatic.maf"

data_list

## $genome.wustl.edu_CESC.IlluminaGA_DNASeq_curated.Level_2.1.0.0.somatic.maf
## # A tibble: 46,547 x 66
##    Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position
##    <chr>                <int> <chr>       <dbl> <chr>               <int>
##  1 A1BG                     1 genom…         37 19               58864353
##  2 A1BG-AS1            503538 genom…         37 19               58864179
##  3 A1CF                 29974 genom…         37 10               52587925
##  4 A1CF                 29974 genom…         37 10               52601652
##  5 A2M                      2 genom…         37 12                9262517
##  6 A2M                      2 genom…         37 12                9265031
##  7 A2M                      2 genom…         37 12                9265042
##  8 A2ML1               144568 genom…         37 12                8982339
##  9 A2ML1               144568 genom…         37 12                8997932
## 10 A2ML1               144568 genom…         37 12                9013793
## # … with 46,537 more rows, and 60 more variables: End_Position <int>,
## #   Strand <chr>, Variant_Classification <chr>, Variant_Type <chr>,
## #   Reference_Allele <chr>, Tumor_Seq_Allele1 <chr>, Tumor_Seq_Allele2 <chr>,
## #   dbSNP_RS <lgl>, dbSNP_Val_Status <lgl>, Tumor_Sample_Barcode <chr>,
## #   Matched_Norm_Sample_Barcode <chr>, Match_Norm_Seq_Allele1 <chr>,
## #   Match_Norm_Seq_Allele2 <chr>, Tumor_Validation_Allele1 <lgl>,
## #   Tumor_Validation_Allele2 <lgl>, Match_Norm_Validation_Allele1 <lgl>,
## #   Match_Norm_Validation_Allele2 <lgl>, Verification_Status <chr>,
## #   Validation_Status <chr>, Mutation_Status <chr>, Sequencing_Phase <chr>,
## #   Sequence_Source <chr>, Validation_Method <chr>, Score <lgl>,
## #   BAM_File <lgl>, Sequencer <chr>, Tumor_Sample_UUID <chr>,
## #   Matched_Norm_Sample_UUID <chr>, chromosome_name <chr>, start <dbl>,
## #   stop <dbl>, reference <chr>, variant <chr>, type <chr>, gene_name <chr>,
## #   transcript_name <chr>, transcript_species <chr>, transcript_source <chr>,
## #   transcript_version <chr>, strand <dbl>, transcript_status <chr>,
## #   trv_type <chr>, c_position <chr>, amino_acid_change <chr>, ucsc_cons <chr>,
## #   domain <chr>, all_domains <chr>, deletion_substructures <chr>,
## #   transcript_error <chr>, default_gene_name <chr>, gene_name_source <chr>,
## #   gene_name_source_1 <chr>, tumor_ref_reads <dbl>, tumor_var_reads <dbl>,
## #   tumor_vaf <dbl>, normal_ref_reads <dbl>, normal_var_reads <dbl>,
## #   normal_vaf <dbl>, dbSNP_rsID <chr>, GMAF <chr>
## 
## $genome.wustl.edu_CESC.IlluminaHiSeq_DNASeq_automated.1.3.0.somatic.maf
## # A tibble: 38,693 x 73
##    Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position
##    <chr>                <int> <chr>       <dbl> <chr>               <int>
##  1 A1BG                     0 genom…         37 19               58864353
##  2 A1BG-AS1                 0 genom…         37 19               58864179
##  3 A1CF                     0 genom…         37 10               52587925
##  4 A1CF                     0 genom…         37 10               52601652
##  5 A2M                      0 genom…         37 12                9262517
##  6 A2M                      0 genom…         37 12                9265031
##  7 A2M                      0 genom…         37 12                9265042
##  8 A2ML1                    0 genom…         37 12                8997932
##  9 A4GNT                    0 genom…         37 3               137843671
## 10 AACSP1                   0 genom…         37 5               178203190
## # … with 38,683 more rows, and 67 more variables: End_Position <int>,
## #   Strand <chr>, Variant_Classification <chr>, Variant_Type <chr>,
## #   Reference_Allele <chr>, Tumor_Seq_Allele1 <chr>, Tumor_Seq_Allele2 <chr>,
## #   dbSNP_RS <lgl>, dbSNP_Val_Status <lgl>, Tumor_Sample_Barcode <chr>,
## #   Matched_Norm_Sample_Barcode <chr>, Match_Norm_Seq_Allele1 <chr>,
## #   Match_Norm_Seq_Allele2 <chr>, Tumor_Validation_Allele1 <chr>,
## #   Tumor_Validation_Allele2 <chr>, Match_Norm_Validation_Allele1 <chr>,
## #   Match_Norm_Validation_Allele2 <chr>, Verification_Status <chr>,
## #   Validation_Status <chr>, Mutation_Status <chr>, Sequencing_Phase <chr>,
## #   Sequence_Source <chr>, Validation_Method <chr>, Score <lgl>,
## #   BAM_File <lgl>, Sequencer <chr>, Tumor_Sample_UUID <chr>,
## #   Matched_Norm_Sample_UUID <chr>, chromosome_name <chr>, start <dbl>,
## #   stop <dbl>, reference <chr>, variant <chr>, type <chr>, gene_name <chr>,
## #   transcript_name <chr>, transcript_species <chr>, transcript_source <chr>,
## #   transcript_version <chr>, strand <dbl>, transcript_status <chr>,
## #   trv_type <chr>, c_position <chr>, amino_acid_change <chr>, ucsc_cons <chr>,
## #   domain <chr>, all_domains <chr>, deletion_substructures <chr>,
## #   transcript_error <chr>, default_gene_name <chr>, gene_name_source <chr>,
## #   gene_name_ensembl <chr>, tumor_ref_reads <dbl>, tumor_var_reads <dbl>,
## #   tumor_vaf <dbl>, normal_ref_reads <dbl>, normal_var_reads <dbl>,
## #   normal_vaf <dbl>, tumor_ref_reads_val <dbl>, tumor_var_reads_val <dbl>,
## #   tumor_vaf_val <dbl>, normal_ref_reads_val <dbl>,
## #   normal_var_reads_val <dbl>, normal_vaf_val <dbl>, misc_ref_reads_val <dbl>,
## #   misc_var_reads_val <dbl>, misc_vaf_val <dbl>
## 
## $ucsc.edu_CESC.IlluminaGA_DNASeq_automated.Level_2.1.1.0.somatic.maf
## # A tibble: 39,864 x 50
##    Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position
##    <chr>                <int> <chr>  <chr>      <chr>               <int>
##  1 AHDC1                27245 ucsc.… GRCh37     1                27875174
##  2 IPO13                 9670 ucsc.… GRCh37     1                44433064
##  3 ATP5F1                 515 ucsc.… GRCh37     1               112002183
##  4 DENND2C             163259 ucsc.… GRCh37     1               115142027
##  5 S100A1                6271 ucsc.… GRCh37     1               153604234
##  6 ARHGEF11              9826 ucsc.… GRCh37     1               156907208
##  7 ADCY10               55811 ucsc.… GRCh37     1               167791359
##  8 ATP2B4                 493 ucsc.… GRCh37     1               203693066
##  9 USH2A                 7399 ucsc.… GRCh37     1               215848679
## 10 DQX1                165545 ucsc.… GRCh37     2                74751187
## # … with 39,854 more rows, and 44 more variables: End_Position <int>,
## #   Strand <chr>, Variant_Classification <chr>, Variant_Type <chr>,
## #   Reference_Allele <chr>, Tumor_Seq_Allele1 <chr>, Tumor_Seq_Allele2 <chr>,
## #   dbSNP_RS <chr>, dbSNP_Val_Status <lgl>, Tumor_Sample_Barcode <chr>,
## #   Matched_Norm_Sample_Barcode <chr>, Match_Norm_Seq_Allele1 <chr>,
## #   Match_Norm_Seq_Allele2 <chr>, Tumor_Validation_Allele1 <lgl>,
## #   Tumor_Validation_Allele2 <lgl>, Match_Norm_Validation_Allele1 <lgl>,
## #   Match_Norm_Validation_Allele2 <lgl>, Verification_Status <chr>,
## #   Validation_Status <chr>, Mutation_Status <chr>, Sequencing_Phase <lgl>,
## #   Sequence_Source <chr>, Validation_Method <chr>, Score <lgl>,
## #   BAM_File <lgl>, Sequencer <chr>, Tumor_Sample_UUID <chr>,
## #   Matched_Norm_Sample_UUID <chr>, RNA_Tumor_Sample_Barcode <chr>,
## #   RNA_Tumor_Sample_UUID <chr>, RNA_Tumor_Seq_Allele1 <chr>,
## #   RNA_Tumor_Seq_Allele2 <chr>, RNA_Normal_Sample_Barcode <lgl>,
## #   RNA_Normal_Sample_UUID <lgl>, RNA_Normal_Seq_Allele1 <lgl>,
## #   RNA_Normal_Seq_Allele2 <lgl>, Match_Norm_Ref_Count <dbl>,
## #   Match_Norm_Alt_Count <dbl>, Tumor_Ref_Count <dbl>, Tumor_Alt_Count <dbl>,
## #   RNA_Norm_Ref_Count <lgl>, RNA_Norm_Alt_Count <lgl>,
## #   RNA_Tumor_Ref_Count <dbl>, RNA_Tumor_Alt_Count <dbl>
## 
## $bcgsc.ca_CESC.IlluminaHiSeq_DNASeq.1.somatic.maf
## # A tibble: 68,621 x 34
##    Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position
##    <chr>                <int> <chr>  <chr>      <chr>               <int>
##  1 COL3A1                1281 bcgsc… GRCh37-li… 2               189873889
##  2 RAB35                11021 bcgsc… GRCh37-li… 12              120534993
##  3 SMC1A                 8243 bcgsc… GRCh37-li… X                53439197
##  4 CDK11B                 984 bcgsc… GRCh37-li… 1                 1572104
##  5 CAMTA1               23261 bcgsc… GRCh37-li… 1                 7724942
##  6 PER3                  8863 bcgsc… GRCh37-li… 1                 7863155
##  7 CSMD2               114784 bcgsc… GRCh37-li… 1                34180259
##  8 BSND                  7809 bcgsc… GRCh37-li… 1                55464880
##  9 TUFT1                 7286 bcgsc… GRCh37-li… 1               151534633
## 10 IFI16                 3428 bcgsc… GRCh37-li… 1               159023487
## # … with 68,611 more rows, and 28 more variables: End_Position <int>,
## #   Strand <chr>, Variant_Classification <chr>, Variant_Type <chr>,
## #   Reference_Allele <chr>, Tumor_Seq_Allele1 <chr>, Tumor_Seq_Allele2 <chr>,
## #   dbSNP_RS <chr>, dbSNP_Val_Status <chr>, Tumor_Sample_Barcode <chr>,
## #   Matched_Norm_Sample_Barcode <chr>, Match_Norm_Seq_Allele1 <chr>,
## #   Match_Norm_Seq_Allele2 <chr>, Tumor_Validation_Allele1 <chr>,
## #   Tumor_Validation_Allele2 <chr>, Match_Norm_Validation_Allele1 <chr>,
## #   Match_Norm_Validation_Allele2 <chr>, Verification_Status <chr>,
## #   Validation_Status <chr>, Mutation_Status <chr>, Sequencing_Phase <chr>,
## #   Sequence_Source <chr>, Validation_Method <chr>, Score <lgl>,
## #   BAM_File <lgl>, Sequencer <chr>, Tumor_Sample_UUID <chr>,
## #   Matched_Norm_Sample_UUID <chr>
## 
## $PR_TCGA_CESC_PAIR_Capture_All_Pairs.aggregated.capture.tcga.uuid.somatic.maf
## # A tibble: 10,020 x 90
##    Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position
##    <chr>                <int> <chr>       <dbl> <chr>               <dbl>
##  1 CDK11B                 984 broad…         37 1                 1572104
##  2 CAMTA1               23261 broad…         37 1                 7724942
##  3 PER3                  8863 broad…         37 1                 7863155
##  4 CSMD2               114784 broad…         37 1                34180259
##  5 BSND                  7809 broad…         37 1                55464880
##  6 TUFT1                 7286 broad…         37 1               151534633
##  7 IFI16                 3428 broad…         37 1               159023487
##  8 ADORA1                 134 broad…         37 1               203134455
##  9 OPTC                 26254 broad…         37 1               203472794
## 10 FLVCR1               28982 broad…         37 1               213032277
## # … with 10,010 more rows, and 84 more variables: End_position <dbl>,
## #   Strand <chr>, Variant_Classification <chr>, Variant_Type <chr>,
## #   Reference_Allele <chr>, Tumor_Seq_Allele1 <chr>, Tumor_Seq_Allele2 <chr>,
## #   dbSNP_RS <chr>, dbSNP_Val_Status <chr>, Tumor_Sample_Barcode <chr>,
## #   Matched_Norm_Sample_Barcode <chr>, Match_Norm_Seq_Allele1 <lgl>,
## #   Match_Norm_Seq_Allele2 <lgl>, Tumor_Validation_Allele1 <lgl>,
## #   Tumor_Validation_Allele2 <lgl>, Match_Norm_Validation_Allele1 <lgl>,
## #   Match_Norm_Validation_Allele2 <lgl>, Verification_Status <lgl>,
## #   Validation_Status <lgl>, Mutation_Status <chr>, Sequencing_Phase <chr>,
## #   Sequence_Source <chr>, Validation_Method <lgl>, Score <lgl>,
## #   BAM_file <lgl>, Sequencer <chr>, Tumor_Sample_UUID <chr>,
## #   Matched_Norm_Sample_UUID <chr>, Genome_Change <chr>,
## #   Annotation_Transcript <chr>, Transcript_Strand <chr>,
## #   Transcript_Exon <dbl>, Transcript_Position <chr>, cDNA_Change <chr>,
## #   Codon_Change <chr>, Protein_Change <chr>, Other_Transcripts <chr>,
## #   Refseq_mRNA_Id <chr>, Refseq_prot_Id <chr>, SwissProt_acc_Id <chr>,
## #   SwissProt_entry_Id <chr>, Description <chr>, UniProt_AApos <chr>,
## #   UniProt_Region <chr>, UniProt_Site <chr>, UniProt_Natural_Variations <chr>,
## #   UniProt_Experimental_Info <chr>, GO_Biological_Process <chr>,
## #   GO_Cellular_Component <chr>, GO_Molecular_Function <chr>,
## #   COSMIC_overlapping_mutations <chr>, COSMIC_fusion_genes <chr>,
## #   COSMIC_tissue_types_affected <chr>, COSMIC_total_alterations_in_gene <dbl>,
## #   Tumorscape_Amplification_Peaks <chr>, Tumorscape_Deletion_Peaks <chr>,
## #   TCGAscape_Amplification_Peaks <chr>, TCGAscape_Deletion_Peaks <chr>,
## #   DrugBank <chr>, ref_context <chr>, gc_content <dbl>,
## #   CCLE_ONCOMAP_overlapping_mutations <chr>,
## #   CCLE_ONCOMAP_total_mutations_in_gene <dbl>, CGC_Mutation_Type <chr>,
## #   CGC_Translocation_Partner <chr>, CGC_Tumor_Types_Somatic <chr>,
## #   CGC_Tumor_Types_Germline <chr>, CGC_Other_Diseases <chr>,
## #   DNARepairGenes_Role <chr>, FamilialCancerDatabase_Syndromes <chr>,
## #   MUTSIG_Published_Results <chr>, OREGANNO_ID <chr>, OREGANNO_Values <chr>,
## #   t_alt_count <int>, t_ref_count <int>, validation_status <lgl>,
## #   validation_method <lgl>, validation_tumor_sample <lgl>,
## #   validation_alt_allele <lgl>, pox <dbl>, qox <dbl>, pox_cutoff <dbl>,
## #   isArtifactMode <dbl>, oxoGCut <dbl>
## 
## $gsc_CESC_pairs.aggregated.capture.tcga.uuid.automated.somatic.maf
## # A tibble: 97,933 x 90
##    Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position
##    <chr>                <int> <chr>       <dbl> <chr>               <dbl>
##  1 SLC8A1                6546 broad…         37 2                40405622
##  2 RP11-156P1…              0 broad…         37 17               45125593
##  3 PDE3A                 5139 broad…         37 12               20522788
##  4 HLA-V                    0 broad…         37 6                29760111
##  5 HHLA1                10086 broad…         37 8               133089975
##  6 PANX2                56666 broad…         37 22               50615969
##  7 MIR146B                  0 broad…         37 10              104196339
##  8 ZNF775              285971 broad…         37 7               150095001
##  9 LCE2C               353140 broad…         37 1               152648817
## 10 RPLP0                 6175 broad…         37 12              120635255
## # … with 97,923 more rows, and 84 more variables: End_position <dbl>,
## #   Strand <chr>, Variant_Classification <chr>, Variant_Type <chr>,
## #   Reference_Allele <chr>, Tumor_Seq_Allele1 <chr>, Tumor_Seq_Allele2 <chr>,
## #   dbSNP_RS <chr>, dbSNP_Val_Status <chr>, Tumor_Sample_Barcode <chr>,
## #   Matched_Norm_Sample_Barcode <chr>, Match_Norm_Seq_Allele1 <lgl>,
## #   Match_Norm_Seq_Allele2 <lgl>, Tumor_Validation_Allele1 <lgl>,
## #   Tumor_Validation_Allele2 <lgl>, Match_Norm_Validation_Allele1 <lgl>,
## #   Match_Norm_Validation_Allele2 <lgl>, Verification_Status <lgl>,
## #   Validation_Status <chr>, Mutation_Status <chr>, Sequencing_Phase <chr>,
## #   Sequence_Source <chr>, Validation_Method <chr>, Score <lgl>,
## #   BAM_file <lgl>, Sequencer <chr>, Tumor_Sample_UUID <chr>,
## #   Matched_Norm_Sample_UUID <chr>, Genome_Change <chr>,
## #   Annotation_Transcript <chr>, Transcript_Strand <chr>,
## #   Transcript_Exon <dbl>, Transcript_Position <dbl>, cDNA_Change <chr>,
## #   Codon_Change <chr>, Protein_Change <chr>, Other_Transcripts <chr>,
## #   Refseq_mRNA_Id <chr>, Refseq_prot_Id <chr>, SwissProt_acc_Id <chr>,
## #   SwissProt_entry_Id <chr>, Description <chr>, UniProt_AApos <dbl>,
## #   UniProt_Region <chr>, UniProt_Site <lgl>, UniProt_Natural_Variations <chr>,
## #   UniProt_Experimental_Info <chr>, GO_Biological_Process <chr>,
## #   GO_Cellular_Component <chr>, GO_Molecular_Function <chr>,
## #   COSMIC_overlapping_mutations <chr>, COSMIC_fusion_genes <chr>,
## #   COSMIC_tissue_types_affected <chr>, COSMIC_total_alterations_in_gene <dbl>,
## #   Tumorscape_Amplification_Peaks <chr>, Tumorscape_Deletion_Peaks <chr>,
## #   TCGAscape_Amplification_Peaks <chr>, TCGAscape_Deletion_Peaks <chr>,
## #   DrugBank <chr>, ref_context <chr>, gc_content <dbl>,
## #   CCLE_ONCOMAP_overlapping_mutations <chr>,
## #   CCLE_ONCOMAP_total_mutations_in_gene <dbl>, CGC_Mutation_Type <chr>,
## #   CGC_Translocation_Partner <chr>, CGC_Tumor_Types_Somatic <chr>,
## #   CGC_Tumor_Types_Germline <chr>, CGC_Other_Diseases <chr>,
## #   DNARepairGenes_Role <chr>, FamilialCancerDatabase_Syndromes <chr>,
## #   MUTSIG_Published_Results <chr>, OREGANNO_ID <chr>, OREGANNO_Values <chr>,
## #   t_alt_count <int>, t_ref_count <int>, validation_alt_allele <lgl>,
## #   validation_method <lgl>, validation_status <lgl>,
## #   validation_tumor_sample <lgl>, pox <dbl>, qox <dbl>, pox_cutoff <dbl>,
## #   isArtifactMode <dbl>, oxoGCut <dbl>
## 
## attr(,"split_type")
## [1] "array"
## attr(,"split_labels")
##   X1
## 1  1
## 2  2
## 3  3
## 4  4
## 5  5
## 6  6

Simple nucleotide variation - hg19

Tiago Silva

2019-12-24

1 Look what the query returns

2 Search each files

3 Data