R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

cBioPortal manual download: TCGA-BRCA survival analysis

Goal:

1. Read cBioPortal clinical patient data

2. Read cBioPortal mutation data

3. Define NPM1 mutation status

4. Build a survival dataset

5. Run Kaplan-Meier and Cox regression analyses

STEP 0: Load packages

library(readr)       # for reading tab-delimited files
library(dplyr)       # for data wrangling

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(survival)    # for Surv(), survfit(), coxph()
library(survminer)   # for ggsurvplot()

## Loading required package: ggplot2

## Loading required package: ggpubr

## 
## Attaching package: 'survminer'

## The following object is masked from 'package:survival':
## 
##     myeloma

STEP 1: Set working directory

setwd("~/PhD/Assignment_1/aml_target_gdc")
dir()

##  [1] "Assignment1.html"                                     
##  [2] "Assignment1.Rmd"                                      
##  [3] "case_lists"                                           
##  [4] "data_clinical_patient.txt"                            
##  [5] "data_clinical_sample.txt"                             
##  [6] "data_cna.txt"                                         
##  [7] "data_gene_panel_matrix.txt"                           
##  [8] "data_mrna_seq_fpkm_zscores_ref_all_samples.txt"       
##  [9] "data_mrna_seq_fpkm.txt"                               
## [10] "data_mrna_seq_read_counts_zscores_ref_all_samples.txt"
## [11] "data_mrna_seq_read_counts.txt"                        
## [12] "data_mrna_seq_tpm_zscores_ref_all_samples.txt"        
## [13] "data_mrna_seq_tpm.txt"                                
## [14] "data_mutations.txt"                                   
## [15] "data_timeline_diagnosis.txt"                          
## [16] "GCA_CompareNPM1.png"                                  
## [17] "meta_clinical_patient.txt"                            
## [18] "meta_clinical_sample.txt"                             
## [19] "meta_cna.txt"                                         
## [20] "meta_gene_panel_matrix.txt"                           
## [21] "meta_mrna_seq_fpkm_zscores_ref_all_samples.txt"       
## [22] "meta_mrna_seq_fpkm.txt"                               
## [23] "meta_mrna_seq_read_counts_zscores_ref_all_samples.txt"
## [24] "meta_mrna_seq_read_counts.txt"                        
## [25] "meta_mrna_seq_tpm_zscores_ref_all_samples.txt"        
## [26] "meta_mrna_seq_tpm.txt"                                
## [27] "meta_mutations.txt"                                   
## [28] "meta_study.txt"                                       
## [29] "meta_timeline_diagnosis.txt"                          
## [30] "README.md"                                            
## [31] "validation_reports"

STEP 2: Read patient-level clinical data

# cBioPortal clinical files usually begin with comment lines starting with "#"
# comment = "#" tells read_tsv() to ignore those lines
clinical <- read_tsv(
  "~/PhD/Assignment_1/aml_target_gdc/data_clinical_patient.txt",
  comment = "#",
  show_col_types = FALSE
)

# Inspect the column names
colnames(clinical)

##  [1] "PATIENT_ID"           "DAYS_TO_BIRTH"        "DAYS_TO_DEATH"       
##  [4] "ETHNICITY"            "PROJECT_ID"           "PRIMARY_DIAGNOSIS"   
##  [7] "PRIMARY_SITE_PATIENT" "RACE"                 "SEX"                 
## [10] "VITAL_STATUS"         "OS_STATUS"            "OS_MONTHS"           
## [13] "AGE"

STEP 3: Build a clean survival dataset

For cBioPortal TCGA studies, the main patient-level columns usually include:

PATIENT_ID

OS_STATUS

OS_MONTHS

AGE

OS_STATUS is often coded like:

“0:LIVING” or “1:DECEASED”

OS_MONTHS is overall survival time in months

clinical2 <- clinical %>%
  transmute(
    patient_id = PATIENT_ID,                              # patient identifier
    os_status = OS_STATUS,                               # survival status
    os_months = as.numeric(OS_MONTHS),                   # overall survival time in months
    age = as.numeric(AGE),                               # age, usually in years
    sex = SEX,                                           # sex
    race = RACE,                                         # race if available
    ethnicity = ETHNICITY                                # ethnicity if available
  ) %>%
  distinct() %>%                                         # remove fully duplicated rows
  mutate(
    # Create event indicator:
    # 1 = death, 0 = censored/alive
    event = ifelse(grepl("^1:", os_status), 1, 0),

    # Survival time variable
    time = os_months
  ) %>%
  filter(!is.na(time)) %>%                               # remove missing survival time
  filter(time >= 0)                                      # remove any negative times if present

STEP 4: Check the clinical survival dataset

dim(clinical2)

## [1] 742   9

length(unique(clinical2$patient_id))

## [1] 742

table(clinical2$event)

## 
##   1 
## 742

summary(clinical2$time)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   0.03285   7.03022  12.43430  16.24715  21.48489 106.53745

STEP 5: Read mutation data

mut <- read_tsv(
  "~/PhD/Assignment_1/aml_target_gdc/data_mutations.txt",
  comment = "#",
  show_col_types = FALSE
)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

# Inspect mutation columns
colnames(mut)

##   [1] "Hugo_Symbol"                       "Entrez_Gene_Id"                   
##   [3] "Center"                            "NCBI_Build"                       
##   [5] "Chromosome"                        "Start_Position"                   
##   [7] "End_Position"                      "Strand"                           
##   [9] "Consequence"                       "Variant_Classification"           
##  [11] "Variant_Type"                      "Reference_Allele"                 
##  [13] "Tumor_Seq_Allele1"                 "Tumor_Seq_Allele2"                
##  [15] "dbSNP_RS"                          "dbSNP_Val_Status"                 
##  [17] "Tumor_Sample_Barcode"              "Matched_Norm_Sample_Barcode"      
##  [19] "Match_Norm_Seq_Allele1"            "Match_Norm_Seq_Allele2"           
##  [21] "Tumor_Validation_Allele1"          "Tumor_Validation_Allele2"         
##  [23] "Match_Norm_Validation_Allele1"     "Match_Norm_Validation_Allele2"    
##  [25] "Verification_Status"               "Validation_Status"                
##  [27] "Mutation_Status"                   "Sequencing_Phase"                 
##  [29] "Sequence_Source"                   "Validation_Method"                
##  [31] "Score"                             "BAM_File"                         
##  [33] "Sequencer"                         "t_ref_count"                      
##  [35] "t_alt_count"                       "n_ref_count"                      
##  [37] "n_alt_count"                       "HGVSc"                            
##  [39] "HGVSp"                             "HGVSp_Short"                      
##  [41] "Transcript_ID"                     "RefSeq"                           
##  [43] "Protein_position"                  "Codons"                           
##  [45] "Exon_Number"                       "1000G_AF"                         
##  [47] "1000G_AFR_AF"                      "1000G_AMR_AF"                     
##  [49] "1000G_EAS_AF"                      "1000G_EUR_AF"                     
##  [51] "1000G_SAS_AF"                      "APPRIS"                           
##  [53] "Allele"                            "Amino_acids"                      
##  [55] "BIOTYPE"                           "CANONICAL"                        
##  [57] "CCDS"                              "CDS_position"                     
##  [59] "CLIN_SIG"                          "CONTEXT"                          
##  [61] "COSMIC"                            "DISTANCE"                         
##  [63] "DOMAINS"                           "ENSP"                             
##  [65] "ESP_AA_AF"                         "ESP_EA_AF"                        
##  [67] "EXON"                              "Existing_variation"               
##  [69] "FLAGS"                             "Feature"                          
##  [71] "Feature_type"                      "GDC_FILTER"                       
##  [73] "GENE_PHENO"                        "Gene"                             
##  [75] "HGNC_ID"                           "HGVS_OFFSET"                      
##  [77] "HIGH_INF_POS"                      "IMPACT"                           
##  [79] "INTRON"                            "MANE"                             
##  [81] "MAX_AF"                            "MAX_AF_POPS"                      
##  [83] "MOTIF_NAME"                        "MOTIF_POS"                        
##  [85] "MOTIF_SCORE_CHANGE"                "One_Consequence"                  
##  [87] "PHENO"                             "PICK"                             
##  [89] "PUBMED"                            "PolyPhen"                         
##  [91] "RNA_Support"                       "RNA_alt_count"                    
##  [93] "RNA_depth"                         "RNA_ref_count"                    
##  [95] "SIFT"                              "SOMATIC"                          
##  [97] "SWISSPROT"                         "SYMBOL"                           
##  [99] "SYMBOL_SOURCE"                     "TRANSCRIPTION_FACTORS"            
## [101] "TRANSCRIPT_STRAND"                 "TREMBL"                           
## [103] "TSL"                               "UNIPARC"                          
## [105] "UNIPROT_ISOFORM"                   "VARIANT_CLASS"                    
## [107] "all_effects"                       "cDNA_position"                    
## [109] "callers"                           "case_id"                          
## [111] "genomic_location_explanation"      "gnomAD_AF"                        
## [113] "gnomAD_AFR_AF"                     "gnomAD_AMR_AF"                    
## [115] "gnomAD_ASJ_AF"                     "gnomAD_EAS_AF"                    
## [117] "gnomAD_FIN_AF"                     "gnomAD_NFE_AF"                    
## [119] "gnomAD_OTH_AF"                     "gnomAD_SAS_AF"                    
## [121] "gnomAD_non_cancer_AF"              "gnomAD_non_cancer_AFR_AF"         
## [123] "gnomAD_non_cancer_AMI_AF"          "gnomAD_non_cancer_AMR_AF"         
## [125] "gnomAD_non_cancer_ASJ_AF"          "gnomAD_non_cancer_EAS_AF"         
## [127] "gnomAD_non_cancer_FIN_AF"          "gnomAD_non_cancer_MAX_AF_POPS_adj"
## [129] "gnomAD_non_cancer_MAX_AF_adj"      "gnomAD_non_cancer_MID_AF"         
## [131] "gnomAD_non_cancer_NFE_AF"          "gnomAD_non_cancer_OTH_AF"         
## [133] "gnomAD_non_cancer_SAS_AF"          "hotspot"                          
## [135] "miRNA"                             "n_depth"                          
## [137] "normal_bam_uuid"                   "t_depth"                          
## [139] "tumor_bam_uuid"                    "Annotation_Status"

STEP 6: Extract NPM1-mutant patients

cBioPortal mutation files usually have:

Hugo_Symbol

Tumor_Sample_Barcode

Patient IDs for TCGA can be recovered from the first 12 characters

of Tumor_Sample_Barcode

NPM1 <- mut %>%
  filter(Hugo_Symbol == "NPM1") %>%                    # keep NPM1 variants only
  mutate(
    patient_id = substr(Tumor_Sample_Barcode, 1, 16) # sample barcode -> patient barcode
  ) %>%
  distinct(patient_id) %>%                               # one row per patient
  mutate(NPM1_mut = 1)                                 # mutation indicator

# Check number of mutated patients
nrow(NPM1)

## [1] 59

head(NPM1)

## # A tibble: 6 × 2
##   patient_id       NPM1_mut
##   <chr>               <dbl>
## 1 TARGET-20-PAEFGT        1
## 2 TARGET-20-PASLSE        1
## 3 TARGET-20-PASPGA        1
## 4 TARGET-20-PASCBW        1
## 5 TARGET-20-PASANW        1
## 6 TARGET-20-PASHWN        1

STEP 7: Join mutation status onto clinical data

dat <- clinical2 %>%
  left_join(NPM1, by = "patient_id") %>%
  mutate(
  # Patients not in the NPM1 mutation table are treated as wild-type
    NPM1_mut = ifelse(is.na(NPM1_mut), 0, 1)
  )

STEP 8: Check final dataset

dim(dat)

## [1] 742  10

table(dat$NPM1_mut)

## 
##   0   1 
## 729  13

table(dat$event)

## 
##   1 
## 742

summary(dat$time)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   0.03285   7.03022  12.43430  16.24715  21.48489 106.53745

colSums(is.na(dat[, c("time", "event", "NPM1_mut", "age", "sex")]))

##     time    event NPM1_mut      age      sex 
##        0        0        0        3        0

STEP 9: Kaplan-Meier survival analysis

fit <- survfit(
  Surv(time, event) ~ NPM1_mut,
  data = dat
)

ggsurvplot(
  fit,
  data = dat,
  pval = TRUE,
  risk.table = TRUE,
  legend.labs = c("NPM1 wild-type", "NPM1 mutant"),
  xlab = "Overall survival time (months)",
  ylab = "Overall survival probability",
  title = "cBioPortal TARGET AML: survival by NPM1 mutation status"
)

## Ignoring unknown labels:
## • colour : "Strata"

# Fit Cox model adjusting for age and sex
cox_fit <- coxph(
  Surv(time, event) ~ NPM1_mut + age + sex,
  data = dat
)

# Display model summary
summary(cox_fit)

## Call:
## coxph(formula = Surv(time, event) ~ NPM1_mut + age + sex, data = dat)
## 
##   n= 739, number of events= 739 
##    (3 observations deleted due to missingness)
## 
##              coef exp(coef) se(coef)      z Pr(>|z|)
## NPM1_mut -0.27188   0.76194  0.28058 -0.969    0.333
## age      -0.06862   0.93368  0.05697 -1.204    0.228
## sexMale  -0.01314   0.98694  0.07405 -0.177    0.859
## 
##          exp(coef) exp(-coef) lower .95 upper .95
## NPM1_mut    0.7619      1.312    0.4396     1.321
## age         0.9337      1.071    0.8350     1.044
## sexMale     0.9869      1.013    0.8536     1.141
## 
## Concordance= 0.514  (se = 0.011 )
## Likelihood ratio test= 2.68  on 3 df,   p=0.4
## Wald test            = 2.44  on 3 df,   p=0.5
## Score (logrank) test = 2.46  on 3 df,   p=0.5

# Always check the proportional hazards assumption:
cox.zph(cox_fit)

##           chisq df     p
## NPM1_mut 0.0027  1 0.959
## age      0.3133  1 0.576
## sex      3.0044  1 0.083
## GLOBAL   3.4416  3 0.328

STEP 10: Cox proportional hazards model

# First check whether sex is usable
table(dat$sex, dat$event)

##         
##            1
##   Female 351
##   Male   391

# A simple model adjusted for age is usually more stable.

cox_fit <- coxph(
  Surv(time, event) ~ NPM1_mut + age,
  data = dat
)

summary(cox_fit)

## Call:
## coxph(formula = Surv(time, event) ~ NPM1_mut + age, data = dat)
## 
##   n= 739, number of events= 739 
##    (3 observations deleted due to missingness)
## 
##              coef exp(coef) se(coef)      z Pr(>|z|)
## NPM1_mut -0.27252   0.76146  0.28056 -0.971    0.331
## age      -0.06831   0.93397  0.05693 -1.200    0.230
## 
##          exp(coef) exp(-coef) lower .95 upper .95
## NPM1_mut    0.7615      1.313    0.4394     1.320
## age         0.9340      1.071    0.8354     1.044
## 
## Concordance= 0.504  (se = 0.005 )
## Likelihood ratio test= 2.65  on 2 df,   p=0.3
## Wald test            = 2.41  on 2 df,   p=0.3
## Score (logrank) test = 2.43  on 2 df,   p=0.3

STEP 11: Check proportional hazards assumption

cox.zph(cox_fit)

##            chisq df    p
## NPM1_mut 0.00296  1 0.96
## age      0.31736  1 0.57
## GLOBAL   0.32067  2 0.85

STEP 12: Optional forest plot

ggforest(cox_fit, data = dat)

Assignment CBioportal

Caroline

2026-03-19