This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
library(readr) # for reading tab-delimited files
library(dplyr) # for data wrangling
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(survival) # for Surv(), survfit(), coxph()
library(survminer) # for ggsurvplot()
## Loading required package: ggplot2
## Loading required package: ggpubr
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
setwd("~/PhD/Assignment_1/aml_target_gdc")
dir()
## [1] "Assignment1.html"
## [2] "Assignment1.Rmd"
## [3] "case_lists"
## [4] "data_clinical_patient.txt"
## [5] "data_clinical_sample.txt"
## [6] "data_cna.txt"
## [7] "data_gene_panel_matrix.txt"
## [8] "data_mrna_seq_fpkm_zscores_ref_all_samples.txt"
## [9] "data_mrna_seq_fpkm.txt"
## [10] "data_mrna_seq_read_counts_zscores_ref_all_samples.txt"
## [11] "data_mrna_seq_read_counts.txt"
## [12] "data_mrna_seq_tpm_zscores_ref_all_samples.txt"
## [13] "data_mrna_seq_tpm.txt"
## [14] "data_mutations.txt"
## [15] "data_timeline_diagnosis.txt"
## [16] "GCA_CompareNPM1.png"
## [17] "meta_clinical_patient.txt"
## [18] "meta_clinical_sample.txt"
## [19] "meta_cna.txt"
## [20] "meta_gene_panel_matrix.txt"
## [21] "meta_mrna_seq_fpkm_zscores_ref_all_samples.txt"
## [22] "meta_mrna_seq_fpkm.txt"
## [23] "meta_mrna_seq_read_counts_zscores_ref_all_samples.txt"
## [24] "meta_mrna_seq_read_counts.txt"
## [25] "meta_mrna_seq_tpm_zscores_ref_all_samples.txt"
## [26] "meta_mrna_seq_tpm.txt"
## [27] "meta_mutations.txt"
## [28] "meta_study.txt"
## [29] "meta_timeline_diagnosis.txt"
## [30] "README.md"
## [31] "validation_reports"
# cBioPortal clinical files usually begin with comment lines starting with "#"
# comment = "#" tells read_tsv() to ignore those lines
clinical <- read_tsv(
"~/PhD/Assignment_1/aml_target_gdc/data_clinical_patient.txt",
comment = "#",
show_col_types = FALSE
)
# Inspect the column names
colnames(clinical)
## [1] "PATIENT_ID" "DAYS_TO_BIRTH" "DAYS_TO_DEATH"
## [4] "ETHNICITY" "PROJECT_ID" "PRIMARY_DIAGNOSIS"
## [7] "PRIMARY_SITE_PATIENT" "RACE" "SEX"
## [10] "VITAL_STATUS" "OS_STATUS" "OS_MONTHS"
## [13] "AGE"
clinical2 <- clinical %>%
transmute(
patient_id = PATIENT_ID, # patient identifier
os_status = OS_STATUS, # survival status
os_months = as.numeric(OS_MONTHS), # overall survival time in months
age = as.numeric(AGE), # age, usually in years
sex = SEX, # sex
race = RACE, # race if available
ethnicity = ETHNICITY # ethnicity if available
) %>%
distinct() %>% # remove fully duplicated rows
mutate(
# Create event indicator:
# 1 = death, 0 = censored/alive
event = ifelse(grepl("^1:", os_status), 1, 0),
# Survival time variable
time = os_months
) %>%
filter(!is.na(time)) %>% # remove missing survival time
filter(time >= 0) # remove any negative times if present
dim(clinical2)
## [1] 742 9
length(unique(clinical2$patient_id))
## [1] 742
table(clinical2$event)
##
## 1
## 742
summary(clinical2$time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03285 7.03022 12.43430 16.24715 21.48489 106.53745
mut <- read_tsv(
"~/PhD/Assignment_1/aml_target_gdc/data_mutations.txt",
comment = "#",
show_col_types = FALSE
)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
# Inspect mutation columns
colnames(mut)
## [1] "Hugo_Symbol" "Entrez_Gene_Id"
## [3] "Center" "NCBI_Build"
## [5] "Chromosome" "Start_Position"
## [7] "End_Position" "Strand"
## [9] "Consequence" "Variant_Classification"
## [11] "Variant_Type" "Reference_Allele"
## [13] "Tumor_Seq_Allele1" "Tumor_Seq_Allele2"
## [15] "dbSNP_RS" "dbSNP_Val_Status"
## [17] "Tumor_Sample_Barcode" "Matched_Norm_Sample_Barcode"
## [19] "Match_Norm_Seq_Allele1" "Match_Norm_Seq_Allele2"
## [21] "Tumor_Validation_Allele1" "Tumor_Validation_Allele2"
## [23] "Match_Norm_Validation_Allele1" "Match_Norm_Validation_Allele2"
## [25] "Verification_Status" "Validation_Status"
## [27] "Mutation_Status" "Sequencing_Phase"
## [29] "Sequence_Source" "Validation_Method"
## [31] "Score" "BAM_File"
## [33] "Sequencer" "t_ref_count"
## [35] "t_alt_count" "n_ref_count"
## [37] "n_alt_count" "HGVSc"
## [39] "HGVSp" "HGVSp_Short"
## [41] "Transcript_ID" "RefSeq"
## [43] "Protein_position" "Codons"
## [45] "Exon_Number" "1000G_AF"
## [47] "1000G_AFR_AF" "1000G_AMR_AF"
## [49] "1000G_EAS_AF" "1000G_EUR_AF"
## [51] "1000G_SAS_AF" "APPRIS"
## [53] "Allele" "Amino_acids"
## [55] "BIOTYPE" "CANONICAL"
## [57] "CCDS" "CDS_position"
## [59] "CLIN_SIG" "CONTEXT"
## [61] "COSMIC" "DISTANCE"
## [63] "DOMAINS" "ENSP"
## [65] "ESP_AA_AF" "ESP_EA_AF"
## [67] "EXON" "Existing_variation"
## [69] "FLAGS" "Feature"
## [71] "Feature_type" "GDC_FILTER"
## [73] "GENE_PHENO" "Gene"
## [75] "HGNC_ID" "HGVS_OFFSET"
## [77] "HIGH_INF_POS" "IMPACT"
## [79] "INTRON" "MANE"
## [81] "MAX_AF" "MAX_AF_POPS"
## [83] "MOTIF_NAME" "MOTIF_POS"
## [85] "MOTIF_SCORE_CHANGE" "One_Consequence"
## [87] "PHENO" "PICK"
## [89] "PUBMED" "PolyPhen"
## [91] "RNA_Support" "RNA_alt_count"
## [93] "RNA_depth" "RNA_ref_count"
## [95] "SIFT" "SOMATIC"
## [97] "SWISSPROT" "SYMBOL"
## [99] "SYMBOL_SOURCE" "TRANSCRIPTION_FACTORS"
## [101] "TRANSCRIPT_STRAND" "TREMBL"
## [103] "TSL" "UNIPARC"
## [105] "UNIPROT_ISOFORM" "VARIANT_CLASS"
## [107] "all_effects" "cDNA_position"
## [109] "callers" "case_id"
## [111] "genomic_location_explanation" "gnomAD_AF"
## [113] "gnomAD_AFR_AF" "gnomAD_AMR_AF"
## [115] "gnomAD_ASJ_AF" "gnomAD_EAS_AF"
## [117] "gnomAD_FIN_AF" "gnomAD_NFE_AF"
## [119] "gnomAD_OTH_AF" "gnomAD_SAS_AF"
## [121] "gnomAD_non_cancer_AF" "gnomAD_non_cancer_AFR_AF"
## [123] "gnomAD_non_cancer_AMI_AF" "gnomAD_non_cancer_AMR_AF"
## [125] "gnomAD_non_cancer_ASJ_AF" "gnomAD_non_cancer_EAS_AF"
## [127] "gnomAD_non_cancer_FIN_AF" "gnomAD_non_cancer_MAX_AF_POPS_adj"
## [129] "gnomAD_non_cancer_MAX_AF_adj" "gnomAD_non_cancer_MID_AF"
## [131] "gnomAD_non_cancer_NFE_AF" "gnomAD_non_cancer_OTH_AF"
## [133] "gnomAD_non_cancer_SAS_AF" "hotspot"
## [135] "miRNA" "n_depth"
## [137] "normal_bam_uuid" "t_depth"
## [139] "tumor_bam_uuid" "Annotation_Status"
NPM1 <- mut %>%
filter(Hugo_Symbol == "NPM1") %>% # keep NPM1 variants only
mutate(
patient_id = substr(Tumor_Sample_Barcode, 1, 16) # sample barcode -> patient barcode
) %>%
distinct(patient_id) %>% # one row per patient
mutate(NPM1_mut = 1) # mutation indicator
# Check number of mutated patients
nrow(NPM1)
## [1] 59
head(NPM1)
## # A tibble: 6 × 2
## patient_id NPM1_mut
## <chr> <dbl>
## 1 TARGET-20-PAEFGT 1
## 2 TARGET-20-PASLSE 1
## 3 TARGET-20-PASPGA 1
## 4 TARGET-20-PASCBW 1
## 5 TARGET-20-PASANW 1
## 6 TARGET-20-PASHWN 1
dat <- clinical2 %>%
left_join(NPM1, by = "patient_id") %>%
mutate(
# Patients not in the NPM1 mutation table are treated as wild-type
NPM1_mut = ifelse(is.na(NPM1_mut), 0, 1)
)
dim(dat)
## [1] 742 10
table(dat$NPM1_mut)
##
## 0 1
## 729 13
table(dat$event)
##
## 1
## 742
summary(dat$time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03285 7.03022 12.43430 16.24715 21.48489 106.53745
colSums(is.na(dat[, c("time", "event", "NPM1_mut", "age", "sex")]))
## time event NPM1_mut age sex
## 0 0 0 3 0
fit <- survfit(
Surv(time, event) ~ NPM1_mut,
data = dat
)
ggsurvplot(
fit,
data = dat,
pval = TRUE,
risk.table = TRUE,
legend.labs = c("NPM1 wild-type", "NPM1 mutant"),
xlab = "Overall survival time (months)",
ylab = "Overall survival probability",
title = "cBioPortal TARGET AML: survival by NPM1 mutation status"
)
## Ignoring unknown labels:
## • colour : "Strata"
# Fit Cox model adjusting for age and sex
cox_fit <- coxph(
Surv(time, event) ~ NPM1_mut + age + sex,
data = dat
)
# Display model summary
summary(cox_fit)
## Call:
## coxph(formula = Surv(time, event) ~ NPM1_mut + age + sex, data = dat)
##
## n= 739, number of events= 739
## (3 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## NPM1_mut -0.27188 0.76194 0.28058 -0.969 0.333
## age -0.06862 0.93368 0.05697 -1.204 0.228
## sexMale -0.01314 0.98694 0.07405 -0.177 0.859
##
## exp(coef) exp(-coef) lower .95 upper .95
## NPM1_mut 0.7619 1.312 0.4396 1.321
## age 0.9337 1.071 0.8350 1.044
## sexMale 0.9869 1.013 0.8536 1.141
##
## Concordance= 0.514 (se = 0.011 )
## Likelihood ratio test= 2.68 on 3 df, p=0.4
## Wald test = 2.44 on 3 df, p=0.5
## Score (logrank) test = 2.46 on 3 df, p=0.5
# Always check the proportional hazards assumption:
cox.zph(cox_fit)
## chisq df p
## NPM1_mut 0.0027 1 0.959
## age 0.3133 1 0.576
## sex 3.0044 1 0.083
## GLOBAL 3.4416 3 0.328
# First check whether sex is usable
table(dat$sex, dat$event)
##
## 1
## Female 351
## Male 391
# A simple model adjusted for age is usually more stable.
cox_fit <- coxph(
Surv(time, event) ~ NPM1_mut + age,
data = dat
)
summary(cox_fit)
## Call:
## coxph(formula = Surv(time, event) ~ NPM1_mut + age, data = dat)
##
## n= 739, number of events= 739
## (3 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## NPM1_mut -0.27252 0.76146 0.28056 -0.971 0.331
## age -0.06831 0.93397 0.05693 -1.200 0.230
##
## exp(coef) exp(-coef) lower .95 upper .95
## NPM1_mut 0.7615 1.313 0.4394 1.320
## age 0.9340 1.071 0.8354 1.044
##
## Concordance= 0.504 (se = 0.005 )
## Likelihood ratio test= 2.65 on 2 df, p=0.3
## Wald test = 2.41 on 2 df, p=0.3
## Score (logrank) test = 2.43 on 2 df, p=0.3
cox.zph(cox_fit)
## chisq df p
## NPM1_mut 0.00296 1 0.96
## age 0.31736 1 0.57
## GLOBAL 0.32067 2 0.85
ggforest(cox_fit, data = dat)