In this dataset, I'll be using data from cBioPortal to analyze cases of acute myeloid leukemia.
setwd("~/git/CUNY.MDS/DATA607/")
library(data.table)
library(snakecase)
library(tidyverse)
library(kableExtra)
library(survival)
library(survminer)
library(ggpubr)
library(broom)
For our purposes, we'll be using survival data, alongside mutation data. We can subset the dataset for variables of interest:
dataset3 = fread("aml_ohsu_2018_clinical_data.tsv")
names(dataset3) = snakecase::to_mixed_case(names(dataset3))
dataset3 = dataset3[, .(Patient_Id, Sample_Id, Overall_Survival_Months, Overall_Survival_Status, Tp_53_Mutation, Dnmt_3_A_Mutation, Eln_2017_Risk_Classification, Npm_1_Consensus_Call, Cumulative_Treatment_Regimens)]
For the purposes of this assignment, we'll be using risk stratification for converting the dataset to wide form and populating the values with the respective mutations:
Thei first step is consolidating risk categories:
dataset3[, .N, by = "Eln_2017_Risk_Classification"] %>% kable %>% kable_styling( full_width = F)
| Eln_2017_Risk_Classification | N |
|---|---|
| Favorable or Intermediate | 27 |
| Adverse | 242 |
| Intermediate or Adverse | 11 |
| Intermediate | 235 |
| Favorable | 155 |
| Unknown | 2 |
This is a bit messy, so we'l consolidate:
dataset3[grepl("favorable", Eln_2017_Risk_Classification, ignore.case = T), Risk.Classification := "Good"]
dataset3[grepl("Adverse", Eln_2017_Risk_Classification, ignore.case = T), Risk.Classification := "Poor"]
dataset3[grepl("Intermediate", Eln_2017_Risk_Classification, ignore.case = T), Risk.Classification := "Intermediate"]
dataset3[is.na(Risk.Classification), Risk.Classification := "Unknown"]
dataset3[, Risk.Classification := factor(Risk.Classification, levels = c("Poor", "Intermediate", "Good"))]
dataset3[, .N, by = "Risk.Classification"] %>% kable %>% kable_styling( full_width = F, position = "left")
| Risk.Classification | N |
|---|---|
| Intermediate | 273 |
| Poor | 242 |
| Good | 155 |
| NA | 2 |
Most patients here have one sample but some have two. For the sake of the export, we'll parse the treatment regimens into wide form.
treatment = dataset3[,.(Patient_Id, Cumulative_Treatment_Regimens)]
lst.split = strsplit(treatment$Cumulative_Treatment_Regimens, split = "\\|")
lst.split.new = list()
for (i in 1:length(lst.split)) {
vec = lst.split[[i]]
vec = unlist(vec)
if (length(vec) < 13) {
Y = 13 - length(vec)
na.vec = rep(NA, Y)
vec = c(vec, na.vec)
}
lst.split.new[[i]] = vec
}
mat.treatments = do.call(rbind, lst.split.new)
colnames(mat.treatments) = paste0("treatment_", 1:ncol(mat.treatments))
rownames(mat.treatments) = treatment$Patient_Id
dt.treatment = as.data.table(mat.treatments, keep.rownames = "Patient_Id")
We can merge this data.table back and export:
dt.wide = merge(dataset3[, .(Patient_Id, Sample_Id, Overall_Survival_Months, Overall_Survival_Status, Tp_53_Mutation, Dnmt_3_A_Mutation, Risk.Classification, Npm_1_Consensus_Call)], dt.treatment, by = "Patient_Id")
head(dt.wide) %>% kable %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F, position = "left")
| Patient_Id | Sample_Id | Overall_Survival_Months | Overall_Survival_Status | Tp_53_Mutation | Dnmt_3_A_Mutation | Risk.Classification | Npm_1_Consensus_Call | treatment_1 | treatment_2 | treatment_3 | treatment_4 | treatment_5 | treatment_6 | treatment_7 | treatment_8 | treatment_9 | treatment_10 | treatment_11 | treatment_12 | treatment_13 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| aml_ohsu_2018_1011 | aml_ohsu_2018_13-00098 | NA | NA | NA | NA | Good | Positive | 7+3 (Cytarabine, Idarubicin) | HiDAC | MEC (Cytarabine, Etoposide, Mitoxantrone) | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| aml_ohsu_2018_1027 | aml_ohsu_2018_13-00123 | 0.30 | 1:DECEASED | NA | NA | Good | Positive | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| aml_ohsu_2018_1045 | aml_ohsu_2018_13-00145 | 6.38 | 1:DECEASED | NA | NA | Intermediate | Negative | 7+3 (Cytarabine, Idarubicin) | Azacitidine | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| aml_ohsu_2018_1046 | aml_ohsu_2018_13-00146 | 5.20 | 1:DECEASED | Negative | Negative | Good | Negative | COG-AAML1031 (IT/IV Cytarabine, Daunorubicin, Etoposide, Mitoxantrone +/- Bortezomib or Sorafenib) | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| aml_ohsu_2018_1049 | aml_ohsu_2018_13-00149 | 8.78 | 1:DECEASED | Negative | Negative | Intermediate | Negative | Decitabine | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| aml_ohsu_2018_1051 | aml_ohsu_2018_13-00150 | 13.29 | 1:DECEASED | NA | NA | Poor | Negative | 7+3 (Cytarabine, Idarubicin) | Sunitinib | HAM (Cytarabine, Mitoxantrone) | FLAG-IDA (Cytarabine, Filgrastim, Fludarabine, Idarubicin) | Decitabine | Busulfan, Cyclophosphamide | NA | NA | NA | NA | NA | NA | NA |
write.csv(dt.wide, file = "dataset3.csv")
The first step is making the mutation data boolean expressions. We'll also do the same with survival:
dataset3[is.na(Tp_53_Mutation), TP53 := FALSE]
dataset3[!is.na(Tp_53_Mutation), TP53 := TRUE]
dataset3[is.na(Npm_1_Consensus_Call), NPM1 := FALSE]
dataset3[!is.na(Npm_1_Consensus_Call), NPM1 := TRUE]
dataset3[is.na(Dnmt_3_A_Mutation), DNMT3A := FALSE]
dataset3[!is.na(Dnmt_3_A_Mutation), DNMT3A := TRUE]
dataset3[grepl("deceased", Overall_Survival_Status, ignore.case = T), SURVIVAL := 1]
dataset3[grepl("alive", Overall_Survival_Status, ignore.case = T), SURVIVAL := 0]
From here, we'll fit Cox-Proportional Hazards models for :
tbl = rbind(tidy(coxph(Surv(Overall_Survival_Months, SURVIVAL) ~ TP53, data=dataset3)),
tidy(coxph(Surv(Overall_Survival_Months, SURVIVAL) ~ DNMT3A, data=dataset3)),
tidy(coxph(Surv(Overall_Survival_Months, SURVIVAL) ~ NPM1, data=dataset3)))
tbl %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F, position= "left")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| TP53TRUE | 0.3536207 | 0.1156758 | 3.056999 | 0.0022356 |
| DNMT3ATRUE | 0.2340358 | 0.1131688 | 2.068024 | 0.0386377 |
| NPM1TRUE | 0.9390498 | 0.7106134 | 1.321464 | 0.1863468 |
Here's the model with the interactive terms:
tidy(coxph(Surv(Overall_Survival_Months, SURVIVAL) ~ TP53*DNMT3A, data=dataset3)) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F, position = "left")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| TP53TRUE | 0.7173523 | 0.2333776 | 3.0737836 | 0.0021136 |
| DNMT3ATRUE | 0.1973039 | 0.2027175 | 0.9732946 | 0.3304069 |
| TP53TRUE:DNMT3ATRUE | -0.6016312 | 0.3186106 | -1.8882964 | 0.0589862 |
We'll visualize these results as well:
plot0 = ggsurvplot(
fit = survfit(Surv(Overall_Survival_Months, SURVIVAL) ~ factor(Risk.Classification), data = dataset3),
xlab = "Days",
ylab = "Overall survival probability (TP53)", ggtheme = theme_minimal())
plot1 = ggsurvplot(
fit = survfit(Surv(Overall_Survival_Months, SURVIVAL) ~ TP53, data = dataset3),
xlab = "Days",
ylab = "Overall survival probability (TP53)", ggtheme = theme_minimal())
plot2 = ggsurvplot(
fit = survfit(Surv(Overall_Survival_Months, SURVIVAL) ~ DNMT3A, data = dataset3),
xlab = "Days",
ylab = "Overall survival probability (DNMT3A)", ggtheme = theme_minimal())
plot3 = ggsurvplot(
fit = survfit(Surv(Overall_Survival_Months, SURVIVAL) ~ NPM1, data = dataset3),
xlab = "Days",
ylab = "Overall survival probability (NPM1)", ggtheme = theme_minimal())
plot0
plot1
plot2
plot3
Based on our findings, we can conclude the following: