Acute Myeloid Leukemia

In this dataset, I'll be using data from cBioPortal to analyze cases of acute myeloid leukemia.

setwd("~/git/CUNY.MDS/DATA607/")
library(data.table)
library(snakecase)
library(tidyverse)
library(kableExtra)
library(survival)
library(survminer)
library(ggpubr)
library(broom)

Feature Selection

For our purposes, we'll be using survival data, alongside mutation data. We can subset the dataset for variables of interest:

dataset3 = fread("aml_ohsu_2018_clinical_data.tsv")
names(dataset3) = snakecase::to_mixed_case(names(dataset3))

dataset3 = dataset3[, .(Patient_Id, Sample_Id, Overall_Survival_Months, Overall_Survival_Status, Tp_53_Mutation, Dnmt_3_A_Mutation, Eln_2017_Risk_Classification, Npm_1_Consensus_Call, Cumulative_Treatment_Regimens)]

CSV Conversion:

For the purposes of this assignment, we'll be using risk stratification for converting the dataset to wide form and populating the values with the respective mutations:

Thei first step is consolidating risk categories:

dataset3[, .N, by = "Eln_2017_Risk_Classification"] %>% kable %>% kable_styling( full_width = F) 
Eln_2017_Risk_Classification N
Favorable or Intermediate 27
Adverse 242
Intermediate or Adverse 11
Intermediate 235
Favorable 155
Unknown 2

This is a bit messy, so we'l consolidate:

dataset3[grepl("favorable", Eln_2017_Risk_Classification, ignore.case = T), Risk.Classification := "Good"]
dataset3[grepl("Adverse", Eln_2017_Risk_Classification, ignore.case = T), Risk.Classification := "Poor"]
dataset3[grepl("Intermediate", Eln_2017_Risk_Classification, ignore.case = T), Risk.Classification := "Intermediate"]
dataset3[is.na(Risk.Classification), Risk.Classification := "Unknown"]
dataset3[, Risk.Classification := factor(Risk.Classification, levels = c("Poor", "Intermediate", "Good"))]
dataset3[, .N, by = "Risk.Classification"] %>% kable %>% kable_styling( full_width = F, position = "left") 
Risk.Classification N
Intermediate 273
Poor 242
Good 155
NA 2

Most patients here have one sample but some have two. For the sake of the export, we'll parse the treatment regimens into wide form.

treatment = dataset3[,.(Patient_Id, Cumulative_Treatment_Regimens)]
lst.split = strsplit(treatment$Cumulative_Treatment_Regimens, split = "\\|")

lst.split.new = list()
for (i in 1:length(lst.split)) {
  vec = lst.split[[i]]
  vec = unlist(vec)
  if (length(vec) < 13) {
    Y = 13 - length(vec)
    na.vec = rep(NA, Y)
    vec = c(vec, na.vec)
  }
  lst.split.new[[i]] = vec
}


mat.treatments = do.call(rbind, lst.split.new)
colnames(mat.treatments) = paste0("treatment_", 1:ncol(mat.treatments))
rownames(mat.treatments) = treatment$Patient_Id

dt.treatment = as.data.table(mat.treatments, keep.rownames = "Patient_Id")

We can merge this data.table back and export:

dt.wide = merge(dataset3[, .(Patient_Id, Sample_Id, Overall_Survival_Months, Overall_Survival_Status, Tp_53_Mutation, Dnmt_3_A_Mutation, Risk.Classification, Npm_1_Consensus_Call)], dt.treatment, by = "Patient_Id")

head(dt.wide) %>% kable %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F, position = "left") 
Patient_Id Sample_Id Overall_Survival_Months Overall_Survival_Status Tp_53_Mutation Dnmt_3_A_Mutation Risk.Classification Npm_1_Consensus_Call treatment_1 treatment_2 treatment_3 treatment_4 treatment_5 treatment_6 treatment_7 treatment_8 treatment_9 treatment_10 treatment_11 treatment_12 treatment_13
aml_ohsu_2018_1011 aml_ohsu_2018_13-00098 NA NA NA NA Good Positive 7+3 (Cytarabine, Idarubicin) HiDAC MEC (Cytarabine, Etoposide, Mitoxantrone) NA NA NA NA NA NA NA NA NA NA
aml_ohsu_2018_1027 aml_ohsu_2018_13-00123 0.30 1:DECEASED NA NA Good Positive NA NA NA NA NA NA NA NA NA NA NA NA NA
aml_ohsu_2018_1045 aml_ohsu_2018_13-00145 6.38 1:DECEASED NA NA Intermediate Negative 7+3 (Cytarabine, Idarubicin) Azacitidine NA NA NA NA NA NA NA NA NA NA NA
aml_ohsu_2018_1046 aml_ohsu_2018_13-00146 5.20 1:DECEASED Negative Negative Good Negative COG-AAML1031 (IT/IV Cytarabine, Daunorubicin, Etoposide, Mitoxantrone +/- Bortezomib or Sorafenib) NA NA NA NA NA NA NA NA NA NA NA NA
aml_ohsu_2018_1049 aml_ohsu_2018_13-00149 8.78 1:DECEASED Negative Negative Intermediate Negative Decitabine NA NA NA NA NA NA NA NA NA NA NA NA
aml_ohsu_2018_1051 aml_ohsu_2018_13-00150 13.29 1:DECEASED NA NA Poor Negative 7+3 (Cytarabine, Idarubicin) Sunitinib HAM (Cytarabine, Mitoxantrone) FLAG-IDA (Cytarabine, Filgrastim, Fludarabine, Idarubicin) Decitabine Busulfan, Cyclophosphamide NA NA NA NA NA NA NA
write.csv(dt.wide, file = "dataset3.csv")

Data Analysis:

The first step is making the mutation data boolean expressions. We'll also do the same with survival:

dataset3[is.na(Tp_53_Mutation), TP53 := FALSE]
dataset3[!is.na(Tp_53_Mutation), TP53 := TRUE]

dataset3[is.na(Npm_1_Consensus_Call), NPM1 := FALSE]
dataset3[!is.na(Npm_1_Consensus_Call), NPM1 := TRUE]

dataset3[is.na(Dnmt_3_A_Mutation), DNMT3A := FALSE]
dataset3[!is.na(Dnmt_3_A_Mutation), DNMT3A := TRUE]

dataset3[grepl("deceased", Overall_Survival_Status, ignore.case = T), SURVIVAL := 1]
dataset3[grepl("alive", Overall_Survival_Status, ignore.case = T), SURVIVAL := 0]

Survival Analysis

From here, we'll fit Cox-Proportional Hazards models for :

  1. TP53 mutations
  2. DNMT3A mutations
  3. Interactions between TP53/DNMT3A
  4. NPM1 mutations
tbl = rbind(tidy(coxph(Surv(Overall_Survival_Months, SURVIVAL) ~ TP53, data=dataset3)), 
      tidy(coxph(Surv(Overall_Survival_Months, SURVIVAL) ~ DNMT3A, data=dataset3)),
      tidy(coxph(Surv(Overall_Survival_Months, SURVIVAL) ~ NPM1, data=dataset3)))

tbl %>% kable() %>%  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F, position= "left") 
term estimate std.error statistic p.value
TP53TRUE 0.3536207 0.1156758 3.056999 0.0022356
DNMT3ATRUE 0.2340358 0.1131688 2.068024 0.0386377
NPM1TRUE 0.9390498 0.7106134 1.321464 0.1863468

Here's the model with the interactive terms:

tidy(coxph(Surv(Overall_Survival_Months, SURVIVAL) ~ TP53*DNMT3A, data=dataset3)) %>% kable() %>%  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F, position = "left") 
term estimate std.error statistic p.value
TP53TRUE 0.7173523 0.2333776 3.0737836 0.0021136
DNMT3ATRUE 0.1973039 0.2027175 0.9732946 0.3304069
TP53TRUE:DNMT3ATRUE -0.6016312 0.3186106 -1.8882964 0.0589862

Survival Plots

We'll visualize these results as well:

plot0 = ggsurvplot(
    fit = survfit(Surv(Overall_Survival_Months, SURVIVAL) ~ factor(Risk.Classification), data = dataset3), 
    xlab = "Days", 
    ylab = "Overall survival probability (TP53)", ggtheme = theme_minimal()) 

plot1 = ggsurvplot(
    fit = survfit(Surv(Overall_Survival_Months, SURVIVAL) ~ TP53, data = dataset3), 
    xlab = "Days", 
    ylab = "Overall survival probability (TP53)", ggtheme = theme_minimal()) 

plot2 = ggsurvplot(
    fit = survfit(Surv(Overall_Survival_Months, SURVIVAL) ~ DNMT3A, data = dataset3), 
    xlab = "Days", 
    ylab = "Overall survival probability (DNMT3A)", ggtheme = theme_minimal()) 


plot3 = ggsurvplot(
    fit = survfit(Surv(Overall_Survival_Months, SURVIVAL) ~ NPM1, data = dataset3), 
    xlab = "Days", 
    ylab = "Overall survival probability (NPM1)", ggtheme = theme_minimal()) 


plot0

plot1

plot2

plot3

Conclusions

Based on our findings, we can conclude the following: