read.csv("cml.csv")->cml
st_options( dfSummary.na.col = T, dfSummary.graph.col = F, dfSummary.valid.col = F)
summarytools::dfSummary(cml)
NoVariableStats / ValuesFreqs (% of Valid)Missing
1id\
[integer]
Mean (sd) : 700 (129.5)\
min < med < max:\
498 < 685.5 < 971\
IQR (CV) : 217.5 (0.2)
258 distinct values0\
(0.0%)
2age\
[numeric]
Mean (sd) : 35.6 (11.3)\
min < med < max:\
6.5 < 35.6 < 61.8\
IQR (CV) : 15.4 (0.3)
256 distinct values0\
(0.0%)
3agecat\
[integer]
Min : 0\
Mean : 0.4\
Max : 1
0 : 167 (64.7%)\
1 : 91 (35.3%)
0\
(0.0%)
4dod\
[numeric]
Mean (sd) : 1.5 (1.6)\
min < med < max:\
0.1 < 1 < 12.2\
IQR (CV) : 1.4 (1.1)
226 distinct values0\
(0.0%)
5dodcat\
[integer]
Min : 0\
Mean : 0.5\
Max : 1
0 : 134 (51.9%)\
1 : 124 (48.1%)
0\
(0.0%)
6pgender\
[integer]
Min : 1\
Mean : 1.4\
Max : 2
1 : 162 (62.8%)\
2 : 96 (37.2%)
0\
(0.0%)
7dgender\
[integer]
Min : 1\
Mean : 1.3\
Max : 2
1 : 168 (65.1%)\
2 : 90 (34.9%)
0\
(0.0%)
8gmatch\
[integer]
Mean (sd) : 2.2 (1.2)\
min < med < max:\
1 < 2 < 4\
IQR (CV) : 2 (0.6)
1 : 106 (41.1%)\
2 : 56 (21.7%)\
3 : 34 (13.2%)\
4 : 62 (24.0%)
0\
(0.0%)
9gmism\
[integer]
Min : 0\
Mean : 0.2\
Max : 1
0 : 202 (78.3%)\
1 : 56 (21.7%)
0\
(0.0%)
10disease\
[integer]
Mean (sd) : 1.4 (0.9)\
min < med < max:\
1 < 1 < 4\
IQR (CV) : 1 (0.6)
1 : 191 (74.0%)\
2 : 42 (16.3%)\
3 : 6 ( 2.3%)\
4 : 19 ( 7.4%)
0\
(0.0%)
11donor\
[integer]
Min : 0\
Mean : 0.5\
Max : 1
0 : 128 (49.6%)\
1 : 130 (50.4%)
0\
(0.0%)
12mini\
[integer]
Min : 1\
Mean : 1.9\
Max : 2
1 : 21 ( 8.1%)\
2 : 237 (91.9%)
0\
(0.0%)
13stage\
[integer]
Mean (sd) : 0.4 (0.7)\
min < med < max:\
0 < 0 < 2\
IQR (CV) : 1 (1.8)
0 : 191 (74.0%)\
1 : 42 (16.3%)\
2 : 25 ( 9.7%)
0\
(0.0%)
14agvhd\
[integer]
Min : 0\
Mean : 0.5\
Max : 1
0 : 117 (49.8%)\
1 : 118 (50.2%)
23\
(8.9%)
15tagvhd\
[numeric]
Mean (sd) : 0.9 (0.6)\
min < med < max:\
0.1 < 0.8 < 3\
IQR (CV) : 0.5 (0.7)
46 distinct values145\
(56.2%)
16relnrm\
[integer]
Mean (sd) : 1 (0.7)\
min < med < max:\
0 < 1 < 2\
IQR (CV) : 1 (0.7)
0 : 69 (26.7%)\
1 : 125 (48.4%)\
2 : 64 (24.8%)
0\
(0.0%)
17efs\
[numeric]
Mean (sd) : 26 (34)\
min < med < max:\
0.4 < 8.3 < 134\
IQR (CV) : 32 (1.3)
219 distinct values0\
(0.0%)
18event\
[integer]
Min : 0\
Mean : 0.7\
Max : 1
0 : 69 (26.7%)\
1 : 189 (73.3%)
0\
(0.0%)
19os\
[numeric]
Mean (sd) : 49.4 (40.3)\
min < med < max:\
0.4 < 42.7 < 134\
IQR (CV) : 76.4 (0.8)
242 distinct values0\
(0.0%)
20dead\
[integer]
Min : 0\
Mean : 0.4\
Max : 1
0 : 152 (58.9%)\
1 : 106 (41.1%)
0\
(0.0%)

calculate Overall survival

os<-survfit(Surv(os, dead)~1, data=cml)

Overall survival at 1 and 5 years

# print(os)
summary(os, c(12, 60))
## Call: survfit(formula = Surv(os, dead) ~ 1, data = cml)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12    182      74    0.713  0.0282        0.660        0.770
##    60    105      27    0.602  0.0309        0.544        0.666

plot overall survival

OS<-ggsurvplot(os, data = cml, legend="none", conf.int = F, palette="red", xlab="Months", break.time.by=12)
OS

# annotate ggsurvplot using annotate function of ggplot

OS$plot + annotate("text", label="test", x=12, y=0.25)

calculate and plot event free survival curve

efs<- survfit(Surv(efs, event)~1, data=cml)

EFS<- ggsurvplot(efs,  data = cml, legend="none", conf.int = F, palette="blue", xlab="Months", break.time.by=12)
EFS

# plot both OS and EFS in one plot

plots<- list(OS=os,EFS=efs)

ggsurvplot(plots, data=cml, combine = TRUE,
           palette=c("red", "blue"),
           break.time.by=12, xlab="Months",
           legend=c(0.2,0.15),
           legend.title="",
           legend.labs=c("Overall Survival", "Event Free Survival"),
           surv.scale="percent", xlim=c(0,60))

OS according to disease stage

fit<- survfit(Surv(os, dead)~stage, data=cml)
diff<- survdiff(Surv(os, dead)~stage, data=cml)

summary(fit, c(12, 60))
## Call: survfit(formula = Surv(os, dead) ~ stage, data = cml)
## 
##                 stage=0 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12    147      43    0.775  0.0302        0.718        0.836
##    60     91      18    0.674  0.0345        0.609        0.745
## 
##                 stage=1 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12     26      16    0.619  0.0749        0.488        0.785
##    60     12       3    0.548  0.0768        0.416        0.721
## 
##                 stage=2 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12      9      15    0.400  0.0980       0.2475        0.646
##    60      2       6    0.133  0.0708       0.0471        0.378
diff
## Call:
## survdiff(formula = Surv(os, dead) ~ stage, data = cml)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## stage=0 191       63    82.79      4.73     21.82
## stage=1  42       22    16.18      2.10      2.49
## stage=2  25       21     7.03     27.77     30.12
## 
##  Chisq= 35  on 2 degrees of freedom, p= 2e-08
cat("P=",broom::glance(diff)$p.value)
## P= 2.455587e-08

plot OS according to stage

ggsurvplot(fit, cml, pval=T, pval.coord = c(100, 0.9), legend=c(0.5,0.9))->OS_stage
OS_stage

# using source code These functions were created by me (Iyad Sultan)

source("/cloud/project/myScripts.R")
cml %>% mutate(var=stage) %>% EFS("Stage")

## Stage
##                Characteristic       Time 1            Time 5      
##              ─────────────────────────────────────────────────────
##                Overall          97% (94%, 99%)    65% (60%, 71%)  
##                var                                                
##                0                97% (95%, 100%)   71% (65%, 77%)  
##                1                95% (89%, 100%)   57% (44%, 74%)  
##                2                92% (82%, 100%)   36% (21%, 61%)  
## 
## Column names: label, stat_1, stat_2
## P= 0.0009073654
cml %>% mutate(var=stage) %>% OS("Stage")

## Stage
##                Characteristic       Time 1            Time 5      
##              ─────────────────────────────────────────────────────
##                Overall          97% (94%, 99%)    82% (78%, 87%)  
##                var                                                
##                0                97% (95%, 100%)   85% (80%, 91%)  
##                1                95% (89%, 100%)   76% (64%, 90%)  
##                2                92% (82%, 100%)   68% (52%, 89%)  
## 
## Column names: label, stat_1, stat_2
## P= 2.455587e-08

plot cumulative risk for death

ggsurvplot(os, cml, fun="event")->a
a

# Cox proportional hazards regression according to gender mismatch, stage, acute GVHD, and duration after diagnosis

coxph(Surv(os, dead)~gmism, cml)
## Call:
## coxph(formula = Surv(os, dead) ~ gmism, data = cml)
## 
##         coef exp(coef) se(coef)     z     p
## gmism 0.1644    1.1787   0.2272 0.724 0.469
## 
## Likelihood ratio test=0.51  on 1 df, p=0.4755
## n= 258, number of events= 106
coxph(Surv(os, dead)~stage, cml)
## Call:
## coxph(formula = Surv(os, dead) ~ stage, data = cml)
## 
##         coef exp(coef) se(coef)     z        p
## stage 0.6832    1.9801   0.1253 5.451 5.01e-08
## 
## Likelihood ratio test=25.32  on 1 df, p=4.852e-07
## n= 258, number of events= 106
coxph(Surv(os, dead)~agvhd, cml)
## Call:
## coxph(formula = Surv(os, dead) ~ agvhd, data = cml)
## 
##         coef exp(coef) se(coef)     z        p
## agvhd 0.7721    2.1643   0.2281 3.385 0.000712
## 
## Likelihood ratio test=12.04  on 1 df, p=0.0005199
## n= 235, number of events= 84 
##    (23 observations deleted due to missingness)
coxph(Surv(os, dead)~dodcat, cml)
## Call:
## coxph(formula = Surv(os, dead) ~ dodcat, data = cml)
## 
##          coef exp(coef) se(coef)     z       p
## dodcat 0.6745    1.9630   0.1997 3.378 0.00073
## 
## Likelihood ratio test=11.79  on 1 df, p=0.0005958
## n= 258, number of events= 106

Mutivariable Cox

fit<- coxph(Surv(os, dead)~stage+agvhd+dodcat, cml)
fit
## Call:
## coxph(formula = Surv(os, dead) ~ stage + agvhd + dodcat, data = cml)
## 
##          coef exp(coef) se(coef)     z        p
## stage  0.7322    2.0796   0.1432 5.112 3.18e-07
## agvhd  0.5059    1.6584   0.2386 2.120  0.03401
## dodcat 0.6504    1.9163   0.2337 2.783  0.00539
## 
## Likelihood ratio test=40.8  on 3 df, p=7.213e-09
## n= 235, number of events= 84 
##    (23 observations deleted due to missingness)
fit %>% gtsummary::tbl_regression(exp=T) %>% as_hux_table() # this is for better table output

Characteristic

HR

95% CI

p-value

stage2.081.57, 2.75<0.001
agvhd1.661.04, 2.650.034
dodcat1.921.21, 3.030.005
HR = Hazard Ratio, CI = Confidence Interval

make a forest plot

ggforest(fit)

# Cox using finalfit

library(finalfit)
library(gt)
cml[, c("gmism","stage","agvhd","dodcat")]<- lapply(cml[, c("gmism","stage","agvhd","dodcat")], as.factor)
explanatory = c("gmism","stage","agvhd","dodcat")
dependent = "Surv(os, dead)"

cml %>% finalfit(dependent, explanatory) %>% gt()
Dependent: Surv(os, dead) all HR (univariable) HR (multivariable)
gmism 0 202 (78.3) - -
1 56 (21.7) 1.18 (0.76-1.84, p=0.469) 1.14 (0.70-1.87, p=0.592)
stage 0 191 (74.0) - -
1 42 (16.3) 1.80 (1.10-2.93, p=0.018) 1.54 (0.86-2.75, p=0.149)
2 25 (9.7) 4.05 (2.45-6.69, p<0.001) 4.76 (2.71-8.37, p<0.001)
agvhd 0 117 (49.8) - -
1 118 (50.2) 2.16 (1.38-3.38, p=0.001) 1.57 (0.98-2.54, p=0.062)
dodcat 0 134 (51.9) - -
1 124 (48.1) 1.96 (1.33-2.90, p=0.001) 2.02 (1.26-3.21, p=0.003)

summaryfactorlist

explanatory = c("age", "gmism","stage","agvhd","dodcat")
cml$dead<-as.factor(cml$dead)
dependent = "dead"

cml %>% summary_factorlist(dependent, explanatory, p=T) %>% gt()
label levels 0 1 p
age Mean (SD) 34.7 (10.5) 36.9 (12.2) 0.127
gmism 0 122 (80.3) 80 (75.5) 0.444
1 30 (19.7) 26 (24.5)
stage 0 128 (84.2) 63 (59.4) <0.001
1 20 (13.2) 22 (20.8)
2 4 (2.6) 21 (19.8)
agvhd 0 87 (57.6) 30 (35.7) 0.002
1 64 (42.4) 54 (64.3)
dodcat 0 93 (61.2) 41 (38.7) 0.001
1 59 (38.8) 65 (61.3)

competing risk analysis

What is the problem? compare these 2 plots. Both showing risk of disease mortality after bone marrow transplantation. This is also called NRM (non-transplant related mortality). This represents supportive care / aggressive conditioning / donor selection. Patients who die with no relapse are unlikely to have relapse (rel). Considering NRM as censoring will give different results.

This table shows the relation between event and mortality. Notice that 64 patients died with no relapse. This is a compeeting risk

cml$relnrm<- factor(cml$relnrm, levels=c(0,1,2), labels=c("censor", "relapse","death"))
table(cml$relnrm)
## 
##  censor relapse   death 
##      69     125      64

old way of calculating cumulative incidence for relapse:

cml$rel<-ifelse(cml$relnrm=="relapse",1,0)
fit=survfit(Surv(efs, rel)~1, cml)
old<- ggsurvplot(fit, cml, fun="event",ylim=c(0,1), surv.scale="percent", conf.int = F, xlim=c(0,60), break.time.by=12, legend=c(0.2,0.8))
old

# correct way: competing risk NRM is not the same as censoring

library(cmprsk)
# relnrm already converted to a factor with 0 labeled "censor"
cr<- cmprsk::cuminc(ftime=cml$efs,
               fstatus=cml$relnrm,
               cencode="censor")

cr
## Estimates and Variances:
## $est
##                  20        40        60        80       100       120
## 1 relapse 0.4203911 0.4841768 0.4895653 0.4895653 0.5001082 0.5001082
## 1 death   0.2330383 0.2410055 0.2468626 0.2468626 0.2468626 0.2468626
## 
## $var
##                     20           40           60           80          100
## 1 relapse 0.0009559261 0.0010008240 0.0010087683 0.0010087683 0.0010782725
## 1 death   0.0006982329 0.0007155143 0.0007390985 0.0007390985 0.0007390985
##                    120
## 1 relapse 0.0010782725
## 1 death   0.0007390985
new<- ggcompetingrisks(cr, ylim=c(0,1), surv.scale="percent")+
  theme(strip.background = element_blank(), strip.text = element_blank(), plot.title=element_blank())+ 
  scale_x_continuous(limits=c(0,60),breaks=seq(12,60,12))+
  scale_color_discrete("")+
  theme(legend.position = c(0.2,0.8))+
  guides(color=guide_legend(reverse = T))
new

# old and new way side by side

ggpubr::ggarrange(old$plot, new, labels=c("Old: 1-KM", "New: Competing risk"))

# Cox regression for competing risk You cannot use regular coxph function as NRM is not the same as censoring Check this website: https://www.emilyzabor.com/tutorials/survival_analysis_in_r_tutorial.html#Estimating_median_survival_time

library(tidycmprsk)
crr(Surv(efs, relnrm) ~ gmism + stage + agvhd + dodcat, data=cml)
## 23 cases omitted due to missing values
## 
## Variable   Coef     SE      HR     95% CI       p-value    
## gmism1     -0.216   0.252   0.81   0.49, 1.32   0.39       
## stage1     -0.343   0.331   0.71   0.37, 1.36   0.30       
## stage2     0.439    0.329   1.55   0.81, 2.96   0.18       
## agvhd1     -0.178   0.191   0.84   0.58, 1.22   0.35       
## dodcat1    0.105    0.195   1.11   0.76, 1.63   0.59