read.csv("cml.csv")->cml

st_options( dfSummary.na.col = T, dfSummary.graph.col = F, dfSummary.valid.col = F)
summarytools::dfSummary(cml)

No	Variable	Stats / Values	Freqs (% of Valid)	Missing
1	id\ [integer]	Mean (sd) : 700 (129.5)\ min < med < max:\ 498 < 685.5 < 971\ IQR (CV) : 217.5 (0.2)	258 distinct values	0\ (0.0%)
2	age\ [numeric]	Mean (sd) : 35.6 (11.3)\ min < med < max:\ 6.5 < 35.6 < 61.8\ IQR (CV) : 15.4 (0.3)	256 distinct values	0\ (0.0%)
3	agecat\ [integer]	Min : 0\ Mean : 0.4\ Max : 1	0 : 167 (64.7%)\ 1 : 91 (35.3%)	0\ (0.0%)
4	dod\ [numeric]	Mean (sd) : 1.5 (1.6)\ min < med < max:\ 0.1 < 1 < 12.2\ IQR (CV) : 1.4 (1.1)	226 distinct values	0\ (0.0%)
5	dodcat\ [integer]	Min : 0\ Mean : 0.5\ Max : 1	0 : 134 (51.9%)\ 1 : 124 (48.1%)	0\ (0.0%)
6	pgender\ [integer]	Min : 1\ Mean : 1.4\ Max : 2	1 : 162 (62.8%)\ 2 : 96 (37.2%)	0\ (0.0%)
7	dgender\ [integer]	Min : 1\ Mean : 1.3\ Max : 2	1 : 168 (65.1%)\ 2 : 90 (34.9%)	0\ (0.0%)
8	gmatch\ [integer]	Mean (sd) : 2.2 (1.2)\ min < med < max:\ 1 < 2 < 4\ IQR (CV) : 2 (0.6)	1 : 106 (41.1%)\ 2 : 56 (21.7%)\ 3 : 34 (13.2%)\ 4 : 62 (24.0%)	0\ (0.0%)
9	gmism\ [integer]	Min : 0\ Mean : 0.2\ Max : 1	0 : 202 (78.3%)\ 1 : 56 (21.7%)	0\ (0.0%)
10	disease\ [integer]	Mean (sd) : 1.4 (0.9)\ min < med < max:\ 1 < 1 < 4\ IQR (CV) : 1 (0.6)	1 : 191 (74.0%)\ 2 : 42 (16.3%)\ 3 : 6 ( 2.3%)\ 4 : 19 ( 7.4%)	0\ (0.0%)
11	donor\ [integer]	Min : 0\ Mean : 0.5\ Max : 1	0 : 128 (49.6%)\ 1 : 130 (50.4%)	0\ (0.0%)
12	mini\ [integer]	Min : 1\ Mean : 1.9\ Max : 2	1 : 21 ( 8.1%)\ 2 : 237 (91.9%)	0\ (0.0%)
13	stage\ [integer]	Mean (sd) : 0.4 (0.7)\ min < med < max:\ 0 < 0 < 2\ IQR (CV) : 1 (1.8)	0 : 191 (74.0%)\ 1 : 42 (16.3%)\ 2 : 25 ( 9.7%)	0\ (0.0%)
14	agvhd\ [integer]	Min : 0\ Mean : 0.5\ Max : 1	0 : 117 (49.8%)\ 1 : 118 (50.2%)	23\ (8.9%)
15	tagvhd\ [numeric]	Mean (sd) : 0.9 (0.6)\ min < med < max:\ 0.1 < 0.8 < 3\ IQR (CV) : 0.5 (0.7)	46 distinct values	145\ (56.2%)
16	relnrm\ [integer]	Mean (sd) : 1 (0.7)\ min < med < max:\ 0 < 1 < 2\ IQR (CV) : 1 (0.7)	0 : 69 (26.7%)\ 1 : 125 (48.4%)\ 2 : 64 (24.8%)	0\ (0.0%)
17	efs\ [numeric]	Mean (sd) : 26 (34)\ min < med < max:\ 0.4 < 8.3 < 134\ IQR (CV) : 32 (1.3)	219 distinct values	0\ (0.0%)
18	event\ [integer]	Min : 0\ Mean : 0.7\ Max : 1	0 : 69 (26.7%)\ 1 : 189 (73.3%)	0\ (0.0%)
19	os\ [numeric]	Mean (sd) : 49.4 (40.3)\ min < med < max:\ 0.4 < 42.7 < 134\ IQR (CV) : 76.4 (0.8)	242 distinct values	0\ (0.0%)
20	dead\ [integer]	Min : 0\ Mean : 0.4\ Max : 1	0 : 152 (58.9%)\ 1 : 106 (41.1%)	0\ (0.0%)

calculate Overall survival

os<-survfit(Surv(os, dead)~1, data=cml)

Overall survival at 1 and 5 years

# print(os)
summary(os, c(12, 60))

## Call: survfit(formula = Surv(os, dead) ~ 1, data = cml)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12    182      74    0.713  0.0282        0.660        0.770
##    60    105      27    0.602  0.0309        0.544        0.666

plot overall survival

OS<-ggsurvplot(os, data = cml, legend="none", conf.int = F, palette="red", xlab="Months", break.time.by=12)
OS

# annotate ggsurvplot using annotate function of ggplot

OS$plot + annotate("text", label="test", x=12, y=0.25)

calculate and plot event free survival curve

efs<- survfit(Surv(efs, event)~1, data=cml)

EFS<- ggsurvplot(efs,  data = cml, legend="none", conf.int = F, palette="blue", xlab="Months", break.time.by=12)
EFS

# plot both OS and EFS in one plot

plots<- list(OS=os,EFS=efs)

ggsurvplot(plots, data=cml, combine = TRUE,
           palette=c("red", "blue"),
           break.time.by=12, xlab="Months",
           legend=c(0.2,0.15),
           legend.title="",
           legend.labs=c("Overall Survival", "Event Free Survival"),
           surv.scale="percent", xlim=c(0,60))

OS according to disease stage

fit<- survfit(Surv(os, dead)~stage, data=cml)
diff<- survdiff(Surv(os, dead)~stage, data=cml)

summary(fit, c(12, 60))

## Call: survfit(formula = Surv(os, dead) ~ stage, data = cml)
## 
##                 stage=0 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12    147      43    0.775  0.0302        0.718        0.836
##    60     91      18    0.674  0.0345        0.609        0.745
## 
##                 stage=1 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12     26      16    0.619  0.0749        0.488        0.785
##    60     12       3    0.548  0.0768        0.416        0.721
## 
##                 stage=2 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12      9      15    0.400  0.0980       0.2475        0.646
##    60      2       6    0.133  0.0708       0.0471        0.378

diff

## Call:
## survdiff(formula = Surv(os, dead) ~ stage, data = cml)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## stage=0 191       63    82.79      4.73     21.82
## stage=1  42       22    16.18      2.10      2.49
## stage=2  25       21     7.03     27.77     30.12
## 
##  Chisq= 35  on 2 degrees of freedom, p= 2e-08

cat("P=",broom::glance(diff)$p.value)

## P= 2.455587e-08

plot OS according to stage

ggsurvplot(fit, cml, pval=T, pval.coord = c(100, 0.9), legend=c(0.5,0.9))->OS_stage
OS_stage

# using source code These functions were created by me (Iyad Sultan)

source("/cloud/project/myScripts.R")
cml %>% mutate(var=stage) %>% EFS("Stage")

## Stage
##                Characteristic       Time 1            Time 5      
##              ─────────────────────────────────────────────────────
##                Overall          97% (94%, 99%)    65% (60%, 71%)  
##                var                                                
##                0                97% (95%, 100%)   71% (65%, 77%)  
##                1                95% (89%, 100%)   57% (44%, 74%)  
##                2                92% (82%, 100%)   36% (21%, 61%)  
## 
## Column names: label, stat_1, stat_2
## P= 0.0009073654

cml %>% mutate(var=stage) %>% OS("Stage")

## Stage
##                Characteristic       Time 1            Time 5      
##              ─────────────────────────────────────────────────────
##                Overall          97% (94%, 99%)    82% (78%, 87%)  
##                var                                                
##                0                97% (95%, 100%)   85% (80%, 91%)  
##                1                95% (89%, 100%)   76% (64%, 90%)  
##                2                92% (82%, 100%)   68% (52%, 89%)  
## 
## Column names: label, stat_1, stat_2
## P= 2.455587e-08

plot cumulative risk for death

ggsurvplot(os, cml, fun="event")->a
a

# Cox proportional hazards regression according to gender mismatch, stage, acute GVHD, and duration after diagnosis

coxph(Surv(os, dead)~gmism, cml)

## Call:
## coxph(formula = Surv(os, dead) ~ gmism, data = cml)
## 
##         coef exp(coef) se(coef)     z     p
## gmism 0.1644    1.1787   0.2272 0.724 0.469
## 
## Likelihood ratio test=0.51  on 1 df, p=0.4755
## n= 258, number of events= 106

coxph(Surv(os, dead)~stage, cml)

## Call:
## coxph(formula = Surv(os, dead) ~ stage, data = cml)
## 
##         coef exp(coef) se(coef)     z        p
## stage 0.6832    1.9801   0.1253 5.451 5.01e-08
## 
## Likelihood ratio test=25.32  on 1 df, p=4.852e-07
## n= 258, number of events= 106

coxph(Surv(os, dead)~agvhd, cml)

## Call:
## coxph(formula = Surv(os, dead) ~ agvhd, data = cml)
## 
##         coef exp(coef) se(coef)     z        p
## agvhd 0.7721    2.1643   0.2281 3.385 0.000712
## 
## Likelihood ratio test=12.04  on 1 df, p=0.0005199
## n= 235, number of events= 84 
##    (23 observations deleted due to missingness)

coxph(Surv(os, dead)~dodcat, cml)

## Call:
## coxph(formula = Surv(os, dead) ~ dodcat, data = cml)
## 
##          coef exp(coef) se(coef)     z       p
## dodcat 0.6745    1.9630   0.1997 3.378 0.00073
## 
## Likelihood ratio test=11.79  on 1 df, p=0.0005958
## n= 258, number of events= 106

Mutivariable Cox

fit<- coxph(Surv(os, dead)~stage+agvhd+dodcat, cml)
fit

## Call:
## coxph(formula = Surv(os, dead) ~ stage + agvhd + dodcat, data = cml)
## 
##          coef exp(coef) se(coef)     z        p
## stage  0.7322    2.0796   0.1432 5.112 3.18e-07
## agvhd  0.5059    1.6584   0.2386 2.120  0.03401
## dodcat 0.6504    1.9163   0.2337 2.783  0.00539
## 
## Likelihood ratio test=40.8  on 3 df, p=7.213e-09
## n= 235, number of events= 84 
##    (23 observations deleted due to missingness)

fit %>% gtsummary::tbl_regression(exp=T) %>% as_hux_table() # this is for better table output

Characteristic	HR	95% CI	p-value
stage	2.08	1.57, 2.75	<0.001
agvhd	1.66	1.04, 2.65	0.034
dodcat	1.92	1.21, 3.03	0.005
HR = Hazard Ratio, CI = Confidence Interval

make a forest plot

ggforest(fit)

# Cox using finalfit

library(finalfit)
library(gt)
cml[, c("gmism","stage","agvhd","dodcat")]<- lapply(cml[, c("gmism","stage","agvhd","dodcat")], as.factor)
explanatory = c("gmism","stage","agvhd","dodcat")
dependent = "Surv(os, dead)"

cml %>% finalfit(dependent, explanatory) %>% gt()

Dependent: Surv(os, dead)		all	HR (univariable)	HR (multivariable)
gmism	0	202 (78.3)	-	-
	1	56 (21.7)	1.18 (0.76-1.84, p=0.469)	1.14 (0.70-1.87, p=0.592)
stage	0	191 (74.0)	-	-
	1	42 (16.3)	1.80 (1.10-2.93, p=0.018)	1.54 (0.86-2.75, p=0.149)
	2	25 (9.7)	4.05 (2.45-6.69, p<0.001)	4.76 (2.71-8.37, p<0.001)
agvhd	0	117 (49.8)	-	-
	1	118 (50.2)	2.16 (1.38-3.38, p=0.001)	1.57 (0.98-2.54, p=0.062)
dodcat	0	134 (51.9)	-	-
	1	124 (48.1)	1.96 (1.33-2.90, p=0.001)	2.02 (1.26-3.21, p=0.003)

summaryfactorlist

explanatory = c("age", "gmism","stage","agvhd","dodcat")
cml$dead<-as.factor(cml$dead)
dependent = "dead"

cml %>% summary_factorlist(dependent, explanatory, p=T) %>% gt()

label	levels	0	1	p
age	Mean (SD)	34.7 (10.5)	36.9 (12.2)	0.127
gmism	0	122 (80.3)	80 (75.5)	0.444
	1	30 (19.7)	26 (24.5)
stage	0	128 (84.2)	63 (59.4)	<0.001
	1	20 (13.2)	22 (20.8)
	2	4 (2.6)	21 (19.8)
agvhd	0	87 (57.6)	30 (35.7)	0.002
	1	64 (42.4)	54 (64.3)
dodcat	0	93 (61.2)	41 (38.7)	0.001
	1	59 (38.8)	65 (61.3)

competing risk analysis

What is the problem? compare these 2 plots. Both showing risk of disease mortality after bone marrow transplantation. This is also called NRM (non-transplant related mortality). This represents supportive care / aggressive conditioning / donor selection. Patients who die with no relapse are unlikely to have relapse (rel). Considering NRM as censoring will give different results.

This table shows the relation between event and mortality. Notice that 64 patients died with no relapse. This is a compeeting risk

cml$relnrm<- factor(cml$relnrm, levels=c(0,1,2), labels=c("censor", "relapse","death"))
table(cml$relnrm)

## 
##  censor relapse   death 
##      69     125      64

old way of calculating cumulative incidence for relapse:

cml$rel<-ifelse(cml$relnrm=="relapse",1,0)
fit=survfit(Surv(efs, rel)~1, cml)
old<- ggsurvplot(fit, cml, fun="event",ylim=c(0,1), surv.scale="percent", conf.int = F, xlim=c(0,60), break.time.by=12, legend=c(0.2,0.8))
old

# correct way: competing risk NRM is not the same as censoring

library(cmprsk)
# relnrm already converted to a factor with 0 labeled "censor"
cr<- cmprsk::cuminc(ftime=cml$efs,
               fstatus=cml$relnrm,
               cencode="censor")

cr

## Estimates and Variances:
## $est
##                  20        40        60        80       100       120
## 1 relapse 0.4203911 0.4841768 0.4895653 0.4895653 0.5001082 0.5001082
## 1 death   0.2330383 0.2410055 0.2468626 0.2468626 0.2468626 0.2468626
## 
## $var
##                     20           40           60           80          100
## 1 relapse 0.0009559261 0.0010008240 0.0010087683 0.0010087683 0.0010782725
## 1 death   0.0006982329 0.0007155143 0.0007390985 0.0007390985 0.0007390985
##                    120
## 1 relapse 0.0010782725
## 1 death   0.0007390985

new<- ggcompetingrisks(cr, ylim=c(0,1), surv.scale="percent")+
  theme(strip.background = element_blank(), strip.text = element_blank(), plot.title=element_blank())+ 
  scale_x_continuous(limits=c(0,60),breaks=seq(12,60,12))+
  scale_color_discrete("")+
  theme(legend.position = c(0.2,0.8))+
  guides(color=guide_legend(reverse = T))
new

# old and new way side by side

ggpubr::ggarrange(old$plot, new, labels=c("Old: 1-KM", "New: Competing risk"))

# Cox regression for competing risk You cannot use regular coxph function as NRM is not the same as censoring Check this website: https://www.emilyzabor.com/tutorials/survival_analysis_in_r_tutorial.html#Estimating_median_survival_time

library(tidycmprsk)
crr(Surv(efs, relnrm) ~ gmism + stage + agvhd + dodcat, data=cml)

## 23 cases omitted due to missing values

## 
## Variable   Coef     SE      HR     95% CI       p-value    
## gmism1     -0.216   0.252   0.81   0.49, 1.32   0.39       
## stage1     -0.343   0.331   0.71   0.37, 1.36   0.30       
## stage2     0.439    0.329   1.55   0.81, 2.96   0.18       
## agvhd1     -0.178   0.191   0.84   0.58, 1.22   0.35       
## dodcat1    0.105    0.195   1.11   0.76, 1.63   0.59

survival by Iyad Sultan

2022-10-21