read.csv("cml.csv")->cml
st_options( dfSummary.na.col = T, dfSummary.graph.col = F, dfSummary.valid.col = F)
summarytools::dfSummary(cml)
| No | Variable | Stats / Values | Freqs (% of Valid) | Missing |
|---|---|---|---|---|
| 1 | id\ [integer] | Mean (sd) : 700 (129.5)\ min < med < max:\ 498 < 685.5 < 971\ IQR (CV) : 217.5 (0.2) | 258 distinct values | 0\ (0.0%) |
| 2 | age\ [numeric] | Mean (sd) : 35.6 (11.3)\ min < med < max:\ 6.5 < 35.6 < 61.8\ IQR (CV) : 15.4 (0.3) | 256 distinct values | 0\ (0.0%) |
| 3 | agecat\ [integer] | Min : 0\ Mean : 0.4\ Max : 1 | 0 : 167 (64.7%)\ 1 : 91 (35.3%) | 0\ (0.0%) |
| 4 | dod\ [numeric] | Mean (sd) : 1.5 (1.6)\ min < med < max:\ 0.1 < 1 < 12.2\ IQR (CV) : 1.4 (1.1) | 226 distinct values | 0\ (0.0%) |
| 5 | dodcat\ [integer] | Min : 0\ Mean : 0.5\ Max : 1 | 0 : 134 (51.9%)\ 1 : 124 (48.1%) | 0\ (0.0%) |
| 6 | pgender\ [integer] | Min : 1\ Mean : 1.4\ Max : 2 | 1 : 162 (62.8%)\ 2 : 96 (37.2%) | 0\ (0.0%) |
| 7 | dgender\ [integer] | Min : 1\ Mean : 1.3\ Max : 2 | 1 : 168 (65.1%)\ 2 : 90 (34.9%) | 0\ (0.0%) |
| 8 | gmatch\ [integer] | Mean (sd) : 2.2 (1.2)\ min < med < max:\ 1 < 2 < 4\ IQR (CV) : 2 (0.6) | 1 : 106 (41.1%)\ 2 : 56 (21.7%)\ 3 : 34 (13.2%)\ 4 : 62 (24.0%) | 0\ (0.0%) |
| 9 | gmism\ [integer] | Min : 0\ Mean : 0.2\ Max : 1 | 0 : 202 (78.3%)\ 1 : 56 (21.7%) | 0\ (0.0%) |
| 10 | disease\ [integer] | Mean (sd) : 1.4 (0.9)\ min < med < max:\ 1 < 1 < 4\ IQR (CV) : 1 (0.6) | 1 : 191 (74.0%)\ 2 : 42 (16.3%)\ 3 : 6 ( 2.3%)\ 4 : 19 ( 7.4%) | 0\ (0.0%) |
| 11 | donor\ [integer] | Min : 0\ Mean : 0.5\ Max : 1 | 0 : 128 (49.6%)\ 1 : 130 (50.4%) | 0\ (0.0%) |
| 12 | mini\ [integer] | Min : 1\ Mean : 1.9\ Max : 2 | 1 : 21 ( 8.1%)\ 2 : 237 (91.9%) | 0\ (0.0%) |
| 13 | stage\ [integer] | Mean (sd) : 0.4 (0.7)\ min < med < max:\ 0 < 0 < 2\ IQR (CV) : 1 (1.8) | 0 : 191 (74.0%)\ 1 : 42 (16.3%)\ 2 : 25 ( 9.7%) | 0\ (0.0%) |
| 14 | agvhd\ [integer] | Min : 0\ Mean : 0.5\ Max : 1 | 0 : 117 (49.8%)\ 1 : 118 (50.2%) | 23\ (8.9%) |
| 15 | tagvhd\ [numeric] | Mean (sd) : 0.9 (0.6)\ min < med < max:\ 0.1 < 0.8 < 3\ IQR (CV) : 0.5 (0.7) | 46 distinct values | 145\ (56.2%) |
| 16 | relnrm\ [integer] | Mean (sd) : 1 (0.7)\ min < med < max:\ 0 < 1 < 2\ IQR (CV) : 1 (0.7) | 0 : 69 (26.7%)\ 1 : 125 (48.4%)\ 2 : 64 (24.8%) | 0\ (0.0%) |
| 17 | efs\ [numeric] | Mean (sd) : 26 (34)\ min < med < max:\ 0.4 < 8.3 < 134\ IQR (CV) : 32 (1.3) | 219 distinct values | 0\ (0.0%) |
| 18 | event\ [integer] | Min : 0\ Mean : 0.7\ Max : 1 | 0 : 69 (26.7%)\ 1 : 189 (73.3%) | 0\ (0.0%) |
| 19 | os\ [numeric] | Mean (sd) : 49.4 (40.3)\ min < med < max:\ 0.4 < 42.7 < 134\ IQR (CV) : 76.4 (0.8) | 242 distinct values | 0\ (0.0%) |
| 20 | dead\ [integer] | Min : 0\ Mean : 0.4\ Max : 1 | 0 : 152 (58.9%)\ 1 : 106 (41.1%) | 0\ (0.0%) |
os<-survfit(Surv(os, dead)~1, data=cml)
# print(os)
summary(os, c(12, 60))
## Call: survfit(formula = Surv(os, dead) ~ 1, data = cml)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 12 182 74 0.713 0.0282 0.660 0.770
## 60 105 27 0.602 0.0309 0.544 0.666
OS<-ggsurvplot(os, data = cml, legend="none", conf.int = F, palette="red", xlab="Months", break.time.by=12)
OS
# annotate ggsurvplot using annotate function of ggplot
OS$plot + annotate("text", label="test", x=12, y=0.25)
efs<- survfit(Surv(efs, event)~1, data=cml)
EFS<- ggsurvplot(efs, data = cml, legend="none", conf.int = F, palette="blue", xlab="Months", break.time.by=12)
EFS
# plot both OS and EFS in one plot
plots<- list(OS=os,EFS=efs)
ggsurvplot(plots, data=cml, combine = TRUE,
palette=c("red", "blue"),
break.time.by=12, xlab="Months",
legend=c(0.2,0.15),
legend.title="",
legend.labs=c("Overall Survival", "Event Free Survival"),
surv.scale="percent", xlim=c(0,60))
fit<- survfit(Surv(os, dead)~stage, data=cml)
diff<- survdiff(Surv(os, dead)~stage, data=cml)
summary(fit, c(12, 60))
## Call: survfit(formula = Surv(os, dead) ~ stage, data = cml)
##
## stage=0
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 12 147 43 0.775 0.0302 0.718 0.836
## 60 91 18 0.674 0.0345 0.609 0.745
##
## stage=1
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 12 26 16 0.619 0.0749 0.488 0.785
## 60 12 3 0.548 0.0768 0.416 0.721
##
## stage=2
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 12 9 15 0.400 0.0980 0.2475 0.646
## 60 2 6 0.133 0.0708 0.0471 0.378
diff
## Call:
## survdiff(formula = Surv(os, dead) ~ stage, data = cml)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## stage=0 191 63 82.79 4.73 21.82
## stage=1 42 22 16.18 2.10 2.49
## stage=2 25 21 7.03 27.77 30.12
##
## Chisq= 35 on 2 degrees of freedom, p= 2e-08
cat("P=",broom::glance(diff)$p.value)
## P= 2.455587e-08
ggsurvplot(fit, cml, pval=T, pval.coord = c(100, 0.9), legend=c(0.5,0.9))->OS_stage
OS_stage
# using source code These functions were created by me (Iyad Sultan)
source("/cloud/project/myScripts.R")
cml %>% mutate(var=stage) %>% EFS("Stage")
## Stage
## Characteristic Time 1 Time 5
## ─────────────────────────────────────────────────────
## Overall 97% (94%, 99%) 65% (60%, 71%)
## var
## 0 97% (95%, 100%) 71% (65%, 77%)
## 1 95% (89%, 100%) 57% (44%, 74%)
## 2 92% (82%, 100%) 36% (21%, 61%)
##
## Column names: label, stat_1, stat_2
## P= 0.0009073654
cml %>% mutate(var=stage) %>% OS("Stage")
## Stage
## Characteristic Time 1 Time 5
## ─────────────────────────────────────────────────────
## Overall 97% (94%, 99%) 82% (78%, 87%)
## var
## 0 97% (95%, 100%) 85% (80%, 91%)
## 1 95% (89%, 100%) 76% (64%, 90%)
## 2 92% (82%, 100%) 68% (52%, 89%)
##
## Column names: label, stat_1, stat_2
## P= 2.455587e-08
ggsurvplot(os, cml, fun="event")->a
a
# Cox proportional hazards regression according to gender mismatch,
stage, acute GVHD, and duration after diagnosis
coxph(Surv(os, dead)~gmism, cml)
## Call:
## coxph(formula = Surv(os, dead) ~ gmism, data = cml)
##
## coef exp(coef) se(coef) z p
## gmism 0.1644 1.1787 0.2272 0.724 0.469
##
## Likelihood ratio test=0.51 on 1 df, p=0.4755
## n= 258, number of events= 106
coxph(Surv(os, dead)~stage, cml)
## Call:
## coxph(formula = Surv(os, dead) ~ stage, data = cml)
##
## coef exp(coef) se(coef) z p
## stage 0.6832 1.9801 0.1253 5.451 5.01e-08
##
## Likelihood ratio test=25.32 on 1 df, p=4.852e-07
## n= 258, number of events= 106
coxph(Surv(os, dead)~agvhd, cml)
## Call:
## coxph(formula = Surv(os, dead) ~ agvhd, data = cml)
##
## coef exp(coef) se(coef) z p
## agvhd 0.7721 2.1643 0.2281 3.385 0.000712
##
## Likelihood ratio test=12.04 on 1 df, p=0.0005199
## n= 235, number of events= 84
## (23 observations deleted due to missingness)
coxph(Surv(os, dead)~dodcat, cml)
## Call:
## coxph(formula = Surv(os, dead) ~ dodcat, data = cml)
##
## coef exp(coef) se(coef) z p
## dodcat 0.6745 1.9630 0.1997 3.378 0.00073
##
## Likelihood ratio test=11.79 on 1 df, p=0.0005958
## n= 258, number of events= 106
fit<- coxph(Surv(os, dead)~stage+agvhd+dodcat, cml)
fit
## Call:
## coxph(formula = Surv(os, dead) ~ stage + agvhd + dodcat, data = cml)
##
## coef exp(coef) se(coef) z p
## stage 0.7322 2.0796 0.1432 5.112 3.18e-07
## agvhd 0.5059 1.6584 0.2386 2.120 0.03401
## dodcat 0.6504 1.9163 0.2337 2.783 0.00539
##
## Likelihood ratio test=40.8 on 3 df, p=7.213e-09
## n= 235, number of events= 84
## (23 observations deleted due to missingness)
fit %>% gtsummary::tbl_regression(exp=T) %>% as_hux_table() # this is for better table output
Characteristic | HR | 95% CI | p-value |
|---|---|---|---|
| stage | 2.08 | 1.57, 2.75 | <0.001 |
| agvhd | 1.66 | 1.04, 2.65 | 0.034 |
| dodcat | 1.92 | 1.21, 3.03 | 0.005 |
| HR = Hazard Ratio, CI = Confidence Interval | |||
ggforest(fit)
# Cox using finalfit
library(finalfit)
library(gt)
cml[, c("gmism","stage","agvhd","dodcat")]<- lapply(cml[, c("gmism","stage","agvhd","dodcat")], as.factor)
explanatory = c("gmism","stage","agvhd","dodcat")
dependent = "Surv(os, dead)"
cml %>% finalfit(dependent, explanatory) %>% gt()
| Dependent: Surv(os, dead) | all | HR (univariable) | HR (multivariable) | |
|---|---|---|---|---|
| gmism | 0 | 202 (78.3) | - | - |
| 1 | 56 (21.7) | 1.18 (0.76-1.84, p=0.469) | 1.14 (0.70-1.87, p=0.592) | |
| stage | 0 | 191 (74.0) | - | - |
| 1 | 42 (16.3) | 1.80 (1.10-2.93, p=0.018) | 1.54 (0.86-2.75, p=0.149) | |
| 2 | 25 (9.7) | 4.05 (2.45-6.69, p<0.001) | 4.76 (2.71-8.37, p<0.001) | |
| agvhd | 0 | 117 (49.8) | - | - |
| 1 | 118 (50.2) | 2.16 (1.38-3.38, p=0.001) | 1.57 (0.98-2.54, p=0.062) | |
| dodcat | 0 | 134 (51.9) | - | - |
| 1 | 124 (48.1) | 1.96 (1.33-2.90, p=0.001) | 2.02 (1.26-3.21, p=0.003) |
explanatory = c("age", "gmism","stage","agvhd","dodcat")
cml$dead<-as.factor(cml$dead)
dependent = "dead"
cml %>% summary_factorlist(dependent, explanatory, p=T) %>% gt()
| label | levels | 0 | 1 | p |
|---|---|---|---|---|
| age | Mean (SD) | 34.7 (10.5) | 36.9 (12.2) | 0.127 |
| gmism | 0 | 122 (80.3) | 80 (75.5) | 0.444 |
| 1 | 30 (19.7) | 26 (24.5) | ||
| stage | 0 | 128 (84.2) | 63 (59.4) | <0.001 |
| 1 | 20 (13.2) | 22 (20.8) | ||
| 2 | 4 (2.6) | 21 (19.8) | ||
| agvhd | 0 | 87 (57.6) | 30 (35.7) | 0.002 |
| 1 | 64 (42.4) | 54 (64.3) | ||
| dodcat | 0 | 93 (61.2) | 41 (38.7) | 0.001 |
| 1 | 59 (38.8) | 65 (61.3) |
What is the problem? compare these 2 plots. Both showing risk of disease mortality after bone marrow transplantation. This is also called NRM (non-transplant related mortality). This represents supportive care / aggressive conditioning / donor selection. Patients who die with no relapse are unlikely to have relapse (rel). Considering NRM as censoring will give different results.
This table shows the relation between event and mortality. Notice that 64 patients died with no relapse. This is a compeeting risk
cml$relnrm<- factor(cml$relnrm, levels=c(0,1,2), labels=c("censor", "relapse","death"))
table(cml$relnrm)
##
## censor relapse death
## 69 125 64
cml$rel<-ifelse(cml$relnrm=="relapse",1,0)
fit=survfit(Surv(efs, rel)~1, cml)
old<- ggsurvplot(fit, cml, fun="event",ylim=c(0,1), surv.scale="percent", conf.int = F, xlim=c(0,60), break.time.by=12, legend=c(0.2,0.8))
old
# correct way: competing risk NRM is not the same as censoring
library(cmprsk)
# relnrm already converted to a factor with 0 labeled "censor"
cr<- cmprsk::cuminc(ftime=cml$efs,
fstatus=cml$relnrm,
cencode="censor")
cr
## Estimates and Variances:
## $est
## 20 40 60 80 100 120
## 1 relapse 0.4203911 0.4841768 0.4895653 0.4895653 0.5001082 0.5001082
## 1 death 0.2330383 0.2410055 0.2468626 0.2468626 0.2468626 0.2468626
##
## $var
## 20 40 60 80 100
## 1 relapse 0.0009559261 0.0010008240 0.0010087683 0.0010087683 0.0010782725
## 1 death 0.0006982329 0.0007155143 0.0007390985 0.0007390985 0.0007390985
## 120
## 1 relapse 0.0010782725
## 1 death 0.0007390985
new<- ggcompetingrisks(cr, ylim=c(0,1), surv.scale="percent")+
theme(strip.background = element_blank(), strip.text = element_blank(), plot.title=element_blank())+
scale_x_continuous(limits=c(0,60),breaks=seq(12,60,12))+
scale_color_discrete("")+
theme(legend.position = c(0.2,0.8))+
guides(color=guide_legend(reverse = T))
new
# old and new way side by side
ggpubr::ggarrange(old$plot, new, labels=c("Old: 1-KM", "New: Competing risk"))
# Cox regression for competing risk You cannot use regular coxph
function as NRM is not the same as censoring Check this website: https://www.emilyzabor.com/tutorials/survival_analysis_in_r_tutorial.html#Estimating_median_survival_time
library(tidycmprsk)
crr(Surv(efs, relnrm) ~ gmism + stage + agvhd + dodcat, data=cml)
## 23 cases omitted due to missing values
##
## Variable Coef SE HR 95% CI p-value
## gmism1 -0.216 0.252 0.81 0.49, 1.32 0.39
## stage1 -0.343 0.331 0.71 0.37, 1.36 0.30
## stage2 0.439 0.329 1.55 0.81, 2.96 0.18
## agvhd1 -0.178 0.191 0.84 0.58, 1.22 0.35
## dodcat1 0.105 0.195 1.11 0.76, 1.63 0.59