Survival Analysis

Step 1: Set working directory

setwd("D:/DrPH/SEM1/Multivariable Analysis/Survival Analysis/Survival Analysis/Training with Prof Kim")

Step 2: Load library

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(haven)
library(survival)

Step 3: Prepare data

Load data

mydata <-  read_dta("stroke_outcome.dta")

Explore data

view(mydata)
str(mydata)

## tibble [262 × 14] (S3: tbl_df/tbl/data.frame)
##  $ id     : chr [1:262] "B040512" "A076101" "A170536" "B459090" ...
##   ..- attr(*, "label")= chr "registration number"
##   ..- attr(*, "format.stata")= chr "%8s"
##  $ sex    : dbl+lbl [1:262] 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, ...
##    ..@ label       : chr "sex"
##    ..@ format.stata: chr "%6.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "male" "female"
##  $ race   : dbl+lbl [1:262] 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
##    ..@ label       : chr "race"
##    ..@ format.stata: chr "%12.0f"
##    ..@ labels      : Named num [1:6] 1 2 3 4 8 9
##    .. ..- attr(*, "names")= chr [1:6] "malay" "chinese" "indians" "others" ...
##  $ doa    : Date[1:262], format: "2012-05-23" "2011-10-05" ...
##  $ dod    : Date[1:262], format: "2012-05-29" "2011-10-10" ...
##  $ gcs    : num [1:262] 14 11 12 13 15 8 15 15 11 11 ...
##   ..- attr(*, "label")= chr "earliest Glasgow Coma Scale"
##   ..- attr(*, "format.stata")= chr "%2.0f"
##  $ sbp    : num [1:262] 108 126 167 172 160 173 120 143 169 173 ...
##   ..- attr(*, "label")= chr "earliest systolic BP (mmHg)"
##   ..- attr(*, "format.stata")= chr "%3.0f"
##  $ dbp    : num [1:262] 99 70 71 100 88 95 82 76 92 123 ...
##   ..- attr(*, "label")= chr "earliest diastolic BP (mmHg)"
##   ..- attr(*, "format.stata")= chr "%3.0f"
##  $ age    : num [1:262] 80 74 73 54 70 66 25 56 55 29 ...
##   ..- attr(*, "label")= chr "age in years"
##   ..- attr(*, "format.stata")= chr "%10.0g"
##  $ event  : dbl+lbl [1:262] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
##    ..@ label       : chr "status @discharge 1=dead, 0=alive"
##    ..@ format.stata: chr "%9.0g"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "censored" "dead"
##  $ dm2    : dbl+lbl [1:262] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
##    ..@ label       : chr "diabetes?"
##    ..@ format.stata: chr "%9.0g"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "no" "yes"
##  $ hpt2   : dbl+lbl [1:262] 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, ...
##    ..@ label       : chr "hypertension?"
##    ..@ format.stata: chr "%9.0g"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "no" "yes"
##  $ race2  : dbl+lbl [1:262] 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
##    ..@ label       : chr "malay vs non-malay"
##    ..@ format.stata: chr "%9.0g"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "no" "yes"
##  $ event_s: num [1:262] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  - attr(*, "label")= chr "Data file created by EpiData based on DEADALIVEHUSM2.REC"

change labelled data to factors

mydata <- mydata %>% 
  mutate(across(where(is.labelled), as_factor))
summary(mydata)

##       id                sex                race          doa            
##  Length:262         male  :115   malay       :245   Min.   :2011-01-01  
##  Class :character   female:147   chinese     :  0   1st Qu.:2011-05-08  
##  Mode  :character                indians     :  0   Median :2011-10-16  
##                                  others      : 17   Mean   :2011-10-07  
##                                  not recorded:  0   3rd Qu.:2012-03-04  
##                                  missing     :  0   Max.   :2012-06-23  
##       dod                  gcs             sbp             dbp        
##  Min.   :2011-01-05   Min.   : 3.00   Min.   : 97.0   Min.   : 42.00  
##  1st Qu.:2011-05-16   1st Qu.:10.00   1st Qu.:143.0   1st Qu.: 75.00  
##  Median :2011-10-21   Median :14.00   Median :160.0   Median : 90.00  
##  Mean   :2011-10-14   Mean   :12.02   Mean   :163.7   Mean   : 90.82  
##  3rd Qu.:2012-03-14   3rd Qu.:15.00   3rd Qu.:186.8   3rd Qu.:105.00  
##  Max.   :2012-06-27   Max.   :15.00   Max.   :290.0   Max.   :160.00  
##       age             event      dm2       hpt2     race2        event_s      
##  Min.   :20.00   censored:199   no :161   no : 91   no : 17   Min.   :0.0000  
##  1st Qu.:52.00   dead    : 63   yes:101   yes:171   yes:245   1st Qu.:0.0000  
##  Median :61.00                                                Median :0.0000  
##  Mean   :60.75                                                Mean   :0.2405  
##  3rd Qu.:69.00                                                3rd Qu.:0.0000  
##  Max.   :96.00                                                Max.   :1.0000

Summary using gtsummary

library(dplyr)
library(gtsummary)

mydata %>%
  select(-c(id, doa, dod)) %>%
  tbl_summary(
    by = event,
    statistic = list(
      all_categorical() ~ "{n} ({p}%)",
      all_continuous() ~ "{mean} ({sd})"
    ),
    digits = all_continuous() ~ 2
  )

Characteristic	censored N = 199¹	dead N = 63¹
sex
male	94 (47%)	21 (33%)
female	105 (53%)	42 (67%)
race
malay	187 (94%)	58 (92%)
chinese	0 (0%)	0 (0%)
indians	0 (0%)	0 (0%)
others	12 (6.0%)	5 (7.9%)
not recorded	0 (0%)	0 (0%)
missing	0 (0%)	0 (0%)
earliest Glasgow Coma Scale	13.39 (2.70)	7.68 (3.64)
earliest systolic BP (mmHg)	160.83 (31.08)	172.59 (36.34)
earliest diastolic BP (mmHg)	88.15 (19.11)	99.27 (25.75)
age in years	60.60 (13.91)	61.21 (14.71)
diabetes?	62 (31%)	39 (62%)
hypertension?	130 (65%)	41 (65%)
malay vs non-malay	187 (94%)	58 (92%)
event_s	0 (0%)	63 (100%)
¹ n (%); Mean (SD)

Dealing with date

Load library

library(lubridate)

Dealing with date

mydata <- mydata %>% mutate(dur = mydata$doa %--% mydata$dod) %>%
  mutate(dur = as.duration(dur))

# convert to days
mydata <- mydata %>% mutate(dur_days = dur /ddays(1))

Step 4: Kaplan Meier estimates

KM1 <- survfit(Surv(time = dur_days, event = event == 'dead') ~ 1,
               type = 'kaplan-meier', data = mydata)
summary(KM1)

## Call: survfit(formula = Surv(time = dur_days, event = event == "dead") ~ 
##     1, data = mydata, type = "kaplan-meier")
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0    262       4    0.985 0.00758       0.9700        1.000
##     1    256      13    0.935 0.01531       0.9052        0.965
##     2    224       4    0.918 0.01716       0.8850        0.952
##     3    193       7    0.885 0.02064       0.8452        0.926
##     4    147       4    0.861 0.02333       0.8161        0.908
##     5    105       5    0.820 0.02852       0.7656        0.878
##     6     79       4    0.778 0.03379       0.7147        0.847
##     7     68       4    0.732 0.03879       0.6602        0.813
##     9     51       3    0.689 0.04376       0.6087        0.781
##    10     44       2    0.658 0.04705       0.5719        0.757
##    12     38       4    0.589 0.05334       0.4929        0.703
##    14     29       2    0.548 0.05687       0.4473        0.672
##    18     23       1    0.524 0.05918       0.4202        0.654
##    22     18       1    0.495 0.06265       0.3864        0.635
##    25     12       2    0.413 0.07459       0.2895        0.588
##    28      7       1    0.354 0.08406       0.2220        0.564
##    29      6       1    0.295 0.08833       0.1638        0.530
##    41      3       1    0.196 0.09951       0.0728        0.530

Step 5: Plot Kaplan-Meier curves

library(survminer)

## Warning: package 'survminer' was built under R version 4.5.2

## Loading required package: ggpubr

## 
## Attaching package: 'survminer'

## The following object is masked from 'package:survival':
## 
##     myeloma

General survival probability

ggsurvplot(KM1, data = mydata, risk.table = TRUE)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Ignoring unknown labels:
## • colour : "Strata"

By sex

KM1.g <- survfit(Surv(time = dur_days, event= event == 'dead') ~ sex,
                 type = "kaplan-meier", data = mydata)
summary(KM1.g)

## Call: survfit(formula = Surv(time = dur_days, event = event == "dead") ~ 
##     sex, data = mydata, type = "kaplan-meier")
## 
##                 sex=male 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    113       6    0.947  0.0211        0.906        0.989
##     3     84       5    0.891  0.0315        0.831        0.954
##     5     38       1    0.867  0.0384        0.795        0.946
##     6     32       2    0.813  0.0517        0.718        0.921
##     7     26       1    0.782  0.0584        0.675        0.905
##     9     18       2    0.695  0.0778        0.558        0.865
##    10     15       1    0.648  0.0853        0.501        0.839
##    12     11       1    0.590  0.0957        0.429        0.810
##    14      8       1    0.516  0.1085        0.342        0.779
##    22      4       1    0.387  0.1382        0.192        0.779
## 
##                 sex=female 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0    147       4    0.973  0.0134       0.9468        0.999
##     1    143       7    0.925  0.0217       0.8836        0.969
##     2    126       4    0.896  0.0255       0.8472        0.947
##     3    109       2    0.879  0.0276       0.8270        0.935
##     4     88       4    0.839  0.0328       0.7776        0.906
##     5     67       4    0.789  0.0392       0.7160        0.870
##     6     47       2    0.756  0.0442       0.6739        0.847
##     7     42       3    0.702  0.0508       0.6088        0.809
##     9     33       1    0.680  0.0536       0.5832        0.794
##    10     29       1    0.657  0.0566       0.5549        0.778
##    12     27       3    0.584  0.0641       0.4709        0.724
##    14     21       1    0.556  0.0668       0.4395        0.704
##    18     17       1    0.523  0.0705       0.4021        0.681
##    25     10       2    0.419  0.0870       0.2788        0.629
##    28      5       1    0.335  0.1022       0.1842        0.609
##    29      4       1    0.251  0.1055       0.1103        0.572
##    41      2       1    0.126  0.1033       0.0251        0.630

# Plot KM based on group (sex)
ggsurvplot(KM1.g, data = mydata, risk.table = TRUE,
           linetype = c(3,6), pval = TRUE)

## Ignoring unknown labels:
## • colour : "Strata"

By diabetes

KM1.dm <- survfit(Surv(time = dur_days, event == 'dead') ~ dm2, 
                     type = "kaplan-meier", data = mydata)
summary(KM1.dm)

## Call: survfit(formula = Surv(time = dur_days, event == "dead") ~ dm2, 
##     data = mydata, type = "kaplan-meier")
## 
##                 dm2=no 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0    161       1    0.994 0.00619       0.9817        1.000
##     1    158       5    0.962 0.01508       0.9332        0.992
##     2    137       2    0.948 0.01784       0.9140        0.984
##     3    117       1    0.940 0.01944       0.9028        0.979
##     4     90       2    0.919 0.02397       0.8735        0.967
##     5     62       3    0.875 0.03388       0.8109        0.944
##     6     47       2    0.838 0.04142       0.7602        0.923
##     7     40       2    0.796 0.04880       0.7056        0.897
##     9     30       1    0.769 0.05390       0.6705        0.882
##    12     24       1    0.737 0.06044       0.6277        0.866
##    18     18       1    0.696 0.06959       0.5723        0.847
##    25      8       1    0.609 0.10165       0.4392        0.845
##    29      5       1    0.487 0.13597       0.2820        0.842
##    41      2       1    0.244 0.18522       0.0549        1.000
## 
##                 dm2=yes 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0    101       3   0.9703  0.0169       0.9377        1.000
##     1     98       8   0.8911  0.0310       0.8324        0.954
##     2     87       2   0.8706  0.0335       0.8074        0.939
##     3     76       6   0.8019  0.0410       0.7255        0.886
##     4     57       2   0.7737  0.0441       0.6920        0.865
##     5     43       2   0.7377  0.0488       0.6480        0.840
##     6     32       2   0.6916  0.0556       0.5908        0.810
##     7     28       2   0.6422  0.0616       0.5321        0.775
##     9     21       2   0.5811  0.0693       0.4600        0.734
##    10     17       2   0.5127  0.0762       0.3832        0.686
##    12     14       3   0.4028  0.0821       0.2702        0.601
##    14      8       2   0.3021  0.0872       0.1717        0.532
##    22      5       1   0.2417  0.0882       0.1182        0.494
##    25      4       1   0.1813  0.0844       0.0728        0.451
##    28      2       1   0.0906  0.0767       0.0172        0.476

ggsurvplot(KM1.dm, data = mydata, risk.table = TRUE,
           linetype = c(1,2), pval = TRUE)

## Ignoring unknown labels:
## • colour : "Strata"

Estimate the survival function

quantile(KM1, probs = c(0.25, 0.50, 0.75))

## $quantile
## 25 50 75 
##  7 22 41 
## 
## $lower
## 25 50 75 
##  6 12 28 
## 
## $upper
## 25 50 75 
## 12 NA NA

quantile(KM1.dm, probs = c(0.25, 0.50, 0.75))

## $quantile
##         25 50 75
## dm2=no  12 29 41
## dm2=yes  5 12 22
## 
## $lower
##         25 50 75
## dm2=no   7 25 41
## dm2=yes  3  9 14
## 
## $upper
##         25 50 75
## dm2=no  NA NA NA
## dm2=yes  9 22 NA

By hypertension

KM1.hpt <- survfit(Surv(time = dur_days, event == 'dead') ~ hpt2, 
                     type = "kaplan-meier", data = mydata)
summary(KM1.hpt)

## Call: survfit(formula = Surv(time = dur_days, event == "dead") ~ hpt2, 
##     data = mydata, type = "kaplan-meier")
## 
##                 hpt2=no 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0     91       1    0.989  0.0109        0.968        1.000
##     1     89       4    0.945  0.0241        0.898        0.993
##     2     74       2    0.919  0.0294        0.863        0.979
##     3     60       3    0.873  0.0381        0.802        0.951
##     4     43       1    0.853  0.0423        0.774        0.940
##     5     29       3    0.765  0.0613        0.653        0.895
##     6     24       2    0.701  0.0709        0.575        0.854
##     7     22       2    0.637  0.0774        0.502        0.808
##     9     14       1    0.592  0.0842        0.448        0.782
##    10     11       1    0.538  0.0921        0.384        0.752
##    12      9       1    0.478  0.0994        0.318        0.719
##    41      1       1    0.000     NaN           NA           NA
## 
##                 hpt2=yes 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0    171       3    0.982  0.0100       0.9630        1.000
##     1    167       9    0.930  0.0196       0.8918        0.969
##     2    150       2    0.917  0.0212       0.8764        0.960
##     3    133       4    0.890  0.0247       0.8425        0.939
##     4    104       3    0.864  0.0281       0.8106        0.921
##     5     76       2    0.841  0.0316       0.7815        0.905
##     6     55       2    0.811  0.0371       0.7410        0.887
##     7     46       2    0.775  0.0431       0.6954        0.864
##     9     37       2    0.733  0.0499       0.6418        0.838
##    10     33       1    0.711  0.0531       0.6144        0.823
##    12     29       3    0.638  0.0623       0.5264        0.772
##    14     21       2    0.577  0.0696       0.4554        0.731
##    18     16       1    0.541  0.0740       0.4136        0.707
##    22     13       1    0.499  0.0792       0.3659        0.681
##    25      8       2    0.374  0.0968       0.2256        0.621
##    28      4       1    0.281  0.1088       0.1314        0.600
##    29      3       1    0.187  0.1054       0.0621        0.564

ggsurvplot(KM1.hpt, data = mydata, risk.table = TRUE, 
           linetype = c(1,2), pval = TRUE)

## Ignoring unknown labels:
## • colour : "Strata"

Step 5: Cox proportional hazard (PH) regression

Model with no covariate

cox.null <- coxph(Surv(time = dur_days, event = event == 'dead') ~ 1,
                 data = mydata)
summary(cox.null)

## Call:  coxph(formula = Surv(time = dur_days, event = event == "dead") ~ 
##     1, data = mydata)
## 
## Null model
##   log likelihood= -285.7719 
##   n= 262

Model with covariate gcs

cox.gcs <- coxph(Surv(time = dur_days, event = event == 'dead') ~ gcs,
                 data = mydata)
summary(cox.gcs)

## Call:
## coxph(formula = Surv(time = dur_days, event = event == "dead") ~ 
##     gcs, data = mydata)
## 
##   n= 262, number of events= 63 
## 
##         coef exp(coef) se(coef)      z Pr(>|z|)    
## gcs -0.22984   0.79466  0.03209 -7.162 7.98e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## gcs    0.7947      1.258    0.7462    0.8463
## 
## Concordance= 0.806  (se = 0.03 )
## Likelihood ratio test= 54.7  on 1 df,   p=1e-13
## Wald test            = 51.29  on 1 df,   p=8e-13
## Score (logrank) test = 62.6  on 1 df,   p=3e-15

Step 6: Cox PH regression with interaction term

Main effect model

cox.gcs.age.noia <- coxph(Surv(time = dur_days, event = event == 'dead') ~ gcs +
                       age + dm2 + hpt2, data = mydata)
cox.gcs.age.noia

## Call:
## coxph(formula = Surv(time = dur_days, event = event == "dead") ~ 
##     gcs + age + dm2 + hpt2, data = mydata)
## 
##             coef exp(coef) se(coef)      z        p
## gcs     -0.21804   0.80409  0.03192 -6.831 8.42e-12
## age      0.02197   1.02221  0.01059  2.074 0.038098
## dm2yes   1.07524   2.93070  0.28115  3.824 0.000131
## hpt2yes -0.62919   0.53302  0.28958 -2.173 0.029799
## 
## Likelihood ratio test=74.35  on 4 df, p=2.736e-15
## n= 262, number of events= 63

Model with interaction

cox.gcs.age.ia <- coxph(Surv(time = dur_days, event = event == 'dead') ~ gcs +
                       age + dm2 + hpt2 + gcs:age, data = mydata)
cox.gcs.age.ia

## Call:
## coxph(formula = Surv(time = dur_days, event = event == "dead") ~ 
##     gcs + age + dm2 + hpt2 + gcs:age, data = mydata)
## 
##              coef exp(coef)  se(coef)      z        p
## gcs     -0.440143  0.643944  0.165466 -2.660  0.00781
## age     -0.007756  0.992274  0.023901 -0.325  0.74554
## dm2yes   1.111505  3.038928  0.285363  3.895 9.82e-05
## hpt2yes -0.651665  0.521177  0.293077 -2.224  0.02618
## gcs:age  0.003608  1.003614  0.002614  1.380  0.16756
## 
## Likelihood ratio test=76.2  on 5 df, p=5.225e-15
## n= 262, number of events= 63

Step 7: Model comparison

anova(cox.gcs.age.ia, cox.gcs.age.noia, test = 'Chisq')

## Analysis of Deviance Table
##  Cox model: response is  Surv(time = dur_days, event = event == "dead")
##  Model 1: ~ gcs + age + dm2 + hpt2 + gcs:age
##  Model 2: ~ gcs + age + dm2 + hpt2
##    loglik  Chisq Df Pr(>|Chi|)
## 1 -247.67                     
## 2 -248.60 1.8511  1     0.1737

Step 8 : Estimate the survival probability

mydata %>% group_by(event) %>% 
  summarize(min.dur = min(dur_days), max.dur = max(dur_days))

## # A tibble: 2 × 3
##   event    min.dur max.dur
##   <fct>      <dbl>   <dbl>
## 1 censored       0      61
## 2 dead           0      41

summary(KM1, times = c(5, 10, 20, 40, 60))

## Call: survfit(formula = Surv(time = dur_days, event = event == "dead") ~ 
##     1, data = mydata, type = "kaplan-meier")
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     5    105      37    0.820  0.0285       0.7656        0.878
##    10     44      13    0.658  0.0470       0.5719        0.757
##    20     20       7    0.524  0.0592       0.4202        0.654
##    40      3       5    0.295  0.0883       0.1638        0.530
##    60      1       1    0.196  0.0995       0.0728        0.530

summary(KM1.dm, times = c(5, 10, 20, 40, 60))

## Call: survfit(formula = Surv(time = dur_days, event == "dead") ~ dm2, 
##     data = mydata, type = "kaplan-meier")
## 
##                 dm2=no 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     5     62      14    0.875  0.0339       0.8109        0.944
##    10     27       5    0.769  0.0539       0.6705        0.882
##    20     15       2    0.696  0.0696       0.5723        0.847
##    40      2       2    0.487  0.1360       0.2820        0.842
##    60      1       1    0.244  0.1852       0.0549        1.000
## 
##                 dm2=yes 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     5     43      23   0.7377  0.0488       0.6480        0.840
##    10     17       8   0.5127  0.0762       0.3832        0.686
##    20      5       5   0.3021  0.0872       0.1717        0.532
##    40      1       3   0.0906  0.0767       0.0172        0.476

#Step 9: Comparing the survival estimates between levels of a group (categorical) variable

Compare the difference in the observed and expected survivors.

logrank.sex <- survdiff(Surv(time = dur_days, event == 'dead') ~ sex, 
                        data = mydata, rho = 0)
peto.sex <- survdiff(Surv(time = dur_days, event == 'dead') ~ sex, 
                     data = mydata, rho = 1)
logrank.dm <- survdiff(Surv(time = dur_days, event == 'dead') ~ dm2, 
                        data = mydata, rho = 0)
peto.dm <- survdiff(Surv(time = dur_days, event == 'dead') ~ dm2, 
                     data = mydata, rho = 1)

(SEX) Comparing result

Log rank

logrank.sex

## Call:
## survdiff(formula = Surv(time = dur_days, event == "dead") ~ sex, 
##     data = mydata, rho = 0)
## 
##              N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=male   115       21     24.1     0.393     0.673
## sex=female 147       42     38.9     0.243     0.673
## 
##  Chisq= 0.7  on 1 degrees of freedom, p= 0.4

Peto

peto.sex

## Call:
## survdiff(formula = Surv(time = dur_days, event == "dead") ~ sex, 
##     data = mydata, rho = 1)
## 
##              N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=male   115     17.7     20.4     0.363     0.728
## sex=female 147     34.0     31.3     0.237     0.728
## 
##  Chisq= 0.7  on 1 degrees of freedom, p= 0.4

(DM) Comparing result

log rank

logrank.dm

## Call:
## survdiff(formula = Surv(time = dur_days, event == "dead") ~ dm2, 
##     data = mydata, rho = 0)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## dm2=no  161       24     39.3      5.97      16.6
## dm2=yes 101       39     23.7      9.92      16.6
## 
##  Chisq= 16.6  on 1 degrees of freedom, p= 4e-05

peto

peto.dm

## Call:
## survdiff(formula = Surv(time = dur_days, event == "dead") ~ dm2, 
##     data = mydata, rho = 1)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## dm2=no  161     19.3     31.9      4.97      15.7
## dm2=yes 101     32.4     19.8      8.03      15.7
## 
##  Chisq= 15.7  on 1 degrees of freedom, p= 7e-05

Step 10: Comparing the survival estimates for numerical variables

summary(mydata$sbp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    97.0   143.0   160.0   163.7   186.8   290.0

c.bmi will be categorized to:

min to 140 (≤140)
141 to 160 (141-160)
161 and above (>160)

mydata <- mydata %>% mutate(sbp.c = cut(sbp, c(0, 140, 160, 300), 
                                          labels = c('min-140', '141-160', 'above 160')))
mydata %>% count(sbp.c)

## # A tibble: 3 × 2
##   sbp.c         n
##   <fct>     <int>
## 1 min-140      60
## 2 141-160      73
## 3 above 160   129

Estimate the survival estimates for categories of SBP

KM1.sbp <- survfit(Surv(time = dur_days, event == 'dead') ~ sbp.c, 
                     type = "kaplan-meier", data = mydata)
summary(KM1.sbp)

## Call: survfit(formula = Surv(time = dur_days, event == "dead") ~ sbp.c, 
##     data = mydata, type = "kaplan-meier")
## 
##                 sbp.c=min-140 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0     60       1    0.983  0.0165       0.9515        1.000
##     1     59       2    0.950  0.0281       0.8964        1.000
##     2     51       2    0.913  0.0374       0.8423        0.989
##     3     45       2    0.872  0.0454       0.7876        0.966
##     5     20       2    0.785  0.0714       0.6568        0.938
##     6     15       1    0.733  0.0836       0.5858        0.916
##     7     13       1    0.676  0.0943       0.5146        0.889
##    22      4       1    0.507  0.1626       0.2706        0.951
##    25      2       1    0.254  0.1969       0.0554        1.000
## 
##                 sbp.c=141-160 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     72       1    0.986  0.0138        0.959        1.000
##     4     45       1    0.964  0.0255        0.915        1.000
##     5     35       1    0.937  0.0368        0.867        1.000
##     7     26       2    0.865  0.0596        0.755        0.990
##     9     18       1    0.817  0.0731        0.685        0.973
##    12     13       1    0.754  0.0905        0.596        0.954
##    25      6       1    0.628  0.1373        0.409        0.964
##    29      4       1    0.471  0.1706        0.232        0.958
## 
##                 sbp.c=above 160 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0    129       3    0.977  0.0133       0.9511        1.000
##     1    125      10    0.899  0.0267       0.8478        0.952
##     2    108       2    0.882  0.0286       0.8276        0.940
##     3     92       5    0.834  0.0342       0.7697        0.904
##     4     73       3    0.800  0.0381       0.7285        0.878
##     5     50       2    0.768  0.0427       0.6884        0.856
##     6     36       3    0.704  0.0528       0.6076        0.815
##     7     29       1    0.680  0.0563       0.5777        0.799
##     9     24       2    0.623  0.0643       0.5089        0.762
##    10     20       2    0.561  0.0714       0.4368        0.719
##    12     17       3    0.462  0.0784       0.3310        0.644
##    14     13       2    0.391  0.0808       0.2604        0.586
##    18      9       1    0.347  0.0827       0.2178        0.554
##    28      2       1    0.174  0.1295       0.0402        0.749
##    41      1       1    0.000     NaN           NA           NA

Perform log-rank test to SBP

logrank.sbp <- survdiff(Surv(time = dur_days, event = event == 'dead') ~ sbp.c,
                        data = mydata, rho = 0)
logrank.sbp

## Call:
## survdiff(formula = Surv(time = dur_days, event = event == "dead") ~ 
##     sbp.c, data = mydata, rho = 0)
## 
##                   N Observed Expected (O-E)^2/E (O-E)^2/V
## sbp.c=min-140    60       13     13.4    0.0146    0.0194
## sbp.c=141-160    73        9     20.9    6.7710   10.7619
## sbp.c=above 160 129       41     28.7    5.3107   10.2538
## 
##  Chisq= 12.8  on 2 degrees of freedom, p= 0.002

Plot KM for SBP

ggsurvplot(KM1.sbp, data = mydata, risk.table = TRUE, 
           linetype = c(1, 2, 3), pval = TRUE)

## Ignoring unknown labels:
## • colour : "Strata"

Testing the assumption of proportional hazard

prop.h <- cox.zph(cox.gcs.age.noia, 
                  transform = 'km', 
                  global = TRUE)
prop.h

##         chisq df    p
## gcs    0.8709  1 0.35
## age    0.2357  1 0.63
## dm2    0.0132  1 0.91
## hpt2   0.7597  1 0.38
## GLOBAL 1.7688  4 0.78

Estimate the survival probability

Step: Model-cheking

Linearity in hazard assumption

ggcoxfunctional(Surv(time = dur_days, event = event == 'dead') ~  gcs + age +
                  sbp, data = mydata)

## Warning: arguments formula is deprecated; will be removed in the next version;
## please use fit instead.

Proportional hazard assumption

km method

prop.h <- cox.zph(cox.gcs.age.noia, transform = 'km', global = TRUE)
prop.h

##         chisq df    p
## gcs    0.8709  1 0.35
## age    0.2357  1 0.63
## dm2    0.0132  1 0.91
## hpt2   0.7597  1 0.38
## GLOBAL 1.7688  4 0.78

par(mfrow = c(2, 2))
plot(prop.h)

The rank method

prop.h.r <- cox.zph(cox.gcs.age.noia, transform = 'rank')
prop.h.r

##          chisq df     p
## gcs    3.45639  1 0.063
## age    0.00887  1 0.925
## dm2    0.00977  1 0.921
## hpt2   2.62690  1 0.105
## GLOBAL 6.08773  4 0.193

par(mfrow = c(2, 2))
plot(prop.h.r)

(km) visualization of proportional hazards diagnostics

ggcoxzph(prop.h)

(rank) visualization of proportional hazards diagnostics

ggcoxzph(prop.h.r)

Step: Model prediction

summary(cox.gcs.age.noia)

## Call:
## coxph(formula = Surv(time = dur_days, event = event == "dead") ~ 
##     gcs + age + dm2 + hpt2, data = mydata)
## 
##   n= 262, number of events= 63 
## 
##             coef exp(coef) se(coef)      z Pr(>|z|)    
## gcs     -0.21804   0.80409  0.03192 -6.831 8.42e-12 ***
## age      0.02197   1.02221  0.01059  2.074 0.038098 *  
## dm2yes   1.07524   2.93070  0.28115  3.824 0.000131 ***
## hpt2yes -0.62919   0.53302  0.28958 -2.173 0.029799 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## gcs        0.8041     1.2436    0.7553    0.8560
## age        1.0222     0.9783    1.0012    1.0437
## dm2yes     2.9307     0.3412    1.6891    5.0850
## hpt2yes    0.5330     1.8761    0.3022    0.9402
## 
## Concordance= 0.841  (se = 0.026 )
## Likelihood ratio test= 74.35  on 4 df,   p=3e-15
## Wald test            = 72.92  on 4 df,   p=5e-15
## Score (logrank) test = 87.5  on 4 df,   p=<2e-16

Can be written tidier as:

with log hazard
with hazard ratio

library(broom)

cox.gcs.age.noia.h <- tidy(cox.gcs.age.noia, conf.int = TRUE)
cox.gcs.age.noia.hr <- tidy(cox.gcs.age.noia, exponentiate = TRUE, conf.int = TRUE)

bind_cols(cox.gcs.age.noia.h, cox.gcs.age.noia.hr)

## New names:
## • `term` -> `term...1`
## • `estimate` -> `estimate...2`
## • `std.error` -> `std.error...3`
## • `statistic` -> `statistic...4`
## • `p.value` -> `p.value...5`
## • `conf.low` -> `conf.low...6`
## • `conf.high` -> `conf.high...7`
## • `term` -> `term...8`
## • `estimate` -> `estimate...9`
## • `std.error` -> `std.error...10`
## • `statistic` -> `statistic...11`
## • `p.value` -> `p.value...12`
## • `conf.low` -> `conf.low...13`
## • `conf.high` -> `conf.high...14`

## # A tibble: 4 × 14
##   term...1 estimate...2 std.error...3 statistic...4 p.value...5 conf.low...6
##   <chr>           <dbl>         <dbl>         <dbl>       <dbl>        <dbl>
## 1 gcs           -0.218         0.0319         -6.83    8.42e-12     -0.281  
## 2 age            0.0220        0.0106          2.07    3.81e- 2      0.00121
## 3 dm2yes         1.08          0.281           3.82    1.31e- 4      0.524  
## 4 hpt2yes       -0.629         0.290          -2.17    2.98e- 2     -1.20   
## # ℹ 8 more variables: conf.high...7 <dbl>, term...8 <chr>, estimate...9 <dbl>,
## #   std.error...10 <dbl>, statistic...11 <dbl>, p.value...12 <dbl>,
## #   conf.low...13 <dbl>, conf.high...14 <dbl>

Create new data

new_data <- expand.grid(gcs = c(5, 10, 12),
                        age = c(40, 50, 70),
                        dm2 = c('no', 'yes'),
                        hpt2 = c('no', 'yes'))
new_data

##    gcs age dm2 hpt2
## 1    5  40  no   no
## 2   10  40  no   no
## 3   12  40  no   no
## 4    5  50  no   no
## 5   10  50  no   no
## 6   12  50  no   no
## 7    5  70  no   no
## 8   10  70  no   no
## 9   12  70  no   no
## 10   5  40 yes   no
## 11  10  40 yes   no
## 12  12  40 yes   no
## 13   5  50 yes   no
## 14  10  50 yes   no
## 15  12  50 yes   no
## 16   5  70 yes   no
## 17  10  70 yes   no
## 18  12  70 yes   no
## 19   5  40  no  yes
## 20  10  40  no  yes
## 21  12  40  no  yes
## 22   5  50  no  yes
## 23  10  50  no  yes
## 24  12  50  no  yes
## 25   5  70  no  yes
## 26  10  70  no  yes
## 27  12  70  no  yes
## 28   5  40 yes  yes
## 29  10  40 yes  yes
## 30  12  40 yes  yes
## 31   5  50 yes  yes
## 32  10  50 yes  yes
## 33  12  50 yes  yes
## 34   5  70 yes  yes
## 35  10  70 yes  yes
## 36  12  70 yes  yes

Linear predictor

mean(mydata$gcs)

## [1] 12.01908

mean(mydata$age)

## [1] 60.74809

# mean for dm2
101/262

## [1] 0.3854962

# mean for hpt2 
summary(mydata$hpt2)

##  no yes 
##  91 171

171/262

## [1] 0.6526718

# gcs * dm
mydata <- mydata %>% mutate(dm.num = as.integer(dm2), gcs.dm = gcs*dm.num)
mean(mydata$gcs.dm)

## [1] 16.5229

use main effect models cox.mv

# and I choose type = 'lp'
predict(cox.gcs.age.noia, newdata = new_data, type = 'lp')

##            1            2            3            4            5            6 
##  1.074603348 -0.015601603 -0.451683583  1.294307733  0.204102782 -0.231979198 
##            7            8            9           10           11           12 
##  1.733716503  0.643511552  0.207429572  2.149845119  1.059640168  0.623558188 
##           13           14           15           16           17           18 
##  2.369549504  1.279344554  0.843262573  2.808958275  1.718753324  1.282671343 
##           19           20           21           22           23           24 
##  0.445414638 -0.644790313 -1.080872293  0.665119023 -0.425085928 -0.861167908 
##           25           26           27           28           29           30 
##  1.104527793  0.014322843 -0.421759138  1.520656409  0.430451458 -0.005630522 
##           31           32           33           34           35           36 
##  1.740360794  0.650155844  0.214073863  2.179769565  1.089564614  0.653482633

#1st obs = 1.0707
augment(cox.gcs.age.noia, newdata = new_data)

## # A tibble: 36 × 6
##      gcs   age dm2   hpt2  .fitted .se.fit
##    <dbl> <dbl> <fct> <fct>   <dbl>   <dbl>
##  1     5    40 no    no     1.07    0.298 
##  2    10    40 no    no    -0.0156  0.223 
##  3    12    40 no    no    -0.452   0.220 
##  4     5    50 no    no     1.29    0.241 
##  5    10    50 no    no     0.204   0.125 
##  6    12    50 no    no    -0.232   0.114 
##  7     5    70 no    no     1.73    0.253 
##  8    10    70 no    no     0.644   0.123 
##  9    12    70 no    no     0.207   0.0981
## 10     5    40 yes   no     2.15    0.378 
## # ℹ 26 more rows

model with interaction, you may either use the predict() or the augment() function

# and I choose type = 'lp'
predict(cox.gcs.age.ia, newdata = new_data, type = 'lp')

##           1           2           3           4           5           6 
##  1.32963375 -0.14954482 -0.74121624  1.43245597  0.13366205 -0.38585551 
##           7           8           9          10          11          12 
##  1.63810040  0.70007580  0.32486596  2.44113848  0.96195991  0.37028849 
##          13          14          15          16          17          18 
##  2.54396070  1.24516679  0.72564922  2.74960513  1.81158053  1.43637069 
##          19          20          21          22          23          24 
##  0.67796897 -0.80120960 -1.39288103  0.78079119 -0.51800273 -1.03752029 
##          25          26          27          28          29          30 
##  0.98643562  0.04841102 -0.32679882  1.78947370  0.31029513 -0.28137629 
##          31          32          33          34          35          36 
##  1.89229592  0.59350201  0.07398444  2.09794035  1.15991575  0.78470591

augment(cox.gcs.age.ia, newdata = new_data)

## # A tibble: 36 × 6
##      gcs   age dm2   hpt2  .fitted .se.fit
##    <dbl> <dbl> <fct> <fct>   <dbl>   <dbl>
##  1     5    40 no    no      1.33    0.336
##  2    10    40 no    no     -0.150   0.232
##  3    12    40 no    no     -0.741   0.303
##  4     5    50 no    no      1.43    0.254
##  5    10    50 no    no      0.134   0.126
##  6    12    50 no    no     -0.386   0.159
##  7     5    70 no    no      1.64    0.277
##  8    10    70 no    no      0.700   0.137
##  9    12    70 no    no      0.325   0.130
## 10     5    40 yes   no      2.44    0.423
## # ℹ 26 more rows

predict(cox.gcs.age.noia, newdata = new_data, type = 'risk')

##          1          2          3          4          5          6          7 
##  2.9288309  0.9845195  0.6365556  3.6484694  1.2264242  0.7929626  5.6616564 
##          8          9         10         11         12         13         14 
##  1.9031522  1.2305111  8.5835289  2.8853326  1.8655542 10.6925742  3.5942831 
##         15         16         17         18         19         20         21 
##  2.3239366 16.5926243  5.5775707  3.6062604  1.5611374  0.5247726  0.3392994 
##         22         23         24         25         26         27         28 
##  1.9447220  0.6537136  0.4226682  3.0177991  1.0144259  0.6558920  4.5752274 
##         29         30         31         32         33         34         35 
##  1.5379517  0.9943853  5.6993994  1.9158394  1.2387141  8.8442680  2.9729794 
##         36 
##  1.9222236

For model with interaction

predict(cox.gcs.age.ia, newdata = new_data, type = 'risk')

##          1          2          3          4          5          6          7 
##  3.7796588  0.8610998  0.4765340  4.1889745  1.1430065  0.6798688  5.1453861 
##          8          9         10         11         12         13         14 
##  2.0139054  1.3838451 11.4861100  2.6168202  1.4481523 12.7299909  3.4735141 
##         15         16         17         18         19         20         21 
##  2.0660720 15.6364564  6.1201128  4.2054054  1.9698728  0.4487858  0.2483587 
##         22         23         24         25         26         27         28 
##  2.1831989  0.5957092  0.3543322  2.6816590  1.0496020  0.7212288  5.9863010 
##         29         30         31         32         33         34         35 
##  1.3638276  0.7547443  6.6345837  1.8103171  1.0767901  8.1493678  3.1896645 
##         36 
##  2.1917623

new_data2 <- expand.grid(event = 'dead', dur_days = c(5, 20, 50))
new_data2

##   event dur_days
## 1  dead        5
## 2  dead       20
## 3  dead       50

new_data3 <- data.frame(new_data, new_data2)
head(new_data3)

##   gcs age dm2 hpt2 event dur_days
## 1   5  40  no   no  dead        5
## 2  10  40  no   no  dead       20
## 3  12  40  no   no  dead       50
## 4   5  50  no   no  dead        5
## 5  10  50  no   no  dead       20
## 6  12  50  no   no  dead       50

pred.exp <- predict(cox.gcs.age.noia, newdata = new_data3, type = 'expected')
pred.exp

##  [1] 0.3255810 0.3224291 0.5623862 0.4055789 0.4016526 0.7005693 0.6293731
##  [8] 0.6232803 1.0871360 0.9541806 0.9449433 1.6481861 1.1886308 1.1771239
## [15] 2.0531593 1.8445047 1.8266485 3.1860710 0.1735425 0.1718625 0.2997654
## [22] 0.2161833 0.2140905 0.3734203 0.3354710 0.3322234 0.5794697 0.5086012
## [29] 0.5036775 0.8785228 0.6335688 0.6274354 1.0943833 0.9831654 0.9736476
## [36] 1.6982525

cbind(new_data3, pred.exp)

##    gcs age dm2 hpt2 event dur_days  pred.exp
## 1    5  40  no   no  dead        5 0.3255810
## 2   10  40  no   no  dead       20 0.3224291
## 3   12  40  no   no  dead       50 0.5623862
## 4    5  50  no   no  dead        5 0.4055789
## 5   10  50  no   no  dead       20 0.4016526
## 6   12  50  no   no  dead       50 0.7005693
## 7    5  70  no   no  dead        5 0.6293731
## 8   10  70  no   no  dead       20 0.6232803
## 9   12  70  no   no  dead       50 1.0871360
## 10   5  40 yes   no  dead        5 0.9541806
## 11  10  40 yes   no  dead       20 0.9449433
## 12  12  40 yes   no  dead       50 1.6481861
## 13   5  50 yes   no  dead        5 1.1886308
## 14  10  50 yes   no  dead       20 1.1771239
## 15  12  50 yes   no  dead       50 2.0531593
## 16   5  70 yes   no  dead        5 1.8445047
## 17  10  70 yes   no  dead       20 1.8266485
## 18  12  70 yes   no  dead       50 3.1860710
## 19   5  40  no  yes  dead        5 0.1735425
## 20  10  40  no  yes  dead       20 0.1718625
## 21  12  40  no  yes  dead       50 0.2997654
## 22   5  50  no  yes  dead        5 0.2161833
## 23  10  50  no  yes  dead       20 0.2140905
## 24  12  50  no  yes  dead       50 0.3734203
## 25   5  70  no  yes  dead        5 0.3354710
## 26  10  70  no  yes  dead       20 0.3322234
## 27  12  70  no  yes  dead       50 0.5794697
## 28   5  40 yes  yes  dead        5 0.5086012
## 29  10  40 yes  yes  dead       20 0.5036775
## 30  12  40 yes  yes  dead       50 0.8785228
## 31   5  50 yes  yes  dead        5 0.6335688
## 32  10  50 yes  yes  dead       20 0.6274354
## 33  12  50 yes  yes  dead       50 1.0943833
## 34   5  70 yes  yes  dead        5 0.9831654
## 35  10  70 yes  yes  dead       20 0.9736476
## 36  12  70 yes  yes  dead       50 1.6982525