Cox regression in R

References

Load data

## Load survival package
library(survival)
## List datasets in survival package
data(package = "survival")

## Load lung data
data(lung)

## Show first 6 rows
head(lung)
  inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
1    3  306      2  74   1       1       90       100     1175      NA
2    3  455      2  68   1       0       90        90     1225      15
3    3 1010      1  56   1       0       90        90       NA      15
4    5  210      2  57   1       1       90        60     1150      11
5    1  883      2  60   1       0      100        90       NA       0
6   12 1022      1  74   1       1       50        80      513       0

Coding (from help file)

NCCTG Lung Cancer Data

Description:
     Survival in patients with advanced lung cancer from the North
     Central Cancer Treatment Group.  Performance scores rate how well
     the patient can perform usual daily activities.

inst:       Institution code
time:       Survival time in days
status:     censoring status 1=censored, 2=dead
age:        Age in years
sex:        Male=1 Female=2
ph.ecog:    ECOG performance score (0=good 5=dead)
ph.karno:   Karnofsky performance score (bad=0-good=100) rated by physician
pat.karno:  Karnofsky performance score as rated by patient
meal.cal:   Calories consumed at meals
wt.loss:    Weight loss in last six months

Create a survival object

## Add survival object. status == 2 is death
lung$SurvObj <- with(lung, Surv(time, status == 2))

## Check data
head(lung)
  inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss SurvObj
1    3  306      2  74   1       1       90       100     1175      NA    306 
2    3  455      2  68   1       0       90        90     1225      15    455 
3    3 1010      1  56   1       0       90        90       NA      15   1010+
4    5  210      2  57   1       1       90        60     1150      11    210 
5    1  883      2  60   1       0      100        90       NA       0    883 
6   12 1022      1  74   1       1       50        80      513       0   1022+

Kaplan-Meier estimator without grouping

## Kaplan-Meier estimator. The "log-log" confidence interval is preferred.
km.as.one <- survfit(SurvObj ~ 1, data = lung, conf.type = "log-log")
km.by.sex <- survfit(SurvObj ~ sex, data = lung, conf.type = "log-log")

## Show object
km.as.one
Call: survfit(formula = SurvObj ~ 1, data = lung, conf.type = "log-log")

records   n.max n.start  events  median 0.95LCL 0.95UCL 
    228     228     228     165     310     284     361 
km.by.sex
Call: survfit(formula = SurvObj ~ sex, data = lung, conf.type = "log-log")

      records n.max n.start events median 0.95LCL 0.95UCL
sex=1     138   138     138    112    270     210     306
sex=2      90    90      90     53    426     345     524

## See survival estimates at given time (lots of outputs)
## summary(km.as.one)
## summary(km.by.sex)

## Plotting without any specification
plot(km.as.one)

plot of chunk unnamed-chunk-4

plot(km.by.sex)

plot of chunk unnamed-chunk-4


## Without
plot(km.as.one, conf = F, mark.time = F)

plot of chunk unnamed-chunk-4

plot(km.by.sex, conf = F, mark.time = F)

plot of chunk unnamed-chunk-4

Cox regression using coxph

## Fit Cox regression: age, sex, Karnofsky performance score, wt loss
res.cox1 <- coxph(SurvObj ~ age + sex + ph.karno + wt.loss, data =  lung)
res.cox1
Call:
coxph(formula = SurvObj ~ age + sex + ph.karno + wt.loss, data = lung)

             coef exp(coef) se(coef)      z      p
age       0.01514     1.015  0.00984  1.539 0.1200
sex      -0.51396     0.598  0.17441 -2.947 0.0032
ph.karno -0.01287     0.987  0.00618 -2.081 0.0370
wt.loss  -0.00225     0.998  0.00636 -0.353 0.7200

Likelihood ratio test=18.8  on 4 df, p=0.000844  n= 214, number of events= 152 
   (14 observations deleted due to missingness)

## Check for violation of proportional hazard (constant HR over time)
(res.zph1 <- cox.zph(res.cox1))
              rho   chisq       p
age      -0.00837  0.0117 0.91381
sex       0.13137  2.5579 0.10975
ph.karno  0.23963  8.2624 0.00405
wt.loss   0.05930  0.5563 0.45575
GLOBAL         NA 12.0669 0.01686

## Displays a graph of the scaled Schoenfeld residuals, along with a smooth curve.
plot(res.zph1)

plot of chunk unnamed-chunk-5 plot of chunk unnamed-chunk-5 plot of chunk unnamed-chunk-5 plot of chunk unnamed-chunk-5


## residuals(res.cox1, type = "scaledsch")
## c("martingale", "deviance", "score", "schoenfeld", "dfbeta", "dfbetas", "scaledsch","partial")

## Check Karnofsky performance score (only 6 discrete values)
table(lung$ph.karno)

 50  60  70  80  90 100 
  6  19  32  67  74  29 

## Solution 1: Stratify
res.cox1.strata <- coxph(SurvObj ~ age + sex + strata(ph.karno) + wt.loss, data =  lung)
cox.zph(res.cox1.strata)
           rho chisq     p
age     0.0320 0.168 0.682
sex     0.1264 2.261 0.133
wt.loss 0.0469 0.346 0.557
GLOBAL      NA 2.619 0.454
summary(res.cox1.strata)
Call:
coxph(formula = SurvObj ~ age + sex + strata(ph.karno) + wt.loss, 
    data = lung)

  n= 214, number of events= 152 
   (14 observations deleted due to missingness)

            coef exp(coef) se(coef)     z Pr(>|z|)    
age      0.01825   1.01842  0.01002  1.82  0.06854 .  
sex     -0.60333   0.54699  0.18037 -3.34  0.00082 ***
wt.loss -0.00536   0.99465  0.00669 -0.80  0.42288    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

        exp(coef) exp(-coef) lower .95 upper .95
age         1.018      0.982     0.999     1.039
sex         0.547      1.828     0.384     0.779
wt.loss     0.995      1.005     0.982     1.008

Concordance= 0.615  (se = 0.057 )
Rsquare= 0.069   (max possible= 0.983 )
Likelihood ratio test= 15.4  on 3 df,   p=0.00152
Wald test            = 14.6  on 3 df,   p=0.00221
Score (logrank) test = 14.9  on 3 df,   p=0.00187

## Solution 2: Time-varying effect
## Event status variable necessary
lung$event <- (lung$status == 2)
## Erase the survival object (Leaving it in data frames breaks the conversion)
lung2$SurvObj <- NULL
Error: object 'lung2' not found

## Counting process format creation
lung.split <- survSplit(data    = lung,
                        cut     = c(200,400), # vector of timepoints to cut at
                        end     = "time",  # character string with name of event time variable
                        event   = "event", # character string with name of censoring indicator
                        start   = "start", # character string with name of start time variable (created)
                        id      = "id",    # character string with name of new id variable to create
                        zero    = 0,       # If start doesn't already exist, used as start
                        episode = NULL     # character string with name of new episode variable (optional)
                        )
## Make id numeric
lung.split$id <- as.numeric(lung.split$id)

## Reorder by id, then start time
library(doBy)
lung.split <- orderBy( ~ id + start, lung.split)

## Recreate SurbObj
lung.split$SurvObj <- with(lung.split, Surv(time = (start), time2 = time, event = event))

## Check
head(lung.split, 10)
     inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss    SurvObj event start id
1       3  200      2  74   1       1       90       100     1175      NA (  0,200+]     0     0  1
685     3  200      2  74   1       1       90       100     1175      NA (  0,200+]     0     0  1
229     3  306      2  74   1       1       90       100     1175      NA  (200,306]     1   200  1
913     3  306      2  74   1       1       90       100     1175      NA  (200,306]     1   200  1
2       3  200      2  68   1       0       90        90     1225      15 (  0,200+]     0     0  2
686     3  200      2  68   1       0       90        90     1225      15 (  0,200+]     0     0  2
230     3  400      2  68   1       0       90        90     1225      15 (200,400+]     0   200  2
914     3  400      2  68   1       0       90        90     1225      15 (200,400+]     0   200  2
458     3  455      2  68   1       0       90        90     1225      15  (400,455]     1   400  2
1142    3  455      2  68   1       0       90        90     1225      15  (400,455]     1   400  2

## Time-varying effect of baseline variable by including interaction with interval
res.cox1.strata <- coxph(SurvObj ~ age + sex + ph.karno + ph.karno:factor(start) + wt.loss + cluster(id),
                         data =  lung.split)
summary(res.cox1.strata)
Call:
coxph(formula = SurvObj ~ age + sex + ph.karno + ph.karno:factor(start) + 
    wt.loss + cluster(id), data = lung.split)

  n= 822, number of events= 304 
   (36 observations deleted due to missingness)

                              coef exp(coef) se(coef) robust se     z Pr(>|z|)    
age                        0.01582   1.01595  0.00700   0.00994  1.59  0.11137    
sex                       -0.53285   0.58693  0.12325   0.16664 -3.20  0.00139 ** 
ph.karno                  -0.03335   0.96720  0.00688   0.01003 -3.32  0.00089 ***
wt.loss                   -0.00239   0.99761  0.00451   0.00670 -0.36  0.72163    
ph.karno:factor(start)200  0.02226   1.02251  0.01008   0.01203  1.85  0.06412 .  
ph.karno:factor(start)400  0.04180   1.04268  0.01032   0.01317  3.17  0.00150 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                          exp(coef) exp(-coef) lower .95 upper .95
age                           1.016      0.984     0.996     1.036
sex                           0.587      1.704     0.423     0.814
ph.karno                      0.967      1.034     0.948     0.986
wt.loss                       0.998      1.002     0.985     1.011
ph.karno:factor(start)200     1.023      0.978     0.999     1.047
ph.karno:factor(start)400     1.043      0.959     1.016     1.070

Concordance= 0.645  (se = 0.019 )
Rsquare= 0.065   (max possible= 0.978 )
Likelihood ratio test= 54.8  on 6 df,   p=5.07e-10
Wald test            = 26.6  on 6 df,   p=0.000176
Score (logrank) test = 56.5  on 6 df,   p=2.33e-10,   Robust = 22.9  p=0.000827

  (Note: the likelihood ratio and score tests assume independence of
     observations within a cluster, the Wald and robust score tests do not).

rms::survplot() function

## Load rms package
library(rms)

## without specification
survplot(km.by.sex)

plot of chunk unnamed-chunk-6


## Minimum decent
survplot(km.by.sex,
         conf = "none")

plot of chunk unnamed-chunk-6


## Full-fledged grammer
survplot(fit  = km.by.sex,
         conf = c("none","bands","bars")[1],
         xlab = "", ylab = "Survival",
         ## xlim(0,100),
         label.curves = TRUE,                     # label curves directly
         ## label.curves = list(keys = "lines"),  # legend instead of direct label
         levels.only  = FALSE,                    # show only levels, no label
         abbrev.label = FALSE,                    # if label used, abbreviate
         ## fun = function(x) {1 - x},            # Cumulative probability plot         
         loglog   = FALSE,                        # log(-log Survival) plot
         logt     = FALSE,                        # log time
         time.inc = 100,                          # time increment
         dots     = TRUE,                         # dot grid
         n.risk   = TRUE,                         # show number at risk
         ## srt.n.risk = 0, sep.n.risk = 0.056, adj.n.risk = 1,
         ## y.n.risk = 0, cex.n.risk = 0.6
         )

plot of chunk unnamed-chunk-6


## Without dots
survplot(fit  = km.by.sex,
         conf = c("none","bands","bars")[1],
         xlab = "", ylab = "Survival",
         ## xlim(0,100),
         label.curves = TRUE,                     # label curves directly
         ## label.curves = list(keys = "lines"),  # legend instead of direct label
         levels.only  = FALSE,                    # show only levels, no label
         abbrev.label = FALSE,                    # if label used, abbreviate
         ## fun = function(x) {1 - x},            # Cumulative probability plot         
         loglog   = FALSE,                        # log(-log Survival) plot
         logt     = FALSE,                        # log time
         time.inc = 100,                          # time increment
         dots     = FALSE,                         # dot grid
         n.risk   = TRUE,                         # show number at risk
         ## srt.n.risk = 0, sep.n.risk = 0.056, adj.n.risk = 1,
         ## y.n.risk = 0, cex.n.risk = 0.6
         )

plot of chunk unnamed-chunk-6


## Different time incremente
survplot(fit  = km.by.sex,
         conf = c("none","bands","bars")[1],
         xlab = "", ylab = "Survival",
         ## xlim(0,100),
         label.curves = TRUE,                     # label curves directly
         ## label.curves = list(keys = "lines"),  # legend instead of direct label
         levels.only  = FALSE,                    # show only levels, no label
         abbrev.label = FALSE,                    # if label used, abbreviate
         ## fun = function(x) {1 - x},            # Cumulative probability plot         
         loglog   = FALSE,                        # log(-log Survival) plot
         logt     = FALSE,                        # log time
         time.inc = 300,                           # time increment
         dots     = FALSE,                        # dot grid
         n.risk   = TRUE,                         # show number at risk
         ## srt.n.risk = 0, sep.n.risk = 0.056, adj.n.risk = 1,
         ## y.n.risk = 0, cex.n.risk = 0.6
         )

plot of chunk unnamed-chunk-6



## Plot cumulative probability F(t) = 1 - S(t)
survplot(fit  = km.by.sex,
         conf = c("none","bands","bars")[1],
         xlab = "", ylab = "Cumulative Incidence",
         ## xlim(0,100),
         label.curves = TRUE,                     # label curves directly
         ## label.curves = list(keys = "lines"),  # legend instead of direct label
         levels.only  = FALSE,                    # show only levels, no label
         abbrev.label = FALSE,                    # if label used, abbreviate
         fun = function(x) {1 - x},             # Cumulative probability plot         
         loglog   = FALSE,                        # log(-log Survival) plot
         logt     = FALSE,                        # log time
         time.inc = 100,                          # time increment
         dots     = FALSE,                        # dot grid
         n.risk   = TRUE,                         # show number at risk
         ## srt.n.risk = 0, sep.n.risk = 0.056, adj.n.risk = 1,
         ## y.n.risk = 0, cex.n.risk = 0.6
         )

plot of chunk unnamed-chunk-6


## Change position of number at risk
survplot(fit  = km.by.sex,
         conf = c("none","bands","bars")[1],
         xlab = "", ylab = "Survival",
         ## xlim(0,100),
         label.curves = TRUE,                     # label curves directly
         ## label.curves = list(keys = "lines"),  # legend instead of direct label
         levels.only  = FALSE,                    # show only levels, no label
         abbrev.label = FALSE,                    # if label used, abbreviate
         ## fun = function(x) {1 - x},            # Cumulative probability plot         
         loglog   = FALSE,                        # log(-log Survival) plot
         logt     = FALSE,                        # log time
         time.inc = 100,                          # time increment
         dots     = FALSE,                        # dot grid
         n.risk   = TRUE,                         # show number at risk
         ## srt.n.risk = 0, sep.n.risk = 0.056, adj.n.risk = 1,
         y.n.risk = -0.2, cex.n.risk = 0.6
         )

plot of chunk unnamed-chunk-6