Relevant package installation

library(survival)
library(Epi)
install.packages("popEpi", repos = "http://cran.us.r-project.org" )

## package 'popEpi' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\acliv1\AppData\Local\Temp\RtmpmYX87d\downloaded_packages

library(popEpi)

## 
## Attaching package: 'popEpi'

## The following object is masked from 'package:survival':
## 
##     Surv

# popEpi::splitMulti returns a data.frame rather than a data.table
options("popEpi.datatable" = FALSE)
clear()

noquote(version$version.string)

## [1] R version 4.2.2 (2022-10-31 ucrt)

noquote(installed.packages()[c("Epi","popEpi"),c("Version","Built")])

##        Version Built
## Epi    2.47    4.2.2
## popEpi 0.4.10  4.2.2

Data and simple survival

Load the lung data from the survival package, and convert sex to a factor

Always do that with categorical variables. Also we rescale time from days to months:

data(lung)

## Warning in data(lung): data set 'lung' not found

lung$sex <- factor(lung$sex,
                   levels = 1:2,
                   labels = c("M", "W"))
lung$time <- round(lung$time / (365.25/12), 3)
head(lung)

##   inst   time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1    3 10.053      2  74   M       1       90       100     1175      NA
## 2    3 14.949      2  68   M       0       90        90     1225      15
## 3    3 33.183      1  56   M       0       90        90       NA      15
## 4    5  6.899      2  57   M       1       90        60     1150      11
## 5    1 29.010      2  60   M       0      100        90       NA       0
## 6   12 33.577      1  74   M       1       50        80      513       0

Use survfit to construct the Kaplan-Meier estimator of overall survival:

km <- survfit(Surv(time, status == 2) ~ 1, data = lung)
km

## Call: survfit(formula = Surv(time, status == 2) ~ 1, data = lung)
## 
##        n events median 0.95LCL 0.95UCL
## [1,] 228    165   10.2    9.36    11.9

summary(km) #note very long output

## Call: survfit(formula = Surv(time, status == 2) ~ 1, data = lung)
## 
##    time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   0.164    228       1   0.9956 0.00438       0.9871        1.000
##   0.361    227       3   0.9825 0.00869       0.9656        1.000
##   0.394    224       1   0.9781 0.00970       0.9592        0.997
##   0.427    223       2   0.9693 0.01142       0.9472        0.992
##   0.493    221       1   0.9649 0.01219       0.9413        0.989
##   0.854    220       1   0.9605 0.01290       0.9356        0.986
##   0.986    219       1   0.9561 0.01356       0.9299        0.983
##   1.018    218       1   0.9518 0.01419       0.9243        0.980
##   1.741    217       2   0.9430 0.01536       0.9134        0.974
##   1.774    215       1   0.9386 0.01590       0.9079        0.970
##   1.938    214       1   0.9342 0.01642       0.9026        0.967
##   1.971    213       2   0.9254 0.01740       0.8920        0.960
##   2.004    211       1   0.9211 0.01786       0.8867        0.957
##   2.037    210       1   0.9167 0.01830       0.8815        0.953
##   2.136    209       2   0.9079 0.01915       0.8711        0.946
##   2.333    207       1   0.9035 0.01955       0.8660        0.943
##   2.595    206       1   0.8991 0.01995       0.8609        0.939
##   2.661    205       2   0.8904 0.02069       0.8507        0.932
##   2.891    203       2   0.8816 0.02140       0.8406        0.925
##   3.023    201       1   0.8772 0.02174       0.8356        0.921
##   3.055    199       1   0.8728 0.02207       0.8306        0.917
##   3.121    198       2   0.8640 0.02271       0.8206        0.910
##   3.450    196       1   0.8596 0.02302       0.8156        0.906
##   3.515    194       2   0.8507 0.02362       0.8056        0.898
##   3.614    192       1   0.8463 0.02391       0.8007        0.894
##   3.811    191       1   0.8418 0.02419       0.7957        0.891
##   3.877    190       1   0.8374 0.02446       0.7908        0.887
##   4.008    189       1   0.8330 0.02473       0.7859        0.883
##   4.304    188       1   0.8285 0.02500       0.7810        0.879
##   4.337    187       2   0.8197 0.02550       0.7712        0.871
##   4.435    185       1   0.8153 0.02575       0.7663        0.867
##   4.665    184       1   0.8108 0.02598       0.7615        0.863
##   4.731    183       1   0.8064 0.02622       0.7566        0.859
##   4.764    182       2   0.7975 0.02667       0.7469        0.852
##   4.830    180       1   0.7931 0.02688       0.7421        0.848
##   5.027    179       1   0.7887 0.02710       0.7373        0.844
##   5.125    178       2   0.7798 0.02751       0.7277        0.836
##   5.355    176       3   0.7665 0.02809       0.7134        0.824
##   5.454    173       2   0.7577 0.02845       0.7039        0.816
##   5.487    171       1   0.7532 0.02863       0.6991        0.811
##   5.585    170       1   0.7488 0.02880       0.6944        0.807
##   5.749    167       1   0.7443 0.02898       0.6896        0.803
##   5.782    165       1   0.7398 0.02915       0.6848        0.799
##   5.815    164       1   0.7353 0.02932       0.6800        0.795
##   5.881    162       2   0.7262 0.02965       0.6704        0.787
##   5.914    160       1   0.7217 0.02981       0.6655        0.783
##   5.947    159       2   0.7126 0.03012       0.6559        0.774
##   5.979    157       1   0.7081 0.03027       0.6511        0.770
##   6.012    156       1   0.7035 0.03041       0.6464        0.766
##   6.111    154       1   0.6989 0.03056       0.6416        0.761
##   6.209    152       1   0.6943 0.03070       0.6367        0.757
##   6.374    149       1   0.6897 0.03085       0.6318        0.753
##   6.472    147       1   0.6850 0.03099       0.6269        0.749
##   6.538    145       1   0.6803 0.03113       0.6219        0.744
##   6.604    144       2   0.6708 0.03141       0.6120        0.735
##   6.637    142       1   0.6661 0.03154       0.6071        0.731
##   6.801    139       1   0.6613 0.03168       0.6020        0.726
##   6.834    138       1   0.6565 0.03181       0.5970        0.722
##   6.899    137       1   0.6517 0.03194       0.5920        0.717
##   6.965    135       1   0.6469 0.03206       0.5870        0.713
##   7.162    134       1   0.6421 0.03218       0.5820        0.708
##   7.294    132       1   0.6372 0.03231       0.5769        0.704
##   7.326    130       1   0.6323 0.03243       0.5718        0.699
##   7.425    126       1   0.6273 0.03256       0.5666        0.694
##   7.524    125       1   0.6223 0.03268       0.5614        0.690
##   7.556    124       1   0.6172 0.03280       0.5562        0.685
##   7.852    121       2   0.6070 0.03304       0.5456        0.675
##   8.049    117       1   0.6019 0.03316       0.5402        0.670
##   8.082    116       1   0.5967 0.03328       0.5349        0.666
##   8.772    112       1   0.5913 0.03341       0.5294        0.661
##   8.805    111       1   0.5860 0.03353       0.5239        0.656
##   8.838    110       1   0.5807 0.03364       0.5184        0.651
##   8.871    108       1   0.5753 0.03376       0.5128        0.645
##   9.298    104       1   0.5698 0.03388       0.5071        0.640
##   9.331    103       1   0.5642 0.03400       0.5014        0.635
##   9.363    101       2   0.5531 0.03424       0.4899        0.624
##   9.396     99       1   0.5475 0.03434       0.4841        0.619
##   9.462     98       1   0.5419 0.03444       0.4784        0.614
##   9.561     97       1   0.5363 0.03454       0.4727        0.608
##   9.626     94       1   0.5306 0.03464       0.4669        0.603
##   9.889     91       1   0.5248 0.03475       0.4609        0.597
##   9.955     89       1   0.5189 0.03485       0.4549        0.592
##  10.021     87       1   0.5129 0.03496       0.4488        0.586
##  10.053     86       1   0.5070 0.03506       0.4427        0.581
##  10.185     85       2   0.4950 0.03523       0.4306        0.569
##  10.513     82       1   0.4890 0.03532       0.4244        0.563
##  10.809     81       1   0.4830 0.03539       0.4183        0.558
##  11.072     79       1   0.4768 0.03547       0.4121        0.552
##  11.170     78       1   0.4707 0.03554       0.4060        0.546
##  11.335     77       1   0.4646 0.03560       0.3998        0.540
##  11.433     76       1   0.4585 0.03565       0.3937        0.534
##  11.499     75       1   0.4524 0.03569       0.3876        0.528
##  11.532     74       1   0.4463 0.03573       0.3815        0.522
##  11.598     73       2   0.4340 0.03578       0.3693        0.510
##  11.860     70       1   0.4278 0.03581       0.3631        0.504
##  11.926     69       2   0.4154 0.03583       0.3508        0.492
##  11.959     67       1   0.4092 0.03582       0.3447        0.486
##  12.189     65       2   0.3966 0.03581       0.3323        0.473
##  12.715     60       1   0.3900 0.03582       0.3258        0.467
##  12.813     59       1   0.3834 0.03582       0.3193        0.460
##  12.945     58       1   0.3768 0.03580       0.3128        0.454
##  13.996     55       1   0.3700 0.03580       0.3060        0.447
##  14.062     54       1   0.3631 0.03579       0.2993        0.440
##  14.094     53       1   0.3563 0.03576       0.2926        0.434
##  14.226     52       1   0.3494 0.03573       0.2860        0.427
##  14.522     51       1   0.3426 0.03568       0.2793        0.420
##  14.587     50       1   0.3357 0.03561       0.2727        0.413
##  14.784     48       1   0.3287 0.03555       0.2659        0.406
##  14.949     47       1   0.3217 0.03548       0.2592        0.399
##  15.014     46       1   0.3147 0.03539       0.2525        0.392
##  15.113     44       1   0.3076 0.03530       0.2456        0.385
##  15.540     43       1   0.3004 0.03520       0.2388        0.378
##  15.671     42       1   0.2933 0.03508       0.2320        0.371
##  17.051     39       1   0.2857 0.03498       0.2248        0.363
##  17.084     38       1   0.2782 0.03485       0.2177        0.356
##  17.216     37       2   0.2632 0.03455       0.2035        0.340
##  17.511     34       1   0.2554 0.03439       0.1962        0.333
##  18.070     32       1   0.2475 0.03423       0.1887        0.325
##  18.333     30       1   0.2392 0.03407       0.1810        0.316
##  18.628     28       1   0.2307 0.03391       0.1729        0.308
##  18.858     27       1   0.2221 0.03371       0.1650        0.299
##  19.154     26       1   0.2136 0.03348       0.1571        0.290
##  20.140     24       1   0.2047 0.03325       0.1489        0.281
##  20.501     23       1   0.1958 0.03297       0.1407        0.272
##  21.060     22       1   0.1869 0.03265       0.1327        0.263
##  21.125     21       1   0.1780 0.03229       0.1247        0.254
##  21.487     20       1   0.1691 0.03188       0.1169        0.245
##  21.520     19       1   0.1602 0.03142       0.1091        0.235
##  22.571     18       1   0.1513 0.03090       0.1014        0.226
##  22.637     17       1   0.1424 0.03034       0.0938        0.216
##  23.162     16       1   0.1335 0.02972       0.0863        0.207
##  23.228     15       1   0.1246 0.02904       0.0789        0.197
##  23.918     14       1   0.1157 0.02830       0.0716        0.187
##  24.016     13       1   0.1068 0.02749       0.0645        0.177
##  24.148     12       1   0.0979 0.02660       0.0575        0.167
##  25.133     10       1   0.0881 0.02568       0.0498        0.156
##  25.988      9       1   0.0783 0.02462       0.0423        0.145
##  26.743      7       1   0.0671 0.02351       0.0338        0.133
##  29.010      4       1   0.0503 0.02285       0.0207        0.123

plot(km)

Explore if survival patterns between men and women are different:

kms <- survfit(Surv(time, status == 2) ~ sex, data = lung)
kms

## Call: survfit(formula = Surv(time, status == 2) ~ sex, data = lung)
## 
##         n events median 0.95LCL 0.95UCL
## sex=M 138    112   8.87    6.96    10.2
## sex=W  90     53  14.00   11.43    18.1

We can plot the two resulting survival curves with their confidence intervals

 plot(kms, col = c("blue", "red"), lwd = 1, conf.int = TRUE)
lines(kms, col = c("blue", "red"), lwd = 3)

We see that men have worse survival than women, but they are also older in terms of age of diagnosis:

with(lung, tapply(age, sex, mean))

##        M        W 
## 63.34058 61.07778

Formally there is a significant difference in survival between men and women

?survdiff

## starting httpd help server ... done

survdiff(Surv(time, status == 2) ~ sex, data = lung)

## Call:
## survdiff(formula = Surv(time, status == 2) ~ sex, data = lung)
## 
##         N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=M 138      112     91.6      4.55      10.3
## sex=W  90       53     73.4      5.68      10.3
## 
##  Chisq= 10.3  on 1 degrees of freedom, p= 0.001

What is the null hypothesis tested here?

Assumptions We are actually testing whether the rate-ratio is 1, assuming proportional hazards (log-rank test)

Rates and rate-ratios: Simple Cox model

Now explore how much sex and age (at diagnosis) influence the mortality—note that we are now addressing the mortality rate and not the survival in a Cox-model:

Note ci.exp function: ci.exp returns only the exponentiated parameter estimates with confidence intervals. It is merely a wrapper for ci.lin, fishing out the last 3 columns from ci.lin(…,Exp=TRUE). If you just want the estimates and confidence limits, but not exponentiated, use ci.exp(…,Exp=FALSE).

c0 <- coxph(Surv(time, status == 2) ~ sex, data= lung)
c1 <-coxph(Surv(time, status == 2) ~ sex + I(age/10), data = lung)
summary(c1)

## Call:
## coxph(formula = Surv(time, status == 2) ~ sex + I(age/10), data = lung)
## 
##   n= 228, number of events= 165 
## 
##               coef exp(coef) se(coef)      z Pr(>|z|)   
## sexW      -0.51322   0.59857  0.16746 -3.065  0.00218 **
## I(age/10)  0.17045   1.18584  0.09223  1.848  0.06459 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## sexW         0.5986     1.6707    0.4311    0.8311
## I(age/10)    1.1858     0.8433    0.9897    1.4208
## 
## Concordance= 0.603  (se = 0.025 )
## Likelihood ratio test= 14.12  on 2 df,   p=9e-04
## Wald test            = 13.47  on 2 df,   p=0.001
## Score (logrank) test = 13.72  on 2 df,   p=0.001

ci.exp(c0)

##      exp(Est.)      2.5%     97.5%
## sexW 0.5880028 0.4237178 0.8159848

ci.exp(c1)

##           exp(Est.)      2.5%     97.5%
## sexW       0.598566 0.4310936 0.8310985
## I(age/10)  1.185842 0.9897335 1.4208086

We see that there is not much confounding by age; the W/M mortality rate-ration (RR) (hazard ratio, HR, is another word for this) is slightly below 0.6 whether age is included or not. The age effect is formally non-significant, the estimate corresponds to a 19% higher mortality rate per 10 years of age at diagnosis (mortality RR or hazard ratio of 1.1858).

Checking is assumption for ph holds

We can check if the assumption of proportional hazards holds, cox.zph provides a test, and the plot method shows the Schoenfeld residuals and a smooth of them; interpretable as an estimate of the interaction effect; that is how the W/M (log) rate-ratio depends on time:

?cox.zph

cox.zph(c0)

##        chisq df     p
## sex     2.86  1 0.091
## GLOBAL  2.86  1 0.091

(z1 <- cox.zph(c1))

##           chisq df    p
## sex       2.608  1 0.11
## I(age/10) 0.209  1 0.65
## GLOBAL    2.771  2 0.25

par(mfrow = c(1,2)); plot(z1)

If the proportional hazards model holds, then the resulting lines in he plots should be approximately horizontal. But we shall return to a more explicit way of addressing this.

We see that there is no systematic pattern for age, but an increase by sex. The cox.zph really gives a test for an interaction between each covariate and the time scale.

the Cox model does not provide an estimate of the effect of time on mortality, and the by that token it is impossible to estimate any interactions with time either.

Prediction data frames for visual cox ph

Above we showed the Kaplan-Meier estimator for each of the two sexes. We can also show the estimated survival curves for the two sexes as derived from the Cox-model. This requires a prediction data frame—a data frame with the same variables as in the Cox-model and values of these representing the persons for whom we want predictions:

prs <- survfit(c0, newdata = data.frame(sex = c("M", "W")))
plot(prs, col = c("blue", "red"))

How is the shape of the two cuves relative to each other and to the original KM curve before?

Over-plot the Cox ph and KM curves

plot(prs, col = c("blue", "red"), lwd = 1, lty =1, conf.int = F)
lines(prs, col = c("blue", "red"), lty = 1, lwd = 3)
lines(kms, col = c("blue", "red"), lty = "11", lwd = 2, lend = "butt")

So far we have only seen the rates through the glasses of survival curves—it is a little odd to try and judge hazards (rates) on a transformed sace, instaed of looking at the hazards directly.

Simple Poisson model

But we do not know how the mortality per se looks as a function of time (since diagnosis). That function is not available from the Cox-model or from the survfit object. To that end we must provide a model for the effect of time on mortality; the simplest is of course to assume that it is constant or a simple linear function of time.

Assume mortality constant over time

For a start we assume that the mortality is constant over time. If it is so, then the likelihood for the model is equivalent to a Poisson likelihood, which can be fitted using the poisreg family from the Epi package—note in poarticular the the response is a two-column matrix (created with cbind) of events count and person-time (here time, because every one starts at 0):

?poisreg

with(lung, cbind(status == 2, time)[1:10,])

##           time
##  [1,] 1 10.053
##  [2,] 1 14.949
##  [3,] 0 33.183
##  [4,] 1  6.899
##  [5,] 1 29.010
##  [6,] 0 33.577
##  [7,] 1 10.185
##  [8,] 1 11.860
##  [9,] 1  7.162
## [10,] 1  5.454

p1 <- glm(cbind(status ==2, time) ~ sex + I(age/10), family = poisreg, data = lung)
ci.exp(p1) # estimates from Poisson regeression

##              exp(Est.)       2.5%     97.5%
## (Intercept) 0.03255173 0.01029235 0.1029518
## sexW        0.61820344 0.44555514 0.8577513
## I(age/10)   1.16904426 0.97796555 1.3974567

ci.exp(c1) # estimates from Cox

##           exp(Est.)      2.5%     97.5%
## sexW       0.598566 0.4310936 0.8310985
## I(age/10)  1.185842 0.9897335 1.4208086

We see that the estimates of sex and age effects are quite close between the Poisson and the Cox models, but also that the Poisson model has an intercept term, the estimate of the (assumed) constant underlying mortality. Since we entered the risk time part of the response (second argument in the cbind) in units of months (remember we rescaled in the beginning?), the (Intercept) (taken from the ci.exp) is a rate per 1 person-month.

What age and sex does the (Intercept) refer to?

Note the syntax for poisreg differs from poisson, as it allows the use of an off-set value.

px <- glm(status == 2 ~ sex + age, offset = log(time), family = poisson, data = lung)

ci.exp(px)

##              exp(Est.)       2.5%     97.5%
## (Intercept) 0.03255173 0.01029254 0.1029499
## sexW        0.61820344 0.44555975 0.8577424
## age         1.01574126 0.99777475 1.0340313

The formulation with the offset is the reason that papers use the description ”…we fitted a Poisson model with log person years as offset”. The drawback of the poisson approach is that you need the (risk) time (person-years) as a variable in the prediction frame. This is not the case for poisreg, where you get the predicted rates per unit in which you entered the person years when specifying the model.

Representation of follow-up: Lexis object

If we want to see how mortality varies by age we must split the follow-up of each person in small intervals of say, 30 days. This is most easily done using a Lexis object. That is basically just taking the lung dataset and adding a few features that defines times and states. The point is that it makes life a lot easier when things get more complex than just simple survival. It can be seen as an expansion of the stset facility in Stata.

?Lexis

L1 <- Lexis(exit = list(tfl = time), exit.status = factor(status, levels = 1:2, labels = c("Alive", "Dead")), data = lung)

## NOTE: entry.status has been set to "Alive" for all.
## NOTE: entry is assumed to be 0 on the tfl timescale.

head(L1)

##  lex.id tfl lex.dur lex.Cst lex.Xst inst   time status age sex ph.ecog ph.karno
##       1   0   10.05   Alive    Dead    3 10.053      2  74   M       1       90
##       2   0   14.95   Alive    Dead    3 14.949      2  68   M       0       90
##       3   0   33.18   Alive   Alive    3 33.183      1  56   M       0       90
##       4   0    6.90   Alive    Dead    5  6.899      2  57   M       1       90
##       5   0   29.01   Alive    Dead    1 29.010      2  60   M       0      100
##       6   0   33.58   Alive   Alive   12 33.577      1  74   M       1       50
##  pat.karno meal.cal wt.loss
##        100     1175      NA
##         90     1225      15
##         90       NA      15
##         60     1150      11
##         90       NA       0
##         80      513       0

There are five new variables added to the dataset:

lex.id: a numerical id of each record in the dataset (normally this will be a person id). Can be set explicitly with the id= argument of Lexis.

tfl: time from lung cancer at the time of entry, therefore it is 0 for all persons; the entry time is 0 from the entry time. The name of this variable was taken from the exit= argument to Lexis.

lex.dur: the length of time a person is in state lex.Cst, here measured in months,because time is.

lex.Cst: Current state, the state in which the lex.dur time is spent.

lex.Xst: eXit state, the state to which the person moves after the lex.dur time in lex.Cst.

This seems a bit of an overkill for keeping track of time and death for the lung cancer patients, but the point here is that this generalizes to multistate data too.

Here is a summary:

summary(L1)

##        
## Transitions:
##      To
## From    Alive Dead  Records:  Events: Risk time:  Persons:
##   Alive    63  165       228      165    2286.42       228

What is the average follow-up time for persons?

For a graphical representation:

?boxes

boxes(L1, boxpos = T)

Explain the numbers in the resulting graph. Redo the graph with risk time counted in years

boxes(L1, boxpos = TRUE, scale.Y = 12, digits.R = 3, show.Y = T)

We can make the Cox-analysis using the Lexis-specific variables:

?Surv

c1 <- coxph(Surv(tfl, tfl + lex.dur, lex.Xst == "Dead") ~ sex + age, data=L1)

#An even simpler method using Lexis features:

?coxph.Lexis

cL <- coxph.Lexis(L1, tfl ~ sex + age)

## survival::coxph analysis of Lexis object L1:
## Rates for the transition:
## Alive->Dead
## Baseline timescale: tfl

ci.exp(cL)

##      exp(Est.)      2.5%     97.5%
## sexW  0.598566 0.4310936 0.8310985
## age   1.017191 0.9989686 1.0357467

ci.exp(c1)

##      exp(Est.)      2.5%     97.5%
## sexW  0.598566 0.4310936 0.8310985
## age   1.017191 0.9989686 1.0357467

We can also make a Poisson analysis:

pc <- glm(cbind(lex.Xst == "Dead", lex.dur) ~ sex + age, family = poisreg, data = L1)

#Once again, made simpler with the Lexis features

pL <- glm.Lexis(L1, ~ sex + age)

## stats::glm Poisson analysis of Lexis object L1 with log link:
## Rates for the transition:
## Alive->Dead

ci.exp(pL)

##              exp(Est.)       2.5%     97.5%
## (Intercept) 0.03255173 0.01029235 0.1029518
## sexW        0.61820344 0.44555514 0.8577513
## age         1.01574126 0.99777440 1.0340317

ci.exp(pc)

##              exp(Est.)       2.5%     97.5%
## (Intercept) 0.03255173 0.01029235 0.1029518
## sexW        0.61820344 0.44555514 0.8577513
## age         1.01574126 0.99777440 1.0340317

Remember that the Poisson-model fitted is a pretty brutal approximation to the Cox-model—it assumes that the baseline hazard is constant, whereas the Cox-model allows the baseline hazard to vary arbitrarily by time.

#Estimating the hazard function: splitting time

If we want a more detailed version of the baseline hazard we split follow-up time in small intervals, assume that the hazard is constant in each small interval, and assume the the size of the hazard varies smoothly with time, tfl:

We can subdivide the follow up into small intervals using survival:::survSplit, Epi:::splitLexis, or popEpi:::splitMulti. The splitmulti is by far the easiest to use and the fastest. Recall that time has been rescaled to months, so we split in 1 month intervals (0 to 36 months)

SL <- splitMulti(L1, tfl = 0:36)


summary(L1)

##        
## Transitions:
##      To
## From    Alive Dead  Records:  Events: Risk time:  Persons:
##   Alive    63  165       228      165    2286.42       228

summary(SL)

##        
## Transitions:
##      To
## From    Alive Dead  Records:  Events: Risk time:  Persons:
##   Alive  2234  165      2399      165    2286.42       228

We can see how the follow up for person, 10 say, is in the original and the split dataset:

wh <- names(L1)[1:10] # names of variables in some order

subset(L1, lex.id == 10)[,wh]

##  lex.id tfl lex.dur lex.Cst lex.Xst inst  time status age sex
##      10   0    5.45   Alive    Dead    7 5.454      2  61   M

subset(SL, lex.id == 10)[,wh]

##  lex.id tfl lex.dur lex.Cst lex.Xst inst  time status age sex
##      10   0    1.00   Alive   Alive    7 5.454      2  61   M
##      10   1    1.00   Alive   Alive    7 5.454      2  61   M
##      10   2    1.00   Alive   Alive    7 5.454      2  61   M
##      10   3    1.00   Alive   Alive    7 5.454      2  61   M
##      10   4    1.00   Alive   Alive    7 5.454      2  61   M
##      10   5    0.45   Alive    Dead    7 5.454      2  61   M

In Sl each record now represents a small interval of follow-up for a person, so each person has many records. The main thing to note here is tfl, which represents the time since lung cancer diagnosis at the beginning of each interval, and lex.dur representing the risk time (“person-years”, in months though)—the length of the interval. The reason we need to split follow-up in small pieces is that the modeling implicitly assumes that the mortality rate is constant within each interval. If intervals are too long, this assumption is a bad approximation.

We can now include a smooth effect of tfl in the Poisson-model allowing the baseline hazard to vary by time since lung Ca. That is done by natural splines:

ps <- glm(cbind(lex.Xst == "Dead", lex.dur) ~ Ns(tfl, knots = seq(0.,36,12)) + sex + age, family = poisreg, data = SL)

ci.exp(ps)

##                                   exp(Est.)        2.5%       97.5%
## (Intercept)                      0.01898426 0.005700994  0.06321739
## Ns(tfl, knots = seq(0, 36, 12))1 2.40380283 0.809425097  7.13873100
## Ns(tfl, knots = seq(0, 36, 12))2 4.15015448 0.436289141 39.47790716
## Ns(tfl, knots = seq(0, 36, 12))3 0.83994013 0.043932122 16.05885147
## sexW                             0.59871596 0.431231819  0.83124850
## age                              1.01658684 0.998376760  1.03512907

#Or a simpler method with Lexis

ps <- glm.Lexis(SL, ~Ns(tfl, knots  =seq(0,36,12)) +age +sex)

## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead

ci.exp(ps)

##                                   exp(Est.)        2.5%       97.5%
## (Intercept)                      0.01898426 0.005700994  0.06321739
## Ns(tfl, knots = seq(0, 36, 12))1 2.40380283 0.809425097  7.13873100
## Ns(tfl, knots = seq(0, 36, 12))2 4.15015448 0.436289141 39.47790716
## Ns(tfl, knots = seq(0, 36, 12))3 0.83994013 0.043932122 16.05885147
## age                              1.01658684 0.998376760  1.03512907
## sexW                             0.59871596 0.431231819  0.83124850

Compate these to the regresssion estimates from the Cox0model and from the model with constant baseline:

ests <- cbind(ci.exp(c1), ci.exp(ps, subset = c("sex", "age")), ci.exp(pc, subset = c("sex", "age")))

colnames(ests)[1:3*3-2] <- c("Cox", "Pois-spline", "Pois-const")
round(ests,3)

##        Cox  2.5% 97.5% Pois-spline  2.5% 97.5% Pois-const  2.5% 97.5%
## sexW 0.599 0.431 0.831       0.599 0.431 0.831      0.618 0.446 0.858
## age  1.017 0.999 1.036       1.017 0.998 1.035      1.016 0.998 1.034

We see that the smooth parametric Poisson model and the Cox model produce virtually the same estimates of age and sex-effects, whereas the Poisson model with constant hazard produce slightly different ones.

The proportional hazards assumption is the same for the Cox model and both the Poisson models: The M/W hazard ratio is the same at any time after diagnosis. What differs between the models is the assumed shape of the hazard (not a hazard ratio).

The Cox model allows the baseline rate to change arbitrarily at every event time, not using the quantitative nature of time; the spilne Poisson model has a baseline that varies smoothly by time and the constant Poisson model has a baseline that is constant over time. The latter is clearly not tenable, whereas the smooth Poisson model and the Cox model give the same regression estimates.

So we now have a parametric model for the baseline hazard which means that we can show the estimated baseline hazard for, say, a 60 year old woman, by supplying a suitable prediction frame, i.e. a data frame where each row represents a set of covariate values (including the time) where we want the predicted mortality—her we use times from 0 to 30 months in steps of 0.2 months:

prf <- data.frame(tfl = seq(0,30,0.2), sex  ="W", age = 60)

The choice of prediction points are totally unrelated to the intervals in which we split the follow-up for analysis. We can over-plot with the predicted rates from the model where mortality rates are constant, the only change is the model (pc instead of ps):

matshade(prf$tfl, ci.pred(ps, prf), lwd = 3, plot = T, log = "y")
matshade(prf$tfl, ci.pred(pc, prf), lwd = 3, lty = "21", lend = "butt")

What we see from the plot is that mortality rates are increasing during the first 1.5 years after lung cancer and then leveling off.

Put some sensible axis labels on the plot, and rescale the rates to rates per 1 person-year.

We can transform the hazard function, λ(t), to a survival function, S(t) using the relationship S(t) = exp(− R t 0 λ(u) du). This integration exercise is implemented in the ci.surv function, which takes a model and a prediction data frame as arguments; the prediction data frame must correspond to a sequence of equidistant time points (makes life easier for the coder), so we can use prf for this purpose:

matshade(prf$tfl, ci.surv(ps, prf, intl = 0.2),
         plot = TRUE, ylim = 0:1, lwd = 3)

This is the survival function for a 60 year old woman. We can expand this by overlaying the survival function from the model with constant hazard (also known as ”exponential(y distributed) survival”) and the KM-estimator

matshade(prf$tfl, ci.surv(ps, prf, intl = 0.2),
         plot = TRUE, ylim = 0:1, lwd = 3)
lines(prf$tfl, ci.surv(pc, prf, intl = 0.2)[,1], lwd = 2, col = gray(0.5))
lines(survfit(c1, newdata = data.frame(sex = "W", age = 60)),
      lwd = 2, lty = 1)

We see that the survival function from the constant hazard model is quite a bit off, but also a good correspondence between the Cox-model based survival and the survival from the parametric hazard function. We can bring the plots together in one graph:

par(mfrow = c(1,2))
# hazard scale
matshade(prf$tfl, ci.pred(ps, prf),
         plot = TRUE, log = "y", lwd = 3, xlim = c(0,30))
matshade(prf$tfl, ci.pred(pc, prf), lty = 3, lwd = 3)
#
# survival
matshade(prf$tfl, ci.surv(ps, prf, intl = 0.2),
         plot = TRUE, ylim = 0:1, lwd = 3, xlim = c(0,30))
matshade(prf$tfl, ci.surv(pc, prf, intl = 0.2),
         lty = 3, alpha = 0, lwd = 3)
lines(survfit(c1, newdata = data.frame(sex = "W", age = 60)),
      col = "forestgreen", lwd = 3)

We have compared the predicted survival curve from a Poisson model with age and sex and time since lung cancer as covariates to that from a Cox-model with age and sex as covariates and time since lung cancer as underlying time scale.

We now go back to the Kaplan-Meier estimator and compare that to the corresponding Poisson-model, which is one with time (tfl) as the only covariate:

par(mfrow=c(1,2))
pk <- glm(cbind(lex.Xst == "Dead",
                lex.dur) ~ Ns(tfl, knots = seq(0, 36, 12)),
          family = poisreg,
            data = SL)
#or
pk <- glm.Lexis(SL, ~ Ns(tfl, knots = seq(0, 36, 12)))

## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead

# hazard
matshade(prf$tfl, ci.pred(pk, prf),
         plot = TRUE, log = "y", lwd = 3, ylim = c(0.01,1))

# survival from smooth model
matshade(prf$tfl, ci.surv(pk, prf, intl = 0.2) ,
         plot = TRUE, lwd = 3, ylim = 0:1)

# K-M estimator
lines(km, lwd = 2)

We can explore how the tightness of the knots in the smooth model influence the underlying hazard and the resulting survival function. This is easiest done by setting up a function that does the analysis with different distance between of knots

zz <-
function(dk)
{
kn <<- seq(0, 36, dk) # must be in global environment...
print(kn)
pk <<- glm.Lexis(SL, ~ Ns(tfl, knots = kn))
matshade(prf$tfl, ci.pred(pk, prf),
         plot = TRUE, log = "y", lwd = 2, ylim = c(0.01,1), xlim = c(0,30))
rug(kn, lwd=3)

plot(km, lwd = 2, col = "limegreen", xlim = c(0,30))
matshade(prf$tfl, ci.surv(pk, prf, intl = 0.2) ,
         lwd = 2, ylim = 0:1, xlim = c(0,30))
}

par(mfrow = c(1,2))
zz(18)

## [1]  0 18 36
## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead

## Warning in rug(kn, lwd = 3): some values will be clipped

zz(12)

## [1]  0 12 24 36
## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead

## Warning in rug(kn, lwd = 3): some values will be clipped

zz(6)

## [1]  0  6 12 18 24 30 36
## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead

## Warning in rug(kn, lwd = 3): some values will be clipped

zz(4)

##  [1]  0  4  8 12 16 20 24 28 32 36
## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead

## Warning: glm.fit: fitted rates numerically 0 occurred

## Warning: some values will be clipped

zz(3)

##  [1]  0  3  6  9 12 15 18 21 24 27 30 33 36
## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead

## Warning: glm.fit: fitted rates numerically 0 occurred

## Warning: some values will be clipped

You will see that the more knots you include, the closer the parametric estimate gets to the Kaplan-Meier estimator. But also that the estimated underlying hazard becomes increasingly silly. The ultimate silliness is of course achieved when we arrive at the Kaplan-Meier estimator.

Fortunately the baseline hazard underlying the Kaplan-Meier and the Breslow estimator is rarely shown.

#Conclusion

The Cox-model and the Poisson-model gives the same results for the regression parameters, provided the baseline is properly modeled

The Cox-model is a partial model, it only gives the HRs, not the rates. The baseline rates are missing.

The Poisson-model is a full model for the rates. Any set of predicted rates (and derivatives thereof) can be derived from the Poisson model.

Bendix Stats Program - Chapter 1

Adam Livori

16/02/2023