library(survival)
library(Epi)
install.packages("popEpi", repos = "http://cran.us.r-project.org" )
## package 'popEpi' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\acliv1\AppData\Local\Temp\RtmpmYX87d\downloaded_packages
library(popEpi)
##
## Attaching package: 'popEpi'
## The following object is masked from 'package:survival':
##
## Surv
# popEpi::splitMulti returns a data.frame rather than a data.table
options("popEpi.datatable" = FALSE)
clear()
noquote(version$version.string)
## [1] R version 4.2.2 (2022-10-31 ucrt)
noquote(installed.packages()[c("Epi","popEpi"),c("Version","Built")])
## Version Built
## Epi 2.47 4.2.2
## popEpi 0.4.10 4.2.2
Always do that with categorical variables. Also we rescale time from days to months:
data(lung)
## Warning in data(lung): data set 'lung' not found
lung$sex <- factor(lung$sex,
levels = 1:2,
labels = c("M", "W"))
lung$time <- round(lung$time / (365.25/12), 3)
head(lung)
## inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1 3 10.053 2 74 M 1 90 100 1175 NA
## 2 3 14.949 2 68 M 0 90 90 1225 15
## 3 3 33.183 1 56 M 0 90 90 NA 15
## 4 5 6.899 2 57 M 1 90 60 1150 11
## 5 1 29.010 2 60 M 0 100 90 NA 0
## 6 12 33.577 1 74 M 1 50 80 513 0
km <- survfit(Surv(time, status == 2) ~ 1, data = lung)
km
## Call: survfit(formula = Surv(time, status == 2) ~ 1, data = lung)
##
## n events median 0.95LCL 0.95UCL
## [1,] 228 165 10.2 9.36 11.9
summary(km) #note very long output
## Call: survfit(formula = Surv(time, status == 2) ~ 1, data = lung)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0.164 228 1 0.9956 0.00438 0.9871 1.000
## 0.361 227 3 0.9825 0.00869 0.9656 1.000
## 0.394 224 1 0.9781 0.00970 0.9592 0.997
## 0.427 223 2 0.9693 0.01142 0.9472 0.992
## 0.493 221 1 0.9649 0.01219 0.9413 0.989
## 0.854 220 1 0.9605 0.01290 0.9356 0.986
## 0.986 219 1 0.9561 0.01356 0.9299 0.983
## 1.018 218 1 0.9518 0.01419 0.9243 0.980
## 1.741 217 2 0.9430 0.01536 0.9134 0.974
## 1.774 215 1 0.9386 0.01590 0.9079 0.970
## 1.938 214 1 0.9342 0.01642 0.9026 0.967
## 1.971 213 2 0.9254 0.01740 0.8920 0.960
## 2.004 211 1 0.9211 0.01786 0.8867 0.957
## 2.037 210 1 0.9167 0.01830 0.8815 0.953
## 2.136 209 2 0.9079 0.01915 0.8711 0.946
## 2.333 207 1 0.9035 0.01955 0.8660 0.943
## 2.595 206 1 0.8991 0.01995 0.8609 0.939
## 2.661 205 2 0.8904 0.02069 0.8507 0.932
## 2.891 203 2 0.8816 0.02140 0.8406 0.925
## 3.023 201 1 0.8772 0.02174 0.8356 0.921
## 3.055 199 1 0.8728 0.02207 0.8306 0.917
## 3.121 198 2 0.8640 0.02271 0.8206 0.910
## 3.450 196 1 0.8596 0.02302 0.8156 0.906
## 3.515 194 2 0.8507 0.02362 0.8056 0.898
## 3.614 192 1 0.8463 0.02391 0.8007 0.894
## 3.811 191 1 0.8418 0.02419 0.7957 0.891
## 3.877 190 1 0.8374 0.02446 0.7908 0.887
## 4.008 189 1 0.8330 0.02473 0.7859 0.883
## 4.304 188 1 0.8285 0.02500 0.7810 0.879
## 4.337 187 2 0.8197 0.02550 0.7712 0.871
## 4.435 185 1 0.8153 0.02575 0.7663 0.867
## 4.665 184 1 0.8108 0.02598 0.7615 0.863
## 4.731 183 1 0.8064 0.02622 0.7566 0.859
## 4.764 182 2 0.7975 0.02667 0.7469 0.852
## 4.830 180 1 0.7931 0.02688 0.7421 0.848
## 5.027 179 1 0.7887 0.02710 0.7373 0.844
## 5.125 178 2 0.7798 0.02751 0.7277 0.836
## 5.355 176 3 0.7665 0.02809 0.7134 0.824
## 5.454 173 2 0.7577 0.02845 0.7039 0.816
## 5.487 171 1 0.7532 0.02863 0.6991 0.811
## 5.585 170 1 0.7488 0.02880 0.6944 0.807
## 5.749 167 1 0.7443 0.02898 0.6896 0.803
## 5.782 165 1 0.7398 0.02915 0.6848 0.799
## 5.815 164 1 0.7353 0.02932 0.6800 0.795
## 5.881 162 2 0.7262 0.02965 0.6704 0.787
## 5.914 160 1 0.7217 0.02981 0.6655 0.783
## 5.947 159 2 0.7126 0.03012 0.6559 0.774
## 5.979 157 1 0.7081 0.03027 0.6511 0.770
## 6.012 156 1 0.7035 0.03041 0.6464 0.766
## 6.111 154 1 0.6989 0.03056 0.6416 0.761
## 6.209 152 1 0.6943 0.03070 0.6367 0.757
## 6.374 149 1 0.6897 0.03085 0.6318 0.753
## 6.472 147 1 0.6850 0.03099 0.6269 0.749
## 6.538 145 1 0.6803 0.03113 0.6219 0.744
## 6.604 144 2 0.6708 0.03141 0.6120 0.735
## 6.637 142 1 0.6661 0.03154 0.6071 0.731
## 6.801 139 1 0.6613 0.03168 0.6020 0.726
## 6.834 138 1 0.6565 0.03181 0.5970 0.722
## 6.899 137 1 0.6517 0.03194 0.5920 0.717
## 6.965 135 1 0.6469 0.03206 0.5870 0.713
## 7.162 134 1 0.6421 0.03218 0.5820 0.708
## 7.294 132 1 0.6372 0.03231 0.5769 0.704
## 7.326 130 1 0.6323 0.03243 0.5718 0.699
## 7.425 126 1 0.6273 0.03256 0.5666 0.694
## 7.524 125 1 0.6223 0.03268 0.5614 0.690
## 7.556 124 1 0.6172 0.03280 0.5562 0.685
## 7.852 121 2 0.6070 0.03304 0.5456 0.675
## 8.049 117 1 0.6019 0.03316 0.5402 0.670
## 8.082 116 1 0.5967 0.03328 0.5349 0.666
## 8.772 112 1 0.5913 0.03341 0.5294 0.661
## 8.805 111 1 0.5860 0.03353 0.5239 0.656
## 8.838 110 1 0.5807 0.03364 0.5184 0.651
## 8.871 108 1 0.5753 0.03376 0.5128 0.645
## 9.298 104 1 0.5698 0.03388 0.5071 0.640
## 9.331 103 1 0.5642 0.03400 0.5014 0.635
## 9.363 101 2 0.5531 0.03424 0.4899 0.624
## 9.396 99 1 0.5475 0.03434 0.4841 0.619
## 9.462 98 1 0.5419 0.03444 0.4784 0.614
## 9.561 97 1 0.5363 0.03454 0.4727 0.608
## 9.626 94 1 0.5306 0.03464 0.4669 0.603
## 9.889 91 1 0.5248 0.03475 0.4609 0.597
## 9.955 89 1 0.5189 0.03485 0.4549 0.592
## 10.021 87 1 0.5129 0.03496 0.4488 0.586
## 10.053 86 1 0.5070 0.03506 0.4427 0.581
## 10.185 85 2 0.4950 0.03523 0.4306 0.569
## 10.513 82 1 0.4890 0.03532 0.4244 0.563
## 10.809 81 1 0.4830 0.03539 0.4183 0.558
## 11.072 79 1 0.4768 0.03547 0.4121 0.552
## 11.170 78 1 0.4707 0.03554 0.4060 0.546
## 11.335 77 1 0.4646 0.03560 0.3998 0.540
## 11.433 76 1 0.4585 0.03565 0.3937 0.534
## 11.499 75 1 0.4524 0.03569 0.3876 0.528
## 11.532 74 1 0.4463 0.03573 0.3815 0.522
## 11.598 73 2 0.4340 0.03578 0.3693 0.510
## 11.860 70 1 0.4278 0.03581 0.3631 0.504
## 11.926 69 2 0.4154 0.03583 0.3508 0.492
## 11.959 67 1 0.4092 0.03582 0.3447 0.486
## 12.189 65 2 0.3966 0.03581 0.3323 0.473
## 12.715 60 1 0.3900 0.03582 0.3258 0.467
## 12.813 59 1 0.3834 0.03582 0.3193 0.460
## 12.945 58 1 0.3768 0.03580 0.3128 0.454
## 13.996 55 1 0.3700 0.03580 0.3060 0.447
## 14.062 54 1 0.3631 0.03579 0.2993 0.440
## 14.094 53 1 0.3563 0.03576 0.2926 0.434
## 14.226 52 1 0.3494 0.03573 0.2860 0.427
## 14.522 51 1 0.3426 0.03568 0.2793 0.420
## 14.587 50 1 0.3357 0.03561 0.2727 0.413
## 14.784 48 1 0.3287 0.03555 0.2659 0.406
## 14.949 47 1 0.3217 0.03548 0.2592 0.399
## 15.014 46 1 0.3147 0.03539 0.2525 0.392
## 15.113 44 1 0.3076 0.03530 0.2456 0.385
## 15.540 43 1 0.3004 0.03520 0.2388 0.378
## 15.671 42 1 0.2933 0.03508 0.2320 0.371
## 17.051 39 1 0.2857 0.03498 0.2248 0.363
## 17.084 38 1 0.2782 0.03485 0.2177 0.356
## 17.216 37 2 0.2632 0.03455 0.2035 0.340
## 17.511 34 1 0.2554 0.03439 0.1962 0.333
## 18.070 32 1 0.2475 0.03423 0.1887 0.325
## 18.333 30 1 0.2392 0.03407 0.1810 0.316
## 18.628 28 1 0.2307 0.03391 0.1729 0.308
## 18.858 27 1 0.2221 0.03371 0.1650 0.299
## 19.154 26 1 0.2136 0.03348 0.1571 0.290
## 20.140 24 1 0.2047 0.03325 0.1489 0.281
## 20.501 23 1 0.1958 0.03297 0.1407 0.272
## 21.060 22 1 0.1869 0.03265 0.1327 0.263
## 21.125 21 1 0.1780 0.03229 0.1247 0.254
## 21.487 20 1 0.1691 0.03188 0.1169 0.245
## 21.520 19 1 0.1602 0.03142 0.1091 0.235
## 22.571 18 1 0.1513 0.03090 0.1014 0.226
## 22.637 17 1 0.1424 0.03034 0.0938 0.216
## 23.162 16 1 0.1335 0.02972 0.0863 0.207
## 23.228 15 1 0.1246 0.02904 0.0789 0.197
## 23.918 14 1 0.1157 0.02830 0.0716 0.187
## 24.016 13 1 0.1068 0.02749 0.0645 0.177
## 24.148 12 1 0.0979 0.02660 0.0575 0.167
## 25.133 10 1 0.0881 0.02568 0.0498 0.156
## 25.988 9 1 0.0783 0.02462 0.0423 0.145
## 26.743 7 1 0.0671 0.02351 0.0338 0.133
## 29.010 4 1 0.0503 0.02285 0.0207 0.123
plot(km)
kms <- survfit(Surv(time, status == 2) ~ sex, data = lung)
kms
## Call: survfit(formula = Surv(time, status == 2) ~ sex, data = lung)
##
## n events median 0.95LCL 0.95UCL
## sex=M 138 112 8.87 6.96 10.2
## sex=W 90 53 14.00 11.43 18.1
We can plot the two resulting survival curves with their confidence intervals
plot(kms, col = c("blue", "red"), lwd = 1, conf.int = TRUE)
lines(kms, col = c("blue", "red"), lwd = 3)
We see that men have worse survival than women, but they are also older in terms of age of diagnosis:
with(lung, tapply(age, sex, mean))
## M W
## 63.34058 61.07778
Formally there is a significant difference in survival between men and women
?survdiff
## starting httpd help server ... done
survdiff(Surv(time, status == 2) ~ sex, data = lung)
## Call:
## survdiff(formula = Surv(time, status == 2) ~ sex, data = lung)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=M 138 112 91.6 4.55 10.3
## sex=W 90 53 73.4 5.68 10.3
##
## Chisq= 10.3 on 1 degrees of freedom, p= 0.001
What is the null hypothesis tested here?
Assumptions We are actually testing whether the rate-ratio is 1, assuming proportional hazards (log-rank test)
Note ci.exp function: ci.exp returns only the exponentiated parameter estimates with confidence intervals. It is merely a wrapper for ci.lin, fishing out the last 3 columns from ci.lin(…,Exp=TRUE). If you just want the estimates and confidence limits, but not exponentiated, use ci.exp(…,Exp=FALSE).
c0 <- coxph(Surv(time, status == 2) ~ sex, data= lung)
c1 <-coxph(Surv(time, status == 2) ~ sex + I(age/10), data = lung)
summary(c1)
## Call:
## coxph(formula = Surv(time, status == 2) ~ sex + I(age/10), data = lung)
##
## n= 228, number of events= 165
##
## coef exp(coef) se(coef) z Pr(>|z|)
## sexW -0.51322 0.59857 0.16746 -3.065 0.00218 **
## I(age/10) 0.17045 1.18584 0.09223 1.848 0.06459 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## sexW 0.5986 1.6707 0.4311 0.8311
## I(age/10) 1.1858 0.8433 0.9897 1.4208
##
## Concordance= 0.603 (se = 0.025 )
## Likelihood ratio test= 14.12 on 2 df, p=9e-04
## Wald test = 13.47 on 2 df, p=0.001
## Score (logrank) test = 13.72 on 2 df, p=0.001
ci.exp(c0)
## exp(Est.) 2.5% 97.5%
## sexW 0.5880028 0.4237178 0.8159848
ci.exp(c1)
## exp(Est.) 2.5% 97.5%
## sexW 0.598566 0.4310936 0.8310985
## I(age/10) 1.185842 0.9897335 1.4208086
We see that there is not much confounding by age; the W/M mortality rate-ration (RR) (hazard ratio, HR, is another word for this) is slightly below 0.6 whether age is included or not. The age effect is formally non-significant, the estimate corresponds to a 19% higher mortality rate per 10 years of age at diagnosis (mortality RR or hazard ratio of 1.1858).
We can check if the assumption of proportional hazards holds, cox.zph provides a test, and the plot method shows the Schoenfeld residuals and a smooth of them; interpretable as an estimate of the interaction effect; that is how the W/M (log) rate-ratio depends on time:
?cox.zph
cox.zph(c0)
## chisq df p
## sex 2.86 1 0.091
## GLOBAL 2.86 1 0.091
(z1 <- cox.zph(c1))
## chisq df p
## sex 2.608 1 0.11
## I(age/10) 0.209 1 0.65
## GLOBAL 2.771 2 0.25
par(mfrow = c(1,2)); plot(z1)
If the proportional hazards model holds, then the resulting lines in he plots should be approximately horizontal. But we shall return to a more explicit way of addressing this.
We see that there is no systematic pattern for age, but an increase by sex. The cox.zph really gives a test for an interaction between each covariate and the time scale.
the Cox model does not provide an estimate of the effect of time on mortality, and the by that token it is impossible to estimate any interactions with time either.
Above we showed the Kaplan-Meier estimator for each of the two sexes. We can also show the estimated survival curves for the two sexes as derived from the Cox-model. This requires a prediction data frame—a data frame with the same variables as in the Cox-model and values of these representing the persons for whom we want predictions:
prs <- survfit(c0, newdata = data.frame(sex = c("M", "W")))
plot(prs, col = c("blue", "red"))
How is the shape of the two cuves relative to each other and to the original KM curve before?
plot(prs, col = c("blue", "red"), lwd = 1, lty =1, conf.int = F)
lines(prs, col = c("blue", "red"), lty = 1, lwd = 3)
lines(kms, col = c("blue", "red"), lty = "11", lwd = 2, lend = "butt")
So far we have only seen the rates through the glasses of survival curves—it is a little odd to try and judge hazards (rates) on a transformed sace, instaed of looking at the hazards directly.
But we do not know how the mortality per se looks as a function of time (since diagnosis). That function is not available from the Cox-model or from the survfit object. To that end we must provide a model for the effect of time on mortality; the simplest is of course to assume that it is constant or a simple linear function of time.
For a start we assume that the mortality is constant over time. If it is so, then the likelihood for the model is equivalent to a Poisson likelihood, which can be fitted using the poisreg family from the Epi package—note in poarticular the the response is a two-column matrix (created with cbind) of events count and person-time (here time, because every one starts at 0):
?poisreg
with(lung, cbind(status == 2, time)[1:10,])
## time
## [1,] 1 10.053
## [2,] 1 14.949
## [3,] 0 33.183
## [4,] 1 6.899
## [5,] 1 29.010
## [6,] 0 33.577
## [7,] 1 10.185
## [8,] 1 11.860
## [9,] 1 7.162
## [10,] 1 5.454
p1 <- glm(cbind(status ==2, time) ~ sex + I(age/10), family = poisreg, data = lung)
ci.exp(p1) # estimates from Poisson regeression
## exp(Est.) 2.5% 97.5%
## (Intercept) 0.03255173 0.01029235 0.1029518
## sexW 0.61820344 0.44555514 0.8577513
## I(age/10) 1.16904426 0.97796555 1.3974567
ci.exp(c1) # estimates from Cox
## exp(Est.) 2.5% 97.5%
## sexW 0.598566 0.4310936 0.8310985
## I(age/10) 1.185842 0.9897335 1.4208086
We see that the estimates of sex and age effects are quite close between the Poisson and the Cox models, but also that the Poisson model has an intercept term, the estimate of the (assumed) constant underlying mortality. Since we entered the risk time part of the response (second argument in the cbind) in units of months (remember we rescaled in the beginning?), the (Intercept) (taken from the ci.exp) is a rate per 1 person-month.
What age and sex does the (Intercept) refer to?
Note the syntax for poisreg differs from poisson, as it allows the use of an off-set value.
px <- glm(status == 2 ~ sex + age, offset = log(time), family = poisson, data = lung)
ci.exp(px)
## exp(Est.) 2.5% 97.5%
## (Intercept) 0.03255173 0.01029254 0.1029499
## sexW 0.61820344 0.44555975 0.8577424
## age 1.01574126 0.99777475 1.0340313
The formulation with the offset is the reason that papers use the description ”…we fitted a Poisson model with log person years as offset”. The drawback of the poisson approach is that you need the (risk) time (person-years) as a variable in the prediction frame. This is not the case for poisreg, where you get the predicted rates per unit in which you entered the person years when specifying the model.
If we want to see how mortality varies by age we must split the follow-up of each person in small intervals of say, 30 days. This is most easily done using a Lexis object. That is basically just taking the lung dataset and adding a few features that defines times and states. The point is that it makes life a lot easier when things get more complex than just simple survival. It can be seen as an expansion of the stset facility in Stata.
?Lexis
L1 <- Lexis(exit = list(tfl = time), exit.status = factor(status, levels = 1:2, labels = c("Alive", "Dead")), data = lung)
## NOTE: entry.status has been set to "Alive" for all.
## NOTE: entry is assumed to be 0 on the tfl timescale.
head(L1)
## lex.id tfl lex.dur lex.Cst lex.Xst inst time status age sex ph.ecog ph.karno
## 1 0 10.05 Alive Dead 3 10.053 2 74 M 1 90
## 2 0 14.95 Alive Dead 3 14.949 2 68 M 0 90
## 3 0 33.18 Alive Alive 3 33.183 1 56 M 0 90
## 4 0 6.90 Alive Dead 5 6.899 2 57 M 1 90
## 5 0 29.01 Alive Dead 1 29.010 2 60 M 0 100
## 6 0 33.58 Alive Alive 12 33.577 1 74 M 1 50
## pat.karno meal.cal wt.loss
## 100 1175 NA
## 90 1225 15
## 90 NA 15
## 60 1150 11
## 90 NA 0
## 80 513 0
There are five new variables added to the dataset:
lex.id: a numerical id of each record in the dataset (normally this will be a person id). Can be set explicitly with the id= argument of Lexis.
tfl: time from lung cancer at the time of entry, therefore it is 0 for all persons; the entry time is 0 from the entry time. The name of this variable was taken from the exit= argument to Lexis.
lex.dur: the length of time a person is in state lex.Cst, here measured in months,because time is.
lex.Cst: Current state, the state in which the lex.dur time is spent.
lex.Xst: eXit state, the state to which the person moves after the lex.dur time in lex.Cst.
This seems a bit of an overkill for keeping track of time and death for the lung cancer patients, but the point here is that this generalizes to multistate data too.
Here is a summary:
summary(L1)
##
## Transitions:
## To
## From Alive Dead Records: Events: Risk time: Persons:
## Alive 63 165 228 165 2286.42 228
What is the average follow-up time for persons?
For a graphical representation:
?boxes
boxes(L1, boxpos = T)
Explain the numbers in the resulting graph. Redo the graph with risk time counted in years
boxes(L1, boxpos = TRUE, scale.Y = 12, digits.R = 3, show.Y = T)
We can make the Cox-analysis using the Lexis-specific variables:
?Surv
c1 <- coxph(Surv(tfl, tfl + lex.dur, lex.Xst == "Dead") ~ sex + age, data=L1)
#An even simpler method using Lexis features:
?coxph.Lexis
cL <- coxph.Lexis(L1, tfl ~ sex + age)
## survival::coxph analysis of Lexis object L1:
## Rates for the transition:
## Alive->Dead
## Baseline timescale: tfl
ci.exp(cL)
## exp(Est.) 2.5% 97.5%
## sexW 0.598566 0.4310936 0.8310985
## age 1.017191 0.9989686 1.0357467
ci.exp(c1)
## exp(Est.) 2.5% 97.5%
## sexW 0.598566 0.4310936 0.8310985
## age 1.017191 0.9989686 1.0357467
We can also make a Poisson analysis:
pc <- glm(cbind(lex.Xst == "Dead", lex.dur) ~ sex + age, family = poisreg, data = L1)
#Once again, made simpler with the Lexis features
pL <- glm.Lexis(L1, ~ sex + age)
## stats::glm Poisson analysis of Lexis object L1 with log link:
## Rates for the transition:
## Alive->Dead
ci.exp(pL)
## exp(Est.) 2.5% 97.5%
## (Intercept) 0.03255173 0.01029235 0.1029518
## sexW 0.61820344 0.44555514 0.8577513
## age 1.01574126 0.99777440 1.0340317
ci.exp(pc)
## exp(Est.) 2.5% 97.5%
## (Intercept) 0.03255173 0.01029235 0.1029518
## sexW 0.61820344 0.44555514 0.8577513
## age 1.01574126 0.99777440 1.0340317
Remember that the Poisson-model fitted is a pretty brutal approximation to the Cox-model—it assumes that the baseline hazard is constant, whereas the Cox-model allows the baseline hazard to vary arbitrarily by time.
#Estimating the hazard function: splitting time
If we want a more detailed version of the baseline hazard we split follow-up time in small intervals, assume that the hazard is constant in each small interval, and assume the the size of the hazard varies smoothly with time, tfl:
We can subdivide the follow up into small intervals using survival:::survSplit, Epi:::splitLexis, or popEpi:::splitMulti. The splitmulti is by far the easiest to use and the fastest. Recall that time has been rescaled to months, so we split in 1 month intervals (0 to 36 months)
SL <- splitMulti(L1, tfl = 0:36)
summary(L1)
##
## Transitions:
## To
## From Alive Dead Records: Events: Risk time: Persons:
## Alive 63 165 228 165 2286.42 228
summary(SL)
##
## Transitions:
## To
## From Alive Dead Records: Events: Risk time: Persons:
## Alive 2234 165 2399 165 2286.42 228
We can see how the follow up for person, 10 say, is in the original and the split dataset:
wh <- names(L1)[1:10] # names of variables in some order
subset(L1, lex.id == 10)[,wh]
## lex.id tfl lex.dur lex.Cst lex.Xst inst time status age sex
## 10 0 5.45 Alive Dead 7 5.454 2 61 M
subset(SL, lex.id == 10)[,wh]
## lex.id tfl lex.dur lex.Cst lex.Xst inst time status age sex
## 10 0 1.00 Alive Alive 7 5.454 2 61 M
## 10 1 1.00 Alive Alive 7 5.454 2 61 M
## 10 2 1.00 Alive Alive 7 5.454 2 61 M
## 10 3 1.00 Alive Alive 7 5.454 2 61 M
## 10 4 1.00 Alive Alive 7 5.454 2 61 M
## 10 5 0.45 Alive Dead 7 5.454 2 61 M
In Sl each record now represents a small interval of follow-up for a person, so each person has many records. The main thing to note here is tfl, which represents the time since lung cancer diagnosis at the beginning of each interval, and lex.dur representing the risk time (“person-years”, in months though)—the length of the interval. The reason we need to split follow-up in small pieces is that the modeling implicitly assumes that the mortality rate is constant within each interval. If intervals are too long, this assumption is a bad approximation.
We can now include a smooth effect of tfl in the Poisson-model allowing the baseline hazard to vary by time since lung Ca. That is done by natural splines:
ps <- glm(cbind(lex.Xst == "Dead", lex.dur) ~ Ns(tfl, knots = seq(0.,36,12)) + sex + age, family = poisreg, data = SL)
ci.exp(ps)
## exp(Est.) 2.5% 97.5%
## (Intercept) 0.01898426 0.005700994 0.06321739
## Ns(tfl, knots = seq(0, 36, 12))1 2.40380283 0.809425097 7.13873100
## Ns(tfl, knots = seq(0, 36, 12))2 4.15015448 0.436289141 39.47790716
## Ns(tfl, knots = seq(0, 36, 12))3 0.83994013 0.043932122 16.05885147
## sexW 0.59871596 0.431231819 0.83124850
## age 1.01658684 0.998376760 1.03512907
#Or a simpler method with Lexis
ps <- glm.Lexis(SL, ~Ns(tfl, knots =seq(0,36,12)) +age +sex)
## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead
ci.exp(ps)
## exp(Est.) 2.5% 97.5%
## (Intercept) 0.01898426 0.005700994 0.06321739
## Ns(tfl, knots = seq(0, 36, 12))1 2.40380283 0.809425097 7.13873100
## Ns(tfl, knots = seq(0, 36, 12))2 4.15015448 0.436289141 39.47790716
## Ns(tfl, knots = seq(0, 36, 12))3 0.83994013 0.043932122 16.05885147
## age 1.01658684 0.998376760 1.03512907
## sexW 0.59871596 0.431231819 0.83124850
Compate these to the regresssion estimates from the Cox0model and from the model with constant baseline:
ests <- cbind(ci.exp(c1), ci.exp(ps, subset = c("sex", "age")), ci.exp(pc, subset = c("sex", "age")))
colnames(ests)[1:3*3-2] <- c("Cox", "Pois-spline", "Pois-const")
round(ests,3)
## Cox 2.5% 97.5% Pois-spline 2.5% 97.5% Pois-const 2.5% 97.5%
## sexW 0.599 0.431 0.831 0.599 0.431 0.831 0.618 0.446 0.858
## age 1.017 0.999 1.036 1.017 0.998 1.035 1.016 0.998 1.034
We see that the smooth parametric Poisson model and the Cox model produce virtually the same estimates of age and sex-effects, whereas the Poisson model with constant hazard produce slightly different ones.
The proportional hazards assumption is the same for the Cox model and both the Poisson models: The M/W hazard ratio is the same at any time after diagnosis. What differs between the models is the assumed shape of the hazard (not a hazard ratio).
The Cox model allows the baseline rate to change arbitrarily at every event time, not using the quantitative nature of time; the spilne Poisson model has a baseline that varies smoothly by time and the constant Poisson model has a baseline that is constant over time. The latter is clearly not tenable, whereas the smooth Poisson model and the Cox model give the same regression estimates.
So we now have a parametric model for the baseline hazard which means that we can show the estimated baseline hazard for, say, a 60 year old woman, by supplying a suitable prediction frame, i.e. a data frame where each row represents a set of covariate values (including the time) where we want the predicted mortality—her we use times from 0 to 30 months in steps of 0.2 months:
prf <- data.frame(tfl = seq(0,30,0.2), sex ="W", age = 60)
The choice of prediction points are totally unrelated to the intervals in which we split the follow-up for analysis. We can over-plot with the predicted rates from the model where mortality rates are constant, the only change is the model (pc instead of ps):
matshade(prf$tfl, ci.pred(ps, prf), lwd = 3, plot = T, log = "y")
matshade(prf$tfl, ci.pred(pc, prf), lwd = 3, lty = "21", lend = "butt")
What we see from the plot is that mortality rates are increasing during the first 1.5 years after lung cancer and then leveling off.
Put some sensible axis labels on the plot, and rescale the rates to rates per 1 person-year.
We can transform the hazard function, λ(t), to a survival function, S(t) using the relationship S(t) = exp(− R t 0 λ(u) du). This integration exercise is implemented in the ci.surv function, which takes a model and a prediction data frame as arguments; the prediction data frame must correspond to a sequence of equidistant time points (makes life easier for the coder), so we can use prf for this purpose:
matshade(prf$tfl, ci.surv(ps, prf, intl = 0.2),
plot = TRUE, ylim = 0:1, lwd = 3)
This is the survival function for a 60 year old woman. We can expand this by overlaying the survival function from the model with constant hazard (also known as ”exponential(y distributed) survival”) and the KM-estimator
matshade(prf$tfl, ci.surv(ps, prf, intl = 0.2),
plot = TRUE, ylim = 0:1, lwd = 3)
lines(prf$tfl, ci.surv(pc, prf, intl = 0.2)[,1], lwd = 2, col = gray(0.5))
lines(survfit(c1, newdata = data.frame(sex = "W", age = 60)),
lwd = 2, lty = 1)
We see that the survival function from the constant hazard model is quite a bit off, but also a good correspondence between the Cox-model based survival and the survival from the parametric hazard function. We can bring the plots together in one graph:
par(mfrow = c(1,2))
# hazard scale
matshade(prf$tfl, ci.pred(ps, prf),
plot = TRUE, log = "y", lwd = 3, xlim = c(0,30))
matshade(prf$tfl, ci.pred(pc, prf), lty = 3, lwd = 3)
#
# survival
matshade(prf$tfl, ci.surv(ps, prf, intl = 0.2),
plot = TRUE, ylim = 0:1, lwd = 3, xlim = c(0,30))
matshade(prf$tfl, ci.surv(pc, prf, intl = 0.2),
lty = 3, alpha = 0, lwd = 3)
lines(survfit(c1, newdata = data.frame(sex = "W", age = 60)),
col = "forestgreen", lwd = 3)
We have compared the predicted survival curve from a Poisson model with age and sex and time since lung cancer as covariates to that from a Cox-model with age and sex as covariates and time since lung cancer as underlying time scale.
We now go back to the Kaplan-Meier estimator and compare that to the corresponding Poisson-model, which is one with time (tfl) as the only covariate:
par(mfrow=c(1,2))
pk <- glm(cbind(lex.Xst == "Dead",
lex.dur) ~ Ns(tfl, knots = seq(0, 36, 12)),
family = poisreg,
data = SL)
#or
pk <- glm.Lexis(SL, ~ Ns(tfl, knots = seq(0, 36, 12)))
## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead
# hazard
matshade(prf$tfl, ci.pred(pk, prf),
plot = TRUE, log = "y", lwd = 3, ylim = c(0.01,1))
# survival from smooth model
matshade(prf$tfl, ci.surv(pk, prf, intl = 0.2) ,
plot = TRUE, lwd = 3, ylim = 0:1)
# K-M estimator
lines(km, lwd = 2)
We can explore how the tightness of the knots in the smooth model influence the underlying hazard and the resulting survival function. This is easiest done by setting up a function that does the analysis with different distance between of knots
zz <-
function(dk)
{
kn <<- seq(0, 36, dk) # must be in global environment...
print(kn)
pk <<- glm.Lexis(SL, ~ Ns(tfl, knots = kn))
matshade(prf$tfl, ci.pred(pk, prf),
plot = TRUE, log = "y", lwd = 2, ylim = c(0.01,1), xlim = c(0,30))
rug(kn, lwd=3)
plot(km, lwd = 2, col = "limegreen", xlim = c(0,30))
matshade(prf$tfl, ci.surv(pk, prf, intl = 0.2) ,
lwd = 2, ylim = 0:1, xlim = c(0,30))
}
par(mfrow = c(1,2))
zz(18)
## [1] 0 18 36
## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead
## Warning in rug(kn, lwd = 3): some values will be clipped
zz(12)
## [1] 0 12 24 36
## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead
## Warning in rug(kn, lwd = 3): some values will be clipped
zz(6)
## [1] 0 6 12 18 24 30 36
## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead
## Warning in rug(kn, lwd = 3): some values will be clipped
zz(4)
## [1] 0 4 8 12 16 20 24 28 32 36
## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead
## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning: some values will be clipped
zz(3)
## [1] 0 3 6 9 12 15 18 21 24 27 30 33 36
## stats::glm Poisson analysis of Lexis object SL with log link:
## Rates for the transition:
## Alive->Dead
## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning: some values will be clipped
You will see that the more knots you include, the closer the parametric estimate gets to the Kaplan-Meier estimator. But also that the estimated underlying hazard becomes increasingly silly. The ultimate silliness is of course achieved when we arrive at the Kaplan-Meier estimator.
Fortunately the baseline hazard underlying the Kaplan-Meier and the Breslow estimator is rarely shown.
#Conclusion
The Cox-model and the Poisson-model gives the same results for the regression parameters, provided the baseline is properly modeled
The Cox-model is a partial model, it only gives the HRs, not the rates. The baseline rates are missing.
The Poisson-model is a full model for the rates. Any set of predicted rates (and derivatives thereof) can be derived from the Poisson model.