Life-Table Analysis

1. Verify that the calculations of total time of exposure and person-days experienced appears to be correct by reviewing the contents of the dataset.

# setting the environment
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
binData = read_csv("binlymph.csv")
## Rows: 20 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): stage, bin, D, P_Days, I_Rate, L, mid_days, P, Survival, Stage3, S...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(binData)
## # A tibble: 20 × 11
##    stage   bin     D P_Days  I_Rate     L mid_days     P Survival Stage3 Stage4
##    <dbl> <dbl> <dbl>  <dbl>   <dbl> <dbl>    <dbl> <dbl>    <dbl>  <dbl>  <dbl>
##  1     0     0     1    132 0.00758     7      3.5 0.947    0.947  1     NA    
##  2     0     7     0    144 0           8     11   1        0.947  0.947 NA    
##  3     0    15     1    259 0.00386    15     22.5 0.942    0.892  0.947 NA    
##  4     0    30     3    429 0.00699    30     45   0.790    0.705  0.892 NA    
##  5     0    60     0    390 0          30     75   1        0.705  0.705 NA    
##  6     0    90     1    364 0.00275    30    105   0.918    0.647  0.705 NA    
##  7     0   120     1    336 0.00298    30    135   0.911    0.589  0.647 NA    
##  8     0   150     0    319 0          30    165   1        0.589  0.589 NA    
##  9     0   180     2    703 0.00284    90    225   0.744    0.438  0.589 NA    
## 10     0   270     0    227 0          NA    315   1        0.438  0.438 NA    
## 11     1     0     1    109 0.00917     7      3.5 0.936    0.936 NA      1    
## 12     1     7     3    109 0.0275      8     11   0.780    0.730 NA      0.936
## 13     1    15     0    180 0          15     22.5 1        0.730 NA      0.730
## 14     1    30     4    297 0.0135     30     45   0.596    0.435 NA      0.730
## 15     1    60     3    205 0.0146     30     75   0.561    0.244 NA      0.435
## 16     1    90     1    123 0.00813    30    105   0.756    0.184 NA      0.244
## 17     1   120     0    120 0          30    135   1        0.184 NA      0.184
## 18     1   150     1    115 0.00870    30    165   0.739    0.136 NA      0.184
## 19     1   180     0    247 0          90    225   1        0.136 NA      0.136
## 20     1   270     0     96 0          NA    315   1        0.136 NA      0.136
summary(binData)
##      stage          bin              D            P_Days     
##  Min.   :0.0   Min.   :  0.0   Min.   :0.00   Min.   : 96.0  
##  1st Qu.:0.0   1st Qu.: 15.0   1st Qu.:0.00   1st Qu.:122.2  
##  Median :0.5   Median : 75.0   Median :1.00   Median :216.0  
##  Mean   :0.5   Mean   : 92.2   Mean   :1.10   Mean   :245.2  
##  3rd Qu.:1.0   3rd Qu.:150.0   3rd Qu.:1.25   3rd Qu.:323.2  
##  Max.   :1.0   Max.   :270.0   Max.   :4.00   Max.   :703.0  
##                                                              
##      I_Rate               L         mid_days           P         
##  Min.   :0.000000   Min.   : 7   Min.   :  3.5   Min.   :0.5610  
##  1st Qu.:0.000000   1st Qu.:15   1st Qu.: 22.5   1st Qu.:0.7739  
##  Median :0.002911   Median :30   Median : 90.0   Median :0.9389  
##  Mean   :0.005431   Mean   :30   Mean   :110.2   Mean   :0.8810  
##  3rd Qu.:0.008271   3rd Qu.:30   3rd Qu.:165.0   3rd Qu.:1.0000  
##  Max.   :0.027523   Max.   :90   Max.   :315.0   Max.   :1.0000  
##                     NA's   :2                                    
##     Survival          Stage3           Stage4      
##  Min.   :0.1363   Min.   :0.4383   Min.   :0.1363  
##  1st Qu.:0.2291   1st Qu.:0.6035   1st Qu.:0.1845  
##  Median :0.5891   Median :0.7050   Median :0.3394  
##  Mean   :0.5375   Mean   :0.7459   Mean   :0.4716  
##  3rd Qu.:0.7297   3rd Qu.:0.9333   3rd Qu.:0.7297  
##  Max.   :0.9470   Max.   :1.0000   Max.   :1.0000  
##                   NA's   :10       NA's   :10
# calculation of total time of exposure (=total person time, person-days)
total_pt <- sum(binData$P_Days)
total_pt
## [1] 4904
#plot of S(t) –vs.- mid_days for each group
qplot(x=mid_days, y=Survival,
      col=factor(stage, labels=c("Stage 3", "Stage 4")),
      data=binData) + labs(col="Cancer Stage")
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

2. Rename variables for simplicity

# Rename variables for simplicity : 
  # D is the number of deaths, 
  # P_Days is the person-days accumulated in the bin
  # mid_days is the midpoint of time bin.

binData = binData %>%
  mutate(t=mid_days) %>%
  mutate(N = P_Days)

3. Fit several log-linear Poisson regression models to the grouped survival data - Generate time terms, centered and spline, interaction terms

# Generate time terms, centered and spline:
binData = binData %>%
  mutate(t60 = t-60) %>%
  mutate(t60sp = ifelse(t > 60, t-60, 0))

# Model A: stage
modelA = glm(D ~ stage, offset=log(N), family=poisson(link="log"), data=binData)
summary(modelA)
## 
## Call:
## glm(formula = D ~ stage, family = poisson(link = "log"), data = binData, 
##     offset = log(N))
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.9054     0.3333 -17.716   <2e-16 ***
## stage         1.0919     0.4336   2.518   0.0118 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 30.505  on 19  degrees of freedom
## Residual deviance: 24.053  on 18  degrees of freedom
## AIC: 56.908
## 
## Number of Fisher Scoring iterations: 5
modelA$coefficients; confint.default(modelA) ## coefficients
## (Intercept)       stage 
##   -5.905362    1.091927
##                  2.5 %    97.5 %
## (Intercept) -6.5586828 -5.252041
## stage        0.2420319  1.941823
exp(modelA$coefficients); exp(confint.default(modelA)) ## IRR
## (Intercept)       stage 
## 0.002724796 2.980012492
##                   2.5 %     97.5 %
## (Intercept) 0.001417752 0.00523682
## stage       1.273834818 6.97144899
# Model B: stage + t-60
modelB = glm(D ~ stage + t60, offset=log(N),
             family=poisson(link="log"), data=binData)
summary(modelB)
## 
## Call:
## glm(formula = D ~ stage + t60, family = poisson(link = "log"), 
##     data = binData, offset = log(N))
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.603146   0.340600 -16.451   <2e-16 ***
## stage        0.942699   0.437674   2.154   0.0312 *  
## t60         -0.006977   0.003166  -2.204   0.0275 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 30.505  on 19  degrees of freedom
## Residual deviance: 18.019  on 17  degrees of freedom
## AIC: 52.874
## 
## Number of Fisher Scoring iterations: 5
modelB$coefficients; confint.default(modelB) ## coefficients
##  (Intercept)        stage          t60 
## -5.603146449  0.942699007 -0.006976953
##                   2.5 %        97.5 %
## (Intercept) -6.27070994 -4.9355829625
## stage        0.08487278  1.8005252297
## t60         -0.01318209 -0.0007718107
exp(modelB$coefficients); exp(confint.default(modelB)) ## IRR
## (Intercept)       stage         t60 
## 0.003686247 2.566900159 0.993047330
##                   2.5 %     97.5 %
## (Intercept) 0.001890886 0.00718627
## stage       1.088578574 6.05282575
## t60         0.986904409 0.99922849
# Model C: stage + t-60 + (t-60)^+
modelC = glm(D ~ stage + t60 + t60sp, offset=log(N),
             family=poisson(link="log"), data=binData)
summary(modelC)
## 
## Call:
## glm(formula = D ~ stage + t60 + t60sp, family = poisson(link = "log"), 
##     data = binData, offset = log(N))
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.4322926  0.4621789 -11.754   <2e-16 ***
## stage        0.9514927  0.4378395   2.173   0.0298 *  
## t60         -0.0006996  0.0123224  -0.057   0.9547    
## t60sp       -0.0081450  0.0154147  -0.528   0.5972    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 30.505  on 19  degrees of freedom
## Residual deviance: 17.736  on 16  degrees of freedom
## AIC: 54.591
## 
## Number of Fisher Scoring iterations: 5
modelC$coefficients; confint.default(modelC) ## coefficients
##   (Intercept)         stage           t60         t60sp 
## -5.4322926309  0.9514926947 -0.0006995638 -0.0081450320
##                   2.5 %      97.5 %
## (Intercept) -6.33814667 -4.52643860
## stage        0.09334297  1.80964242
## t60         -0.02485107  0.02345194
## t60sp       -0.03835733  0.02206726
exp(modelC$coefficients); exp(confint.default(modelC)) ## IRR
## (Intercept)       stage         t60       t60sp 
## 0.004373058 2.589572216 0.999300681 0.991888049
##                   2.5 %     97.5 %
## (Intercept) 0.001767575 0.01081914
## stage       1.097838197 6.10826284
## t60         0.975455177 1.02372910
## t60sp       0.962368999 1.02231255
# Model D: stage + t-60 + (t-60)^+ + stage*(t-60) + stage*(t-60)^+
modelD = glm(D ~ stage + t60 + t60sp + stage:t60 + stage:t60sp, offset=log(N), 
             family=poisson(link="log"), data=binData)
summary(modelD)
## 
## Call:
## glm(formula = D ~ stage + t60 + t60sp + stage:t60 + stage:t60sp, 
##     family = poisson(link = "log"), data = binData, offset = log(N))
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.792139   0.632165  -9.162   <2e-16 ***
## stage        1.614121   0.804147   2.007   0.0447 *  
## t60         -0.009640   0.019928  -0.484   0.6286    
## t60sp        0.005976   0.023829   0.251   0.8020    
## stage:t60    0.016566   0.025548   0.648   0.5167    
## stage:t60sp -0.029494   0.032583  -0.905   0.3654    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 30.505  on 19  degrees of freedom
## Residual deviance: 16.233  on 14  degrees of freedom
## AIC: 57.088
## 
## Number of Fisher Scoring iterations: 5
modelD$coefficients; confint.default(modelD) ## coefficients
##  (Intercept)        stage          t60        t60sp    stage:t60  stage:t60sp 
## -5.792139416  1.614121315 -0.009639959  0.005975502  0.016565567 -0.029494184
##                   2.5 %      97.5 %
## (Intercept) -7.03115989 -4.55311894
## stage        0.03802288  3.19021975
## t60         -0.04869908  0.02941916
## t60sp       -0.04072942  0.05268042
## stage:t60   -0.03350662  0.06663775
## stage:t60sp -0.09335523  0.03436687
exp(modelD$coefficients); exp(confint.default(modelD)) ## IRR
## (Intercept)       stage         t60       t60sp   stage:t60 stage:t60sp 
## 0.003051447 5.023471935 0.990406356 1.005993391 1.016703537 0.970936525
##                    2.5 %     97.5 %
## (Intercept) 0.0008839059  0.0105343
## stage       1.0387549969 24.2937655
## t60         0.9524677004  1.0298562
## t60sp       0.9600888770  1.0540927
## stage:t60   0.9670485101  1.0689082
## stage:t60sp 0.9108698702  1.0349642

4. Identify the “best” prediction by using AIC

# using AIC, identify the “best” prediction
AIC(modelA, modelB, modelC, modelD)
##        df      AIC
## modelA  2 56.90787
## modelB  3 52.87416
## modelC  4 54.59051
## modelD  6 57.08824

Kaplan-Meier (K-M) Analysis

5. Calculate Kaplan-Meier (K-M) estimates of the survival curve with 95% CI separately for each group

# import dataset
lymphData = read_csv("lymphoma.csv")
## Rows: 35 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): id, stage, days, died
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(lymphData)
## # A tibble: 6 × 4
##      id stage  days  died
##   <dbl> <dbl> <dbl> <dbl>
## 1     1     0     6     1
## 2     2     0    19     1
## 3     3     0    32     1
## 4     4     0    42     1
## 5     5     0    42     1
## 6     6     0    43     0
library(survival)

lymphData$SurvObj = with(lymphData, Surv(days, died == 1))

km.stage = survfit(SurvObj ~ stage, data = lymphData,
                   type="kaplan-meier", conf.type="log-log")
summary(km.stage)
## Call: survfit(formula = SurvObj ~ stage, data = lymphData, type = "kaplan-meier", 
##     conf.type = "log-log")
## 
##                 stage=0 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     6     19       1    0.947  0.0512        0.681        0.992
##    19     18       1    0.895  0.0704        0.641        0.973
##    32     17       1    0.842  0.0837        0.587        0.946
##    42     16       2    0.737  0.1010        0.479        0.881
##    94     13       1    0.680  0.1080        0.421        0.842
##   126     12       1    0.623  0.1129        0.367        0.800
##   207     10       1    0.561  0.1176        0.308        0.753
##   253      7       1    0.481  0.1251        0.230        0.694
## 
##                 stage=1 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     4     16       1    0.938  0.0605       0.6323        0.991
##    10     15       1    0.875  0.0827       0.5860        0.967
##    11     14       1    0.812  0.0976       0.5246        0.935
##    13     13       1    0.750  0.1083       0.4634        0.898
##    31     12       1    0.688  0.1159       0.4046        0.856
##    40     11       1    0.625  0.1210       0.3486        0.811
##    50     10       1    0.562  0.1240       0.2954        0.762
##    56      9       1    0.500  0.1250       0.2452        0.710
##    68      8       1    0.438  0.1240       0.1981        0.656
##    82      7       1    0.375  0.1210       0.1542        0.598
##    85      6       1    0.312  0.1159       0.1139        0.536
##    93      5       1    0.250  0.1083       0.0775        0.472
##   175      4       1    0.188  0.0976       0.0460        0.402

6. Plot the K-M curves against time

# Plot the K-M curves against time
plot(km.stage, col=c("blue","red"),
     main="Kaplan-Meier survival estimates by cancer stage",
     ylab="S(t)", xlab="time" )
legend("bottomleft", c("Stage 3", "Stage 4"),
       col=c("blue", "red"), lty=1)

# Carry out a log-rank
survdiff(SurvObj ~ stage, data=lymphData)
## Call:
## survdiff(formula = SurvObj ~ stage, data = lymphData)
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## stage=0 19        9    14.01      1.79      5.09
## stage=1 16       13     7.99      3.15      5.09
## 
##  Chisq= 5.1  on 1 degrees of freedom, p= 0.02

7. Fit a Cox proportional hazards model

# Fit a Cox proportional hazards model 
model1 = coxph(SurvObj ~ stage, data = lymphData, ties="breslow")
summary(model1)
## Call:
## coxph(formula = SurvObj ~ stage, data = lymphData, ties = "breslow")
## 
##   n= 35, number of events= 22 
## 
##         coef exp(coef) se(coef)     z Pr(>|z|)  
## stage 0.9576    2.6054   0.4402 2.175   0.0296 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##       exp(coef) exp(-coef) lower .95 upper .95
## stage     2.605     0.3838     1.099     6.175
## 
## Concordance= 0.623  (se = 0.053 )
## Likelihood ratio test= 4.83  on 1 df,   p=0.03
## Wald test            = 4.73  on 1 df,   p=0.03
## Score (logrank) test = 5.07  on 1 df,   p=0.02