Load some data

library(readr)
df <- read.csv("C:/Users/Francs W/Downloads/rotterdam.csv")
head(df)
##   rownames pid year age meno  size grade nodes pgr  er hormon chemo rtime recur
## 1     1393   1 1992  74    1  <=20     3     0  35 291      0     0  1799     0
## 2     1416   2 1984  79    1 20-50     3     0  36 611      0     0  2828     0
## 3     2962   3 1983  44    0  <=20     2     0 138   0      0     0  6012     0
## 4     1455   4 1985  70    1 20-50     3     0   0  12      0     0  2624     0
## 5      977   5 1983  75    1  <=20     3     0 260 409      0     0  4915     0
## 6      617   6 1983  52    0  <=20     3     0 139 303      0     0  5888     0
##   dtime death
## 1  1799     0
## 2  2828     0
## 3  6012     0
## 4  2624     0
## 5  4915     0
## 6  5888     0
colnames(df)
##  [1] "rownames" "pid"      "year"     "age"      "meno"     "size"    
##  [7] "grade"    "nodes"    "pgr"      "er"       "hormon"   "chemo"   
## [13] "rtime"    "recur"    "dtime"    "death"

load relevant libraries

library(survival)
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(ggsurvfit)
# 1. Censoring data((1 = TRUE and 0 = FALSE)
unique(df$death)
## [1] 0 1
df <- df%>% mutate(status=death)
head(df)
##   rownames pid year age meno  size grade nodes pgr  er hormon chemo rtime recur
## 1     1393   1 1992  74    1  <=20     3     0  35 291      0     0  1799     0
## 2     1416   2 1984  79    1 20-50     3     0  36 611      0     0  2828     0
## 3     2962   3 1983  44    0  <=20     2     0 138   0      0     0  6012     0
## 4     1455   4 1985  70    1 20-50     3     0   0  12      0     0  2624     0
## 5      977   5 1983  75    1  <=20     3     0 260 409      0     0  4915     0
## 6      617   6 1983  52    0  <=20     3     0 139 303      0     0  5888     0
##   dtime death status
## 1  1799     0      0
## 2  2828     0      0
## 3  6012     0      0
## 4  2624     0      0
## 5  4915     0      0
## 6  5888     0      0
table(df$status)
## 
##    0    1 
## 1710 1272
# 2. Time-to-event 
df$dtime %>% head()
## [1] 1799 2828 6012 2624 4915 5888
df <- df %>% mutate(dtime_yrs = dtime/365.25)
head(df)
##   rownames pid year age meno  size grade nodes pgr  er hormon chemo rtime recur
## 1     1393   1 1992  74    1  <=20     3     0  35 291      0     0  1799     0
## 2     1416   2 1984  79    1 20-50     3     0  36 611      0     0  2828     0
## 3     2962   3 1983  44    0  <=20     2     0 138   0      0     0  6012     0
## 4     1455   4 1985  70    1 20-50     3     0   0  12      0     0  2624     0
## 5      977   5 1983  75    1  <=20     3     0 260 409      0     0  4915     0
## 6      617   6 1983  52    0  <=20     3     0 139 303      0     0  5888     0
##   dtime death status dtime_yrs
## 1  1799     0      0  4.925394
## 2  2828     0      0  7.742642
## 3  6012     0      0 16.459959
## 4  2624     0      0  7.184120
## 5  4915     0      0 13.456537
## 6  5888     0      0 16.120465

KAPLAN-MEIER

#we create a survival object
surv_obj <- Surv(time = df$dtime_yrs, event = df$status)
head(surv_obj)
## [1]  4.925394+  7.742642+ 16.459959+  7.184120+ 13.456537+ 16.120465+
#Create survival curve
s1 <- survfit(surv_obj ~ 1, data = df)
#Plot
ggsurvfit(s1, linewidth = 1) +
  labs(title = "Kaplan-Meier Graph", x = 'Years', y = 'Overall survival') +
  add_confidence_interval() +
  add_risktable() 

The survival probability steadily declines over time, meaning more individuals experience the event (death). Around 50% of the population has experienced the event, this suggests a median survival time of approximately 10 years. The confidence interval becomes wider over time indicating increased uncertainty as fewer patients remain at risk.

NELSON-AALEN ESTIMATOR

#survfit object for Nelson-Aalen
na_fit <- survfit(surv_obj ~ 1, data = df, type = "fleming-harrington")
# Nelson-Aalen cumulative hazard:
ggsurvfit(na_fit, linewidth = 1, type = "cumhaz") +
  add_confidence_interval() +
  add_risktable() +
  labs(title = "Nelson-Aalen Cumulative Graph", x = 'Years', y = 'cumulative hazard')

As the cumulative hazard increases, the risk of experiencing the event (death) also increases. The wider interval after 15 years suggests greater uncertainty due to fewer patients remaining at risk.

Log rank test is used to whether there is a difference in survival times between two groups.we will therefore construct a Kaplan meier curve showing the difference between survival probability with chemo and no chemo

table(df$chemo)
## 
##    0    1 
## 2402  580
s2 <- survfit(surv_obj ~ chemo, data = df)
ggsurvplot(s2, data = df,
           size = 1,
           palette = c('pink', 'maroon'),
           censor.shape = '|', censor.size = 4,
           conf.int = TRUE,
           pval = TRUE,
           risk.table = TRUE,
           risk.table.col = 'strata',
           legend.labs = list('0' = 'No chemo', '1' = 'Chemo'),
           risk.table.height = 0.25,
           ggtheme = theme_bw())

The curves are slightly divergent. Since p > 0.05, the difference in survival between the two groups is not statistically significant. This suggests that chemotherapy does not provide a significant survival benefit in this dataset.

Log rank test

Now we can compute the test of the difference between the survival curves using survdiff

#Comparing survival dtimes between groups
logrankres_chemo <- survdiff(Surv(dtime_yrs, status) ~ chemo, data = df)
logrankres_chemo
## Call:
## survdiff(formula = Surv(dtime_yrs, status) ~ chemo, data = df)
## 
##            N Observed Expected (O-E)^2/E (O-E)^2/V
## chemo=0 2402     1014     1024    0.0963     0.495
## chemo=1  580      258      248    0.3977     0.495
## 
##  Chisq= 0.5  on 1 degrees of freedom, p= 0.5

The p-value of 0.5 suggests that chemotherapy does not have a significant impact on survival in this data set. The survival curves for the two groups (as seen in the Kaplan-Meier plot) are not significantly different.

# What about hormonal treatment?
logrankres_hor <- survdiff(Surv(dtime_yrs, status) ~ hormon, data = df)
logrankres_hor
## Call:
## survdiff(formula = Surv(dtime_yrs, status) ~ hormon, data = df)
## 
##             N Observed Expected (O-E)^2/E (O-E)^2/V
## hormon=0 2643     1113     1162      2.04      23.7
## hormon=1  339      159      110     21.43      23.7
## 
##  Chisq= 23.7  on 1 degrees of freedom, p= 1e-06

Patients receiving hormonal treatment had significantly better survival compared to those who did not. The observed deaths in the hormonal treatment group are much lower than expected, suggesting a strong survival benefit. This result contrasts with chemotherapy, which showed no significant effect on survival.

Cox regression

#Fit the model
cox_res <- coxph(Surv(dtime_yrs, status) ~ hormon + chemo + size + er + pgr + nodes + meno + grade + age, data = df)
cox_res
## Call:
## coxph(formula = Surv(dtime_yrs, status) ~ hormon + chemo + size + 
##     er + pgr + nodes + meno + grade + age, data = df)
## 
##                 coef  exp(coef)   se(coef)      z        p
## hormon    -6.553e-02  9.366e-01  8.840e-02 -0.741 0.458535
## chemo      5.032e-02  1.052e+00  8.198e-02  0.614 0.539342
## size>50    8.222e-01  2.276e+00  9.142e-02  8.993  < 2e-16
## size20-50  4.425e-01  1.557e+00  6.536e-02  6.771 1.28e-11
## er        -5.512e-05  9.999e-01  1.107e-04 -0.498 0.618466
## pgr       -3.676e-04  9.996e-01  1.226e-04 -2.998 0.002720
## nodes      7.295e-02  1.076e+00  4.879e-03 14.953  < 2e-16
## meno       7.046e-02  1.073e+00  1.006e-01  0.701 0.483583
## grade      3.156e-01  1.371e+00  7.082e-02  4.456 8.33e-06
## age        1.406e-02  1.014e+00  3.830e-03  3.671 0.000242
## 
## Likelihood ratio test=524.4  on 10 df, p=< 2.2e-16
## n= 2982, number of events= 1272
# Before interpreting the results, verify whether the proportional hazards assumptions are respected
# Cox model assumes that the HR between any two groups remains constant over time
test <- survival::cox.zph(cox_res)
test
##         chisq df       p
## hormon  0.684  1 0.40818
## chemo   2.100  1 0.14727
## size    5.206  2 0.07405
## er     59.748  1 1.1e-14
## pgr    41.637  1 1.1e-10
## nodes   4.073  1 0.04358
## meno    4.284  1 0.03846
## grade   3.163  1 0.07533
## age    13.954  1 0.00019
## GLOBAL 93.751 10 9.6e-16

Significant Violations (p < 0.05): ER (estrogen receptor), PGR (progesterone receptor), lymph nodes, menopause, and age violate the proportional hazards assumption. Global p-value = 9.6e-16, indicating overall model issues.

Moderate Concerns: Tumor size and grade are borderline (p ~ 0.07).

No Violation (p > 0.05): Hormonal treatment and chemotherapy satisfy the assumption