title: “SURVIVAL ANALYSIS” author: “Teddy Wambua 21/05600” date: “2025-03-01” output: html_document

library(survival)
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(ggplot2)
# Load lung dataset
data(lung)  
## Warning in data(lung): data set 'lung' not found
head(lung)
##   inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1    3  306      2  74   1       1       90       100     1175      NA
## 2    3  455      2  68   1       0       90        90     1225      15
## 3    3 1010      1  56   1       0       90        90       NA      15
## 4    5  210      2  57   1       1       90        60     1150      11
## 5    1  883      2  60   1       0      100        90       NA       0
## 6   12 1022      1  74   1       1       50        80      513       0
lung <- na.omit(lung)  # Remove missing values

# Convert survival time and event status
lung$SurvObj <- with(lung, Surv(time, status == 2))

# Check structure
str(lung)
## 'data.frame':    167 obs. of  11 variables:
##  $ inst     : num  3 5 12 7 11 1 7 6 12 22 ...
##  $ time     : num  455 210 1022 310 361 ...
##  $ status   : num  2 2 1 2 2 2 2 2 2 2 ...
##  $ age      : num  68 57 74 68 71 53 61 57 57 70 ...
##  $ sex      : num  1 1 1 2 2 1 1 1 1 1 ...
##  $ ph.ecog  : num  0 1 1 2 2 1 2 1 1 1 ...
##  $ ph.karno : num  90 90 50 70 60 70 70 80 80 90 ...
##  $ pat.karno: num  90 60 80 60 80 80 70 80 70 100 ...
##  $ meal.cal : num  1225 1150 513 384 538 ...
##  $ wt.loss  : num  15 11 0 10 1 16 34 27 60 -5 ...
##  $ SurvObj  : 'Surv' num [1:167, 1:2]  455   210  1022+  310   361   218   166   170   567   613  ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:2] "time" "status"
##   ..- attr(*, "type")= chr "right"
##  - attr(*, "na.action")= 'omit' Named int [1:61] 1 3 5 12 13 14 16 20 23 25 ...
##   ..- attr(*, "names")= chr [1:61] "1" "3" "5" "12" ...
# Fit Kaplan-Meier survival curve
km_fit <- survfit(Surv(time, status == 2) ~ 1, data = lung)

# Plot Kaplan-Meier curve
ggsurvplot(km_fit, data = lung, conf.int = TRUE,
           xlab = "Time in days", ylab = "Survival Probability",
           ggtheme = theme_minimal(),
           risk.table = TRUE) +
  labs(title = "Kaplan-Meier Survival Curve")

  1. Interpretation: Patients have a declining survival probability over time; steep drops indicate early deaths.
# Fit Cox model 
cox_model <- coxph(Surv(time, status == 2) ~ 1, data = lung)

# Compute cumulative hazard
na_fit <- basehaz(cox_model, centered = FALSE)

# Plot Nelson-Aalen cumulative hazard function
ggplot(na_fit, aes(x = time, y = hazard)) +
  geom_line(color = "blue") +
  labs(title = "Nelson-Aalen Cumulative Hazard Function",
       x = "Time in days",
       y = "Cumulative Hazard") +
  theme_minimal()

  1. Interpretation: The cumulative hazard increases over time, meaning the risk of death accumulates.
# Compare survival between male and female patients
surv_diff <- survdiff(Surv(time, status == 2) ~ sex, data = lung)
print(surv_diff)
## Call:
## survdiff(formula = Surv(time, status == 2) ~ sex, data = lung)
## 
##         N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=1 103       82     68.7      2.57      6.05
## sex=2  64       38     51.3      3.44      6.05
## 
##  Chisq= 6  on 1 degrees of freedom, p= 0.01
  1. Interpretation: p-value < 0.05, there is a significant survival difference between males and females.
# Fit Cox model with age, sex, and ECOG score
cox_model <- coxph(Surv(time, status == 2) ~ age + sex + ph.ecog, data = lung)

# Display Cox model summary
summary(cox_model)
## Call:
## coxph(formula = Surv(time, status == 2) ~ age + sex + ph.ecog, 
##     data = lung)
## 
##   n= 167, number of events= 120 
## 
##              coef exp(coef)  se(coef)      z Pr(>|z|)    
## age      0.008034  1.008067  0.011086  0.725 0.468623    
## sex     -0.502180  0.605210  0.197336 -2.545 0.010934 *  
## ph.ecog  0.455257  1.576579  0.136857  3.327 0.000879 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## age        1.0081     0.9920    0.9864     1.030
## sex        0.6052     1.6523    0.4111     0.891
## ph.ecog    1.5766     0.6343    1.2057     2.062
## 
## Concordance= 0.637  (se = 0.031 )
## Likelihood ratio test= 20.01  on 3 df,   p=2e-04
## Wald test            = 19.69  on 3 df,   p=2e-04
## Score (logrank) test = 20.02  on 3 df,   p=2e-04
  1. Interpretation: Older patients, male gender, and poor ECOG score significantly increases the risk of death.