#loading the required packages

library(survival)
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma

#loading the dataset

data<-leukemia

#creating a survival object

mysurv_object <- Surv(time = leukemia$time, event =leukemia$status)

View first few rows

head(leukemia)
##   time status          x
## 1    9      1 Maintained
## 2   13      1 Maintained
## 3   13      0 Maintained
## 4   18      1 Maintained
## 5   23      1 Maintained
## 6   28      0 Maintained

#summary of the data

summary(mysurv_object)
##       time            status      
##  Min.   :  5.00   Min.   :0.0000  
##  1st Qu.: 12.50   1st Qu.:1.0000  
##  Median : 23.00   Median :1.0000  
##  Mean   : 29.48   Mean   :0.7826  
##  3rd Qu.: 33.50   3rd Qu.:1.0000  
##  Max.   :161.00   Max.   :1.0000

#intepretation It provide insights into the survival times and event status of patients. The minimum survival time is 5 days, while the maximum is 161 days, indicating a wide range of survival durations. The median survival time is 23 days, meaning half of the patients survived longer and half shorter. The mean survival time is approximately 29.48 days, slightly higher than the median, suggesting a right-skewed distribution with some patients surviving much longer. The status variable (event occurrence) shows that 78.26% of patients (mean = 0.7826) experienced the event (death), while 21.74% were censored. This indicates a high mortality rate in the dataset.

#structure of the survival object

str(mysurv_object)
##  'Surv' num [1:23, 1:2]   9   13   13+  18   23   28+  31   34   45+  48  ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:2] "time" "status"
##  - attr(*, "type")= chr "right"

#interpreting output The first column (time) indicates survival durations, and the second column (status) denotes whether the event (death) occurred. The “+” symbol next to some values (e.g., 13+, 28+, 45+) indicates right-censored observations, meaning these patients were still alive or lost to follow-up at the recorded time. The “right” censoring type confirms that the dataset accounts for patients whose survival status was unknown beyond a certain point.

#plotting the kaplan-meir estimator

km_fit <- survfit(mysurv_object ~ 1)

Plot the Kaplan-Meier survival curves

plot(km_fit, main="Kaplan-Meier Survival Curve", xlab="Time", ylab="Survival Probability", col="purple", lwd=2)
grid()

#interpretation The Kaplan-Meier curve shows the estimated survival probability over time for leukemia patients. The y-axis represents the survival probability, while the x-axis represents time in days. The curve indicates the proportion of patients surviving at different time points, with drops in the curve corresponding to events (deaths).

#The nelson-aalen estimates

na_fit <- survfit(mysurv_object~ 1, type="fleming-harrington")

#plotting the nelson aalen curve

H_t <- -log(na_fit$surv)
plot(na_fit$time, H_t, type="s", col="green", lwd=2, main="Nelson-Aalen Cumulative Hazard Function",
     xlab="Time", ylab="Cumulative Hazard")
grid()

#interpretation The Nelson-Aalen estimator provides a cumulative hazard function. The y-axis represents the cumulative hazard, which increases over time. This indicates the accumulated risk of the event (death) occurring as time progresses.

Log-rank test

log_rank_test <- survdiff(Surv(time, status) ~ x, data = leukemia)
print(log_rank_test)
## Call:
## survdiff(formula = Surv(time, status) ~ x, data = leukemia)
## 
##                  N Observed Expected (O-E)^2/E (O-E)^2/V
## x=Maintained    11        7    10.69      1.27       3.4
## x=Nonmaintained 12       11     7.31      1.86       3.4
## 
##  Chisq= 3.4  on 1 degrees of freedom, p= 0.07

#interpretation Null Hypothesis (H0): The null hypothesis states that there is no difference in survival between the “Maintained” and “Nonmaintained” treatment groups. The p-value of 0.07 suggests that there is not enough evidence to reject the null hypothesis at the conventional significance level of 0.05. This means that the difference in survival between the “Maintained” and “Nonmaintained” groups is not statistically significant.

Fitting a Cox proportional hazard model

cox_fit <- coxph(Surv(time, status) ~ x, data = leukemia)

Summary of the Cox model

summary(cox_fit)
## Call:
## coxph(formula = Surv(time, status) ~ x, data = leukemia)
## 
##   n= 23, number of events= 18 
## 
##                  coef exp(coef) se(coef)     z Pr(>|z|)  
## xNonmaintained 0.9155    2.4981   0.5119 1.788   0.0737 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                exp(coef) exp(-coef) lower .95 upper .95
## xNonmaintained     2.498     0.4003    0.9159     6.813
## 
## Concordance= 0.619  (se = 0.063 )
## Likelihood ratio test= 3.38  on 1 df,   p=0.07
## Wald test            = 3.2  on 1 df,   p=0.07
## Score (logrank) test = 3.42  on 1 df,   p=0.06

The Cox Proportional Hazards model results indicate that patients in the non-maintained treatment group have a 2.5 times higher risk of death compared to those in the maintained group (HR = 2.498, 95% CI: 0.9159 - 6.813). However, the p-value (0.0737) suggests that this effect is not statistically significant at the 5% level, though it is marginally significant at the 10% level, indicating weak evidence that treatment influences survival. Model diagnostics, including the Likelihood Ratio Test (p = 0.07) and Wald Test (p = 0.07), further support this borderline significance. The model’s concordance index (0.619) suggests moderate predictive ability, but given the small sample size (n = 23), additional data may be needed to confirm these findings with stronger statistical confidence.