#loading necessary packages

library(survival)
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma

library(survival) library(survminer)

#loading kidney dataset

data<-kidney

we create a survival object

surv_object <- Surv(time = kidney$time, event = kidney$status)

#Viewing the first few rows in the data

head(kidney)
##   id time status age sex disease frail
## 1  1    8      1  28   1   Other   2.3
## 2  1   16      1  28   1   Other   2.3
## 3  2   23      1  48   2      GN   1.9
## 4  2   13      0  48   2      GN   1.9
## 5  3   22      1  32   1   Other   1.2
## 6  3   28      1  32   1   Other   1.2

#summary statistics

summary(surv_object)
##       time           status      
##  Min.   :  2.0   Min.   :0.0000  
##  1st Qu.: 16.0   1st Qu.:1.0000  
##  Median : 39.5   Median :1.0000  
##  Mean   :101.6   Mean   :0.7632  
##  3rd Qu.:149.8   3rd Qu.:1.0000  
##  Max.   :562.0   Max.   :1.0000

#interpretation The dataset suggests that most patients in the study experienced kidney infection recurrence relatively quickly, with 50% experiencing recurrence within 39.5 days. However, there is considerable variation, with some patients surviving without recurrence for much longer. The relatively high event rate (76.32%) suggests that recurrence is common in this population #strucure of the data

str(surv_object)
##  'Surv' num [1:76, 1:2]   8   16   23   13+  22   28  447  318   30   12  ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:2] "time" "status"
##  - attr(*, "type")= chr "right"

#interpretation The right-censoring attribute indicates that some patients did not experience the event during the study period, meaning their survival time is only recorded up to the last available follow-up.

#The kaplan-meir estimator

km_fit <- survfit(surv_object ~ 1)

Plot the Kaplan-Meier curve

plot(km_fit, main="Kaplan-Meier Survival Curve", xlab="Time", ylab="Survival Probability", col="blue", lwd=2)
grid()

#explanation It illustrates the probability of patients remaining recurrence-free over time. The solid blue line represents the estimated survival function, while the dashed lines indicate the 95% confidence interval. The curve starts at 1.0 (100% survival) and gradually declines, showing that the risk of kidney infection recurrence increases over time. The steep drops suggest periods where multiple patients experienced recurrence, while the flatter sections indicate times with fewer or no events. The long tail suggests that some patients remained recurrence-free for an extended period.

#The nelson-aalen estimator

na_fit <- survfit(surv_object~ 1, type="fleming-harrington")

#plotting the Cumulative hazard curve(nelson aalen)

H_t <- -log(na_fit$surv)
plot(na_fit$time, H_t, type="s", col="red", lwd=2, main="Nelson-Aalen Cumulative Hazard Function",
     xlab="Time", ylab="Cumulative Hazard")
grid()

#Explanation It shows how the risk of kidney infection recurrence accumulates over time. The y-axis (Cumulative Hazard) represents the total hazard experienced by patients up to a given time, while the x-axis (Time) indicates the study duration. The increasing red stepwise curve suggests that the hazard rises progressively, meaning the risk of recurrence accumulates as time passes. The steeper sections indicate periods where multiple recurrences occurred, whereas flatter segments show times with fewer events. The final sharp increase suggests a higher risk of recurrence in later stages for remaining patients. This function helps understand how recurrence risk evolves over time in the study population. #Create a categorical variable for groups (e.g., sex)

kidney$sex <- factor(kidney$sex, labels = c("Female", "Male"))

Perform the log-rank test

log_rank_test <- survdiff(surv_object ~ sex, data = kidney)
print(log_rank_test)
## Call:
## survdiff(formula = surv_object ~ sex, data = kidney)
## 
##             N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=Female 20       18     10.2      5.99      8.31
## sex=Male   56       40     47.8      1.28      8.31
## 
##  Chisq= 8.3  on 1 degrees of freedom, p= 0.004

#Interpretation The log-rank test for the kidney dataset compares survival between male and female patients. Females (N = 20) had 18 observed recurrences, significantly higher than the expected 10.2, while males (N = 56) had 40 recurrences, lower than the expected 47.8. The Chi-square statistic (8.3, p = 0.004) indicates a statistically significant difference in survival, suggesting that females experience a higher risk of kidney infection recurrence than males. This finding highlights sex as a key factor influencing recurrence rates in the study population.

Plot the Kaplan-Meier curves for each group

km_fit_sex <- survfit(surv_object ~ sex, data = kidney)
ggsurvplot(km_fit_sex, data = kidney, 
           title = "Kaplan-Meier Survival Curves by Sex for Kidney Patients",
           xlab = "Time (days)", 
           ylab = "Survival Probability",
           risk.table = TRUE)

#Interpretation Females experience a steeper decline in survival probability early on, indicating a higher recurrence rate of kidney infections compared to males. Males have a more gradual decline, suggesting better survival outcomes over time. By around 200 days, most females have experienced the event (recurrence), whereas a significant proportion of males remain event-free for a longer period. This supports the earlier log-rank test finding that females have a significantly higher risk of recurrence compared to males

Fit a Cox proportional hazards model

cox_fit <- coxph(Surv(time, status) ~ age + sex, data = kidney)

Summary of the Cox model

summary(cox_fit)
## Call:
## coxph(formula = Surv(time, status) ~ age + sex, data = kidney)
## 
##   n= 76, number of events= 58 
## 
##              coef exp(coef)  se(coef)      z Pr(>|z|)   
## age      0.002032  1.002034  0.009246  0.220  0.82607   
## sexMale -0.829314  0.436349  0.298955 -2.774  0.00554 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## age        1.0020      0.998    0.9840     1.020
## sexMale    0.4363      2.292    0.2429     0.784
## 
## Concordance= 0.662  (se = 0.045 )
## Likelihood ratio test= 7.12  on 2 df,   p=0.03
## Wald test            = 8.02  on 2 df,   p=0.02
## Score (logrank) test = 8.45  on 2 df,   p=0.01

#intepreting ouput Cox proportional hazards model examines the effect of age and sex on survival time. The age coefficient (HR = 1.002, p = 0.826) suggests that age has no significant impact on survival. However, sex is statistically significant (HR = 0.436, p = 0.005), indicating that males have a 56.4% lower hazard of recurrence compared to females. The concordance (C = 0.662) shows a moderate predictive ability of the model. The likelihood ratio, Wald, and log-rank tests confirm that the model is statistically significant (p < 0.05), suggesting that sex plays a crucial role in kidney disease recurrence, with males having better survival outcomes.