# Load necessary libraries
library(survival)
## Warning: package 'survival' was built under R version 4.4.2
library(survminer)
## Warning: package 'survminer' was built under R version 4.4.2
## Loading required package: ggplot2
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Load built-in lung cancer dataset
# The dataset contains survival time, event status, and covariates
data(lung)
## Warning in data(lung): data set 'lung' not found
lung <- na.omit(lung)  # Remove missing values
# Define survival object
surv_obj <- Surv(time = lung$time, event = lung$status)
# 1. Kaplan-Meier Estimator
km_fit <- survfit(surv_obj ~ 1,data = lung )  # Overall survival curve
ggplot_km <- ggsurvplot(km_fit, 
                         data = lung,  # Make sure this matches the dataset used in km_fit
                         conf.int = TRUE, 
                         pval = TRUE, 
                         risk.table = TRUE,
                         title = "Kaplan-Meier Survival Curve")
## Warning in .pvalue(fit, data = data, method = method, pval = pval, pval.coord = pval.coord, : There are no survival curves to be compared. 
##  This is a null model.
print(ggplot_km)

# 2. Nelson-Aalen Estimator
nelson_aalen_fit <- survfit(surv_obj ~ 1, type = "fh")  # Fleming-Harrington estimator (Nelson-Aalen approximation)

plot(nelson_aalen_fit, fun = "cumhaz", xlab = "Time", ylab = "Cumulative Hazard", main = "Nelson-Aalen Estimator")

# 3. Log-Rank Test
km_fit_sex <- survfit(surv_obj ~ lung$sex)
log_rank_test <- survdiff(surv_obj ~ lung$sex)
print(log_rank_test)
## Call:
## survdiff(formula = surv_obj ~ lung$sex)
## 
##              N Observed Expected (O-E)^2/E (O-E)^2/V
## lung$sex=1 103       82     68.7      2.57      6.05
## lung$sex=2  64       38     51.3      3.44      6.05
## 
##  Chisq= 6  on 1 degrees of freedom, p= 0.01
ggplot_lr <- ggsurvplot(km_fit_sex, 
                        data = lung,
                         conf.int = TRUE, 
                         pval = TRUE, 
                         risk.table = TRUE,
                         title = "Kaplan-Meier Survival by Sex")
print(ggplot_lr)

# 4. Cox Proportional Hazards Model
cox_model <- coxph(surv_obj ~ age + sex + ph.ecog, data = lung)
summary(cox_model)
## Call:
## coxph(formula = surv_obj ~ age + sex + ph.ecog, data = lung)
## 
##   n= 167, number of events= 120 
## 
##              coef exp(coef)  se(coef)      z Pr(>|z|)    
## age      0.008034  1.008067  0.011086  0.725 0.468623    
## sex     -0.502180  0.605210  0.197336 -2.545 0.010934 *  
## ph.ecog  0.455257  1.576579  0.136857  3.327 0.000879 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## age        1.0081     0.9920    0.9864     1.030
## sex        0.6052     1.6523    0.4111     0.891
## ph.ecog    1.5766     0.6343    1.2057     2.062
## 
## Concordance= 0.637  (se = 0.031 )
## Likelihood ratio test= 20.01  on 3 df,   p=2e-04
## Wald test            = 19.69  on 3 df,   p=2e-04
## Score (logrank) test = 20.02  on 3 df,   p=2e-04
# Interpretation:
# - The coefficients represent log-hazard ratios
# - The p-values indicate the significance of each covariate
# - A hazard ratio >1 means an increased risk, <1 means a decreased risk
# Plot Cox model adjusted survival curves
ggcoxzph <- ggcoxzph(cox.zph(cox_model))
print(ggcoxzph)