# Load necessary libraries
library(survival)
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
# Load the lung cancer dataset
data()

# View the first few rows of the dataset
head(lung)
##   inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1    3  306      2  74   1       1       90       100     1175      NA
## 2    3  455      2  68   1       0       90        90     1225      15
## 3    3 1010      1  56   1       0       90        90       NA      15
## 4    5  210      2  57   1       1       90        60     1150      11
## 5    1  883      2  60   1       0      100        90       NA       0
## 6   12 1022      1  74   1       1       50        80      513       0
# Create a Kaplan-Meier survival object
km_fit <- survfit(Surv(time, status) ~ 1, data = lung)

# Print the summary of the survival object
summary(km_fit,times=50*c(1:100))
## Call: survfit(formula = Surv(time, status) ~ 1, data = lung)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    50    217      11   0.9518  0.0142       0.9243        0.980
##   100    196      20   0.8640  0.0227       0.8206        0.910
##   150    179      16   0.7931  0.0269       0.7421        0.848
##   200    144      25   0.6803  0.0311       0.6219        0.744
##   250    115      17   0.5967  0.0333       0.5349        0.666
##   300     92      12   0.5306  0.0346       0.4669        0.603
##   350     75      13   0.4524  0.0357       0.3876        0.528
##   400     57      12   0.3768  0.0358       0.3128        0.454
##   450     48       7   0.3287  0.0356       0.2659        0.406
##   500     41       5   0.2933  0.0351       0.2320        0.371
##   550     32       6   0.2475  0.0342       0.1887        0.325
##   600     24       4   0.2136  0.0335       0.1571        0.290
##   650     20       4   0.1780  0.0323       0.1247        0.254
##   700     16       4   0.1424  0.0303       0.0938        0.216
##   750     10       5   0.0979  0.0266       0.0575        0.167
##   800      8       2   0.0783  0.0246       0.0423        0.145
##   850      4       1   0.0671  0.0235       0.0338        0.133
##   900      3       1   0.0503  0.0228       0.0207        0.123
##   950      3       0   0.0503  0.0228       0.0207        0.123
##  1000      2       0   0.0503  0.0228       0.0207        0.123
# Plot the Kaplan-Meier survival curve
ggsurvplot(km_fit, 
           conf.int = TRUE,  # Show confidence intervals
           risk.table = TRUE, # Include risk table
           ggtheme = theme_minimal(),
           title = "Kaplan-Meier Survival Curve for Lung Cancer Patients")

INTERPRETATION OF KAPLAN MEIER

The survival probability steadily decreases over time, showing that lung cancer has a significant impact on survival.

Median survival time (where survival probability = 0.5) appears to be around 300 days, meaning half of the patients lived beyond this point.

The risk table shows that fewer patients remain at later time points, making estimates less precise (wider confidence intervals).

NELSON AALEN

# Load necessary libraries
library(survival)
library(ggplot2)

# Load the lung cancer dataset
data()

# Create a survival object
surv_obj <- Surv(lung$time, lung$status)

# Fit the Nelson-Aalen estimator (Cumulative Hazard Function)
na_fit <- survfit(surv_obj ~ 1, type = "fh")  # Fleming-Harrington estimate

# Compute the cumulative hazard using basehaz()
hazard_data <- basehaz(coxph(surv_obj ~ 1, data = lung), centered = FALSE)

# Plot the Nelson-Aalen Cumulative Hazard Estimate
ggplot(hazard_data, aes(x = time, y = hazard)) +
  geom_step() +
  labs(title = "Nelson-Aalen Cumulative Hazard Estimate",
       x = "Time (days)",
       y = "Cumulative Hazard") +
  theme_minimal()

INTERPRETATION OF NELSON AALEN

The cumulative hazard increases over time, meaning the risk of death accumulates steadily.
The steeper increase after 750 days suggests a higher event rate at later stages.

LOG RANK TEST

# Load necessary library
library(survival)

# Load the lung dataset
data()

# Remove missing values
lung_clean <- na.omit(lung[, c("time", "status", "sex")])

# Convert sex variable: 1 = Male, 2 = Female (change to factor)
lung_clean$sex <- factor(lung_clean$sex, levels = c(1, 2), labels = c("Male", "Female"))

# Check the number of observations in each group
table(lung_clean$sex)
## 
##   Male Female 
##    138     90

The output Male 0 Female 0 means that after removing missing values, there are no remaining observations in the dataset. This is why the Log-Rank test is failing—there’s no data left to analyze!

COX PH REGRESSION

# Load the lung dataset from the survival package
data()

# Step 1: Check for missing values
print(colSums(is.na(lung)))  # Display the num ber of NAs in each column
##      inst      time    status       age       sex   ph.ecog  ph.karno pat.karno 
##         1         0         0         0         0         1         1         3 
##  meal.cal   wt.loss 
##        47        14
nrow(lung_clean)
## [1] 228
colSums(is.na(lung))
##      inst      time    status       age       sex   ph.ecog  ph.karno pat.karno 
##         1         0         0         0         0         1         1         3 
##  meal.cal   wt.loss 
##        47        14
lung_clean <- lung[!is.na(lung$time) & !is.na(lung$status) & !is.na(lung$ph.ecog), ]

nrow(lung_clean)  # Check if it is now greater than 0
## [1] 227
# Define survival object
surv_object <- Surv(time = lung_clean$time, event = lung_clean$status)

# Fit the Cox model (without 'sex' since it had too many missing values)
cox_model <- coxph(surv_object ~ age + ph.ecog, data = lung_clean)

# Display the summary of the model
summary(cox_model)
## Call:
## coxph(formula = surv_object ~ age + ph.ecog, data = lung_clean)
## 
##   n= 227, number of events= 164 
## 
##             coef exp(coef) se(coef)     z Pr(>|z|)    
## age     0.011281  1.011345 0.009319 1.211 0.226082    
## ph.ecog 0.443485  1.558128 0.115831 3.829 0.000129 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## age         1.011     0.9888     0.993     1.030
## ph.ecog     1.558     0.6418     1.242     1.955
## 
## Concordance= 0.61  (se = 0.025 )
## Likelihood ratio test= 19.06  on 2 df,   p=7e-05
## Wald test            = 19.26  on 2 df,   p=7e-05
## Score (logrank) test = 19.55  on 2 df,   p=6e-05

INTERPRETATION

~Since p < 0.001, ph.ecog significantly affects survival.

~HR = 1.558 means that for each 1-unit increase in ph.ecog, the risk of death increases by 55.8%.

~95% CI [1.242, 1.955] means we are confident that the true hazard increase is between 24.2% and 95.5%.