# Load necessary libraries
library(survival)
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
# Load the lung cancer dataset
data()
# View the first few rows of the dataset
head(lung)
## inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1 3 306 2 74 1 1 90 100 1175 NA
## 2 3 455 2 68 1 0 90 90 1225 15
## 3 3 1010 1 56 1 0 90 90 NA 15
## 4 5 210 2 57 1 1 90 60 1150 11
## 5 1 883 2 60 1 0 100 90 NA 0
## 6 12 1022 1 74 1 1 50 80 513 0
# Create a Kaplan-Meier survival object
km_fit <- survfit(Surv(time, status) ~ 1, data = lung)
# Print the summary of the survival object
summary(km_fit,times=50*c(1:100))
## Call: survfit(formula = Surv(time, status) ~ 1, data = lung)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 50 217 11 0.9518 0.0142 0.9243 0.980
## 100 196 20 0.8640 0.0227 0.8206 0.910
## 150 179 16 0.7931 0.0269 0.7421 0.848
## 200 144 25 0.6803 0.0311 0.6219 0.744
## 250 115 17 0.5967 0.0333 0.5349 0.666
## 300 92 12 0.5306 0.0346 0.4669 0.603
## 350 75 13 0.4524 0.0357 0.3876 0.528
## 400 57 12 0.3768 0.0358 0.3128 0.454
## 450 48 7 0.3287 0.0356 0.2659 0.406
## 500 41 5 0.2933 0.0351 0.2320 0.371
## 550 32 6 0.2475 0.0342 0.1887 0.325
## 600 24 4 0.2136 0.0335 0.1571 0.290
## 650 20 4 0.1780 0.0323 0.1247 0.254
## 700 16 4 0.1424 0.0303 0.0938 0.216
## 750 10 5 0.0979 0.0266 0.0575 0.167
## 800 8 2 0.0783 0.0246 0.0423 0.145
## 850 4 1 0.0671 0.0235 0.0338 0.133
## 900 3 1 0.0503 0.0228 0.0207 0.123
## 950 3 0 0.0503 0.0228 0.0207 0.123
## 1000 2 0 0.0503 0.0228 0.0207 0.123
# Plot the Kaplan-Meier survival curve
ggsurvplot(km_fit,
conf.int = TRUE, # Show confidence intervals
risk.table = TRUE, # Include risk table
ggtheme = theme_minimal(),
title = "Kaplan-Meier Survival Curve for Lung Cancer Patients")
INTERPRETATION OF KAPLAN MEIER
The survival probability steadily decreases over time, showing that lung cancer has a significant impact on survival.
Median survival time (where survival probability = 0.5) appears to be around 300 days, meaning half of the patients lived beyond this point.
The risk table shows that fewer patients remain at later time points, making estimates less precise (wider confidence intervals).
NELSON AALEN
# Load necessary libraries
library(survival)
library(ggplot2)
# Load the lung cancer dataset
data()
# Create a survival object
surv_obj <- Surv(lung$time, lung$status)
# Fit the Nelson-Aalen estimator (Cumulative Hazard Function)
na_fit <- survfit(surv_obj ~ 1, type = "fh") # Fleming-Harrington estimate
# Compute the cumulative hazard using basehaz()
hazard_data <- basehaz(coxph(surv_obj ~ 1, data = lung), centered = FALSE)
# Plot the Nelson-Aalen Cumulative Hazard Estimate
ggplot(hazard_data, aes(x = time, y = hazard)) +
geom_step() +
labs(title = "Nelson-Aalen Cumulative Hazard Estimate",
x = "Time (days)",
y = "Cumulative Hazard") +
theme_minimal()
INTERPRETATION OF NELSON AALEN
The cumulative hazard increases over time, meaning
the risk of death accumulates steadily.
The steeper increase after 750 days suggests a
higher event rate at later stages.
LOG RANK TEST
# Load necessary library
library(survival)
# Load the lung dataset
data()
# Remove missing values
lung_clean <- na.omit(lung[, c("time", "status", "sex")])
# Convert sex variable: 1 = Male, 2 = Female (change to factor)
lung_clean$sex <- factor(lung_clean$sex, levels = c(1, 2), labels = c("Male", "Female"))
# Check the number of observations in each group
table(lung_clean$sex)
##
## Male Female
## 138 90
The output Male 0 Female 0 means that after removing missing values, there are no remaining observations in the dataset. This is why the Log-Rank test is failing—there’s no data left to analyze!
COX PH REGRESSION
# Load the lung dataset from the survival package
data()
# Step 1: Check for missing values
print(colSums(is.na(lung))) # Display the num ber of NAs in each column
## inst time status age sex ph.ecog ph.karno pat.karno
## 1 0 0 0 0 1 1 3
## meal.cal wt.loss
## 47 14
nrow(lung_clean)
## [1] 228
colSums(is.na(lung))
## inst time status age sex ph.ecog ph.karno pat.karno
## 1 0 0 0 0 1 1 3
## meal.cal wt.loss
## 47 14
lung_clean <- lung[!is.na(lung$time) & !is.na(lung$status) & !is.na(lung$ph.ecog), ]
nrow(lung_clean) # Check if it is now greater than 0
## [1] 227
# Define survival object
surv_object <- Surv(time = lung_clean$time, event = lung_clean$status)
# Fit the Cox model (without 'sex' since it had too many missing values)
cox_model <- coxph(surv_object ~ age + ph.ecog, data = lung_clean)
# Display the summary of the model
summary(cox_model)
## Call:
## coxph(formula = surv_object ~ age + ph.ecog, data = lung_clean)
##
## n= 227, number of events= 164
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.011281 1.011345 0.009319 1.211 0.226082
## ph.ecog 0.443485 1.558128 0.115831 3.829 0.000129 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.011 0.9888 0.993 1.030
## ph.ecog 1.558 0.6418 1.242 1.955
##
## Concordance= 0.61 (se = 0.025 )
## Likelihood ratio test= 19.06 on 2 df, p=7e-05
## Wald test = 19.26 on 2 df, p=7e-05
## Score (logrank) test = 19.55 on 2 df, p=6e-05
INTERPRETATION
~Since p < 0.001, ph.ecog significantly affects survival.
~HR = 1.558 means that for each 1-unit increase in ph.ecog, the risk of death increases by 55.8%.
~95% CI [1.242, 1.955] means we are confident that the true hazard increase is between 24.2% and 95.5%.