library(survival)
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rmarkdown)
data(lung)
# Select variables relevant to the survival analysis
lung_df <- lung |>
select(time, status, age, sex, ph.ecog, wt.loss) |>
as.data.frame()
view(lung_df)
#Checked variables names and descriptions using the data dictionary
?lung
## starting httpd help server ... done
#Variable names were standardized to improve readability and interpretation during analysis.
lung_df <-lung_df |>
rename(
survival_time_in_days = time,
event = status,
performance_score = ph.ecog,
weight_change = wt.loss) |>
as.data.frame()
# Clean data set and prepare variables for survival analysis
colMeans(is.na(lung_df)) *100
## survival_time_in_days event age
## 0.0000000 0.0000000 0.0000000
## sex performance_score weight_change
## 0.0000000 0.4385965 6.1403509
lung_df<- lung_df |>
drop_na(weight_change, performance_score
)
#Convert variable sex to factor with levels to improve interpretability.
lung_df_clean <- lung_df|>
mutate(
sex = factor(sex,
levels = c("1","2"),
labels = c("Male","Female"))
) |>
as.data.frame()
#Recode event variable to 0, 1 to because the Kaplan Meier and Cox regressions require this level of coding
lung_df_clean <- lung_df_clean |>
mutate(event = ifelse(event == 2, 1, 0))
table(lung_df_clean$event)
##
## 0 1
## 62 151
#Boxplots were generated to explore the distribution, central tendency, and potential outliers for key variables such as age, weight change, and survival time.
ggplot(lung_df,aes(x = "", y = age)) +
geom_boxplot(fill = "skyblue") +
geom_jitter(width = 0.25, alpha = 0.6) +
coord_flip()

ggplot(lung_df, aes(x = "", y = weight_change)) +
geom_boxplot(fill = "orange") +
geom_jitter(width = 0.25, alpha = 0.6) +
coord_flip()

#Histogram shows a right-skewed distribution, where most observation values occur at lower values.
ggplot(lung_df, aes(x = weight_change)) +
geom_histogram(bins = 20)

#This histogram shows high peak at lower values (days of survival) with a right-skew
ggplot(lung_df, aes(x = survival_time_in_days)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(
title = "Distribution of Survival Time",
x = "Survival Time (Days)",
y = "Count"
)

#Exploratory analysis of summary statistics
summary(lung_df_clean)
## survival_time_in_days event age sex
## Min. : 5.0 Min. :0.0000 Min. :39.0 Male :127
## 1st Qu.: 176.0 1st Qu.:0.0000 1st Qu.:56.0 Female: 86
## Median : 269.0 Median :1.0000 Median :63.0
## Mean : 315.2 Mean :0.7089 Mean :62.5
## 3rd Qu.: 428.0 3rd Qu.:1.0000 3rd Qu.:70.0
## Max. :1022.0 Max. :1.0000 Max. :82.0
## performance_score weight_change
## Min. :0.0000 Min. :-24.000
## 1st Qu.:0.0000 1st Qu.: 0.000
## Median :1.0000 Median : 7.000
## Mean :0.9343 Mean : 9.728
## 3rd Qu.:1.0000 3rd Qu.: 15.000
## Max. :3.0000 Max. : 68.000
#Kaplan_Meier Model for survival analysis
fit <- survfit(Surv(survival_time_in_days, event) ~ 1, data = lung_df_clean)
ggsurvplot(fit,
data = lung_df_clean,
xlab = "Time (Days)",
ylab = "Survival Probability",
title = "Kaplan-Meier Survival Curve"
)

#Kaplan_Meier Model by Sex
fit_sex <- survfit(Surv(survival_time_in_days, event) ~ sex, data = lung_df_clean)
ggsurvplot(
fit_sex,
data = lung_df_clean,
legend.title = "Sex",
xlab = "Time (Days)",
ylab = "Survival Probability",
title = "Kaplan-Meier Survival Curves by Sex"
)

##Kaplan_Meier Model by performance__score
fit_sex <- survfit(Surv(survival_time_in_days, event) ~ performance_score, data = lung_df_clean)
ggsurvplot(
fit_sex,
data = lung_df_clean,
legend.title = "ECOG Performance Score",
xlab = "Time (Days)",
ylab = "Survival Probability",
title = "Kaplan-Meier Survival Curves by Performance Score",
legend = "right")

#Fit Cox Proportional Hazards Model to evaluate predictors of mortality
cox_model <- coxph(
Surv(survival_time_in_days, event) ~ age + sex + performance_score + weight_change,
data = lung_df_clean
)
cox_model
## Call:
## coxph(formula = Surv(survival_time_in_days, event) ~ age + sex +
## performance_score + weight_change, data = lung_df_clean)
##
## coef exp(coef) se(coef) z p
## age 0.013369 1.013459 0.009628 1.389 0.164951
## sexFemale -0.590775 0.553898 0.175339 -3.369 0.000754
## performance_score 0.515111 1.673824 0.125988 4.089 4.34e-05
## weight_change -0.009006 0.991034 0.006658 -1.353 0.176135
##
## Likelihood ratio test=31.02 on 4 df, p=3.032e-06
## n= 213, number of events= 151
# Test proportional hazards assumption using Schoenfeld residuals
cox_test <- cox.zph(cox_model)
cox_test
## chisq df p
## age 0.4353 1 0.51
## sex 2.6731 1 0.10
## performance_score 1.6355 1 0.20
## weight_change 0.0457 1 0.83
## GLOBAL 4.7516 4 0.31
#GLOBAL value is > .05, indicating this Cox model is appropriate for analysis
#Create summary table for easier interpretation
cox_results <- data.frame(
Variable = rownames(summary(cox_model)$coefficients),
Hazard_Ratio = exp(summary(cox_model)$coefficients[, "coef"]),
CI_Lower = exp(summary(cox_model)$conf.int[, "lower .95"]),
CI_Upper = exp(summary(cox_model)$conf.int[, "upper .95"]),
p_value = summary(cox_model)$coefficients[, "Pr(>|z|)"]
)
cox_results <- cox_results |>
dplyr::mutate(
Hazard_Ratio = round(Hazard_Ratio, 2),
CI_Lower = round(CI_Lower, 2),
CI_Upper = round(CI_Upper, 2),
p_value = round(p_value, 3)
)
cox_results
## Variable Hazard_Ratio CI_Lower CI_Upper p_value
## age age 1.01 2.70 2.81 0.165
## sexFemale sexFemale 0.55 1.48 2.18 0.001
## performance_score performance_score 1.67 3.70 8.52 0.000
## weight_change weight_change 0.99 2.66 2.73 0.176
#Hazard ratio forest plot to quickly see which variables increase risk, decrease risk, are statistically meaningful
ggforest(cox_model, data = lung_df_clean)
