library(survival)
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rmarkdown)
data(lung)

# Select variables relevant to the survival analysis
lung_df <- lung |> 
  select(time, status, age, sex, ph.ecog, wt.loss) |> 
  as.data.frame()

view(lung_df)
#Checked variables names and descriptions using the data dictionary
?lung
## starting httpd help server ... done
#Variable names were standardized to improve readability and interpretation during analysis.
lung_df <-lung_df |> 
  rename( 
    survival_time_in_days = time,
    event = status,
    performance_score = ph.ecog,
    weight_change = wt.loss) |> 
  as.data.frame()
# Clean data set and prepare variables for survival analysis

colMeans(is.na(lung_df)) *100
## survival_time_in_days                 event                   age 
##             0.0000000             0.0000000             0.0000000 
##                   sex     performance_score         weight_change 
##             0.0000000             0.4385965             6.1403509
lung_df<- lung_df |> 
  drop_na(weight_change, performance_score
  )
#Convert variable sex to factor with levels to improve interpretability.  
lung_df_clean <- lung_df|> 
  mutate(
    sex = factor(sex, 
                 levels = c("1","2"),
                 labels = c("Male","Female"))
  ) |> 
  as.data.frame()

#Recode event variable to 0, 1 to because the Kaplan Meier and Cox regressions require this level of coding
lung_df_clean <- lung_df_clean |> 
  mutate(event = ifelse(event == 2, 1, 0))

table(lung_df_clean$event)
## 
##   0   1 
##  62 151
#Boxplots were generated to explore the distribution, central tendency, and potential outliers for key variables such as age, weight change, and survival time.

ggplot(lung_df,aes(x = "", y = age)) +
  geom_boxplot(fill = "skyblue") +
  geom_jitter(width = 0.25, alpha = 0.6) +
   coord_flip()

ggplot(lung_df, aes(x = "", y = weight_change)) +
  geom_boxplot(fill = "orange") +
 geom_jitter(width = 0.25, alpha = 0.6) +
   coord_flip()

#Histogram shows a right-skewed distribution, where most observation values occur at lower values. 
ggplot(lung_df, aes(x = weight_change)) +
  geom_histogram(bins = 20)

#This histogram shows high peak at lower values (days of survival) with a right-skew

ggplot(lung_df, aes(x = survival_time_in_days)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(
    title = "Distribution of Survival Time",
    x = "Survival Time (Days)",
    y = "Count"
  )

#Exploratory analysis of summary statistics 
summary(lung_df_clean)
##  survival_time_in_days     event             age           sex     
##  Min.   :   5.0        Min.   :0.0000   Min.   :39.0   Male  :127  
##  1st Qu.: 176.0        1st Qu.:0.0000   1st Qu.:56.0   Female: 86  
##  Median : 269.0        Median :1.0000   Median :63.0               
##  Mean   : 315.2        Mean   :0.7089   Mean   :62.5               
##  3rd Qu.: 428.0        3rd Qu.:1.0000   3rd Qu.:70.0               
##  Max.   :1022.0        Max.   :1.0000   Max.   :82.0               
##  performance_score weight_change    
##  Min.   :0.0000    Min.   :-24.000  
##  1st Qu.:0.0000    1st Qu.:  0.000  
##  Median :1.0000    Median :  7.000  
##  Mean   :0.9343    Mean   :  9.728  
##  3rd Qu.:1.0000    3rd Qu.: 15.000  
##  Max.   :3.0000    Max.   : 68.000
#Kaplan_Meier Model for survival analysis
fit <- survfit(Surv(survival_time_in_days, event) ~ 1, data = lung_df_clean)

ggsurvplot(fit,
  data = lung_df_clean,
  xlab = "Time (Days)",
  ylab = "Survival Probability",
  title = "Kaplan-Meier Survival Curve"
)

#Kaplan_Meier Model by Sex
fit_sex <- survfit(Surv(survival_time_in_days, event) ~ sex, data = lung_df_clean)

ggsurvplot(
  fit_sex,
  data = lung_df_clean,
  legend.title = "Sex",
  xlab = "Time (Days)",
  ylab = "Survival Probability",
  title = "Kaplan-Meier Survival Curves by Sex"
)

##Kaplan_Meier Model by performance__score

fit_sex <- survfit(Surv(survival_time_in_days, event) ~ performance_score, data = lung_df_clean)

ggsurvplot(
  fit_sex,
  data = lung_df_clean,
  legend.title = "ECOG Performance Score",
  xlab = "Time (Days)",
  ylab = "Survival Probability",
  title = "Kaplan-Meier Survival Curves by Performance Score",
  legend = "right") 

#Fit Cox Proportional Hazards Model to evaluate predictors of mortality
cox_model <- coxph(
  Surv(survival_time_in_days, event) ~ age + sex + performance_score + weight_change,
  data = lung_df_clean
)

cox_model
## Call:
## coxph(formula = Surv(survival_time_in_days, event) ~ age + sex + 
##     performance_score + weight_change, data = lung_df_clean)
## 
##                        coef exp(coef)  se(coef)      z        p
## age                0.013369  1.013459  0.009628  1.389 0.164951
## sexFemale         -0.590775  0.553898  0.175339 -3.369 0.000754
## performance_score  0.515111  1.673824  0.125988  4.089 4.34e-05
## weight_change     -0.009006  0.991034  0.006658 -1.353 0.176135
## 
## Likelihood ratio test=31.02  on 4 df, p=3.032e-06
## n= 213, number of events= 151
# Test proportional hazards assumption using Schoenfeld residuals 
cox_test <- cox.zph(cox_model)
cox_test
##                    chisq df    p
## age               0.4353  1 0.51
## sex               2.6731  1 0.10
## performance_score 1.6355  1 0.20
## weight_change     0.0457  1 0.83
## GLOBAL            4.7516  4 0.31
#GLOBAL value is > .05, indicating this Cox model is appropriate for analysis
#Create summary table for easier interpretation
cox_results <- data.frame(
  Variable = rownames(summary(cox_model)$coefficients),
  Hazard_Ratio = exp(summary(cox_model)$coefficients[, "coef"]),
  CI_Lower = exp(summary(cox_model)$conf.int[, "lower .95"]),
  CI_Upper = exp(summary(cox_model)$conf.int[, "upper .95"]),
  p_value = summary(cox_model)$coefficients[, "Pr(>|z|)"]
)

cox_results <- cox_results |>
  dplyr::mutate(
    Hazard_Ratio = round(Hazard_Ratio, 2),
    CI_Lower = round(CI_Lower, 2),
    CI_Upper = round(CI_Upper, 2),
    p_value = round(p_value, 3)
  )

cox_results
##                            Variable Hazard_Ratio CI_Lower CI_Upper p_value
## age                             age         1.01     2.70     2.81   0.165
## sexFemale                 sexFemale         0.55     1.48     2.18   0.001
## performance_score performance_score         1.67     3.70     8.52   0.000
## weight_change         weight_change         0.99     2.66     2.73   0.176
#Hazard ratio forest plot to quickly see which variables increase risk, decrease risk, are statistically meaningful
ggforest(cox_model, data = lung_df_clean)