#load library

library(survival)
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(summarytools)
## 
## Attaching package: 'summarytools'
## 
## The following object is masked from 'package:tibble':
## 
##     view
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(patchwork)
library(gtsummary)
library(broom)

#load data

data(veteran)
## Warning in data(veteran): data set 'veteran' not found
df <- veteran %>% 
  select(time, status, trt, age, celltype, karno, diagtime, prior) %>% 
  as.data.frame()

head(df)

#data description

str(veteran)
## 'data.frame':    137 obs. of  8 variables:
##  $ trt     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ celltype: Factor w/ 4 levels "squamous","smallcell",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time    : num  72 411 228 126 118 10 82 110 314 100 ...
##  $ status  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ karno   : num  60 70 60 60 70 20 40 80 50 70 ...
##  $ diagtime: num  7 5 3 9 11 5 10 29 18 6 ...
##  $ age     : num  69 64 38 63 65 49 69 68 43 70 ...
##  $ prior   : num  0 10 0 10 10 0 10 0 0 0 ...
summary(veteran)
##       trt             celltype       time           status      
##  Min.   :1.000   squamous :35   Min.   :  1.0   Min.   :0.0000  
##  1st Qu.:1.000   smallcell:48   1st Qu.: 25.0   1st Qu.:1.0000  
##  Median :1.000   adeno    :27   Median : 80.0   Median :1.0000  
##  Mean   :1.496   large    :27   Mean   :121.6   Mean   :0.9343  
##  3rd Qu.:2.000                  3rd Qu.:144.0   3rd Qu.:1.0000  
##  Max.   :2.000                  Max.   :999.0   Max.   :1.0000  
##      karno          diagtime           age            prior      
##  Min.   :10.00   Min.   : 1.000   Min.   :34.00   Min.   : 0.00  
##  1st Qu.:40.00   1st Qu.: 3.000   1st Qu.:51.00   1st Qu.: 0.00  
##  Median :60.00   Median : 5.000   Median :62.00   Median : 0.00  
##  Mean   :58.57   Mean   : 8.774   Mean   :58.31   Mean   : 2.92  
##  3rd Qu.:75.00   3rd Qu.:11.000   3rd Qu.:66.00   3rd Qu.:10.00  
##  Max.   :99.00   Max.   :87.000   Max.   :81.00   Max.   :10.00

#Kaplan by treatment group

km_trt <- survfit(Surv(time, status) ~ trt, data = df)
ggsurvplot(km_trt, data = df, pval = TRUE, conf.int = TRUE, risk.table = TRUE,
           ggtheme = theme_minimal(), legend.labs = c("Standard", "Test"),
           title = "Kaplan-Meier Survival Curves by Treatment Group",
           xlab = "Time (days)", ylab = "Survival Probability")

#Kaplan by cell type

km_cell <- survfit(Surv(time, status) ~ celltype, data = df)
ggsurvplot(km_cell, data = df, pval = TRUE, conf.int = TRUE, risk.table = TRUE,
           ggtheme = theme_minimal(), legend.title = "Cancer Type",
           title = "Kaplan-Meier Survival Curves by Cancer Cell Type",
           xlab = "Time (days)", ylab = "Survival Probability")

#Nelson Aalen by treatment group

na_trt <- survfit(Surv(time, status) ~ trt, data = df, type = "fleming-harrington")
ggsurvplot(na_trt, data = df, fun = "cumhaz", conf.int = TRUE, risk.table = TRUE,
           ggtheme = theme_minimal(), legend.labs = c("Standard", "Test"),
           title = "Nelson-Aalen Cumulative Hazard by Treatment Group",
           xlab = "Time (days)", ylab = "Cumulative Hazard")

#Nelson Aalen by celltype

na_cell <- survfit(Surv(time, status) ~ celltype, data = df, type = "fleming-harrington")
ggsurvplot(na_cell, data = df, fun = "cumhaz", conf.int = TRUE, risk.table = TRUE,
           ggtheme = theme_minimal(), legend.title = "Cancer Type",
           title = "Nelson-Aalen Cumulative Hazard by Cancer Cell Type",
           xlab = "Time (days)", ylab = "Cumulative Hazard")

#Log rank test

logrank_test <- survdiff(Surv(time, status) ~ trt, data = df)
print(logrank_test)
## Call:
## survdiff(formula = Surv(time, status) ~ trt, data = df)
## 
##        N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=1 69       64     64.5   0.00388   0.00823
## trt=2 68       64     63.5   0.00394   0.00823
## 
##  Chisq= 0  on 1 degrees of freedom, p= 0.9

A p value of 0.9 shows no statistically significant difference in survival between the groups. #Cox proportional hazard model

cox_model <- coxph(Surv(time, status) ~ age + trt + celltype + karno, data = df)
summary(cox_model)
## Call:
## coxph(formula = Surv(time, status) ~ age + trt + celltype + karno, 
##     data = df)
## 
##   n= 137, number of events= 128 
## 
##                        coef exp(coef)  se(coef)      z Pr(>|z|)    
## age               -0.008903  0.991136  0.009224 -0.965   0.3345    
## trt                0.303048  1.353980  0.205656  1.474   0.1406    
## celltypesmallcell  0.856340  2.354528  0.271322  3.156   0.0016 ** 
## celltypeadeno      1.178807  3.250494  0.296440  3.977 6.99e-05 ***
## celltypelarge      0.402332  1.495308  0.282544  1.424   0.1545    
## karno             -0.032685  0.967843  0.005409 -6.043 1.51e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## age                  0.9911     1.0089    0.9734    1.0092
## trt                  1.3540     0.7386    0.9048    2.0261
## celltypesmallcell    2.3545     0.4247    1.3834    4.0073
## celltypeadeno        3.2505     0.3076    1.8181    5.8114
## celltypelarge        1.4953     0.6688    0.8595    2.6016
## karno                0.9678     1.0332    0.9576    0.9782
## 
## Concordance= 0.738  (se = 0.021 )
## Likelihood ratio test= 61.98  on 6 df,   p=2e-11
## Wald test            = 62.35  on 6 df,   p=1e-11
## Score (logrank) test = 66.62  on 6 df,   p=2e-12

From the results only small cell type adeno celltype and large cell type statistically affect survival since they have a p-value less than 0.05.