library(readr)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(gt)
library(ggsignif)
library(readxl)
library(survminer)
library(survival)
library(table1)
library(kableExtra)
library(nricens)
#Loading data and generating relevant 2-year variables
EZH2 <- read_excel("/Users/vni/Downloads/Research/Clinical Research/Dr. Landsburg/Vincent new2.xlsx") %>%
mutate(
pfs_2yr = pmin(survtimep, 24),
pfs_status_2yr = ifelse(survtimep <= 24, survstatusp, 0),
os_2yr = pmin(survtimeo, 24),
os_status_2yr = as.numeric(ifelse(survtimeo <= 24, survstatuso, 0))) %>%
mutate(ipi3 == as.numeric(ipi3)) %>%
mutate(newipi012 = case_when(
DEL == 1 | ipi3 == 1 ~ 2,
TP53...14 == 1 & (DEL != 1 & ipi3 != 1) ~ 1,
DEL == 0 & ipi3 == 0 & TP53...14 == 0 ~ 0
))
#List
features <- c("ipi3", "COO", "DEL", "FISH-MYC", "TP53...14")
features_names <- c("International Prognosis Index", "Cell of Origin (Germinal Center)", "Double Expressor Lymphoma", "FISH-MYC Rearrangment", "TP53 Mutation")
#Cox Regression Table Clean Up
clean_up <- function(data){
data %>%
rownames_to_column(var = "Characteristic") %>%
mutate(across(c(`Hazard Ratio`, `95% Confidence Interval (Low)`, `95% Confidence Interval (High)`), ~ round(.x, 2)),
`P Value` = formatC(`P Value`, format = "e", digits = 2),
Characteristic = features_names)
}
# Basic Function to perform Cox regression
perform_cox <- function(time, status, feature) {
cox_model <- coxph(Surv(time, status) ~ get(feature), data = EZH2)
summary_cox <- summary(cox_model)
hr <- summary_cox$coef[1, 2]
ci_low <- summary_cox$conf.int[1, 3]
ci_high <- summary_cox$conf.int[1, 4]
p_value <- summary_cox$coef[1, 5]
return(c(`Hazard Ratio` = hr, `95% Confidence Interval (Low)` = ci_low, `95% Confidence Interval (High)` = ci_high, `P Value` = p_value))
}
#Survival Objects
pfs_object <- Surv(EZH2$pfs_2yr, EZH2$pfs_status_2yr)
os_object <- Surv(EZH2$os_2yr, EZH2$os_status_2yr)
#Fill specific functions
perform_cox_os2 <- function(feature) {
perform_cox(time = EZH2$os_2yr, status = EZH2$os_status_2yr, feature = feature)
}
perform_cox_pfs2 <- function(feature) {
perform_cox(time = EZH2$pfs_2yr, status = EZH2$pfs_status_2yr, feature = feature)
}
# Perform Cox regression
clean_up(as.data.frame(t(sapply(features, perform_cox_os2)))) %>%
kbl(caption = "Univariate Cox Regression (OS - 2 Years)") %>%
kable_classic("striped", full_width = F)
Univariate Cox Regression (OS - 2 Years)
|
Characteristic
|
Hazard Ratio
|
95% Confidence Interval (Low)
|
95% Confidence Interval (High)
|
P Value
|
|
International Prognosis Index
|
2.26
|
1.29
|
3.95
|
4.40e-03
|
|
Cell of Origin (Germinal Center)
|
0.71
|
0.40
|
1.26
|
2.45e-01
|
|
Double Expressor Lymphoma
|
2.28
|
1.29
|
4.01
|
4.41e-03
|
|
FISH-MYC Rearrangment
|
0.91
|
0.13
|
6.58
|
9.24e-01
|
|
TP53 Mutation
|
2.10
|
1.16
|
3.82
|
1.50e-02
|
clean_up(as.data.frame(t(sapply(features, perform_cox_pfs2)))) %>%
kbl(caption = "Univariate Cox Regression (PFS - 2 Years)") %>%
kable_classic("striped", full_width = F)
Univariate Cox Regression (PFS - 2 Years)
|
Characteristic
|
Hazard Ratio
|
95% Confidence Interval (Low)
|
95% Confidence Interval (High)
|
P Value
|
|
International Prognosis Index
|
1.86
|
1.27
|
2.71
|
1.36e-03
|
|
Cell of Origin (Germinal Center)
|
0.72
|
0.49
|
1.05
|
9.12e-02
|
|
Double Expressor Lymphoma
|
2.21
|
1.51
|
3.24
|
4.96e-05
|
|
FISH-MYC Rearrangment
|
1.70
|
0.63
|
4.61
|
2.99e-01
|
|
TP53 Mutation
|
1.66
|
1.08
|
2.53
|
2.00e-02
|
#Multivariate Cox Regression
pfs_multi <- coxph(pfs_object ~ ipi3 + DEL + `TP53...14`, data = EZH2)
summary(pfs_multi)
## Call:
## coxph(formula = pfs_object ~ ipi3 + DEL + TP53...14, data = EZH2)
##
## n= 412, number of events= 109
##
## coef exp(coef) se(coef) z Pr(>|z|)
## ipi31 0.4864 1.6264 0.1961 2.480 0.013132 *
## DEL 0.7176 2.0494 0.1977 3.630 0.000283 ***
## TP53...14 0.4414 1.5549 0.2179 2.026 0.042757 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## ipi31 1.626 0.6148 1.107 2.389
## DEL 2.049 0.4879 1.391 3.019
## TP53...14 1.555 0.6431 1.015 2.383
##
## Concordance= 0.636 (se = 0.025 )
## Likelihood ratio test= 26.04 on 3 df, p=9e-06
## Wald test = 27.92 on 3 df, p=4e-06
## Score (logrank) test = 29.18 on 3 df, p=2e-06
os_multi <- coxph(os_object ~ ipi3 + DEL + `TP53...14`, data = EZH2)
summary(os_multi)
## Call:
## coxph(formula = os_object ~ ipi3 + DEL + TP53...14, data = EZH2)
##
## n= 412, number of events= 49
##
## coef exp(coef) se(coef) z Pr(>|z|)
## ipi31 0.6612 1.9371 0.2909 2.273 0.0230 *
## DEL 0.6920 1.9977 0.2928 2.363 0.0181 *
## TP53...14 0.6474 1.9106 0.3062 2.114 0.0345 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## ipi31 1.937 0.5162 1.095 3.426
## DEL 1.998 0.5006 1.125 3.546
## TP53...14 1.911 0.5234 1.048 3.482
##
## Concordance= 0.675 (se = 0.035 )
## Likelihood ratio test= 17.53 on 3 df, p=6e-04
## Wald test = 18.79 on 3 df, p=3e-04
## Score (logrank) test = 19.92 on 3 df, p=2e-04
#Survival Fits (newipi012)
OS_fit <- survfit(Surv(survtimeo, as.numeric(survstatuso)) ~ newipi012, data = EZH2)
PFS_fit <- survfit(Surv(survtimep, as.numeric(survstatusp)) ~ newipi012, data = EZH2)
ggsurvplot(PFS_fit,
data = EZH2,
legend = "bottom",
legend.title = "IPI",
legend.labs = c("0", "1", "2"),
pval = TRUE,
ggtheme = theme_bw()) +
xlab("Time (Month)") +
ylab("PFS Probability")

ggsurvplot(OS_fit,
data = EZH2,
legend = "bottom",
legend.title = "IPI",
legend.labs = c("0", "1", "2"),
pval = TRUE,
ggtheme = theme_bw()) +
xlab("Time (Month)") +
ylab("OS Probability")

#Breakdown
prop.table(table(EZH2$newipi012))
##
## 0 1 2
## 0.43689320 0.09466019 0.46844660
prop.table(table(EZH2$ripi))
##
## 0 1 2
## 0.1650485 0.5145631 0.3203883
EZH2 %>%
group_by(newipi012) %>%
summarise(percent_PFS_2yr_1 = 100 - mean(pfs_status_2yr == 1, na.rm = TRUE) * 100,
percent_OS_2yr_1 = 100 - mean(os_status_2yr == 1, na.rm = TRUE) * 100) %>%
mutate(across(where(is.numeric), ~round(., 2))) %>%
kbl(caption = "By newipi012") %>%
kable_classic("striped", full_width = F)
By newipi012
|
newipi012
|
percent_PFS_2yr_1
|
percent_OS_2yr_1
|
|
0
|
85.00
|
96.11
|
|
1
|
71.79
|
87.18
|
|
2
|
63.21
|
80.83
|
EZH2 %>%
group_by(ripi) %>%
summarise(percent_PFS_2yr_1 = 100 - mean(pfs_status_2yr == 1, na.rm = TRUE) * 100,
percent_OS_2yr_1 = 100 - mean(os_status_2yr == 1, na.rm = TRUE) * 100) %>%
mutate(across(where(is.numeric), ~round(., 2))) %>%
kbl(caption = "By ripi") %>%
kable_classic("striped", full_width = F)
By ripi
|
ripi
|
percent_PFS_2yr_1
|
percent_OS_2yr_1
|
|
0
|
91.18
|
98.53
|
|
1
|
75.00
|
89.62
|
|
2
|
62.12
|
80.30
|
#General C-Index and AIC#
# Fit Cox proportional hazards models
ripi_pfs <- coxph(pfs_object ~ ripi, data = EZH2)
newipi012_pfs <- coxph(pfs_object ~ newipi012, data = EZH2)
ripi_os <- coxph(os_object ~ ripi, data = EZH2)
newipi012_os <- coxph(os_object ~ newipi012, data = EZH2)
#Creating Cox Model Objects
list_cox <- list(ripi_pfs, newipi012_pfs, ripi_os, newipi012_os)
model_names <- c("PFS by RIPI", "PFS by New IPI 012", "OS by RIPI", "PFS by New IPI 012")
#Generating summary of AIC and C-Index
c_indices <- numeric(length(list_cox))
aic_values <- numeric(length(list_cox))
for (i in seq_along(list_cox)) {
model <- list_cox[[i]]
c_index <- summary(model)$concordance[1]
c_indices[i] <- c_index
aic_values[i] <- AIC(model)
}
c_index_AIC_summary <- data.frame(model_names,
c_indices,
aic_values) %>%
mutate(across(c(c_indices,
aic_values),
~ round(.x, 2))) %>%
rename(`Predictor Model` = model_names,
`Concordance Index` = c_indices,
`Akaike Information Criterion` = aic_values)
c_index_AIC_summary %>%
kbl(caption = "Concordance Index and Akaike Information Criterion by Predictor Model") %>%
kable_classic("striped", full_width = F)
Concordance Index and Akaike Information Criterion by Predictor Model
|
Predictor Model
|
Concordance Index
|
Akaike Information Criterion
|
|
PFS by RIPI
|
0.61
|
1256.38
|
|
PFS by New IPI 012
|
0.62
|
1252.77
|
|
OS by RIPI
|
0.65
|
562.92
|
|
PFS by New IPI 012
|
0.67
|
556.29
|
#Net Reclassification Index
#Defining cut points for "0", "1", and "2"
cut_points <- c(-Inf, 0.5, 1.5, Inf)
#Calculating NRI
nri_result_os <- nricens(time = EZH2$os_2yr,
event = EZH2$os_status_2yr,
p.std = EZH2$ripi,
p.new = EZH2$newipi012,
t0 = 24,
cut = cut_points,
updown = "category")

nri_result_pfs <- nricens(time = EZH2$pfs_2yr,
event = EZH2$pfs_status_2yr,
p.std = EZH2$ripi,
p.new = EZH2$newipi012,
t0 = 24,
cut = cut_points,
updown = "category")

nri_os <- nri_result_os[["nri"]] %>%
mutate(across(where(is.numeric), ~round(., 2)))
nri_pfs <- nri_result_pfs[["nri"]] %>%
mutate(across(where(is.numeric), ~round(., 2)))
nri_os %>%
kbl(caption = "Net Reclassification Index (OS)") %>%
kable_classic("striped", full_width = F)
Net Reclassification Index (OS)
|
|
Estimate
|
Lower
|
Upper
|
|
NRI
|
0.30
|
0.10
|
0.50
|
|
NRI+
|
0.13
|
-0.05
|
0.30
|
|
NRI-
|
0.17
|
0.10
|
0.24
|
|
Pr(Up|Case)
|
0.26
|
0.14
|
0.40
|
|
Pr(Down|Case)
|
0.14
|
0.05
|
0.24
|
|
Pr(Down|Ctrl)
|
0.35
|
0.30
|
0.40
|
|
Pr(Up|Ctrl)
|
0.17
|
0.13
|
0.21
|
nri_pfs %>%
kbl(caption = "Net Reclassification Index (PFS)") %>%
kable_classic("striped", full_width = F)
Net Reclassification Index (PFS)
|
|
Estimate
|
Lower
|
Upper
|
|
NRI
|
0.18
|
0.03
|
0.33
|
|
NRI+
|
-0.01
|
-0.13
|
0.10
|
|
NRI-
|
0.18
|
0.11
|
0.26
|
|
Pr(Up|Case)
|
0.20
|
0.13
|
0.28
|
|
Pr(Down|Case)
|
0.21
|
0.14
|
0.29
|
|
Pr(Down|Ctrl)
|
0.36
|
0.31
|
0.42
|
|
Pr(Up|Ctrl)
|
0.18
|
0.13
|
0.22
|
#Stratified Log Rank
EZH2$newipi012f <- factor(EZH2$newipi012, levels = c(0, 1, 2), labels = c("Low Risk", "Medium Risk", "High Risk"))
pfs_strat <- survdiff(Surv(survtimep, as.numeric(survstatusp)) ~ newipi012 +strata(ripi), data = EZH2)
os_strat <- survdiff(Surv(survtimeo, as.numeric(survstatuso)) ~ newipi012 +strata(ripi), data = EZH2)
print(pfs_strat)
## Call:
## survdiff(formula = Surv(survtimep, as.numeric(survstatusp)) ~
## newipi012 + strata(ripi), data = EZH2)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## newipi012=0 180 39 51.83 3.176 8.744
## newipi012=1 39 12 9.25 0.819 0.922
## newipi012=2 193 79 68.92 1.474 6.407
##
## Chisq= 8.9 on 2 degrees of freedom, p= 0.01
print(os_strat)
## Call:
## survdiff(formula = Surv(survtimeo, as.numeric(survstatuso)) ~
## newipi012 + strata(ripi), data = EZH2)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## newipi012=0 180 11 21.75 5.313 13.788
## newipi012=1 39 6 4.41 0.577 0.653
## newipi012=2 193 46 36.84 2.275 11.629
##
## Chisq= 14.7 on 2 degrees of freedom, p= 7e-04
# Split the data by strata
strata_levels <- unique(EZH2$ripi)
p_values <- data.frame(Stratum = character(), `P Value (OS)` = numeric(), `P Value (PFS)` = numeric(), stringsAsFactors = FALSE)
for (stratum in strata_levels) {
subset_data <- subset(EZH2, ripi == stratum)
surv_object_pfs <- Surv(subset_data$pfs_2yr, subset_data$pfs_status_2yr)
surv_object_os <- Surv(subset_data$os_2yr, subset_data$os_status_2yr)
logrank_test_pfs <- survdiff(surv_object_pfs ~ newipi012, data = subset_data)
logrank_test_os <- survdiff(surv_object_os ~ newipi012, data = subset_data)
p_value_pfs <- 1 - pchisq(logrank_test_pfs$chisq, length(logrank_test_pfs$n) - 1)
p_value_os <- 1 - pchisq(logrank_test_os$chisq, length(logrank_test_os$n) - 1)
p_values <- rbind(p_values, data.frame(Stratum = stratum, `P Value (OS)` = p_value_os, `P Value (PFS)` = p_value_pfs))
}
p_values %>%
rename(`ripi Stratum` = Stratum,
`P Value (OS)` = P.Value..OS.,
`P Value (PFS)` = P.Value..PFS.) %>%
mutate(across(where(is.numeric), ~round(., 3))) %>%
kbl(caption = "") %>%
kable_classic("striped", full_width = F)
|
ripi Stratum
|
P Value (OS)
|
P Value (PFS)
|
|
1
|
0.004
|
0.008
|
|
0
|
0.897
|
0.678
|
|
2
|
0.354
|
0.271
|