library(readr)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(ggsignif)
library(readxl)
library(survminer)
library(survival)
library(table1)
library(kableExtra)
EZH2 <- read_excel("/Users/vni/Downloads/Research/Clinical Research/Dr. Landsburg/Vincent EZH2.xlsx") %>%
  rename(FISH = 'FISH-MYC')
#KM Plots (PFS and OS)
#Generating Mutation Characterization
EZH2_1 <- EZH2 %>%
  mutate(Mutation = case_when(
    !is.na(`EZH2 var`) & is.na(`TP53 var`) ~ "EZH2 Only",
    is.na(`EZH2 var`) & !is.na(`TP53 var`) ~ "TP53 Only",
    !is.na(`EZH2 var`) & !is.na(`TP53 var`) ~ "EZH2/TP53 Co-Mutation",
    TRUE ~ "Neither"),
    Mutation = factor(Mutation, levels = c("EZH2 Only", "TP53 Only", "EZH2/TP53 Co-Mutation", "Neither"))
    )

#Reorder Mutation
new_order <- c("EZH2 Only", "TP53 Only", "EZH2/TP53 Co-Mutation", "Neither")
EZH2_1$Mutation <- factor(EZH2_1$Mutation, levels = new_order)

#Survival Fits 
PFS_fit <- survfit(Surv(PFS, as.numeric(death)) ~ Mutation, data = EZH2_1)
OS_fit <- survfit(Surv(OS, as.numeric(death)) ~ Mutation, data = EZH2_1)

# Log-rank test for pairwise comparisons
PFS_p_values <- pairwise_survdiff(Surv(PFS, as.numeric(death)) ~ Mutation, data = EZH2_1, p.adjust.method = "holm")
OS_p_values <- pairwise_survdiff(Surv(OS, as.numeric(death)) ~ Mutation, data = EZH2_1, p.adjust.method = "holm")

# Print the p-values for pairwise comparisons
print(PFS_p_values)
## 
##  Pairwise comparisons using Log-Rank test 
## 
## data:  EZH2_1 and Mutation 
## 
##                       EZH2 Only TP53 Only EZH2/TP53 Co-Mutation
## TP53 Only             0.34      -         -                    
## EZH2/TP53 Co-Mutation 1.00      1.00      -                    
## Neither               1.00      7.6e-06   1.00                 
## 
## P value adjustment method: holm
print(OS_p_values)
## 
##  Pairwise comparisons using Log-Rank test 
## 
## data:  EZH2_1 and Mutation 
## 
##                       EZH2 Only TP53 Only EZH2/TP53 Co-Mutation
## TP53 Only             0.33      -         -                    
## EZH2/TP53 Co-Mutation 1.00      1.00      -                    
## Neither               1.00      1.3e-05   1.00                 
## 
## P value adjustment method: holm
#Plot
ggsurvplot(PFS_fit, 
           data = EZH2_1,
           legend = "bottom",
           legend.title = "Mutation",
           legend.labs = c("EZH2 Only", "TP53 Only", "EZH2/TP53 Co-Mutation", "Neither"),
           pval = TRUE,
           ggtheme = theme_bw()) +
  xlab("Time (Month)") +
  ylab("PFS Probability")

ggsurvplot(OS_fit, data = EZH2_1,
           legend = "bottom",
           legend.title = "Mutation",
           legend.labs = c("EZH2 Only", "TP53 Only", "EZH2/TP53 Co-Mutation", "Neither"),
           pval = TRUE,
           ggtheme = theme_bw()) +
  xlab("Time (Month)") +
  ylab("OS Probability")

#KM Plots (MYC-FISH)
#Generating MYC and EZH2 interaction Characterization
EZH2_MYC <- EZH2 %>%
  mutate(EZH2_MYC = case_when(
    !is.na(`EZH2 var`) & FISH == 1 ~ "EZH2 + MYC-FISH",
    !is.na(`EZH2 var`) & FISH == 0 ~ "EZH2 Only",
    is.na(`EZH2 var`) & FISH == 1 ~ "MYC-FISH Only",
    TRUE ~ "Neither"),
    EZH2_MYC = factor(EZH2_MYC, levels = c("EZH2 + MYC-FISH", "EZH2 Only", "MYC-FISH Only", "Neither"))
    )

#Reorder Mutation
new_order_MYC <- c("EZH2 + MYC-FISH", "EZH2 Only", "MYC-FISH Only", "Neither")
EZH2_MYC$EZH2_MYC <- factor(EZH2_MYC$EZH2_MYC, levels = new_order_MYC)

#Survival Fits 
MYC_PFS_fit <- survfit(Surv(PFS, as.numeric(death)) ~ EZH2_MYC, data = EZH2_MYC)
MYC_OS_fit <- survfit(Surv(OS,as.numeric(death)) ~ EZH2_MYC, data = EZH2_MYC)

# Log-rank test for pairwise comparisons
MYC_PFS_p_values <- pairwise_survdiff(Surv(PFS, as.numeric(death)) ~ EZH2_MYC, data = EZH2_MYC, p.adjust.method = "holm")
MYC_OS_p_values <- pairwise_survdiff(Surv(OS, as.numeric(death)) ~ EZH2_MYC, data = EZH2_MYC, p.adjust.method = "holm")

# Print the p-values for pairwise comparisons
print(MYC_PFS_p_values)
## 
##  Pairwise comparisons using Log-Rank test 
## 
## data:  EZH2_MYC and EZH2_MYC 
## 
##               EZH2 + MYC-FISH EZH2 Only MYC-FISH Only
## EZH2 Only     0.913           -         -            
## MYC-FISH Only 0.913           0.004     -            
## Neither       0.913           0.913     7.9e-11      
## 
## P value adjustment method: holm
print(MYC_OS_p_values)
## 
##  Pairwise comparisons using Log-Rank test 
## 
## data:  EZH2_MYC and EZH2_MYC 
## 
##               EZH2 + MYC-FISH EZH2 Only MYC-FISH Only
## EZH2 Only     0.8453          -         -            
## MYC-FISH Only 0.9109          0.0042    -            
## Neither       0.9109          0.9109    2.3e-10      
## 
## P value adjustment method: holm
#Plot
ggsurvplot(MYC_PFS_fit, 
           data = EZH2_MYC,
           legend = "bottom",
           legend.title = "Mutation",
           legend.labs = c("EZH2 + MYC-FISH", "EZH2 Only", "MYC-FISH Only", "Neither"),
           pval = TRUE,
           ggtheme = theme_bw()) +
  xlab("Time (Month)") +
  ylab("PFS Probability")

ggsurvplot(MYC_OS_fit, 
           data = EZH2_MYC,
           legend = "bottom",
           legend.title = "Mutation",
           legend.labs = c("EZH2 + MYC-FISH", "EZH2 Only", "MYC-FISH Only", "Neither"),
           pval = TRUE,
           ggtheme = theme_bw()) +
  xlab("Time (Month)") +
  ylab("OS Probability")

var_list <- list("DEL", "ipi3", "COO", "FISH", "DHL", "TP53")
x_square_results <- matrix()
p_values <- matrix()

for(i in var_list) {
  #Create contingency tables  
  con_name <- paste0(i, "_contigency")
  contingency <- table(EZH2$EZH2, EZH2[[i]])
  assign(con_name, contingency)

  #Chi-Square Test
  chisq_name <- paste0(i, "_chisq")
  chi_square_result <- chisq.test(contingency)
  assign(chisq_name, chi_square_result)
  
  # Print the chi-square test result
  cat("Chi-square test for", con_name, ":\n")
  print(chi_square_result)
  cat("\n")
  
  x_square_results[i] <- chi_square_result[["statistic"]][["X-squared"]]
  p_values[i] <- chi_square_result[["p.value"]]
  }
## Chi-square test for DEL_contigency :
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency
## X-squared = 0.16205, df = 1, p-value = 0.6873
## 
## 
## Chi-square test for ipi3_contigency :
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency
## X-squared = 5.2863, df = 1, p-value = 0.02149
## 
## 
## Chi-square test for COO_contigency :
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency
## X-squared = 22.103, df = 1, p-value = 2.584e-06
## 
## 
## Chi-square test for FISH_contigency :
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency
## X-squared = 5.566, df = 1, p-value = 0.01831
## 
## 
## Chi-square test for DHL_contigency :
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency
## X-squared = 28.119, df = 1, p-value = 1.141e-07
## 
## 
## Chi-square test for TP53_contigency :
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency
## X-squared = 0.075904, df = 1, p-value = 0.7829
#Cleaning up the results
x_square_full <- cbind(as.data.frame(x_square_results), as.data.frame(p_values)) %>%
  rename(`χ2` = x_square_results,
         `P Value` = p_values) %>%
  filter(!is.na(`χ2`)) %>%
  mutate(`χ2` = round(as.numeric(`χ2`), 2),
         `P Value` = format(as.numeric(`P Value`), scientific = TRUE, digits = 2))

#Rename rownames
new_rownames <- c("Double Expressor Lymphoma", "International Prognotic Index Score", "Cell of Origin (Germinal Center)", "FISH-MYC Rearrangement", "Double Hit Lymphoma", "TP53 Mutation")

rownames(x_square_full) <- new_rownames

#Table
x_square_full %>%
  kbl(caption = "Association of Incidence") %>%
  kable_classic("striped", full_width = F)
Association of Incidence
χ2 P Value
Double Expressor Lymphoma 0.16 6.9e-01
International Prognotic Index Score 5.29 2.1e-02
Cell of Origin (Germinal Center) 22.10 2.6e-06
FISH-MYC Rearrangement 5.57 1.8e-02
Double Hit Lymphoma 28.12 1.1e-07
TP53 Mutation 0.08 7.8e-01
#Convert EZH2_mut to a factor for modeling
EZH2$EZH2 <- factor(EZH2$EZH2, levels = c(0, 1), labels = c("No Mutation", "Mutation"))

#Stratified Log Rank Formula
strat_log <- function(outcome, var, data) {
    formula_str <- paste("Surv(", outcome, ", as.numeric(death)) ~ EZH2 + strata(", var, ")", sep = "")
    formula <- as.formula(formula_str)
    survdiff(formula, data = data)
}

OS_strat_log_results <- matrix()
OS_strat_p_value <- matrix()
PFS_strat_log_results <- matrix()
PFS_strat_p_value <- matrix()

#Loop Iterations for Stratified Log Rank
for(i in var_list) {
  var <- i
  OS_strat <- strat_log(outcome = "OS", var, data = EZH2_1)
  PFS_strat <- strat_log(outcome = "PFS", var, data = EZH2_1)
  print(OS_strat)
  print(PFS_strat)
  
  OS_strat_log_results[i] <- OS_strat[["chisq"]]
  OS_strat_p_value[i] <- OS_strat[["pvalue"]]
  PFS_strat_log_results[i] <- PFS_strat[["chisq"]]
  PFS_strat_p_value[i] <- PFS_strat[["pvalue"]]
}
## Call:
## survdiff(formula = formula, data = data)
## 
## n=874, 49 observations deleted due to missingness.
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## EZH2=0 833      142   141.43   0.00231    0.0522
## EZH2=1  41        6     6.57   0.04976    0.0522
## 
##  Chisq= 0.1  on 1 degrees of freedom, p= 0.8 
## Call:
## survdiff(formula = formula, data = data)
## 
## n=874, 49 observations deleted due to missingness.
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## EZH2=0 833      142   141.43   0.00231    0.0521
## EZH2=1  41        6     6.57   0.04968    0.0521
## 
##  Chisq= 0.1  on 1 degrees of freedom, p= 0.8 
## Call:
## survdiff(formula = formula, data = data)
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## EZH2=0 876      155    151.6    0.0763       1.2
## EZH2=1  47        7     10.4    1.1118       1.2
## 
##  Chisq= 1.2  on 1 degrees of freedom, p= 0.3 
## Call:
## survdiff(formula = formula, data = data)
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## EZH2=0 876      155    151.9    0.0648      1.05
## EZH2=1  47        7     10.1    0.9705      1.05
## 
##  Chisq= 1  on 1 degrees of freedom, p= 0.3 
## Call:
## survdiff(formula = formula, data = data)
## 
## n=488, 435 observations deleted due to missingness.
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## EZH2=0 452       83    80.97     0.051      0.78
## EZH2=1  36        4     6.03     0.685      0.78
## 
##  Chisq= 0.8  on 1 degrees of freedom, p= 0.4 
## Call:
## survdiff(formula = formula, data = data)
## 
## n=488, 435 observations deleted due to missingness.
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## EZH2=0 452       83    81.13    0.0431     0.677
## EZH2=1  36        4     5.87    0.5952     0.677
## 
##  Chisq= 0.7  on 1 degrees of freedom, p= 0.4 
## Call:
## survdiff(formula = formula, data = data)
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## EZH2=0 876      155   152.31    0.0477     0.815
## EZH2=1  47        7     9.69    0.7489     0.815
## 
##  Chisq= 0.8  on 1 degrees of freedom, p= 0.4 
## Call:
## survdiff(formula = formula, data = data)
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## EZH2=0 876      155   152.47    0.0421     0.728
## EZH2=1  47        7     9.53    0.6737     0.728
## 
##  Chisq= 0.7  on 1 degrees of freedom, p= 0.4 
## Call:
## survdiff(formula = formula, data = data)
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## EZH2=0 876      155    151.6    0.0752      1.29
## EZH2=1  47        7     10.4    1.0994      1.29
## 
##  Chisq= 1.3  on 1 degrees of freedom, p= 0.3 
## Call:
## survdiff(formula = formula, data = data)
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## EZH2=0 876      155    151.6    0.0783      1.32
## EZH2=1  47        7     10.4    1.1358      1.32
## 
##  Chisq= 1.3  on 1 degrees of freedom, p= 0.3 
## Call:
## survdiff(formula = formula, data = data)
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## EZH2=0 876      155      154   0.00647     0.131
## EZH2=1  47        7        8   0.12461     0.131
## 
##  Chisq= 0.1  on 1 degrees of freedom, p= 0.7 
## Call:
## survdiff(formula = formula, data = data)
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## EZH2=0 876      155   154.14   0.00475    0.0981
## EZH2=1  47        7     7.86   0.09313    0.0981
## 
##  Chisq= 0.1  on 1 degrees of freedom, p= 0.8
#Cleaning up Stratified Log Rank Results
OS_strat_log_results <- cbind(as.data.frame(OS_strat_log_results), as.data.frame(OS_strat_p_value)) %>%
  filter(!is.na(OS_strat_log_results)) %>%
  mutate(`χ2` = round(as.numeric(OS_strat_log_results), 2),
         `P Value` = format(as.numeric(OS_strat_p_value), scientific = TRUE, digits = 2))

OS_strat_log_results <- OS_strat_log_results[, c(3, 4)]

PFS_strat_log_results <- cbind(as.data.frame(PFS_strat_log_results), as.data.frame(PFS_strat_p_value)) %>%
  filter(!is.na(PFS_strat_log_results)) %>%
  mutate(`χ2` = round(as.numeric(PFS_strat_log_results), 2),
         `P Value` = format(as.numeric(PFS_strat_p_value), scientific = TRUE, digits = 2))

PFS_strat_log_results <- PFS_strat_log_results[, c(3, 4)]

#Rename rownames
rownames(OS_strat_log_results) <- new_rownames
rownames(PFS_strat_log_results) <- new_rownames


#Table
OS_strat_log_results %>%
  kbl(caption = "Stratified Log Rank (Overall Survival)") %>%
  kable_classic("striped", full_width = F)
Stratified Log Rank (Overall Survival)
χ2 P Value
Double Expressor Lymphoma 0.05 8.2e-01
International Prognotic Index Score 1.20 2.7e-01
Cell of Origin (Germinal Center) 0.78 3.8e-01
FISH-MYC Rearrangement 0.82 3.7e-01
Double Hit Lymphoma 1.29 2.6e-01
TP53 Mutation 0.13 7.2e-01
PFS_strat_log_results %>%
  kbl(caption = "Stratified Log Rank (Progression-Free Survival)") %>%
  kable_classic("striped", full_width = F)
Stratified Log Rank (Progression-Free Survival)
χ2 P Value
Double Expressor Lymphoma 0.05 8.2e-01
International Prognotic Index Score 1.05 3.1e-01
Cell of Origin (Germinal Center) 0.68 4.1e-01
FISH-MYC Rearrangement 0.73 3.9e-01
Double Hit Lymphoma 1.32 2.5e-01
TP53 Mutation 0.10 7.5e-01