library(cbioportalR)
set_cbioportal_db("public")

library(tidyverse)
library(dplyr)
library(ggplot2)
library(readxl)
library(corrplot)
#This is the Yule's coefficient I used for subsequent calculations
calcYulesYBetweenMatrices1 <- function(dm.x, dm.y) {
    if(!all(dm.x %in% c(0,1)) | !all(dm.y %in% c(0,1))) {
    stop("Error: calcYulesYBetweenMatrices() requires binary matrices as input. Please ensure all values are 0/1 or FALSE/TRUE.")
    }
  
  tt <- t( dm.x) %*%  dm.y # Count  TRUE and  TRUE
  tf <- t( dm.x) %*% !dm.y # Count  TRUE and FALSE
  ft <- t(!dm.x) %*%  dm.y # Count FALSE and  TRUE
  ff <- t(!dm.x) %*% !dm.y # Count FALSE and FALSE
  
  Y <- (tt * ff - tf * ft) / (tt * ff + tf * ft)
  return(Y)}
#Stratification by Gleason Scores
p1000 <- available_samples("prad_p1000") %>%
  select(sampleId, patientId, studyId)

p1000_clinical <- get_clinical_by_study(study_id = "prad_p1000",
                                        clinical_attribute = "REVIEWED_GLEASON_CATEGORY",
                                        base_url = 'www.cbioportal.org/api') %>%
  mutate(gleason_category = case_when(
    value == "3+3" ~ 1,
    value == "3+4" ~ 2,
    value == "4+3" ~ 3,
    value == "3+5" ~ 4,
    value == "5+3" ~ 4,
    value == "4+4" ~ 4,
    value == "4+5" ~ 5,
    value == "5+4" ~ 5,
    value == "5+5" ~ 5
    ))

gleason_list <- list()

# Loop through Gleason categories 1 to 5
for (i in 1:5) {
  # Filter p1000_clinical based on Gleason category
  gleason_data <- p1000_clinical %>%
    filter(gleason_category == i) %>%
    select(sampleId, patientId, studyId)

  # Get genetics data for the selected samples
  gleason_data <- get_genetics_by_sample(
    sample_id = c(gleason_data$sampleId),
    study_id = "prad_p1000"
  )

  # Add the result to the list
  gleason_list[[paste0("gleason", i)]] <- gleason_data
}

gleason1 <- gleason_list[["gleason1"]]
gleason2 <- gleason_list[["gleason2"]]
gleason3 <- gleason_list[["gleason3"]]
gleason4 <- gleason_list[["gleason4"]]
gleason5 <- gleason_list[["gleason5"]]

#Loop of the Gleason Dataframes
yule_gleason1 <- data.frame()
yule_gleason2 <- data.frame()
yule_gleason3 <- data.frame()
yule_gleason4 <- data.frame()
yule_gleason5 <- data.frame()

for(i in 1:5) {
  gleason_var <- paste0("gleason", i)

#Get mutation data
 cases <- assign(gleason_var, p1000_clinical %>%
    filter(gleason_category == i) %>%
    select(sampleId, patientId, studyId))
 
 genetics <- get_genetics_by_sample(sample_id = c(cases$sampleId),
                       study_id = "prad_p1000")
 
 prca_public <- as.data.frame(genetics[["mutation"]]) %>%
    select("hugoGeneSymbol", "sampleId") %>%
    rename(Gene_name = "hugoGeneSymbol",
           ID = "sampleId")
 
#Top Genes
  top_prca <- prca_public %>%
    group_by(Gene_name) %>%
    summarise(n = n()) %>%
    arrange(desc(n)) %>%
    head(15)

if ("CDH1" %in% prca_public$Gene_name) {
  # If "CDH1" is present, perform the specified operation
  top_prca <- append(top_prca$Gene_name, "CDH1") %>%
    as.data.frame() %>%
    rename(Gene_name = ".")
  
} else {
  # If "CDH1" is not present, keep it as it is
}

#Clean data
  prca_public_matrix <- prca_public %>%
    mutate(seen = as.numeric(1)) %>%
    pivot_wider(names_from = Gene_name, values_from = seen) %>%
    mutate_all(as.character) %>%
    mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
    select(-ID) %>%
    as.matrix()

#Accounting for multiple mutations of the same gene in the same sample
  prca_public_matrix[is.na(prca_public_matrix)] <- 1

#Storing matrixes
  yule_df <- as.data.frame(calcYulesYBetweenMatrices1(prca_public_matrix, prca_public_matrix))

#Filtering for the top 15 genes + CDH1
  yule.prca_public.filtered <- yule_df[c(top_prca$Gene_name), ] %>%
    select(top_prca$Gene_name)

#Round to Yule's Coefficients to 2 digits
  yule.prca_public.filtered <- as.matrix.data.frame(round(yule.prca_public.filtered, 
                                                          digits = 2))
  
  order_indices <- order(rownames(yule.prca_public.filtered))
  yule_df <- yule.prca_public.filtered[order_indices, order_indices]

if (i == 1) yule_gleason1 <- as.data.frame(yule_df)
  else if (i == 2) yule_gleason2 <- as.data.frame(yule_df)
  else if (i == 3) yule_gleason3 <- as.data.frame(yule_df)
  else if (i == 4) yule_gleason4 <- as.data.frame(yule_df)
  else if (i == 5) yule_gleason5 <- as.data.frame(yule_df)
  
#Generating p-values for the correlations
p_prca_public <- cor.mtest(yule_df)

#Visualize only the statistically significant comutations
corrplot.mixed(yule_df, 
               lower = "ellipse", 
               upper = "number", 
               number.cex = .6,
               tl.pos = 'lt',
               p.mat = p_prca_public$p,
               sig.level = c(0.05),
               insig = 'blank')
}

#Stratification by Sample Type: Primary vs. Metastatic
p1000 <- available_samples("prad_p1000") %>%
  select(sampleId, patientId, studyId)

p1000_clinical <- get_clinical_by_study(study_id = "prad_p1000",
                                        clinical_attribute = "SAMPLE_TYPE",
                                        base_url = 'www.cbioportal.org/api')

sample_type_list <- list()
sample_types <- c("Primary", "Metastasis")

# Loop through Sample Types
for (i in sample_types) {
  # Filter p1000_clinical based on Sampe Type category
  sample_type_data <- p1000_clinical %>%
    filter(value == i) %>%
    select(sampleId, patientId, studyId)

  # Get genetics data for the selected samples
  sample_type_data <- get_genetics_by_sample(
    sample_id = c(sample_type_data$sampleId),
    study_id = "prad_p1000"
  )

  # Add the result to the list
  sample_type_list[[paste0(i)]] <- sample_type_data
}


Primary <- sample_type_list[["Primary"]]
Metastasis <- sample_type_list[["Metastasis"]]

#Loop
for(i in sample_types) {
  var <- paste0(i)

#Get mutation data
 cases <- assign(var, p1000_clinical %>%
    filter(value == i) %>%
    select(sampleId, patientId, studyId))
 
 genetics <- get_genetics_by_sample(sample_id = c(cases$sampleId),
                       study_id = "prad_p1000")
 
 prca_public <- as.data.frame(genetics[["mutation"]]) %>%
    select("hugoGeneSymbol", "sampleId") %>%
    rename(Gene_name = "hugoGeneSymbol",
           ID = "sampleId")
 
#Top Genes
  top_prca <- prca_public %>%
    group_by(Gene_name) %>%
    summarise(n = n()) %>%
    arrange(desc(n)) %>%
    head(15)

if ("CDH1" %in% prca_public$Gene_name) {
  # If "CDH1" is present, perform the specified operation
  top_prca <- append(top_prca$Gene_name, "CDH1") %>%
    as.data.frame() %>%
    rename(Gene_name = ".")
  
} else {
  # If "CDH1" is not present, keep it as it is
}

#Clean data
  prca_public_matrix <- prca_public %>%
    mutate(seen = as.numeric(1)) %>%
    pivot_wider(names_from = Gene_name, values_from = seen) %>%
    mutate_all(as.character) %>%
    mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
    select(-ID) %>%
    as.matrix()

#Accounting for multiple mutations of the same gene in the same sample
  prca_public_matrix[is.na(prca_public_matrix)] <- 1

#Storing matrixes
  yule_df <- as.data.frame(calcYulesYBetweenMatrices1(prca_public_matrix, prca_public_matrix))

#Filtering for the top 15 genes + CDH1
  yule.prca_public.filtered <- yule_df[c(top_prca$Gene_name), ] %>%
    select(top_prca$Gene_name)

#Round to Yule's Coefficients to 2 digits
  yule.prca_public.filtered <- as.matrix.data.frame(round(yule.prca_public.filtered, 
                                                          digits = 2))
  
  order_indices <- order(rownames(yule.prca_public.filtered))
  yule_df <- yule.prca_public.filtered[order_indices, order_indices]

#Generating p-values for the correlations
p_prca_public <- cor.mtest(yule_df)

#Visualize only the statistically significant comutations
corrplot.mixed(yule_df, 
               lower = "ellipse", 
               upper = "number", 
               number.cex = .6,
               tl.pos = 'lt',
               p.mat = p_prca_public$p,
               sig.level = c(0.05),
               insig = 'blank')
}

#Stratification by TMB
p1000 <- available_samples("prad_p1000") %>%
  select(sampleId, patientId, studyId)

p1000_clinical <- get_clinical_by_study(study_id = "prad_p1000",
                                        clinical_attribute = "MUTATION_BURDEN",
                                        base_url = 'www.cbioportal.org/api') %>%
  mutate(tmd_type = ifelse(value >= 10, "high", "low"))

tmd_type_list <- list()
tmd_types <- c("high", "low")

# Loop through TMB Types
for (i in tmd_types) {
  # Filter p1000_clinical based on TMB Type category
  tmd_type_data <- p1000_clinical %>%
    filter(tmd_type == i) %>%
    select(sampleId, patientId, studyId)

  # Get genetics data for the selected samples
  tmd_type_data <- get_genetics_by_sample(
    sample_id = c(tmd_type_data$sampleId),
    study_id = "prad_p1000"
  )

  # Add the result to the list
  tmd_type_list[[paste0(i)]] <- tmd_type_data
}


high <- tmd_type_list[["high"]]
low <- tmd_type_list[["low"]]

#Loop
for(i in tmd_types) {
  var <- paste0(i)

#Get mutation data
 cases <- assign(var, p1000_clinical %>%
    filter(tmd_type == i) %>%
    select(sampleId, patientId, studyId))
 
 genetics <- get_genetics_by_sample(sample_id = c(cases$sampleId),
                       study_id = "prad_p1000")
 
 prca_public <- as.data.frame(genetics[["mutation"]]) %>%
    select("hugoGeneSymbol", "sampleId") %>%
    rename(Gene_name = "hugoGeneSymbol",
           ID = "sampleId")
 
#Top Genes
  top_prca <- prca_public %>%
    group_by(Gene_name) %>%
    summarise(n = n()) %>%
    arrange(desc(n)) %>%
    head(15)

if ("CDH1" %in% prca_public$Gene_name) {
  # If "CDH1" is present, perform the specified operation
  top_prca <- append(top_prca$Gene_name, "CDH1") %>%
    as.data.frame() %>%
    rename(Gene_name = ".")
  
} else {
  # If "CDH1" is not present, keep it as it is
}

#Clean data
  prca_public_matrix <- prca_public %>%
    mutate(seen = as.numeric(1)) %>%
    pivot_wider(names_from = Gene_name, values_from = seen) %>%
    mutate_all(as.character) %>%
    mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
    select(-ID) %>%
    as.matrix()

#Accounting for multiple mutations of the same gene in the same sample
  prca_public_matrix[is.na(prca_public_matrix)] <- 1

#Storing matrixes
  yule_df <- as.data.frame(calcYulesYBetweenMatrices1(prca_public_matrix, prca_public_matrix))

#Filtering for the top 15 genes + CDH1
  yule.prca_public.filtered <- yule_df[c(top_prca$Gene_name), ] %>%
    select(top_prca$Gene_name)

#Round to Yule's Coefficients to 2 digits
  yule.prca_public.filtered <- as.matrix.data.frame(round(yule.prca_public.filtered, 
                                                          digits = 2))
  
  order_indices <- order(rownames(yule.prca_public.filtered))
  yule_df <- yule.prca_public.filtered[order_indices, order_indices]

#Generating p-values for the correlations
p_prca_public <- cor.mtest(yule_df)

#Visualize only the statistically significant comutations
corrplot.mixed(yule_df, 
               lower = "ellipse", 
               upper = "number", 
               number.cex = .6,
               tl.pos = 'lt',
               p.mat = p_prca_public$p,
               sig.level = c(0.05),
               insig = 'blank')
}

#Stratification by Age
p1000 <- available_samples("prad_p1000") %>%
  select(sampleId, patientId, studyId)

#Pulling Data
p1000_clinical <- get_clinical_by_study(study_id = "prad_p1000",
                                        clinical_attribute = "AGE",
                                        base_url = 'www.cbioportal.org/api')%>%
  mutate(age_type = ifelse(value >= 50, "high", "low")) %>%
  select(-sampleId)

#Fixing the lack of sampleId from the data pull
p1000_sampleId <- p1000 %>%
  select(sampleId, patientId)

p1000_clinical <- left_join(p1000_clinical, p1000_sampleId, by = "patientId")

age_type_list <- list()
age_types <- c("high", "low")

  age_type_data <- p1000_clinical %>%
    filter(age_type == "high") %>%
    select(sampleId, patientId, studyId)
  
# Loop through Age Types
for (i in age_types) {
  # Filter p1000_clinical based on Age Type category
  age_type_data <- p1000_clinical %>%
    filter(age_type == i) %>%
    select(sampleId, patientId, studyId)

  # Get genetics data for the selected samples
  age_type_data <- get_genetics_by_sample(
    sample_id = c(age_type_data$sampleId),
    study_id = "prad_p1000"
  )

  # Add the result to the list
  age_type_list[[paste0(i)]] <- age_type_data
}


high <- age_type_list[["high"]]
low <- age_type_list[["low"]]

#Loop
for(i in age_types) {
  var <- paste0(i)

#Get mutation data
 cases <- assign(var, p1000_clinical %>%
    filter(age_type == i) %>%
    select(sampleId, patientId, studyId))
 
 genetics <- get_genetics_by_sample(sample_id = c(cases$sampleId),
                       study_id = "prad_p1000")
 
 prca_public <- as.data.frame(genetics[["mutation"]]) %>%
    select("hugoGeneSymbol", "sampleId") %>%
    rename(Gene_name = "hugoGeneSymbol",
           ID = "sampleId")
 
#Top Genes
  top_prca <- prca_public %>%
    group_by(Gene_name) %>%
    summarise(n = n()) %>%
    arrange(desc(n)) %>%
    head(15)

if ("CDH1" %in% prca_public$Gene_name) {
  # If "CDH1" is present, perform the specified operation
  top_prca <- append(top_prca$Gene_name, "CDH1") %>%
    as.data.frame() %>%
    rename(Gene_name = ".")
  
} else {
  # If "CDH1" is not present, keep it as it is
}

#Clean data
  prca_public_matrix <- prca_public %>%
    mutate(seen = as.numeric(1)) %>%
    pivot_wider(names_from = Gene_name, values_from = seen) %>%
    mutate_all(as.character) %>%
    mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
    select(-ID) %>%
    as.matrix()

#Accounting for multiple mutations of the same gene in the same sample
  prca_public_matrix[is.na(prca_public_matrix)] <- 1

#Storing matrixes
  yule_df <- as.data.frame(calcYulesYBetweenMatrices1(prca_public_matrix, prca_public_matrix))

#Filtering for the top 15 genes + CDH1
  yule.prca_public.filtered <- yule_df[c(top_prca$Gene_name), ] %>%
    select(top_prca$Gene_name)

#Round to Yule's Coefficients to 2 digits
  yule.prca_public.filtered <- as.matrix.data.frame(round(yule.prca_public.filtered, 
                                                          digits = 2))
  
  order_indices <- order(rownames(yule.prca_public.filtered))
  yule_df <- yule.prca_public.filtered[order_indices, order_indices]

#Generating p-values for the correlations
p_prca_public <- cor.mtest(yule_df)

#Visualize only the statistically significant comutations
corrplot.mixed(yule_df, 
               lower = "ellipse", 
               upper = "number", 
               number.cex = .6,
               tl.pos = 'lt',
               p.mat = p_prca_public$p,
               sig.level = c(0.05),
               insig = 'blank')
}

#Stratification by Fusion
p1000 <- available_samples("prad_p1000") %>%
  select(sampleId, patientId, studyId)

#Pulling Data
p1000_fusion <- get_fusions_by_study(study_id = "prad_p1000",
                                        base_url = 'www.cbioportal.org/api') %>%
  mutate(fusion_type = case_when(
    eventInfo == "TMPRSS2-ERG fusion" ~ "ERG",
    eventInfo == "TMPRSS2-ETV1 fusion" ~ "ETV1",
    eventInfo == "TMPRSS2-ETV4 fusion" ~ "ETV4",
    eventInfo == "TMPRSS2-FLI1 fusion" ~ "FLI4"))

fusion_data_list <- list()
fusion_types <- c("ERG", "ETV1", "ETV4", "FLI4")

# Loop through Fusion Types
for (i in fusion_types) {
  # Filter p1000_fusion based on Fusion Type
  fusion_data <- p1000_fusion %>%
    filter(fusion_type == i) %>%
    select(sampleId, patientId, studyId)

  # Get genetics data for the selected samples
  fusion_data <- get_genetics_by_sample(
    sample_id = c(fusion_data$sampleId),
    study_id = "prad_p1000"
  )

  # Add the result to the list
  fusion_data_list[[paste0(i)]] <- fusion_data
}

#New list of fusion types with "No_Fusion"
fusion_types <- c("ERG", "ETV1", "ETV4", "FLI4", "No_Fusion")

#Loop
for(i in fusion_types) {
  var <- paste0(i)

#Get mutation data for the fusion events
if (i %in% c("ERG", "ETV1", "ETV4", "FLI4")) {
 cases <- assign(var, p1000_fusion %>%
    filter(fusion_type == i) %>%
    select(sampleId, patientId, studyId))
 
 genetics <- get_genetics_by_sample(sample_id = c(cases$sampleId),
                       study_id = "prad_p1000")
 
 prca_public <- as.data.frame(genetics[["mutation"]]) %>%
    select("hugoGeneSymbol", "sampleId") %>%
    rename(Gene_name = "hugoGeneSymbol",
           ID = "sampleId")
 }
else {
  #Pulling the samples with no fusion events
  all_prad_p1000 <- get_genetics_by_study(
    study_id = "prad_p1000")
  
  prca_public <- all_prad_p1000[["mutation"]] %>%
    filter(!sampleId %in% c(p1000_fusion$sampleId)) %>%
    select("hugoGeneSymbol", "sampleId") %>%
    rename(Gene_name = "hugoGeneSymbol",
           ID = "sampleId")
}

#Top Genes
  top_prca <- prca_public %>%
    group_by(Gene_name) %>%
    summarise(n = n()) %>%
    arrange(desc(n)) %>%
    head(15)

if ("CDH1" %in% prca_public$Gene_name) {
  # If "CDH1" is present, perform the specified operation
  top_prca <- append(top_prca$Gene_name, "CDH1") %>%
    as.data.frame() %>%
    rename(Gene_name = ".")
  
} else {
  # If "CDH1" is not present, keep it as it is
}

#Clean data
  prca_public_matrix <- prca_public %>%
    mutate(seen = as.numeric(1)) %>%
    pivot_wider(names_from = Gene_name, values_from = seen) %>%
    mutate_all(as.character) %>%
    mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
    select(-ID) %>%
    as.matrix()

#Accounting for multiple mutations of the same gene in the same sample
  prca_public_matrix[is.na(prca_public_matrix)] <- 1

#Storing matrixes
  yule_df <- as.data.frame(calcYulesYBetweenMatrices1(prca_public_matrix, prca_public_matrix))

#Filtering for the top 15 genes + CDH1
  yule.prca_public.filtered <- yule_df[c(top_prca$Gene_name), ] %>%
    select(top_prca$Gene_name)

#Round to Yule's Coefficients to 2 digits
  yule.prca_public.filtered <- as.matrix.data.frame(round(yule.prca_public.filtered, 
                                                          digits = 2))
  
  order_indices <- order(rownames(yule.prca_public.filtered))
  yule_df <- yule.prca_public.filtered[order_indices, order_indices]

#Generating p-values for the correlations
p_prca_public <- cor.mtest(yule_df)

#Visualize only the statistically significant comutations
corrplot.mixed(yule_df, 
               lower = "ellipse", 
               upper = "number", 
               number.cex = .6,
               tl.pos = 'lt',
               p.mat = p_prca_public$p,
               sig.level = c(0.05),
               insig = 'blank')
}