#Loading libraries and dataset
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(readxl)
library(corrplot)
## corrplot 0.92 loaded
library(cbioportalR)
## Warning: package 'cbioportalR' was built under R version 4.3.1
library(knitr) 
  
#Use suppressWarnings() to prevent crashes from the error message
suppressWarnings({ 
  prad_tcga_pan_can_atlas_2018 <- read_excel("/Users/vni/Downloads/Research/PLUMS/PrCa Public Code/By Dataset/Data/prad_tcga_pan_can_atlas_2018.xlsx")
}) 
###################FUNCTIONS###################
# custom labels function using sample size, mutation type and gene name
# LABELand Calculate mutation sum for each gene in the raw_data
sum_muts_per_gene <- function(data) {

  sums <- c(colSums(data))
  
  list_sums <- as.list(sums)
  
  new_col_names <- paste0(colnames(data), " (N = ", list_sums,")")
  
  colnames(data) <- new_col_names
  
  return(data)
}
# Function called by compute_yule_from_raw_data to make the contigency table
# DO NOT CALL THIS 
table_for_yule <- function(data) {
  
  count_1_1 <- 0
  count_1_0 <- 0
  count_0_1 <- 0
  count_0_0 <- 0
  
  for (i in 1:nrow(data)) {
    
    if (all(data[i,] == 1)) {
      
      count_1_1 <- count_1_1 + 1
      
    } else if (data[i,1] == 1 & data[i,2] == 0) {
      
      count_1_0 <- count_1_0 + 1
      
    } else if (data[i,1] == 0 & data[i,2] == 1) {
      
      count_0_1 <- count_0_1 + 1
      
    } else if (all(data[i,] == 0)) {
      
      count_0_0 <- count_0_0 + 1
      
    } else {
      
      print(data[i,])
      stop("Value is not a 0 or 1")
      
    }
    
  }
  contingency_table <- matrix(c(count_1_1, count_0_1, count_1_0, count_0_0), nrow = 2, ncol = 2)
  return(contingency_table)
}
# Function called by compute_yule_from_raw_data to calculate yule
# DO NOT CALL THIS 
# c_table = contingency table
yuleJC <- function(c_table) {
  numerator <- (c_table[1] * c_table[4]) - (c_table[2] * c_table[3])
  denominator <- (c_table[1] * c_table[4]) + (c_table[2] * c_table[3])
  if (denominator == 0) {
    return(NA)
  } else {
    result <- numerator / denominator
    return(result)
  }
}

# Function to compute Yule and Fisher using a matrix with multiple columns, each column is a gene
# compares each column with another column, makes the contingency table and computes Yule and Fisher
#Compute multiple Yules by comparing the columns
compute_yule_from_raw_data <- function(raw_data) {
  
  #create empty matrices for Yule results and fisher results with the row and col names
  results_matrix <- matrix(0, nrow = ncol(raw_data), ncol = ncol(raw_data))
  colnames(results_matrix) <- rownames(results_matrix) <- colnames(raw_data)
  #make the matrix diagonal = 1
  diag(results_matrix) <- 1
  results_fisher <- results_matrix
  diag(results_fisher) <- 0
  
  for (i in 1:(ncol(raw_data)-1)) {
    
    for (j in (i+1):ncol(raw_data)) {
      
      subset_data <- raw_data[, c(i,j)]
      
      colnames(subset_data) <- colnames(raw_data[, c(i,j)])
      
      labels <- colnames(subset_data)
      
      table_for_yule(subset_data)
      
      result <- table_for_yule(subset_data)
      
      x <- labels[1]
      y <- labels[2]
      
      results_matrix[x,y] <- yuleJC(result)
      
      #mirror copy the results above the diagonal to below the diagonal
      results_matrix[lower.tri(results_matrix)] <- t(results_matrix)[lower.tri(results_matrix)]
      
      rounded_results_matrix <- round(results_matrix, digits = 2)
      
      # use of R's fisher statistical analysis
      fisher_result <- fisher.test(result)
      p_value <- fisher_result$p.value
      results_fisher[x,y] <- p_value
      
      #mirror copy the results above the diagonal to below the diagonal
      results_fisher[lower.tri(results_fisher)] <- t(results_fisher)[lower.tri(results_fisher)]
      
      rounded_results_fisher <- round(results_fisher, digits = 4)
      threshold <- 0.05
      significant_results <- ifelse(results_fisher < threshold, 1, 0)
      
    }
  }
  final_results_list <- list(as.data.frame(rounded_results_matrix),
                             as.matrix(results_fisher),
                             as.matrix(significant_results))
  return(final_results_list)
}
#
# Function to run BH correction (FDR) from the Fisher matrix
# pass the second element in the list for the unrounded Fisher results to this fxn
adjusted_p_values <- function(results_fisher) {

  vectorized_p_values <- c(apply(results_fisher, 1, c))
  
  adjusted_p_values <- p.adjust(vectorized_p_values, method = "BH")
  significant_results <- adjusted_p_values < 0.05
  
  adjusted_p_values_matrix <- results_fisher
  adjusted_p_values_matrix[] <- adjusted_p_values
  
  significant_results_matrix <- results_fisher
  significant_results_matrix[] <- significant_results
  
  final_stats_lists <- list(as.data.frame(adjusted_p_values_matrix), 
                            as.data.frame(significant_results_matrix))
  return(final_stats_lists)
  
}

corrplot_standard <- function(data){
  corrplot.mixed(data, 
               lower = "ellipse", 
               upper = "number", 
               number.cex = .5,
               tl.pos = 'lt',
               tl.offset = 0.5)
}

corrplot_fisher <- function(data){
  corrplot.mixed(data,
                 lower = "ellipse", 
                 upper = "number", 
                 number.cex = .5,
                 tl.pos = 'lt',
                 p.mat = as.matrix(final_stats_list$p),
                 sig.level = c(0.05),
                 insig = 'blank',
                 tl.offset = 0.5)
  }

genetics_clean <- function(data){
  data %>%
    filter(Gene_name %in% genes_of_interest) %>%
    mutate(seen = as.numeric(1)) %>%
    pivot_wider(names_from = Gene_name, values_from = seen) %>%
    mutate_all(as.character) %>%
    mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
    select(-ID) %>%
    as.matrix()
}

top_mut <- function(data){genetics %>%
  group_by(Gene_name) %>%
  summarise(n = n()) %>%
  arrange(desc(n)) %>%
  head(15)
}
################END OF FUNCTIONS################
#Dataset Names
mutation <- prad_tcga_pan_can_atlas_2018

data_name <- "prad_tcga_pan_can_atlas_2018"
###Entire Dataset###
genetics <- mutation %>%
  select("hugoGeneSymbol", "sampleId") %>%
    rename(Gene_name = "hugoGeneSymbol",
           ID = "sampleId") 

#Identifying top mutations
top_mutations <- top_mut(genetics)

#Combining Top 15 genes with other genes of interest; using unique() to prevent duplicates
genes_of_interest <- unique(c(top_mutations$Gene_name, "AR", "PTEN", "PIK3CA", "ATM", "BRCA1", "BRCA2", "SPOP", "FOXA1", "CDK12", "RB1", "TMPRSS2-ERG fusion", "TMPRSS2-FLI1 fusion", "TMPRSS2-ETV1 fusion", "TMPRSS2-ETV2 fusion"))

#Cleaning data and generating matrix; Filtering for genes of interest at this step
genetics_matrix <- genetics_clean(genetics)
## Warning: Values from `seen` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
##   {data} %>%
##   dplyr::group_by(ID, Gene_name) %>%
##   dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
##   dplyr::filter(n > 1L)
## Warning: There were 23 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 22 remaining warnings.
genetics_matrix[is.na(genetics_matrix)] <- 1
genetics_matrix <- sum_muts_per_gene(genetics_matrix)

################COMPUTATIONAL COMMMANDS#################
final_results_list <- compute_yule_from_raw_data(genetics_matrix)
results_fisher <- final_results_list[[2]]
final_stats_list <- adjusted_p_values(results_fisher)
#############END OF COMPUTATIONAL COMMANDS##############

#####################VISUALIZATION######################

#visualize yule
yule_visual <- as.matrix(final_results_list[[1]])
par(ps = 7)
corrplot_standard(yule_visual)

#visualize Fisher + FDR stats on top of yule
# you have to rename the element in the list containing all the p values to 'p'
# you have to make sure that element is a matrix (use 'as.matrix()')
names(final_stats_list)[1] <- "p"
corrplot_fisher(yule_visual)

#visualize Fisher ONLY stats on top of yule
# you have to rename the element in the list containing all the p values to 'p'
# you have to make sure that element is a matrix (use 'as.matrix()')
names(final_results_list)[2] <- "p"
corrplot_fisher(yule_visual)
#############END OF VISUALIZATION
###AGE###
#Pulling clinical data
clinical <- get_clinical_by_study(study_id = data_name,
                                        clinical_attribute = "AGE",
                                        base_url = 'www.cbioportal.org/api')%>%
  mutate(parameter_type = ifelse(value >= 50, "high", "low"))

age_type_list <- list()
age_types <- c("high", "low")

#Start of Loop
for(i in age_types) {
parameter_data <- clinical %>%
  filter(parameter_type == i) %>%
  select(patientId, studyId)

genetics <- mutation %>%
  filter(mutation$patientId %in% c(parameter_data$patientId)) %>%
  select("hugoGeneSymbol", "sampleId") %>%
    rename(Gene_name = "hugoGeneSymbol",
           ID = "sampleId")

#Identifying top mutations
top_mutations <- top_mut(genetics)

#Combining Top 15 genes with other genes of interest; using unique() to prevent duplicates
genes_of_interest <- unique(c(top_mutations$Gene_name, "AR", "PTEN", "PIK3CA", "ATM", "BRCA1", "BRCA2", "SPOP", "FOXA1", "CDK12", "RB1", "TMPRSS2-ERG fusion", "TMPRSS2-FLI1 fusion", "TMPRSS2-ETV1 fusion", "TMPRSS2-ETV2 fusion"))

#Cleaning data and generating matrix; Filtering for genes of interest at this step
genetics_matrix <- genetics_clean(genetics)
genetics_matrix[is.na(genetics_matrix)] <- 1
genetics_matrix <- sum_muts_per_gene(genetics_matrix)

################COMPUTATIONAL COMMMANDS#################
final_results_list <- compute_yule_from_raw_data(genetics_matrix)
results_fisher <- final_results_list[[2]]
final_stats_list <- adjusted_p_values(results_fisher)
#############END OF COMPUTATIONAL COMMANDS##############

#####################VISUALIZATION######################

#visualize yule
yule_visual <- as.matrix(final_results_list[[1]])
par(ps = 7)
corrplot_standard(yule_visual)

#visualize Fisher + FDR stats on top of yule
# you have to rename the element in the list containing all the p values to 'p'
# you have to make sure that element is a matrix (use 'as.matrix()')
names(final_stats_list)[1] <- "p"
corrplot_fisher(yule_visual)

#visualize Fisher ONLY stats on top of yule
# you have to rename the element in the list containing all the p values to 'p'
# you have to make sure that element is a matrix (use 'as.matrix()')
names(final_results_list)[2] <- "p"
corrplot_fisher(yule_visual)
#############END OF VISUALIZATION
}
## Warning: Values from `seen` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
##   {data} %>%
##   dplyr::group_by(ID, Gene_name) %>%
##   dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
##   dplyr::filter(n > 1L)
## Warning: There were 23 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 22 remaining warnings.

## Warning: Values from `seen` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
##   {data} %>%
##   dplyr::group_by(ID, Gene_name) %>%
##   dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
##   dplyr::filter(n > 1L)
## Warning: There were 18 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 17 remaining warnings.

###"LIKELY ONCOGENIC" and "ONCOGENIC"###
genetics <- mutation %>%
  filter(mutation$oncogenic_status %in% c("Likely Oncogenic", "Oncogenic")) %>%
  select("hugoGeneSymbol", "sampleId") %>%
    rename(Gene_name = "hugoGeneSymbol",
           ID = "sampleId") 

#Identifying top mutations
top_mutations <- top_mut(genetics)

#Combining Top 15 genes with other genes of interest; using unique() to prevent duplicates
genes_of_interest <- unique(c(top_mutations$Gene_name, "AR", "PTEN", "PIK3CA", "ATM", "BRCA1", "BRCA2", "SPOP", "FOXA1", "CDK12", "RB1", "TMPRSS2-ERG fusion", "TMPRSS2-FLI1 fusion", "TMPRSS2-ETV1 fusion", "TMPRSS2-ETV2 fusion"))

#Cleaning data and generating matrix; Filtering for genes of interest at this step
genetics_matrix <- genetics_clean(genetics)
## Warning: Values from `seen` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
##   {data} %>%
##   dplyr::group_by(ID, Gene_name) %>%
##   dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
##   dplyr::filter(n > 1L)
## Warning: There were 18 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 17 remaining warnings.
genetics_matrix[is.na(genetics_matrix)] <- 1
genetics_matrix <- sum_muts_per_gene(genetics_matrix)

################COMPUTATIONAL COMMMANDS#################
final_results_list <- compute_yule_from_raw_data(genetics_matrix)
results_fisher <- final_results_list[[2]]
final_stats_list <- adjusted_p_values(results_fisher)
#############END OF COMPUTATIONAL COMMANDS##############

#####################VISUALIZATION######################

#visualize yule
yule_visual <- as.matrix(final_results_list[[1]])
par(ps = 7)
corrplot_standard(yule_visual)

#visualize Fisher + FDR stats on top of yule
# you have to rename the element in the list containing all the p values to 'p'
# you have to make sure that element is a matrix (use 'as.matrix()')
names(final_stats_list)[1] <- "p"
corrplot_fisher(yule_visual)

#visualize Fisher ONLY stats on top of yule
# you have to rename the element in the list containing all the p values to 'p'
# you have to make sure that element is a matrix (use 'as.matrix()')
names(final_results_list)[2] <- "p"
corrplot_fisher(yule_visual)
#############END OF VISUALIZATION

```