#Loading libraries and dataset
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(readxl)
library(corrplot)
## corrplot 0.92 loaded
library(cbioportalR)
## Warning: package 'cbioportalR' was built under R version 4.3.1
library(knitr)
#Use suppressWarnings() to prevent crashes from the error message
suppressWarnings({
prad_tcga_pan_can_atlas_2018 <- read_excel("/Users/vni/Downloads/Research/PLUMS/PrCa Public Code/By Dataset/Data/prad_tcga_pan_can_atlas_2018.xlsx")
})
###################FUNCTIONS###################
# custom labels function using sample size, mutation type and gene name
# LABELand Calculate mutation sum for each gene in the raw_data
sum_muts_per_gene <- function(data) {
sums <- c(colSums(data))
list_sums <- as.list(sums)
new_col_names <- paste0(colnames(data), " (N = ", list_sums,")")
colnames(data) <- new_col_names
return(data)
}
# Function called by compute_yule_from_raw_data to make the contigency table
# DO NOT CALL THIS
table_for_yule <- function(data) {
count_1_1 <- 0
count_1_0 <- 0
count_0_1 <- 0
count_0_0 <- 0
for (i in 1:nrow(data)) {
if (all(data[i,] == 1)) {
count_1_1 <- count_1_1 + 1
} else if (data[i,1] == 1 & data[i,2] == 0) {
count_1_0 <- count_1_0 + 1
} else if (data[i,1] == 0 & data[i,2] == 1) {
count_0_1 <- count_0_1 + 1
} else if (all(data[i,] == 0)) {
count_0_0 <- count_0_0 + 1
} else {
print(data[i,])
stop("Value is not a 0 or 1")
}
}
contingency_table <- matrix(c(count_1_1, count_0_1, count_1_0, count_0_0), nrow = 2, ncol = 2)
return(contingency_table)
}
# Function called by compute_yule_from_raw_data to calculate yule
# DO NOT CALL THIS
# c_table = contingency table
yuleJC <- function(c_table) {
numerator <- (c_table[1] * c_table[4]) - (c_table[2] * c_table[3])
denominator <- (c_table[1] * c_table[4]) + (c_table[2] * c_table[3])
if (denominator == 0) {
return(NA)
} else {
result <- numerator / denominator
return(result)
}
}
# Function to compute Yule and Fisher using a matrix with multiple columns, each column is a gene
# compares each column with another column, makes the contingency table and computes Yule and Fisher
#Compute multiple Yules by comparing the columns
compute_yule_from_raw_data <- function(raw_data) {
#create empty matrices for Yule results and fisher results with the row and col names
results_matrix <- matrix(0, nrow = ncol(raw_data), ncol = ncol(raw_data))
colnames(results_matrix) <- rownames(results_matrix) <- colnames(raw_data)
#make the matrix diagonal = 1
diag(results_matrix) <- 1
results_fisher <- results_matrix
diag(results_fisher) <- 0
for (i in 1:(ncol(raw_data)-1)) {
for (j in (i+1):ncol(raw_data)) {
subset_data <- raw_data[, c(i,j)]
colnames(subset_data) <- colnames(raw_data[, c(i,j)])
labels <- colnames(subset_data)
table_for_yule(subset_data)
result <- table_for_yule(subset_data)
x <- labels[1]
y <- labels[2]
results_matrix[x,y] <- yuleJC(result)
#mirror copy the results above the diagonal to below the diagonal
results_matrix[lower.tri(results_matrix)] <- t(results_matrix)[lower.tri(results_matrix)]
rounded_results_matrix <- round(results_matrix, digits = 2)
# use of R's fisher statistical analysis
fisher_result <- fisher.test(result)
p_value <- fisher_result$p.value
results_fisher[x,y] <- p_value
#mirror copy the results above the diagonal to below the diagonal
results_fisher[lower.tri(results_fisher)] <- t(results_fisher)[lower.tri(results_fisher)]
rounded_results_fisher <- round(results_fisher, digits = 4)
threshold <- 0.05
significant_results <- ifelse(results_fisher < threshold, 1, 0)
}
}
final_results_list <- list(as.data.frame(rounded_results_matrix),
as.matrix(results_fisher),
as.matrix(significant_results))
return(final_results_list)
}
#
# Function to run BH correction (FDR) from the Fisher matrix
# pass the second element in the list for the unrounded Fisher results to this fxn
adjusted_p_values <- function(results_fisher) {
vectorized_p_values <- c(apply(results_fisher, 1, c))
adjusted_p_values <- p.adjust(vectorized_p_values, method = "BH")
significant_results <- adjusted_p_values < 0.05
adjusted_p_values_matrix <- results_fisher
adjusted_p_values_matrix[] <- adjusted_p_values
significant_results_matrix <- results_fisher
significant_results_matrix[] <- significant_results
final_stats_lists <- list(as.data.frame(adjusted_p_values_matrix),
as.data.frame(significant_results_matrix))
return(final_stats_lists)
}
corrplot_standard <- function(data){
corrplot.mixed(data,
lower = "ellipse",
upper = "number",
number.cex = .5,
tl.pos = 'lt',
tl.offset = 0.5)
}
corrplot_fisher <- function(data){
corrplot.mixed(data,
lower = "ellipse",
upper = "number",
number.cex = .5,
tl.pos = 'lt',
p.mat = as.matrix(final_stats_list$p),
sig.level = c(0.05),
insig = 'blank',
tl.offset = 0.5)
}
genetics_clean <- function(data){
data %>%
filter(Gene_name %in% genes_of_interest) %>%
mutate(seen = as.numeric(1)) %>%
pivot_wider(names_from = Gene_name, values_from = seen) %>%
mutate_all(as.character) %>%
mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
select(-ID) %>%
as.matrix()
}
top_mut <- function(data){genetics %>%
group_by(Gene_name) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(15)
}
################END OF FUNCTIONS################
#Dataset Names
mutation <- prad_tcga_pan_can_atlas_2018
data_name <- "prad_tcga_pan_can_atlas_2018"
###Entire Dataset###
genetics <- mutation %>%
select("hugoGeneSymbol", "sampleId") %>%
rename(Gene_name = "hugoGeneSymbol",
ID = "sampleId")
#Identifying top mutations
top_mutations <- top_mut(genetics)
#Combining Top 15 genes with other genes of interest; using unique() to prevent duplicates
genes_of_interest <- unique(c(top_mutations$Gene_name, "AR", "PTEN", "PIK3CA", "ATM", "BRCA1", "BRCA2", "SPOP", "FOXA1", "CDK12", "RB1", "TMPRSS2-ERG fusion", "TMPRSS2-FLI1 fusion", "TMPRSS2-ETV1 fusion", "TMPRSS2-ETV2 fusion"))
#Cleaning data and generating matrix; Filtering for genes of interest at this step
genetics_matrix <- genetics_clean(genetics)
## Warning: Values from `seen` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
## {data} %>%
## dplyr::group_by(ID, Gene_name) %>%
## dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
## dplyr::filter(n > 1L)
## Warning: There were 23 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 22 remaining warnings.
genetics_matrix[is.na(genetics_matrix)] <- 1
genetics_matrix <- sum_muts_per_gene(genetics_matrix)
################COMPUTATIONAL COMMMANDS#################
final_results_list <- compute_yule_from_raw_data(genetics_matrix)
results_fisher <- final_results_list[[2]]
final_stats_list <- adjusted_p_values(results_fisher)
#############END OF COMPUTATIONAL COMMANDS##############
#####################VISUALIZATION######################
#visualize yule
yule_visual <- as.matrix(final_results_list[[1]])
par(ps = 7)
corrplot_standard(yule_visual)
#visualize Fisher + FDR stats on top of yule
# you have to rename the element in the list containing all the p values to 'p'
# you have to make sure that element is a matrix (use 'as.matrix()')
names(final_stats_list)[1] <- "p"
corrplot_fisher(yule_visual)
#visualize Fisher ONLY stats on top of yule
# you have to rename the element in the list containing all the p values to 'p'
# you have to make sure that element is a matrix (use 'as.matrix()')
names(final_results_list)[2] <- "p"
corrplot_fisher(yule_visual)
#############END OF VISUALIZATION
###AGE###
#Pulling clinical data
clinical <- get_clinical_by_study(study_id = data_name,
clinical_attribute = "AGE",
base_url = 'www.cbioportal.org/api')%>%
mutate(parameter_type = ifelse(value >= 50, "high", "low"))
age_type_list <- list()
age_types <- c("high", "low")
#Start of Loop
for(i in age_types) {
parameter_data <- clinical %>%
filter(parameter_type == i) %>%
select(patientId, studyId)
genetics <- mutation %>%
filter(mutation$patientId %in% c(parameter_data$patientId)) %>%
select("hugoGeneSymbol", "sampleId") %>%
rename(Gene_name = "hugoGeneSymbol",
ID = "sampleId")
#Identifying top mutations
top_mutations <- top_mut(genetics)
#Combining Top 15 genes with other genes of interest; using unique() to prevent duplicates
genes_of_interest <- unique(c(top_mutations$Gene_name, "AR", "PTEN", "PIK3CA", "ATM", "BRCA1", "BRCA2", "SPOP", "FOXA1", "CDK12", "RB1", "TMPRSS2-ERG fusion", "TMPRSS2-FLI1 fusion", "TMPRSS2-ETV1 fusion", "TMPRSS2-ETV2 fusion"))
#Cleaning data and generating matrix; Filtering for genes of interest at this step
genetics_matrix <- genetics_clean(genetics)
genetics_matrix[is.na(genetics_matrix)] <- 1
genetics_matrix <- sum_muts_per_gene(genetics_matrix)
################COMPUTATIONAL COMMMANDS#################
final_results_list <- compute_yule_from_raw_data(genetics_matrix)
results_fisher <- final_results_list[[2]]
final_stats_list <- adjusted_p_values(results_fisher)
#############END OF COMPUTATIONAL COMMANDS##############
#####################VISUALIZATION######################
#visualize yule
yule_visual <- as.matrix(final_results_list[[1]])
par(ps = 7)
corrplot_standard(yule_visual)
#visualize Fisher + FDR stats on top of yule
# you have to rename the element in the list containing all the p values to 'p'
# you have to make sure that element is a matrix (use 'as.matrix()')
names(final_stats_list)[1] <- "p"
corrplot_fisher(yule_visual)
#visualize Fisher ONLY stats on top of yule
# you have to rename the element in the list containing all the p values to 'p'
# you have to make sure that element is a matrix (use 'as.matrix()')
names(final_results_list)[2] <- "p"
corrplot_fisher(yule_visual)
#############END OF VISUALIZATION
}
## Warning: Values from `seen` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
## {data} %>%
## dplyr::group_by(ID, Gene_name) %>%
## dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
## dplyr::filter(n > 1L)
## Warning: There were 23 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 22 remaining warnings.
## Warning: Values from `seen` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
## {data} %>%
## dplyr::group_by(ID, Gene_name) %>%
## dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
## dplyr::filter(n > 1L)
## Warning: There were 18 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 17 remaining warnings.
###"LIKELY ONCOGENIC" and "ONCOGENIC"###
genetics <- mutation %>%
filter(mutation$oncogenic_status %in% c("Likely Oncogenic", "Oncogenic")) %>%
select("hugoGeneSymbol", "sampleId") %>%
rename(Gene_name = "hugoGeneSymbol",
ID = "sampleId")
#Identifying top mutations
top_mutations <- top_mut(genetics)
#Combining Top 15 genes with other genes of interest; using unique() to prevent duplicates
genes_of_interest <- unique(c(top_mutations$Gene_name, "AR", "PTEN", "PIK3CA", "ATM", "BRCA1", "BRCA2", "SPOP", "FOXA1", "CDK12", "RB1", "TMPRSS2-ERG fusion", "TMPRSS2-FLI1 fusion", "TMPRSS2-ETV1 fusion", "TMPRSS2-ETV2 fusion"))
#Cleaning data and generating matrix; Filtering for genes of interest at this step
genetics_matrix <- genetics_clean(genetics)
## Warning: Values from `seen` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
## {data} %>%
## dplyr::group_by(ID, Gene_name) %>%
## dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
## dplyr::filter(n > 1L)
## Warning: There were 18 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 17 remaining warnings.
genetics_matrix[is.na(genetics_matrix)] <- 1
genetics_matrix <- sum_muts_per_gene(genetics_matrix)
################COMPUTATIONAL COMMMANDS#################
final_results_list <- compute_yule_from_raw_data(genetics_matrix)
results_fisher <- final_results_list[[2]]
final_stats_list <- adjusted_p_values(results_fisher)
#############END OF COMPUTATIONAL COMMANDS##############
#####################VISUALIZATION######################
#visualize yule
yule_visual <- as.matrix(final_results_list[[1]])
par(ps = 7)
corrplot_standard(yule_visual)
#visualize Fisher + FDR stats on top of yule
# you have to rename the element in the list containing all the p values to 'p'
# you have to make sure that element is a matrix (use 'as.matrix()')
names(final_stats_list)[1] <- "p"
corrplot_fisher(yule_visual)
#visualize Fisher ONLY stats on top of yule
# you have to rename the element in the list containing all the p values to 'p'
# you have to make sure that element is a matrix (use 'as.matrix()')
names(final_results_list)[2] <- "p"
corrplot_fisher(yule_visual)
#############END OF VISUALIZATION
```