library(rapiclient)
client <- get_api(url = "https://www.cbioportal.org/api/v2/api-docs")
library(cbioportalR)
set_cbioportal_db("public")
library(tidyverse)
library(dplyr)
library(ggplot2)
library(readxl)
library(corrplot)
#This is the Yule's coefficient I used for subsequent calculations
calcYulesYBetweenMatrices1 <- function(dm.x, dm.y) {
if(!all(dm.x %in% c(0,1)) | !all(dm.y %in% c(0,1))) {
stop("Error: calcYulesYBetweenMatrices() requires binary matrices as input. Please ensure all values are 0/1 or FALSE/TRUE.")
}
tt <- t( dm.x) %*% dm.y # Count TRUE and TRUE
tf <- t( dm.x) %*% !dm.y # Count TRUE and FALSE
ft <- t(!dm.x) %*% dm.y # Count FALSE and TRUE
ff <- t(!dm.x) %*% !dm.y # Count FALSE and FALSE
Y <- (tt * ff - tf * ft) / (tt * ff + tf * ft)
return(Y)}
#All of the study names
all_studies <- available_studies()
#Gathering samples from individual studies
cpcg <- available_samples("prad_cpcg_2017") %>%
select(sampleId, patientId, studyId)
p1000 <- available_samples("prad_p1000") %>%
select(sampleId, patientId, studyId)
tcga <- available_samples("prad_tcga_pan_can_atlas_2018") %>%
select(sampleId, patientId, studyId)
#Selecting only the primary samples in p1000
p1000_tcga <- get_clinical_by_study(study_id = "prad_p1000",
clinical_attribute = "DATA_SOURCE",
base_url = 'www.cbioportal.org/api') %>%
filter(value == "TCGA")
p1000_primary <- get_clinical_by_study(study_id = "prad_p1000",
clinical_attribute = "SAMPLE_TYPE",
base_url = 'www.cbioportal.org/api') %>%
filter(value == "Primary") %>%
filter(patientId != c(p1000_tcga$patientId))
#Combining the studies
df_pairs <- bind_rows(cpcg, p1000, tcga) %>%
select(sampleId, studyId)
#Gathering the mutation data
prca_public <- get_genetics_by_sample(sample_study_pairs = df_pairs)
#As a dataframe
prca_public <- as.data.frame(prca_public[["mutation"]]) %>%
select("hugoGeneSymbol", "sampleId") %>%
rename(Gene_name = "hugoGeneSymbol",
ID = "sampleId")
#Extracting the top genes the Public data
top_prca <- prca_public %>%
group_by(Gene_name) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(15)
top_prca <- append(top_prca$Gene_name, "CDH1") %>%
as.data.frame() %>%
rename(Gene_name = ".")
#Clean data
prca_public_matrix <- prca_public %>%
mutate(seen = as.numeric(1)) %>%
pivot_wider(names_from = Gene_name, values_from = seen) %>%
mutate_all(as.character) %>%
mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
select(-ID) %>%
as.matrix()
#Accounting for multiple mutations of the same gene in the same sample
prca_public_matrix[is.na(prca_public_matrix)] <- 1
#Generating Yules Coefficient Matrix
yule.prca_public <- as.data.frame(calcYulesYBetweenMatrices1(prca_public_matrix, prca_public_matrix))
#Filtering for the top 15 genes + CDH1
yule.prca_public.filtered <- yule.prca_public[c(top_prca$Gene_name), ] %>%
select(top_prca$Gene_name)
#Round to Yule's Coefficients to 2 digits
yule.prca_public.filtered <- as.matrix.data.frame(round(yule.prca_public.filtered,
digits = 2))
order_indices <- order(rownames(yule.prca_public.filtered))
yule.prca_public.filtered <- yule.prca_public.filtered[order_indices, order_indices]
#All of PUBLIC datasets
#Generating p-values for the correlations of entire public dataset
p_prca_public <- cor.mtest(yule.prca_public.filtered)
#Visualize all relationships (statistically significant and non-statistically significant)
corrplot.mixed(yule.prca_public.filtered,
lower = "ellipse",
upper = "number",
number.cex = .6,
tl.pos = 'lt')

#Visualize only the statistically significant comutations
corrplot.mixed(yule.prca_public.filtered,
lower = "ellipse",
upper = "number",
number.cex = .6,
tl.pos = 'lt',
p.mat = p_prca_public$p,
sig.level = c(0.05),
insig = 'blank')
