library(cbioportalR)
set_cbioportal_db("public")
library(tidyverse)
library(dplyr)
library(ggplot2)
library(readxl)
library(corrplot)
#This is the Yule's coefficient I used for subsequent calculations
calcYulesYBetweenMatrices1 <- function(dm.x, dm.y) {
if(!all(dm.x %in% c(0,1)) | !all(dm.y %in% c(0,1))) {
stop("Error: calcYulesYBetweenMatrices() requires binary matrices as input. Please ensure all values are 0/1 or FALSE/TRUE.")
}
tt <- t( dm.x) %*% dm.y # Count TRUE and TRUE
tf <- t( dm.x) %*% !dm.y # Count TRUE and FALSE
ft <- t(!dm.x) %*% dm.y # Count FALSE and TRUE
ff <- t(!dm.x) %*% !dm.y # Count FALSE and FALSE
Y <- (tt * ff - tf * ft) / (tt * ff + tf * ft)
return(Y)}
#Stratification by Gleason Scores
p1000 <- available_samples("prad_p1000") %>%
select(sampleId, patientId, studyId)
p1000_clinical <- get_clinical_by_study(study_id = "prad_p1000",
clinical_attribute = "REVIEWED_GLEASON_CATEGORY",
base_url = 'www.cbioportal.org/api') %>%
mutate(gleason_category = case_when(
value == "3+3" ~ 1,
value == "3+4" ~ 2,
value == "4+3" ~ 3,
value == "3+5" ~ 4,
value == "5+3" ~ 4,
value == "4+4" ~ 4,
value == "4+5" ~ 5,
value == "5+4" ~ 5,
value == "5+5" ~ 5
))
gleason_list <- list()
# Loop through Gleason categories 1 to 5
for (i in 1:5) {
# Filter p1000_clinical based on Gleason category
gleason_data <- p1000_clinical %>%
filter(gleason_category == i) %>%
select(sampleId, patientId, studyId)
# Get genetics data for the selected samples
gleason_data <- get_genetics_by_sample(
sample_id = c(gleason_data$sampleId),
study_id = "prad_p1000"
)
# Add the result to the list
gleason_list[[paste0("gleason", i)]] <- gleason_data
}
gleason1 <- gleason_list[["gleason1"]]
gleason2 <- gleason_list[["gleason2"]]
gleason3 <- gleason_list[["gleason3"]]
gleason4 <- gleason_list[["gleason4"]]
gleason5 <- gleason_list[["gleason5"]]
#Loop of the Gleason Dataframes
yule_gleason1 <- data.frame()
yule_gleason2 <- data.frame()
yule_gleason3 <- data.frame()
yule_gleason4 <- data.frame()
yule_gleason5 <- data.frame()
for(i in 1:5) {
gleason_var <- paste0("gleason", i)
#Get mutation data
cases <- assign(gleason_var, p1000_clinical %>%
filter(gleason_category == i) %>%
select(sampleId, patientId, studyId))
genetics <- get_genetics_by_sample(sample_id = c(cases$sampleId),
study_id = "prad_p1000")
prca_public <- as.data.frame(genetics[["mutation"]]) %>%
select("hugoGeneSymbol", "sampleId") %>%
rename(Gene_name = "hugoGeneSymbol",
ID = "sampleId")
#Top Genes
top_prca <- prca_public %>%
group_by(Gene_name) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(15)
if ("CDH1" %in% prca_public$Gene_name) {
# If "CDH1" is present, perform the specified operation
top_prca <- append(top_prca$Gene_name, "CDH1") %>%
as.data.frame() %>%
rename(Gene_name = ".")
} else {
# If "CDH1" is not present, keep it as it is
}
#Clean data
prca_public_matrix <- prca_public %>%
mutate(seen = as.numeric(1)) %>%
pivot_wider(names_from = Gene_name, values_from = seen) %>%
mutate_all(as.character) %>%
mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
select(-ID) %>%
as.matrix()
#Accounting for multiple mutations of the same gene in the same sample
prca_public_matrix[is.na(prca_public_matrix)] <- 1
#Storing matrixes
yule_df <- as.data.frame(calcYulesYBetweenMatrices1(prca_public_matrix, prca_public_matrix))
#Filtering for the top 15 genes + CDH1
yule.prca_public.filtered <- yule_df[c(top_prca$Gene_name), ] %>%
select(top_prca$Gene_name)
#Round to Yule's Coefficients to 2 digits
yule.prca_public.filtered <- as.matrix.data.frame(round(yule.prca_public.filtered,
digits = 2))
order_indices <- order(rownames(yule.prca_public.filtered))
yule_df <- yule.prca_public.filtered[order_indices, order_indices]
if (i == 1) yule_gleason1 <- as.data.frame(yule_df)
else if (i == 2) yule_gleason2 <- as.data.frame(yule_df)
else if (i == 3) yule_gleason3 <- as.data.frame(yule_df)
else if (i == 4) yule_gleason4 <- as.data.frame(yule_df)
else if (i == 5) yule_gleason5 <- as.data.frame(yule_df)
#Generating p-values for the correlations
p_prca_public <- cor.mtest(yule_df)
#Visualize only the statistically significant comutations
corrplot.mixed(yule_df,
lower = "ellipse",
upper = "number",
number.cex = .6,
tl.pos = 'lt',
p.mat = p_prca_public$p,
sig.level = c(0.05),
insig = 'blank')
}





#Stratification by Sample Type: Primary vs. Metastatic
p1000 <- available_samples("prad_p1000") %>%
select(sampleId, patientId, studyId)
p1000_clinical <- get_clinical_by_study(study_id = "prad_p1000",
clinical_attribute = "SAMPLE_TYPE",
base_url = 'www.cbioportal.org/api')
sample_type_list <- list()
sample_types <- c("Primary", "Metastasis")
# Loop through Sample Types
for (i in sample_types) {
# Filter p1000_clinical based on Sampe Type category
sample_type_data <- p1000_clinical %>%
filter(value == i) %>%
select(sampleId, patientId, studyId)
# Get genetics data for the selected samples
sample_type_data <- get_genetics_by_sample(
sample_id = c(sample_type_data$sampleId),
study_id = "prad_p1000"
)
# Add the result to the list
sample_type_list[[paste0(i)]] <- sample_type_data
}
Primary <- sample_type_list[["Primary"]]
Metastasis <- sample_type_list[["Metastasis"]]
#Loop
for(i in sample_types) {
var <- paste0(i)
#Get mutation data
cases <- assign(var, p1000_clinical %>%
filter(value == i) %>%
select(sampleId, patientId, studyId))
genetics <- get_genetics_by_sample(sample_id = c(cases$sampleId),
study_id = "prad_p1000")
prca_public <- as.data.frame(genetics[["mutation"]]) %>%
select("hugoGeneSymbol", "sampleId") %>%
rename(Gene_name = "hugoGeneSymbol",
ID = "sampleId")
#Top Genes
top_prca <- prca_public %>%
group_by(Gene_name) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(15)
if ("CDH1" %in% prca_public$Gene_name) {
# If "CDH1" is present, perform the specified operation
top_prca <- append(top_prca$Gene_name, "CDH1") %>%
as.data.frame() %>%
rename(Gene_name = ".")
} else {
# If "CDH1" is not present, keep it as it is
}
#Clean data
prca_public_matrix <- prca_public %>%
mutate(seen = as.numeric(1)) %>%
pivot_wider(names_from = Gene_name, values_from = seen) %>%
mutate_all(as.character) %>%
mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
select(-ID) %>%
as.matrix()
#Accounting for multiple mutations of the same gene in the same sample
prca_public_matrix[is.na(prca_public_matrix)] <- 1
#Storing matrixes
yule_df <- as.data.frame(calcYulesYBetweenMatrices1(prca_public_matrix, prca_public_matrix))
#Filtering for the top 15 genes + CDH1
yule.prca_public.filtered <- yule_df[c(top_prca$Gene_name), ] %>%
select(top_prca$Gene_name)
#Round to Yule's Coefficients to 2 digits
yule.prca_public.filtered <- as.matrix.data.frame(round(yule.prca_public.filtered,
digits = 2))
order_indices <- order(rownames(yule.prca_public.filtered))
yule_df <- yule.prca_public.filtered[order_indices, order_indices]
#Generating p-values for the correlations
p_prca_public <- cor.mtest(yule_df)
#Visualize only the statistically significant comutations
corrplot.mixed(yule_df,
lower = "ellipse",
upper = "number",
number.cex = .6,
tl.pos = 'lt',
p.mat = p_prca_public$p,
sig.level = c(0.05),
insig = 'blank')
}


#Stratification by TMB
p1000 <- available_samples("prad_p1000") %>%
select(sampleId, patientId, studyId)
p1000_clinical <- get_clinical_by_study(study_id = "prad_p1000",
clinical_attribute = "MUTATION_BURDEN",
base_url = 'www.cbioportal.org/api') %>%
mutate(tmd_type = ifelse(value >= 10, "high", "low"))
tmd_type_list <- list()
tmd_types <- c("high", "low")
# Loop through TMB Types
for (i in tmd_types) {
# Filter p1000_clinical based on TMB Type category
tmd_type_data <- p1000_clinical %>%
filter(tmd_type == i) %>%
select(sampleId, patientId, studyId)
# Get genetics data for the selected samples
tmd_type_data <- get_genetics_by_sample(
sample_id = c(tmd_type_data$sampleId),
study_id = "prad_p1000"
)
# Add the result to the list
tmd_type_list[[paste0(i)]] <- tmd_type_data
}
high <- tmd_type_list[["high"]]
low <- tmd_type_list[["low"]]
#Loop
for(i in tmd_types) {
var <- paste0(i)
#Get mutation data
cases <- assign(var, p1000_clinical %>%
filter(tmd_type == i) %>%
select(sampleId, patientId, studyId))
genetics <- get_genetics_by_sample(sample_id = c(cases$sampleId),
study_id = "prad_p1000")
prca_public <- as.data.frame(genetics[["mutation"]]) %>%
select("hugoGeneSymbol", "sampleId") %>%
rename(Gene_name = "hugoGeneSymbol",
ID = "sampleId")
#Top Genes
top_prca <- prca_public %>%
group_by(Gene_name) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(15)
if ("CDH1" %in% prca_public$Gene_name) {
# If "CDH1" is present, perform the specified operation
top_prca <- append(top_prca$Gene_name, "CDH1") %>%
as.data.frame() %>%
rename(Gene_name = ".")
} else {
# If "CDH1" is not present, keep it as it is
}
#Clean data
prca_public_matrix <- prca_public %>%
mutate(seen = as.numeric(1)) %>%
pivot_wider(names_from = Gene_name, values_from = seen) %>%
mutate_all(as.character) %>%
mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
select(-ID) %>%
as.matrix()
#Accounting for multiple mutations of the same gene in the same sample
prca_public_matrix[is.na(prca_public_matrix)] <- 1
#Storing matrixes
yule_df <- as.data.frame(calcYulesYBetweenMatrices1(prca_public_matrix, prca_public_matrix))
#Filtering for the top 15 genes + CDH1
yule.prca_public.filtered <- yule_df[c(top_prca$Gene_name), ] %>%
select(top_prca$Gene_name)
#Round to Yule's Coefficients to 2 digits
yule.prca_public.filtered <- as.matrix.data.frame(round(yule.prca_public.filtered,
digits = 2))
order_indices <- order(rownames(yule.prca_public.filtered))
yule_df <- yule.prca_public.filtered[order_indices, order_indices]
#Generating p-values for the correlations
p_prca_public <- cor.mtest(yule_df)
#Visualize only the statistically significant comutations
corrplot.mixed(yule_df,
lower = "ellipse",
upper = "number",
number.cex = .6,
tl.pos = 'lt',
p.mat = p_prca_public$p,
sig.level = c(0.05),
insig = 'blank')
}


#Stratification by Age
p1000 <- available_samples("prad_p1000") %>%
select(sampleId, patientId, studyId)
#Pulling Data
p1000_clinical <- get_clinical_by_study(study_id = "prad_p1000",
clinical_attribute = "AGE",
base_url = 'www.cbioportal.org/api')%>%
mutate(age_type = ifelse(value >= 50, "high", "low")) %>%
select(-sampleId)
#Fixing the lack of sampleId from the data pull
p1000_sampleId <- p1000 %>%
select(sampleId, patientId)
p1000_clinical <- left_join(p1000_clinical, p1000_sampleId, by = "patientId")
age_type_list <- list()
age_types <- c("high", "low")
age_type_data <- p1000_clinical %>%
filter(age_type == "high") %>%
select(sampleId, patientId, studyId)
# Loop through Age Types
for (i in age_types) {
# Filter p1000_clinical based on Age Type category
age_type_data <- p1000_clinical %>%
filter(age_type == i) %>%
select(sampleId, patientId, studyId)
# Get genetics data for the selected samples
age_type_data <- get_genetics_by_sample(
sample_id = c(age_type_data$sampleId),
study_id = "prad_p1000"
)
# Add the result to the list
age_type_list[[paste0(i)]] <- age_type_data
}
high <- age_type_list[["high"]]
low <- age_type_list[["low"]]
#Loop
for(i in age_types) {
var <- paste0(i)
#Get mutation data
cases <- assign(var, p1000_clinical %>%
filter(age_type == i) %>%
select(sampleId, patientId, studyId))
genetics <- get_genetics_by_sample(sample_id = c(cases$sampleId),
study_id = "prad_p1000")
prca_public <- as.data.frame(genetics[["mutation"]]) %>%
select("hugoGeneSymbol", "sampleId") %>%
rename(Gene_name = "hugoGeneSymbol",
ID = "sampleId")
#Top Genes
top_prca <- prca_public %>%
group_by(Gene_name) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(15)
if ("CDH1" %in% prca_public$Gene_name) {
# If "CDH1" is present, perform the specified operation
top_prca <- append(top_prca$Gene_name, "CDH1") %>%
as.data.frame() %>%
rename(Gene_name = ".")
} else {
# If "CDH1" is not present, keep it as it is
}
#Clean data
prca_public_matrix <- prca_public %>%
mutate(seen = as.numeric(1)) %>%
pivot_wider(names_from = Gene_name, values_from = seen) %>%
mutate_all(as.character) %>%
mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
select(-ID) %>%
as.matrix()
#Accounting for multiple mutations of the same gene in the same sample
prca_public_matrix[is.na(prca_public_matrix)] <- 1
#Storing matrixes
yule_df <- as.data.frame(calcYulesYBetweenMatrices1(prca_public_matrix, prca_public_matrix))
#Filtering for the top 15 genes + CDH1
yule.prca_public.filtered <- yule_df[c(top_prca$Gene_name), ] %>%
select(top_prca$Gene_name)
#Round to Yule's Coefficients to 2 digits
yule.prca_public.filtered <- as.matrix.data.frame(round(yule.prca_public.filtered,
digits = 2))
order_indices <- order(rownames(yule.prca_public.filtered))
yule_df <- yule.prca_public.filtered[order_indices, order_indices]
#Generating p-values for the correlations
p_prca_public <- cor.mtest(yule_df)
#Visualize only the statistically significant comutations
corrplot.mixed(yule_df,
lower = "ellipse",
upper = "number",
number.cex = .6,
tl.pos = 'lt',
p.mat = p_prca_public$p,
sig.level = c(0.05),
insig = 'blank')
}


#Stratification by Fusion
p1000 <- available_samples("prad_p1000") %>%
select(sampleId, patientId, studyId)
#Pulling Data
p1000_fusion <- get_fusions_by_study(study_id = "prad_p1000",
base_url = 'www.cbioportal.org/api') %>%
mutate(fusion_type = case_when(
eventInfo == "TMPRSS2-ERG fusion" ~ "ERG",
eventInfo == "TMPRSS2-ETV1 fusion" ~ "ETV1",
eventInfo == "TMPRSS2-ETV4 fusion" ~ "ETV4",
eventInfo == "TMPRSS2-FLI1 fusion" ~ "FLI4"))
fusion_data_list <- list()
fusion_types <- c("ERG", "ETV1", "ETV4", "FLI4")
# Loop through Fusion Types
for (i in fusion_types) {
# Filter p1000_fusion based on Fusion Type
fusion_data <- p1000_fusion %>%
filter(fusion_type == i) %>%
select(sampleId, patientId, studyId)
# Get genetics data for the selected samples
fusion_data <- get_genetics_by_sample(
sample_id = c(fusion_data$sampleId),
study_id = "prad_p1000"
)
# Add the result to the list
fusion_data_list[[paste0(i)]] <- fusion_data
}
#New list of fusion types with "No_Fusion"
fusion_types <- c("ERG", "ETV1", "ETV4", "FLI4", "No_Fusion")
#Loop
for(i in fusion_types) {
var <- paste0(i)
#Get mutation data for the fusion events
if (i %in% c("ERG", "ETV1", "ETV4", "FLI4")) {
cases <- assign(var, p1000_fusion %>%
filter(fusion_type == i) %>%
select(sampleId, patientId, studyId))
genetics <- get_genetics_by_sample(sample_id = c(cases$sampleId),
study_id = "prad_p1000")
prca_public <- as.data.frame(genetics[["mutation"]]) %>%
select("hugoGeneSymbol", "sampleId") %>%
rename(Gene_name = "hugoGeneSymbol",
ID = "sampleId")
}
else {
#Pulling the samples with no fusion events
all_prad_p1000 <- get_genetics_by_study(
study_id = "prad_p1000")
prca_public <- all_prad_p1000[["mutation"]] %>%
filter(!sampleId %in% c(p1000_fusion$sampleId)) %>%
select("hugoGeneSymbol", "sampleId") %>%
rename(Gene_name = "hugoGeneSymbol",
ID = "sampleId")
}
#Top Genes
top_prca <- prca_public %>%
group_by(Gene_name) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(15)
if ("CDH1" %in% prca_public$Gene_name) {
# If "CDH1" is present, perform the specified operation
top_prca <- append(top_prca$Gene_name, "CDH1") %>%
as.data.frame() %>%
rename(Gene_name = ".")
} else {
# If "CDH1" is not present, keep it as it is
}
#Clean data
prca_public_matrix <- prca_public %>%
mutate(seen = as.numeric(1)) %>%
pivot_wider(names_from = Gene_name, values_from = seen) %>%
mutate_all(as.character) %>%
mutate(across(everything(), ~ifelse(. == "NULL", 0, as.numeric(.)))) %>%
select(-ID) %>%
as.matrix()
#Accounting for multiple mutations of the same gene in the same sample
prca_public_matrix[is.na(prca_public_matrix)] <- 1
#Storing matrixes
yule_df <- as.data.frame(calcYulesYBetweenMatrices1(prca_public_matrix, prca_public_matrix))
#Filtering for the top 15 genes + CDH1
yule.prca_public.filtered <- yule_df[c(top_prca$Gene_name), ] %>%
select(top_prca$Gene_name)
#Round to Yule's Coefficients to 2 digits
yule.prca_public.filtered <- as.matrix.data.frame(round(yule.prca_public.filtered,
digits = 2))
order_indices <- order(rownames(yule.prca_public.filtered))
yule_df <- yule.prca_public.filtered[order_indices, order_indices]
#Generating p-values for the correlations
p_prca_public <- cor.mtest(yule_df)
#Visualize only the statistically significant comutations
corrplot.mixed(yule_df,
lower = "ellipse",
upper = "number",
number.cex = .6,
tl.pos = 'lt',
p.mat = p_prca_public$p,
sig.level = c(0.05),
insig = 'blank')
}




