This file will contain all four data: Breast Cancer MIB (BCmib), Breast Cancer RNASeq (BCrna), Cardiotox MIB (CTmib), and Cardiotox RNASeq (CTrna). Goal is to compare tramatinib at 120 hrs across all assays.
Load libraries
Cardiotoxicity RNAseq data (CTrna)
CTrna <- read.csv("tramet.csv");
#every p val for CT RNAseq Tramatinib that is p<.05
CTrna <- CTrna %>% filter(pval < .05) %>% filter(!Mean.1 == 0);
#trim excess columns
CTrna <- CTrna[,c(1,12:13)]
#make gene names uppercase to match the breast cancer data
CTrna$Gene <- toupper(CTrna$Gene)
Cardio-toxicity MIB data (CTmib)
CTmib <- read_excel("MIB.xlsx"); #every value is significant for at least one rep
CTmib <- CTmib[,c(2,3,39)]
colnames(CTmib) <- c("Gene0","Gene","log2_CT")
Breast Cancer RNAseq data
BCrna <- read.csv("BreastCancer_DESeq_Results.csv");
#round to 8 figs, pull only p<.05 values, and trim excess columns
BCrna[, 2:7] <- round(BCrna[, 2:7], 8)
BCrna <- BCrna[BCrna$padj<.05,c(1,3,7)]
Breast Cancer MIB data (BCmib)
#MIB Breast cancer
BCmib <- read_excel("Copy of MCF10A_TimeCourse_Trametinib_proteinGroups_AW.xlsx", sheet = "LFQ normalized from Kinases tab")
BCmib <- BCmib[,c(1,11:13)][complete.cases(BCmib[-1]) | rowSums(!is.na(BCmib[-1])) > 0, ]
BCmib <- BCmib[rowSums(BCmib[-1], na.rm = TRUE) != 0, ]
#Take the mean of the replicates and doa log2 fold transformation
BCmib$mean <- rowMeans(BCmib[, 2:4], na.rm = TRUE)
BCmib$log2 <- log2(BCmib$mean)
#trim the df down to essentials
BCmib <- BCmib[,c(1,6)]
colnames(BCmib) <- c("Gene","log2_BC")
NEXT! Finding overlapping Genes in RNAseq data dfr is the dataframe for the RNA data
dfr <- merge(BCrna, CTrna, by = "Gene", suffixes = c("_BCrna", "_CTrna"))
colnames(dfr) <- c("Gene","BC_log2","BC_pval","CT_log2","CT_pval")
dfr[, 2:5] <- apply(dfr[, 2:5], 2, as.numeric)
## Warning in apply(dfr[, 2:5], 2, as.numeric): NAs introduced by coercion
Any relationships?
ggplot(dfr, aes(x = BC_log2, y = CT_log2)) +
geom_point(fill = "violetred1",color= "violetred4", alpha = 0.2,size=4,shape=21) +
labs(x = "BC_log2", y = "CT_log2", title = "Scatter Plot of Breast Cancer and CardioTox RNASeq Log2 fold change") + gghisto +
theme(panel.background = element_rect(fill = "lightsteelblue1"),legend.position = "none")
## Warning: Removed 3 rows containing missing values (`geom_point()`).
ggplot(dfr, aes(x = BC_pval, y = CT_pval)) +
geom_point(fill = "violetred1",color= "violetred4", alpha = 0.2,size=6,shape=21) +
labs(x = "BC_pval", y = "CT_pval", title = "Scatter Plot of Breast Cancer and CardioTox RNASeq pvals") + gghisto +
theme(panel.background = element_rect(fill = "lightsteelblue1"),legend.position = "none")
Volcano plots
p1 <- ggplot(dfr, aes(x = BC_log2, y = -log10(BC_pval))) +
geom_point(fill = "violetred1",color= "violetred4", alpha = 0.3,size=2,shape=21) +
labs(x = "BC_log2", y = "-log10(BC_pval)", title = "Volcano Plot \nfor Breast Cancer RNAseq") +
gghisto
p2 <- ggplot(dfr, aes(x = CT_log2, y = -log10(CT_pval))) +
geom_point(fill = "violetred1",color= "violetred4", alpha = 0.3,size=2,shape=21) +
labs(x = "CT_log2", y = "-log10(CT_pval)", title = "Volcano Plot \nfor CardioTox RNAseq") +
gghisto +
xlim(-5, 10)
grid.arrange(p1, p2, ncol = 2)
## Warning: Removed 3 rows containing missing values (`geom_point()`).
NOW bring it down to top 200 genes from each set
# Subset the rows with the top 200 highest values in CARDIOTOX
top_200ct <- dfr[order(dfr$CT_log2, decreasing = TRUE)[1:200], c(1,4)]
# Subset the rows with the top 200 highest values in BREAST CANCER
top_200bc <- dfr[order(dfr$BC_log2, decreasing = TRUE)[1:200], c(1,2)]
#How many overlap??
common_top200_gene <- intersect(top_200bc$Gene, top_200ct$Gene)
#subset based on the overlapping top 200 genes
dftop <- dfr[dfr$Gene %in% common_top200_gene, ]
#these are all the genes that occured in the top 200 for each breast cancer and cardiotox. 46 total
Replot the smaller group. Not much of interest.
ggplot(dftop, aes(x = BC_log2, y = CT_log2)) +
geom_point(fill = "violetred1",color= "violetred4", alpha = 0.5,size=4,shape=21) +
labs(x = "BC_log2", y = "CT_log2", title = "Scatter Plot of Breast Cancer and CardioTox RNASeq Log2 fold change") + gghisto +
theme(panel.background = element_rect(fill = "lightsteelblue1"),legend.position = "none")+
geom_label_repel(aes(label = Gene),
box.padding = 0.25,
point.padding = 0.1,
segment.color = 'grey50',
max.overlaps = 20,
size=4)
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Venn Diagram showing differences
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
##
## Attaching package: 'VennDiagram'
## The following object is masked from 'package:ggpubr':
##
## rotate
# Generate plot
v <- venn.diagram(list(`Breast \nCancer`=top_200bc$Gene, `Cardio\ntoxicity`=top_200ct$Gene),
fill = c("deeppink", "lightblue"),
alpha = c(0.5, 0.5), cat.cex = 2, cex=2,
filename=NULL)
# have a look at the default plot
grid.newpage();grid.draw(v)
trying to make a venn diagram style list
gene_status <- data.frame(Gene = unique(c(top_200bc$Gene, top_200ct$Gene)))
# Add columns to indicate whether each gene is in top_200ct or top_200bc
gene_status$cardiotox <- gene_status$Gene %in% top_200ct$Gene
gene_status$breastcancer <- gene_status$Gene %in% top_200bc$Gene
gene_status$both <- gene_status$cardiotox & gene_status$breastcancer
gene_status
attempt to plot it
library(ggdist)
#try to use jitter dodge to make a sspread of points
ggplot(gene_status, aes(x = factor(breastcancer), y = factor(cardiotox))) +
geom_point(position = position_jitter(width = 0.2, height = 0.3), size = 4,fill = "violetred1",color= "violetred4", alpha = 0.5,size=4,shape=21) +
labs(x = "in_top_200bc", y = "in_top_200ct", title = "Counts of Genes in Top 200 BC and CT") +
scale_x_discrete(labels = c("FALSE", "TRUE")) +
scale_y_discrete(labels = c("FALSE", "TRUE"))+ gghisto +
theme(panel.background = element_rect(fill = "lightsteelblue1"),legend.position = "none")+
geom_label_repel(aes(label = Gene),
box.padding = 0.25,
point.padding = 0.1,
segment.color = 'white',
max.overlaps = 100,
size=2)
## Warning: Duplicated aesthetics after name standardisation: size
## Warning: ggrepel: 308 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Onto MIB data! Find overlapping sig genes
dfm <- merge(BCmib, CTmib, by = "Gene")[,c(1,2,4)]
dfr[, 2:5] <- apply(dfr[, 2:5], 2, as.numeric)
ggplot(dfm, aes(x = log2_BC, y = log2_CT)) +
geom_point(fill = "violetred1",color= "violetred4", alpha = 0.6,size=6,shape=21) +
labs(x = "BC_log2", y = "CT_log2", title = "Scatter Plot of Breast Cancer and CardioTox MIB Log2 fold change") + gghisto +
theme(panel.background = element_rect(fill = "lightsteelblue1"),legend.position = "none") +
geom_label_repel(aes(label = Gene),
box.padding = 0.25,
point.padding = 0.1,
segment.color = 'grey50',
max.overlaps = 20,
size=3)