#############################################################
################# Abby's code. DO NOT EDIT! #################
################# Please cite appropriately.#################
#############################################################
################# This script was developed using the following resources #################
# Chimenti, M. (2015, August 17). Create a volcano plot on EBSeq output. Retrieved July 19, 2017, from
# http://www.michaelchimenti.com/2015/08/create-a-volcano-plot-on-ebseq-output/
# Gillespie, C. (2011, June 21). Volcano plots of microarray data. Retrieved July 19, 2017, from
# http://bioinformatics.knowledgeblog.org/2011/06/21/volcano-plots-of-microarray-data/
# Leng, N., Kendziorski C. (2015). EBSeq: An R Package for Differential Expression Analysis using
# RNA-Seq Data. R package version 1.14.10.
# Li, B., A Short Tutorial for RSEM. (2015), GitHub repository,
# https://github.com/bli25ucb/RSEM_tutorial
# Phipson, B., Trigos, A., Ritchie, M., Doyle, M., Dashnow, H., & Law, C. (n.d.). RNA-seq Analysis in
# R. Retrieved July 19, 2017, from http://combine-australia.github.io/RNAseq-R/06-rnaseq-day1.html
############################################################################################
library(EBSeq)
## Loading required package: blockmodeling
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## Loading required package: testthat
library(gridExtra)
library(RColorBrewer)
require(ggplot2)
## Loading required package: ggplot2
######################## Declare variables ########################
# Read-in data
#A_genedata = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeqTestEBSeq_results_genes.txt", header=T, stringsAsFactors=F)
#A_normal = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeqTestEBSeq_results_genes.txt.normalized_data_matrix", header=T, stringsAsFactors=F)
#A_RSEM_ind = read.table("/mnt/bd2/Aligned_Reads_memory/1108724_S17_L005_Aligned.genes.results", header=T, stringsAsFactors=F)
#A_geneFDR5 = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeq_results_genes_fdr-0.05.txt", header=T, stringsAsFactors=F)
#A_geneFDR1 = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeq_results_genes_fdr-0.1.txt", header=T, stringsAsFactors=F)
A_genedata = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeq_NEW_Lung_results_genes.txt", header=T, stringsAsFactors=F)
A_normal = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeq_NEW_Lung_results_genes.txt.normalized_data_matrix", header=T, stringsAsFactors=F)
#A_RSEM_ind = read.table("/mnt/bd2/Aligned_Reads_memory/1108724_S17_L005_Aligned.genes.results", header=T, stringsAsFactors=F)
A_geneFDR5 = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/NEW_Lung_EBSeq_results_genes_fdr-0.05.txt", header=T, stringsAsFactors=F)
A_geneFDR1 = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeq_NEW_Lung_results_genes_fdr-0.1.txt", header=T, stringsAsFactors=F)
# Clean-up headers before declaring other vars
# Edit column headers to sample names only
colnames(A_normal) = gsub("X.mnt.bd2.Aligned_Reads_disk1.", "", colnames(A_normal))
colnames(A_normal) = gsub("_.*", "", colnames(A_normal))
# Subset normalized gene counts w/ FDR=0.01
normFDRintersect = A_normal[rownames(A_normal) %in% rownames(A_geneFDR1),]
# Subset individual normalized gene counts w/ FDR=0.01
normFDRInterect = A_geneFDR1[rownames(A_normal) %in% rownames(A_geneFDR1),]
# Subset clustered gene counts w/ FDR=0.01
A_geneCluster = A_geneFDR1[,5:6]
# Clean-up header for new var before declaring other vars
names(A_geneCluster) = c("R", "DF")
# Define top/bottom FC vars
A_TopPostFCnames = rownames(A_geneFDR1[order(-A_geneFDR1$PostFC),])[1:50]
A_TopRealFCnames = rownames(A_geneFDR1[order(-A_geneFDR1$RealFC),])[1:50]
A_BotPostFCnames = rownames(A_geneFDR1[order(A_geneFDR1$PostFC),])[1:50]
A_BotRealFCnames = rownames(A_geneFDR1[order(A_geneFDR1$RealFC),])[1:50]
# Subset top/bottom FC var values in normalized data
A_normHeatTopPostFC = subset(A_normal, rownames(A_normal) %in% A_TopPostFCnames)
A_normHeatTopRealFC = subset(A_normal, rownames(A_normal) %in% A_TopRealFCnames)
A_normHeatBotPostFC = subset(A_normal, rownames(A_normal) %in% A_BotPostFCnames)
A_normHeatBotRealFC = subset(A_normal, rownames(A_normal) %in% A_BotRealFCnames)
# Find variance for each gene for normalized INDVIDIDUAL SAMPLE gene counts w/ FDR = 0.01;
# then subset top 50 and 500
normGeneVar <- apply(normFDRintersect, 1, var)
findNormGeneVar50 = names(sort(normGeneVar, decreasing=TRUE))[1:50]
findNormGeneVar500 = names(sort(normGeneVar, decreasing=TRUE))[1:500]
normGeneVar50 = A_normal[findNormGeneVar50,]
normGeneVar500 = A_normal[findNormGeneVar500,]
# Find variance for each gene for normalized CLUSTERED gene counts w/ FDR = 0.01;
# then subset top 50 and 500
normGeneVarCLUST <- apply(A_geneCluster, 1, var)
findNormGeneVar50CLUST = names(sort(normGeneVarCLUST, decreasing=TRUE))[1:50]
findNormGeneVar500CLUST = names(sort(normGeneVarCLUST, decreasing=TRUE))[1:500]
normGeneVar50CLUST = A_geneCluster[findNormGeneVar50CLUST,]
normGeneVar500CLUST = A_geneCluster[findNormGeneVar500CLUST,]
# Playing w/ no FDR cutoff
normGeneVarNO <- apply(A_normal, 1, var)
findNormGeneVar50NO = names(sort(normGeneVarNO, decreasing=TRUE))[1:50]
findNormGeneVar500NO = names(sort(normGeneVarNO, decreasing=TRUE))[1:500]
normGeneVar50NO = A_normal[findNormGeneVar50NO,]
normGeneVar500NO = A_normal[findNormGeneVar500NO,]
# Define color palette
mypalette = brewer.pal(9,"YlOrRd")
morecols = colorRampPalette(mypalette)
######################## Variables declared ########################
# Find means across both clusters combined
A_genedata$Mean = rowMeans(A_genedata[,grep("^C",names(A_genedata))])
######################## Histogram of PPDE frequency ########################
hist( A_genedata$PPDE, breaks=10, col="mistyrose3", main = "PPDE Frequency of All Genes",
xlab = "PPDE", ylab = "Frequency")

hist( A_geneFDR1$PPDE, breaks=10, col="mistyrose3", main = "PPDE Frequency of FDR = 0.01 Genes",
xlab = "PPDE", ylab = "Frequency")

######################## Histogrammed PPDE frequency ########################
######################## Heatmap most variable genes by individual sample ########################
heatmap.2(as.matrix(normGeneVar50),col=rev(morecols(50)),trace="none",
main=paste("Top", nrow(normGeneVar50), "most variable genes"), scale="row", margins = c(8,10))

heatmap.2(as.matrix(normGeneVar500),col=rev(morecols(500)),trace="none",
main=paste("Top", nrow(normGeneVar500), "most variable genes"), scale="row", margins = c(8,10))

heatmap.2(as.matrix(normGeneVar50NO),col=rev(morecols(50)),trace="none",
main=paste("Top", nrow(normGeneVar50NO), "most variable genes"), scale="row", margins = c(8,10), font.main = 8)

heatmap.2(as.matrix(normGeneVar500NO),col=rev(morecols(500)),trace="none",
main=paste("Top", nrow(normGeneVar500NO), "most variable genes"), scale="row", margins = c(8,10), font.main=8)

######################## Most variable genes mapped by individual sample ########################
######################## Heatmap based on fold change by individual sample ########################
heatmap.2(as.matrix(A_normHeatTopPostFC),
scale="row", trace="none", Colv=F, margins = c(11,10), main=paste("Top",
nrow(A_normHeatTopPostFC), "Post FC Genes"))
## Warning in heatmap.2(as.matrix(A_normHeatTopPostFC), scale = "row", trace =
## "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
## column dendogram.

heatmap.2(as.matrix(A_normHeatTopRealFC),
scale="row", trace="none", Colv=F, margins = c(11,10), main=paste("Top",
nrow(A_normHeatTopRealFC), "Real FC Genes"))
## Warning in heatmap.2(as.matrix(A_normHeatTopRealFC), scale = "row", trace =
## "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
## column dendogram.

heatmap.2(as.matrix(A_normHeatBotPostFC),
scale="row", trace="none", Colv=F, margins = c(11,10), main=paste("Bottom",
nrow(A_normHeatBotPostFC), "Post FC Genes"))
## Warning in heatmap.2(as.matrix(A_normHeatBotPostFC), scale = "row", trace =
## "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
## column dendogram.

heatmap.2(as.matrix(A_normHeatBotRealFC),
scale="row", trace="none", Colv=F, margins = c(11,10), main=paste("Bottom",
nrow(A_normHeatBotRealFC), "Real FC Genes"))
## Warning in heatmap.2(as.matrix(A_normHeatBotRealFC), scale = "row", trace =
## "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
## column dendogram.

######################## Heatmapped based on fold change by individual sample ########################
######################## Heatmap most variable genes by cluster ########################
heatmap.2(as.matrix(normGeneVar50CLUST),col=rev(morecols(50)),trace="none",
main=paste("Top", nrow(normGeneVar50CLUST), "most variable genes"), scale="row", margins = c(8,10))

heatmap.2(as.matrix(normGeneVar500CLUST),col=rev(morecols(500)),trace="none",
main=paste("Top", nrow(normGeneVar500CLUST), "most variable genes"), scale="row", margins = c(8,10))

######################## Most variable genes mapped by individual sample ########################
######################## Volcano plots w/ PPDE ########################
# Define colors
A_geneFDR1$PPDERealFCColor = "black"
A_geneFDR1$PPDERealFCColor[A_geneFDR1$PPDE > 0.95 & abs(log2(A_geneFDR1$RealFC)) < 15] = "blue"
A_geneFDR1$PPDERealFCColor[A_geneFDR1$PPDE > 0.95 & abs(log2(A_geneFDR1$RealFC)) < 5] = "red"
# Plot real FC
par(mfrow=c(1,2))
with(A_geneFDR1, plot(log2(RealFC), -log10(PPDE), pch=20, main="PPDE vs. Real FC", col=A_geneFDR1$PPDERealFCColor))
with(A_geneFDR1, plot(log2(RealFC), (PPDE), pch=20, main="PPDE vs. Real FC", col=A_geneFDR1$PPDERealFCColor))

# Define colors
A_geneFDR1$PPDEPostFCColor = "black"
A_geneFDR1$PPDEPostFCColor[A_geneFDR1$PPDE > 0.95 & abs(log2(A_geneFDR1$PostFC)) < 15] = "blue"
A_geneFDR1$PPDEPostFCColor[A_geneFDR1$PPDE > 0.95 & abs(log2(A_geneFDR1$PostFC)) < 5] = "red"
# Plot posterior FC
par(mfrow=c(1,2))
with(A_geneFDR1, plot(log2(PostFC), -log10(PPDE), pch=20, main="PPDE vs. Post FC", col=A_geneFDR1$PPDEPostFCColor))
with(A_geneFDR1, plot(log2(PostFC), (PPDE), pch=20, main="PPDE vs. Post FC", col=A_geneFDR1$PPDEPostFCColor))

# Define colors for real FC volcano plot
A_geneFDR1$PPDERealFCthreshold = as.factor(abs(log2(A_geneFDR1$RealFC)) > 5 & A_geneFDR1$PPDE > 0.95)
# Plot real FC
g1= ggplot(data=A_geneFDR1, aes(x=log2(RealFC), y=-log10(PPDE), color=PPDERealFCthreshold)) +
geom_point(alpha=0.4, size=1.75) +
#opts(legend.position = "none") +
#xlim(c(-10, 10)) + ylim(c(0, 15)) +
ggtitle("PPDE vs. Real Fold Change") +
xlab("log2 Real Fold Change") + ylab("-log10(PPDE)")
g2 = ggplot(data=A_geneFDR1, aes(x=log2(RealFC), y=(PPDE), color=PPDERealFCthreshold)) +
geom_point(alpha=0.4, size=1.75) +
#opts(legend.position = "none") +
#xlim(c(-10, 10)) + ylim(c(0, 15)) +
ggtitle("PPDE vs. Real Fold Change") +
xlab("log2 Real Fold Change") + ylab("PPDE")
grid.arrange(g1, g2, ncol=2)

# Define colors for post FC volcano plot
A_geneFDR1$PPDEPostFCthreshold = as.factor(abs(log2(A_geneFDR1$PostFC)) > 5 & A_geneFDR1$PPDE > 0.95)
# Plot post FC
g1= ggplot(data=A_geneFDR1, aes(x=log2(PostFC), y=-log10(PPDE), color=PPDEPostFCthreshold)) +
geom_point(alpha=0.4, size=1.75) +
#opts(legend.position = "none") +
#xlim(c(-10, 10)) + ylim(c(0, 15)) +
ggtitle("PPDE vs. Posterior Fold Change") +
xlab("log2 Posterior Fold Change") + ylab("-log10(PPDE)")
g2 = ggplot(data=A_geneFDR1, aes(x=log2(PostFC), y=(PPDE), color=PPDEPostFCthreshold)) +
geom_point(alpha=0.4, size=1.75) +
#opts(legend.position = "none") +
#xlim(c(-10, 10)) + ylim(c(0, 15)) +
ggtitle("PPDE vs. Posterior Fold Change") +
xlab("log2 Posterior Fold Change") + ylab("PPDE")
grid.arrange(g1, g2, ncol=2)

######################## Plotted PPDE volcanoes ########################
######################## Volcano plots w/ FDR ########################
# Define FDR values
A_geneFDR1$FDR = 1 - A_geneFDR1$PPDE
# Define colors
A_geneFDR1$FDRRealFCColor = "black"
A_geneFDR1$FDRRealFCColor[A_geneFDR1$FDR < 0.05 & abs(log2(A_geneFDR1$RealFC)) < 15] = "blue"
A_geneFDR1$FDRRealFCColor[A_geneFDR1$FDR < 0.05 & abs(log2(A_geneFDR1$RealFC)) < 5] = "red"
# Plot real FC
par(mfrow=c(1,2))
with(A_geneFDR1, plot(log2(RealFC), -log10(FDR), pch=20, main="FDR vs. Real FC", col=A_geneFDR1$FDRRealFCColor))
with(A_geneFDR1, plot(log2(RealFC), (FDR), pch=20, main="FDR vs. Real FC", col=A_geneFDR1$FDRRealFCColor))

# Define colors
A_geneFDR1$FDRPostFCColor = "black"
A_geneFDR1$FDRPostFCColor[A_geneFDR1$FDR < 0.05 & abs(log2(A_geneFDR1$PostFC)) < 15] = "blue"
A_geneFDR1$FDRPostFCColor[A_geneFDR1$FDR < 0.05 & abs(log2(A_geneFDR1$PostFC)) < 5] = "red"
# Plot posterior FC
par(mfrow=c(1,2))
with(A_geneFDR1, plot(log2(PostFC), -log10(FDR), pch=20, main="FDR vs. Post FC", col=A_geneFDR1$FDRPostFCColor))
with(A_geneFDR1, plot(log2(PostFC), (FDR), pch=20, main="FDR vs. Post FC", col=A_geneFDR1$FDRPostFCColor))

# Define colors for real FC volcano plot
A_geneFDR1$FDRRealFCthreshold = as.factor(abs(log2(A_geneFDR1$RealFC)) > 5 & A_geneFDR1$FDR < 0.05)
# Plot real FC
g1= ggplot(data=A_geneFDR1, aes(x=log2(RealFC), y=-log10(FDR), color=FDRRealFCthreshold)) +
geom_point(alpha=0.4, size=1.75) +
#opts(legend.position = "none") +
#xlim(c(-10, 10)) + ylim(c(0, 100)) +
ggtitle("FDR vs. Real Fold Change") +
xlab("log2 Real Fold Change") + ylab("-log10(FDR)")
g1

g2 = ggplot(data=A_geneFDR1, aes(x=log2(RealFC), y=(FDR), color=FDRRealFCthreshold)) +
geom_point(alpha=0.4, size=1.75) +
#opts(legend.position = "none") +
#xlim(c(-10, 10)) + ylim(c(0, 20)) +
ggtitle("FDR vs. Real Fold Change") +
xlab("log2 Real Fold Change") + ylab("FDR")
grid.arrange(g1, g2, ncol=2)

# Define colors for post FC volcano plot
A_geneFDR1$FDRPostFCthreshold = as.factor(abs(log2(A_geneFDR1$PostFC)) > 5 & A_geneFDR1$FDR < 0.05)
# Plot post FC
g1= ggplot(data=A_geneFDR1, aes(x=log2(PostFC), y=-log10(FDR), color=FDRPostFCthreshold)) +
geom_point(alpha=0.4, size=1.75) +
#opts(legend.position = "none") +
#xlim(c(-10, 10)) + ylim(c(0, 15)) +
ggtitle("FDR vs. Posterior Fold Change") +
xlab("log2 Posterior Fold Change") + ylab("-log10(FDR)")
g2 = ggplot(data=A_geneFDR1, aes(x=log2(PostFC), y=(FDR), color=FDRPostFCthreshold)) +
geom_point(alpha=0.4, size=1.75) +
#opts(legend.position = "none") +
#xlim(c(-10, 10)) + ylim(c(0, 15)) +
ggtitle("FDR vs. Posterior Fold Change") +
xlab("log2 Posterior Fold Change") + ylab("FDR")
grid.arrange(g1, g2, ncol=2)

######################## Plotted FDR volcanoes ########################
######################## Write table of DE genes w/ FDR=0.01 ########################
write.table(A_geneFDR1[,1:6], file = "/mnt/bd2/RSEM_EBSeq/Lung_DE_FDR0.01",
quote = FALSE, sep = ",", row.names = TRUE, col.names = TRUE)
######################## Table written ########################
######################## For individual sample ########################
#Find highest expressed genes
#idx = order(A_RSEM_ind[,"TPM"], decreasing=T)
#A_RSEM_ind[idx[1:10], c("gene_id", "expected_count", "TPM")]
#######################################################################