#############################################################
################# Abby's code. DO NOT EDIT! #################
################# Please cite appropriately.#################
#############################################################

################# This script was developed using the following resources #################
# Chimenti, M. (2015, August 17). Create a volcano plot on EBSeq output. Retrieved July 19, 2017, from 
#     http://www.michaelchimenti.com/2015/08/create-a-volcano-plot-on-ebseq-output/ 
# Gillespie, C. (2011, June 21). Volcano plots of microarray data. Retrieved July 19, 2017, from 
#     http://bioinformatics.knowledgeblog.org/2011/06/21/volcano-plots-of-microarray-data/
# Leng, N., Kendziorski C. (2015). EBSeq: An R Package for Differential Expression Analysis using 
#     RNA-Seq Data. R package version 1.14.10.
# Li, B., A Short Tutorial for RSEM. (2015), GitHub repository, 
#     https://github.com/bli25ucb/RSEM_tutorial
# Phipson, B., Trigos, A., Ritchie, M., Doyle, M., Dashnow, H., & Law, C. (n.d.). RNA-seq Analysis in 
#     R. Retrieved July 19, 2017, from http://combine-australia.github.io/RNAseq-R/06-rnaseq-day1.html 
############################################################################################

library(EBSeq)
## Loading required package: blockmodeling
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
## Loading required package: testthat
library(gridExtra)
library(RColorBrewer)
require(ggplot2)
## Loading required package: ggplot2
######################## Declare variables ########################
# Read-in data
#A_genedata = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeqTestEBSeq_results_genes.txt", header=T, stringsAsFactors=F)
#A_normal = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeqTestEBSeq_results_genes.txt.normalized_data_matrix", header=T, stringsAsFactors=F)
#A_RSEM_ind = read.table("/mnt/bd2/Aligned_Reads_memory/1108724_S17_L005_Aligned.genes.results", header=T, stringsAsFactors=F)
#A_geneFDR5 = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeq_results_genes_fdr-0.05.txt", header=T, stringsAsFactors=F)
#A_geneFDR1 = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeq_results_genes_fdr-0.1.txt", header=T, stringsAsFactors=F)


A_genedata = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeq_NEW_Lung_results_genes.txt", header=T, stringsAsFactors=F)
A_normal = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeq_NEW_Lung_results_genes.txt.normalized_data_matrix", header=T, stringsAsFactors=F)
#A_RSEM_ind = read.table("/mnt/bd2/Aligned_Reads_memory/1108724_S17_L005_Aligned.genes.results", header=T, stringsAsFactors=F)
A_geneFDR5 = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/NEW_Lung_EBSeq_results_genes_fdr-0.05.txt", header=T, stringsAsFactors=F)
A_geneFDR1 = read.table("/mnt/bd2/RSEM_EBSeq/EBSeqTest/EBSeq_NEW_Lung_results_genes_fdr-0.1.txt", header=T, stringsAsFactors=F)


# Clean-up headers before declaring other vars
# Edit column headers to sample names only 
colnames(A_normal) = gsub("X.mnt.bd2.Aligned_Reads_disk1.", "", colnames(A_normal))
colnames(A_normal) = gsub("_.*", "", colnames(A_normal))

# Subset normalized gene counts w/ FDR=0.01
normFDRintersect = A_normal[rownames(A_normal) %in% rownames(A_geneFDR1),]

# Subset individual normalized gene counts w/ FDR=0.01
normFDRInterect = A_geneFDR1[rownames(A_normal) %in% rownames(A_geneFDR1),]

# Subset clustered gene counts w/ FDR=0.01
A_geneCluster = A_geneFDR1[,5:6]

# Clean-up header for new var before declaring other vars
names(A_geneCluster) = c("R", "DF")

# Define top/bottom FC vars
A_TopPostFCnames = rownames(A_geneFDR1[order(-A_geneFDR1$PostFC),])[1:50]
A_TopRealFCnames = rownames(A_geneFDR1[order(-A_geneFDR1$RealFC),])[1:50]
A_BotPostFCnames = rownames(A_geneFDR1[order(A_geneFDR1$PostFC),])[1:50]
A_BotRealFCnames = rownames(A_geneFDR1[order(A_geneFDR1$RealFC),])[1:50]

# Subset top/bottom FC var values in normalized data
A_normHeatTopPostFC = subset(A_normal, rownames(A_normal) %in% A_TopPostFCnames)
A_normHeatTopRealFC = subset(A_normal, rownames(A_normal) %in% A_TopRealFCnames)
A_normHeatBotPostFC = subset(A_normal, rownames(A_normal) %in% A_BotPostFCnames)
A_normHeatBotRealFC = subset(A_normal, rownames(A_normal) %in% A_BotRealFCnames)

# Find variance for each gene for normalized INDVIDIDUAL SAMPLE gene counts w/ FDR = 0.01; 
# then subset top 50 and 500
normGeneVar <- apply(normFDRintersect, 1, var)
findNormGeneVar50 = names(sort(normGeneVar, decreasing=TRUE))[1:50]
findNormGeneVar500 = names(sort(normGeneVar, decreasing=TRUE))[1:500]
normGeneVar50 = A_normal[findNormGeneVar50,]
normGeneVar500 = A_normal[findNormGeneVar500,]


# Find variance for each gene for normalized CLUSTERED gene counts w/ FDR = 0.01; 
# then subset top 50 and 500
normGeneVarCLUST <- apply(A_geneCluster, 1, var)
findNormGeneVar50CLUST = names(sort(normGeneVarCLUST, decreasing=TRUE))[1:50]
findNormGeneVar500CLUST = names(sort(normGeneVarCLUST, decreasing=TRUE))[1:500]
normGeneVar50CLUST = A_geneCluster[findNormGeneVar50CLUST,]
normGeneVar500CLUST = A_geneCluster[findNormGeneVar500CLUST,]

# Playing w/ no FDR cutoff
normGeneVarNO <- apply(A_normal, 1, var)
findNormGeneVar50NO = names(sort(normGeneVarNO, decreasing=TRUE))[1:50]
findNormGeneVar500NO = names(sort(normGeneVarNO, decreasing=TRUE))[1:500]
normGeneVar50NO = A_normal[findNormGeneVar50NO,]
normGeneVar500NO = A_normal[findNormGeneVar500NO,]

# Define color palette
mypalette = brewer.pal(9,"YlOrRd")
morecols = colorRampPalette(mypalette)
######################## Variables declared ########################


# Find means across both clusters combined
A_genedata$Mean = rowMeans(A_genedata[,grep("^C",names(A_genedata))])


######################## Histogram of PPDE frequency ########################
hist( A_genedata$PPDE, breaks=10, col="mistyrose3", main = "PPDE Frequency of All Genes",
      xlab = "PPDE", ylab = "Frequency")

hist( A_geneFDR1$PPDE, breaks=10, col="mistyrose3", main = "PPDE Frequency of FDR = 0.01 Genes",
      xlab = "PPDE", ylab = "Frequency")

######################## Histogrammed PPDE frequency ########################



######################## Heatmap most variable genes by individual sample ########################
heatmap.2(as.matrix(normGeneVar50),col=rev(morecols(50)),trace="none",
          main=paste("Top", nrow(normGeneVar50), "most variable genes"), scale="row", margins = c(8,10))

heatmap.2(as.matrix(normGeneVar500),col=rev(morecols(500)),trace="none",
          main=paste("Top", nrow(normGeneVar500), "most variable genes"), scale="row", margins = c(8,10))

heatmap.2(as.matrix(normGeneVar50NO),col=rev(morecols(50)),trace="none",
          main=paste("Top", nrow(normGeneVar50NO), "most variable genes"), scale="row", margins = c(8,10), font.main = 8)

heatmap.2(as.matrix(normGeneVar500NO),col=rev(morecols(500)),trace="none",
          main=paste("Top", nrow(normGeneVar500NO), "most variable genes"), scale="row", margins = c(8,10), font.main=8)

######################## Most variable genes mapped by individual sample ########################



######################## Heatmap based on fold change by individual sample ########################
heatmap.2(as.matrix(A_normHeatTopPostFC),
          scale="row", trace="none", Colv=F, margins = c(11,10), main=paste("Top",
                nrow(A_normHeatTopPostFC), "Post FC Genes"))
## Warning in heatmap.2(as.matrix(A_normHeatTopPostFC), scale = "row", trace =
## "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
## column dendogram.

heatmap.2(as.matrix(A_normHeatTopRealFC),
          scale="row", trace="none", Colv=F, margins = c(11,10), main=paste("Top",
                nrow(A_normHeatTopRealFC), "Real FC Genes"))
## Warning in heatmap.2(as.matrix(A_normHeatTopRealFC), scale = "row", trace =
## "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
## column dendogram.

heatmap.2(as.matrix(A_normHeatBotPostFC),
          scale="row", trace="none", Colv=F, margins = c(11,10), main=paste("Bottom",
                nrow(A_normHeatBotPostFC), "Post FC Genes"))
## Warning in heatmap.2(as.matrix(A_normHeatBotPostFC), scale = "row", trace =
## "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
## column dendogram.

heatmap.2(as.matrix(A_normHeatBotRealFC),
          scale="row", trace="none", Colv=F, margins = c(11,10), main=paste("Bottom",
                nrow(A_normHeatBotRealFC), "Real FC Genes"))
## Warning in heatmap.2(as.matrix(A_normHeatBotRealFC), scale = "row", trace =
## "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
## column dendogram.

######################## Heatmapped based on fold change by individual sample ########################



######################## Heatmap most variable genes by cluster ########################
heatmap.2(as.matrix(normGeneVar50CLUST),col=rev(morecols(50)),trace="none",
          main=paste("Top", nrow(normGeneVar50CLUST), "most variable genes"), scale="row", margins = c(8,10))

heatmap.2(as.matrix(normGeneVar500CLUST),col=rev(morecols(500)),trace="none",
          main=paste("Top", nrow(normGeneVar500CLUST), "most variable genes"), scale="row", margins = c(8,10))

######################## Most variable genes mapped by individual sample ########################



######################## Volcano plots w/ PPDE ########################
# Define colors
A_geneFDR1$PPDERealFCColor = "black"
A_geneFDR1$PPDERealFCColor[A_geneFDR1$PPDE > 0.95 & abs(log2(A_geneFDR1$RealFC)) < 15] = "blue"
A_geneFDR1$PPDERealFCColor[A_geneFDR1$PPDE > 0.95 & abs(log2(A_geneFDR1$RealFC)) < 5] = "red"

# Plot real FC
par(mfrow=c(1,2))
with(A_geneFDR1, plot(log2(RealFC), -log10(PPDE), pch=20, main="PPDE vs. Real FC", col=A_geneFDR1$PPDERealFCColor))
with(A_geneFDR1, plot(log2(RealFC), (PPDE), pch=20, main="PPDE vs. Real FC", col=A_geneFDR1$PPDERealFCColor))

# Define colors
A_geneFDR1$PPDEPostFCColor = "black"
A_geneFDR1$PPDEPostFCColor[A_geneFDR1$PPDE > 0.95 & abs(log2(A_geneFDR1$PostFC)) < 15] = "blue"
A_geneFDR1$PPDEPostFCColor[A_geneFDR1$PPDE > 0.95 & abs(log2(A_geneFDR1$PostFC)) < 5] = "red"

# Plot posterior FC
par(mfrow=c(1,2))
with(A_geneFDR1, plot(log2(PostFC), -log10(PPDE), pch=20, main="PPDE vs. Post FC", col=A_geneFDR1$PPDEPostFCColor))
with(A_geneFDR1, plot(log2(PostFC), (PPDE), pch=20, main="PPDE vs. Post FC", col=A_geneFDR1$PPDEPostFCColor))

# Define colors for real FC volcano plot
A_geneFDR1$PPDERealFCthreshold = as.factor(abs(log2(A_geneFDR1$RealFC)) > 5 & A_geneFDR1$PPDE > 0.95)

# Plot real FC
g1= ggplot(data=A_geneFDR1, aes(x=log2(RealFC), y=-log10(PPDE), color=PPDERealFCthreshold)) +
  geom_point(alpha=0.4, size=1.75) +
  #opts(legend.position = "none") +
  #xlim(c(-10, 10)) + ylim(c(0, 15)) +
  ggtitle("PPDE vs. Real Fold Change") +
  xlab("log2 Real Fold Change") + ylab("-log10(PPDE)")

g2 = ggplot(data=A_geneFDR1, aes(x=log2(RealFC), y=(PPDE), color=PPDERealFCthreshold)) +
  geom_point(alpha=0.4, size=1.75) +
  #opts(legend.position = "none") +
  #xlim(c(-10, 10)) + ylim(c(0, 15)) +
  ggtitle("PPDE vs. Real Fold Change") +
  xlab("log2 Real Fold Change") + ylab("PPDE")

grid.arrange(g1, g2, ncol=2)

# Define colors for post FC volcano plot
A_geneFDR1$PPDEPostFCthreshold = as.factor(abs(log2(A_geneFDR1$PostFC)) > 5 & A_geneFDR1$PPDE > 0.95)

# Plot post FC
g1= ggplot(data=A_geneFDR1, aes(x=log2(PostFC), y=-log10(PPDE), color=PPDEPostFCthreshold)) +
  geom_point(alpha=0.4, size=1.75) +
  #opts(legend.position = "none") +
  #xlim(c(-10, 10)) + ylim(c(0, 15)) +
  ggtitle("PPDE vs. Posterior Fold Change") +
  xlab("log2 Posterior Fold Change") + ylab("-log10(PPDE)")

g2 = ggplot(data=A_geneFDR1, aes(x=log2(PostFC), y=(PPDE), color=PPDEPostFCthreshold)) +
  geom_point(alpha=0.4, size=1.75) +
  #opts(legend.position = "none") +
  #xlim(c(-10, 10)) + ylim(c(0, 15)) +
  ggtitle("PPDE vs. Posterior Fold Change") +
    xlab("log2 Posterior Fold Change") + ylab("PPDE")

grid.arrange(g1, g2, ncol=2)

######################## Plotted PPDE volcanoes ########################



######################## Volcano plots w/ FDR ########################
# Define FDR values
A_geneFDR1$FDR = 1 - A_geneFDR1$PPDE

# Define colors
A_geneFDR1$FDRRealFCColor = "black"
A_geneFDR1$FDRRealFCColor[A_geneFDR1$FDR < 0.05 & abs(log2(A_geneFDR1$RealFC)) < 15] = "blue"
A_geneFDR1$FDRRealFCColor[A_geneFDR1$FDR < 0.05 & abs(log2(A_geneFDR1$RealFC)) < 5] = "red"

# Plot real FC
par(mfrow=c(1,2))
with(A_geneFDR1, plot(log2(RealFC), -log10(FDR), pch=20, main="FDR vs. Real FC", col=A_geneFDR1$FDRRealFCColor))
with(A_geneFDR1, plot(log2(RealFC), (FDR), pch=20, main="FDR vs. Real FC", col=A_geneFDR1$FDRRealFCColor))

# Define colors
A_geneFDR1$FDRPostFCColor = "black"
A_geneFDR1$FDRPostFCColor[A_geneFDR1$FDR < 0.05 & abs(log2(A_geneFDR1$PostFC)) < 15] = "blue"
A_geneFDR1$FDRPostFCColor[A_geneFDR1$FDR < 0.05 & abs(log2(A_geneFDR1$PostFC)) < 5] = "red"

# Plot posterior FC
par(mfrow=c(1,2))
with(A_geneFDR1, plot(log2(PostFC), -log10(FDR), pch=20, main="FDR vs. Post FC", col=A_geneFDR1$FDRPostFCColor))
with(A_geneFDR1, plot(log2(PostFC), (FDR), pch=20, main="FDR vs. Post FC", col=A_geneFDR1$FDRPostFCColor))

# Define colors for real FC volcano plot
A_geneFDR1$FDRRealFCthreshold = as.factor(abs(log2(A_geneFDR1$RealFC)) > 5 & A_geneFDR1$FDR < 0.05)

# Plot real FC
g1= ggplot(data=A_geneFDR1, aes(x=log2(RealFC), y=-log10(FDR), color=FDRRealFCthreshold)) +
  geom_point(alpha=0.4, size=1.75) +
  #opts(legend.position = "none") +
  #xlim(c(-10, 10)) + ylim(c(0, 100)) +
  ggtitle("FDR vs. Real Fold Change") +
  xlab("log2 Real Fold Change") + ylab("-log10(FDR)")

g1

g2 = ggplot(data=A_geneFDR1, aes(x=log2(RealFC), y=(FDR), color=FDRRealFCthreshold)) +
  geom_point(alpha=0.4, size=1.75) +
  #opts(legend.position = "none") +
  #xlim(c(-10, 10)) + ylim(c(0, 20)) +
  ggtitle("FDR vs. Real Fold Change") +
  xlab("log2 Real Fold Change") + ylab("FDR")

grid.arrange(g1, g2, ncol=2)

# Define colors for post FC volcano plot
A_geneFDR1$FDRPostFCthreshold = as.factor(abs(log2(A_geneFDR1$PostFC)) > 5 & A_geneFDR1$FDR < 0.05)

# Plot post FC
g1= ggplot(data=A_geneFDR1, aes(x=log2(PostFC), y=-log10(FDR), color=FDRPostFCthreshold)) +
  geom_point(alpha=0.4, size=1.75) +
  #opts(legend.position = "none") +
  #xlim(c(-10, 10)) + ylim(c(0, 15)) +
  ggtitle("FDR vs. Posterior Fold Change") +
  xlab("log2 Posterior Fold Change") + ylab("-log10(FDR)")

g2 = ggplot(data=A_geneFDR1, aes(x=log2(PostFC), y=(FDR), color=FDRPostFCthreshold)) +
  geom_point(alpha=0.4, size=1.75) +
  #opts(legend.position = "none") +
  #xlim(c(-10, 10)) + ylim(c(0, 15)) +
  ggtitle("FDR vs. Posterior Fold Change") +
  xlab("log2 Posterior Fold Change") + ylab("FDR")

grid.arrange(g1, g2, ncol=2)

######################## Plotted FDR volcanoes ########################



######################## Write table of DE genes w/ FDR=0.01 ########################
write.table(A_geneFDR1[,1:6], file = "/mnt/bd2/RSEM_EBSeq/Lung_DE_FDR0.01", 
            quote = FALSE, sep = ",", row.names = TRUE, col.names = TRUE)
######################## Table written ########################



######################## For individual sample ########################
#Find highest expressed genes
#idx = order(A_RSEM_ind[,"TPM"], decreasing=T)
#A_RSEM_ind[idx[1:10], c("gene_id", "expected_count", "TPM")]
#######################################################################