Turtle_Brain_Data_Troubleshoot

#Reference:
#http://www.gettinggeneticsdone.com/2014/05/r-volcano-plots-to-visualize-rnaseq-microarray.html

#Read-in /mnt/bd2/Turles/BrainGOonly_edit.csv
BrainGOdata <- read.csv("/mnt/bd2/Turles/BrainGOonly_edit.csv", header=TRUE, sep=",")

#Glance at data
head(BrainGOdata)

##        SeqName                         Description PPEE PPDE     PostFC
## 1 c14505_g1_i1                       Myb-related B    0    1 0.76230333
## 2  c1819_g1_i1 importin subunit alpha-3 isoform X1    0    1 0.89173484
## 3 c18256_g1_i2           Transcription factor COE1    0    1 0.13070670
## 4  c1869_g1_i1         14-3-3 zeta-like isoform X1    0    1 0.89780314
## 5 c27686_g1_i1  PREDICTED: ovostatin-like, partial    0    1 0.01490221
## 6 c28548_g1_i1       four and a half LIM domains 1    0    1 0.66376355
##        RealFC   FC......   ABS.FC.       eValue FDR pValue sim.mean
## 1 0.762119513  -1.311814  1.311814  0.00000e+00   0      0    98.00
## 2 0.891711420  -1.121410  1.121410  0.00000e+00   0      0   100.00
## 3 0.112813204  -7.650717  7.650717  1.01809e-28   0      0    83.33
## 4 0.897813424  -1.113830  1.113830 1.67911e-165   0      0    99.00
## 5 0.000313928 -67.104152 67.104152  0.00000e+00   0      0    97.00
## 6 0.663559680  -1.506561  1.506561  0.00000e+00   0      0    98.33
##   log2FoldChange X.GO       GO.IDs                          GO.Names
## 1   0.3915629175    4 F:GO:0003677                      F:GO:0003700
## 2   0.1653133164    8 C:GO:0070062                      P:GO:0006607
## 3   2.9355949561    6 C:GO:0005634                      F:GO:0003677
## 4   0.1555289523    1 F:GO:0019904 F:protein domain specific binding
## 5   6.0683301237    3 C:GO:0005615                      F:GO:0004866
## 6   0.5912586803    1 F:GO:0008270                F:zinc ion binding
##            tre1
## 1  P:GO:0006355
## 2  C:GO:0005829
## 3  F:GO:0046872
## 4              
## 5  P:GO:0010951
## 6

#Try plotting w/ eVal
with(BrainGOdata, plot(log2FoldChange, -log10(eValue), main="Volcano plot"))

#Produces 50 warnings:
#1: In min(x) : no non-missing arguments to min; returning Inf
#2: In max(x) : no non-missing arguments to max; returning -Inf

###Investigate what may cause error, e.g. "NA" and zeros
#See if there are NAs in eVal column
sum(is.na(BrainGOdata$eValue))

## [1] 1110

#See if there are zeros in eVal column
length(grep("0", BrainGOdata$eValue))

## [1] 1599

###Subset the data: remove NAs and zeros in eVal column
#Remove NAs
BrainGOdata <- BrainGOdata[!is.na(BrainGOdata$eValue),]

#Remove zeros
BrainGOdata <- BrainGOdata[BrainGOdata$eValue!="0", ]

#Re-check to make sure NAs and zeros were cleared out
sum(is.na(BrainGOdata$eValue))

## [1] 0

length(grep("0", BrainGOdata$eValue))

## [1] 579

#For some reason, the above command says there are 579 zeros,
#but min() returns 1.72138e-180
min(BrainGOdata$eValue)

## [1] 1.72138e-180

#Try plotting again
with(BrainGOdata, plot(log2FoldChange, -log10(eValue), main="Volcano plot",  xlab = "log2FoldChange", ylab = "-log10(eValue)"))

###Let's try this w/ the p-val instead of e-val
###Note: no cut-offs are implemented here
#Reset data
BrainGOdata <- read.csv("/mnt/bd2/Turles/BrainGOonly_edit.csv", header=TRUE, sep=",")

#Remove NAs
BrainGOdata <- BrainGOdata[!is.na(BrainGOdata$pValue),]

#Remove zeros
BrainGOdata <- BrainGOdata[BrainGOdata$pValue!="0", ]

#Try plotting
with(BrainGOdata, plot(log2FoldChange, -log10(na.omit(as.numeric(pValue))), main="Volcano plot", xlab = "log2FoldChange", ylab = "-log10(eValue)"))

###Try using ggplot

#install.packages("ggplot2") #only run this command once and if you haven't already installed
library(ggplot2)

#First, find FDR cut-off
BrainGOdata$threshold = as.factor(abs(as.numeric(BrainGOdata$log2FoldChange)) > 2 & as.numeric(BrainGOdata$FDR) < 0.05)

#Produces something, but it's not right
g = ggplot(data=BrainGOdata, aes(x=as.numeric(log2FoldChange), y=-log10(as.numeric(pValue)), colour=threshold)) +
  geom_point(alpha=0.4, size=1.75) +
  #opts(legend.position = "none") +
  geom_point() +
  xlab("log2 fold change") + ylab("-log10 p-value")
g

#Here's where I'm leaving off on 7/10/17
#Still trying to deal the the factor problem

#Convert to numeric
mylog2FoldChange <- as.numeric(as.character(BrainGOdata$log2FoldChange))
mypvalue <- -log10(as.numeric(as.character(BrainGOdata$pValue)))
myFDR <- as.numeric(as.character(BrainGOdata$FDR))

#First, find FDR cut-off
BrainGOdata$threshold = as.factor(abs(mylog2FoldChange) > 2 & myFDR < 0.05)

g = ggplot(data=BrainGOdata, aes(x=mylog2FoldChange, y=mypvalue, colour=threshold)) +
  geom_point(alpha=0.4, size=1.75) +
  theme(legend.position = "none") +
  ylim(c(-0.30, -2.4))
  xlim(c(-10, 10))

## <ScaleContinuousPosition>
##  Range:  
##  Limits:  -10 --   10

  xlab("log2 fold change") + ylab("-log10 p-value")

## Error in xlab("log2 fold change") + ylab("-log10 p-value"): non-numeric argument to binary operator

#levels(factor(BrainGOdata$pValue))

Turtle_Brain_Data_Troubleshoot

Abby Moore

July 10, 2017