# Clear the console 
rm(list = ls())

# Load the DESeq2 package
library("DESeq2")

# Load the limma package
library("limma")

# Set working directory for the datafile
setwd("~/Documents/git_repos/Oyster_DEseq2/")
# Load the file
data <- read.csv("data_51_full.txt", sep = "\t")

# Remove the "X" that gets prepended to the colnames
names(data) <- sub("X", "", names(data))

# Add the first column as rownames to the data
row.names(data) <- data[,1]

# Remove the first column that contains gene id's
data <- data[,-1]
# Create a df with sample, condition, population and replicate information
samples <- data.frame(
  sample = names(data),
  cond = sub("(5|33)_(OP|M|LL)_([1-6])","\\1",names(data)),
  pop  = sub("(5|33)_(OP|M|LL)_([1-6])","\\2",names(data)),
  rep  = sub("(5|33)_(OP|M|LL)_([1-6])","\\3",names(data))
)
samples
##     sample cond pop rep
## 1   5_OP_1    5  OP   1
## 2   5_OP_2    5  OP   2
## 3   5_OP_3    5  OP   3
## 4   5_OP_4    5  OP   4
## 5   5_OP_5    5  OP   5
## 6   5_OP_6    5  OP   6
## 7  33_OP_1   33  OP   1
## 8  33_OP_2   33  OP   2
## 9  33_OP_3   33  OP   3
## 10 33_OP_4   33  OP   4
## 11 33_OP_5   33  OP   5
## 12 33_OP_6   33  OP   6
## 13   5_M_1    5   M   1
## 14   5_M_2    5   M   2
## 15   5_M_3    5   M   3
## 16   5_M_4    5   M   4
## 17   5_M_5    5   M   5
## 18   5_M_6    5   M   6
## 19  33_M_1   33   M   1
## 20  33_M_2   33   M   2
## 21  33_M_3   33   M   3
## 22  33_M_4   33   M   4
## 23  33_M_5   33   M   5
## 24  33_M_6   33   M   6
## 25  5_LL_1    5  LL   1
## 26  5_LL_2    5  LL   2
## 27  5_LL_3    5  LL   3
## 28  5_LL_4    5  LL   4
## 29  5_LL_5    5  LL   5
## 30  5_LL_6    5  LL   6
## 31 33_LL_1   33  LL   1
## 32 33_LL_2   33  LL   2
## 33 33_LL_3   33  LL   3
## 34 33_LL_4   33  LL   4
## 35 33_LL_5   33  LL   5
## 36 33_LL_6   33  LL   6
# Count matrix input (no design here)
dds <- DESeqDataSetFromMatrix(countData = data,
                              colData = samples,
                              design = ~ cond + cond:pop) #This is essentially the same design as                                                             # the ~cond + pop + cond:pop, but 
                                                          # makes it easier to perform the contrasts

dds
## class: DESeqDataSet 
## dim: 51574 36 
## metadata(0):
## assays(1): counts
## rownames(51574): oystercontig_1 oystercontig_2 ...
##   oystercontig_51573 oystercontig_51574
## rowRanges metadata column names(0):
## colnames(36): 1 2 ... 35 36
## colData names(4): sample cond pop rep
# Add feature data (gene id's as column)
featureData <- data.frame(gene=rownames(data))
(mcols(dds) <- DataFrame(mcols(dds), featureData))
## DataFrame with 51574 rows and 1 column
##                     gene
##                 <factor>
## 1         oystercontig_1
## 2         oystercontig_2
## 3         oystercontig_3
## 4         oystercontig_4
## 5         oystercontig_5
## ...                  ...
## 51570 oystercontig_51570
## 51571 oystercontig_51571
## 51572 oystercontig_51572
## 51573 oystercontig_51573
## 51574 oystercontig_51574
# Filtering to remove rows with 0 reads
dds <- dds[ rowSums(counts(dds)) > 1, ]
# First generate the dispersions
dds <- DESeq(dds)

# Dropping condition column
dds <- nbinomLRT(dds, full = design(dds), reduced = ~ cond)

# Check the coefficients
resultsNames(dds)
## [1] "Intercept"    "cond_5_vs_33" "cond33.popM"  "cond5.popM"  
## [5] "cond33.popOP" "cond5.popOP"
# nbinormLRT
# For analyses using the likelihood ratio test (using nbinomLRT), the p-values are determined solely # by the difference in deviance between the full and reduced model formula. A log2 fold change is   # included, which  # can be controlled using the name argument, or by default this will be the      # estimated coefficient for the   # last element of resultsNames(object)

# Question 1: There is not a single contrast which gives the answer of DE of e.g. (OP vs LL) or (M  # vs LL), but you can accomplish this by taking the union of the FDR limited sets from the above two # pairwise comparisons. 

# You can compare OP vs LL for treatment 5 with the following
res_OPvsLL.5 <- results(dds, test = "Wald", name="cond5.popOP")
res_OPvsLL.5 <- subset(res_OPvsLL.5, res_OPvsLL.5$padj <= 0.05)
res_OPvsLL.5.genes <- row.names(res_OPvsLL.5)

# You can compare M vs LL for treatment 5 with the following
res_MvsLL.5 <- results(dds, test = "Wald", name = "cond5.popM")
res_MvsLL.5 <- subset(res_MvsLL.5, res_MvsLL.5$padj <= 0.05)
res_MvsLL.5.genes <- row.names(res_MvsLL.5)

# Combining the two above..
comb <- c(res_OPvsLL.5.genes,res_MvsLL.5.genes)

# Comparing comb with the above two
res_OPvsLL.5.genes.2 <- comb %in% res_OPvsLL.5.genes
res_MvsLL.5.genes.2 <- comb %in% res_MvsLL.5.genes  

# Generating venn counts to plot venn diagram
counts.5 <- cbind(res_OPvsLL.5.genes.2, res_MvsLL.5.genes.2)
results.5 <- vennCounts(counts.5)
vennDiagram(results.5, cex = 1,names = c("OPvsLL.5","MvsLL.5"), circle.col = c("red", "blue"))

# You can compare OP vs LL for treatment 33 with the following
res_OPvsLL.33 <- results(dds, test = "Wald", name="cond33.popOP")
res_OPvsLL.33 <- subset(res_OPvsLL.33, res_OPvsLL.33$padj <= 0.05)
res_OPvsLL.33.genes <- row.names(res_OPvsLL.33)

# You can compare M vs LL for treatment 33 with the following
res_MvsLL.33 <- results(dds, test = "Wald", name = "cond33.popM")
res_MvsLL.33 <- subset(res_MvsLL.33, res_MvsLL.33$padj <= 0.05)
res_MvsLL.33.genes <- row.names(res_MvsLL.33)

# Combining the two above..
comb <- c(res_OPvsLL.33.genes,res_MvsLL.33.genes)

# Comparing the comb with the above two
res_OPvsLL.33.genes.2 <- comb %in% res_OPvsLL.33.genes
res_MvsLL.33.genes.2 <- comb %in% res_MvsLL.33.genes  

# Generating venn counts to plot venn diagram
counts.33 <- cbind(res_OPvsLL.33.genes.2, res_MvsLL.33.genes.2)
results.33 <- vennCounts(counts.33)
vennDiagram(results.33, cex = 1,names = c("OPvsLL.33","MvsLL.33"), circle.col = c("red", "blue"))

# Question 2:
# If you want to ask if the OP vs LL effect is different in treatment 33 vs treatment 5, you form the contrast:
dds <- DESeq(dds)
res.1 <- results(dds, contrast=list("cond33.popOP", "cond5.popOP"))
res.1 <- subset(res.1, res.1$padj <= 0.05)
dim(res.1) # Very few genes
## [1] 16  6
# If you want to ask if the M vs LL effect is different in treatment 33 vs treatment 5, you form the contrast:
res.2 <- results(dds, contrast=list("cond33.popM", "cond5.popM"))
res.2 <- subset(res.2, res.2$padj <= 0.05)
dim(res.2)
## [1] 173   6
# Then you extract the genes in the union of venn diagram. Not doing union here since there are very few gens in the the first contrast