DESeq2

# Clear the console 
rm(list = ls())

# Load the DESeq2 package
library("DESeq2")

# Load the limma package
library("limma")

# Set working directory for the datafile
setwd("~/Documents/git_repos/Oyster_DEseq2/")

# Load the file
data <- read.csv("data_51_full.txt", sep = "\t")

# Remove the "X" that gets prepended to the colnames
names(data) <- sub("X", "", names(data))

# Add the first column as rownames to the data
row.names(data) <- data[,1]

# Remove the first column that contains gene id's
data <- data[,-1]

# Create a df with sample, condition, population and replicate information
samples <- data.frame(
  sample = names(data),
  cond = sub("(5|33)_(OP|M|LL)_([1-6])","\\1",names(data)),
  pop  = sub("(5|33)_(OP|M|LL)_([1-6])","\\2",names(data)),
  rep  = sub("(5|33)_(OP|M|LL)_([1-6])","\\3",names(data))
)
samples

##     sample cond pop rep
## 1   5_OP_1    5  OP   1
## 2   5_OP_2    5  OP   2
## 3   5_OP_3    5  OP   3
## 4   5_OP_4    5  OP   4
## 5   5_OP_5    5  OP   5
## 6   5_OP_6    5  OP   6
## 7  33_OP_1   33  OP   1
## 8  33_OP_2   33  OP   2
## 9  33_OP_3   33  OP   3
## 10 33_OP_4   33  OP   4
## 11 33_OP_5   33  OP   5
## 12 33_OP_6   33  OP   6
## 13   5_M_1    5   M   1
## 14   5_M_2    5   M   2
## 15   5_M_3    5   M   3
## 16   5_M_4    5   M   4
## 17   5_M_5    5   M   5
## 18   5_M_6    5   M   6
## 19  33_M_1   33   M   1
## 20  33_M_2   33   M   2
## 21  33_M_3   33   M   3
## 22  33_M_4   33   M   4
## 23  33_M_5   33   M   5
## 24  33_M_6   33   M   6
## 25  5_LL_1    5  LL   1
## 26  5_LL_2    5  LL   2
## 27  5_LL_3    5  LL   3
## 28  5_LL_4    5  LL   4
## 29  5_LL_5    5  LL   5
## 30  5_LL_6    5  LL   6
## 31 33_LL_1   33  LL   1
## 32 33_LL_2   33  LL   2
## 33 33_LL_3   33  LL   3
## 34 33_LL_4   33  LL   4
## 35 33_LL_5   33  LL   5
## 36 33_LL_6   33  LL   6

# Count matrix input (no design here)
dds <- DESeqDataSetFromMatrix(countData = data,
                              colData = samples,
                              design = ~ cond + cond:pop) #This is essentially the same design as                                                             # the ~cond + pop + cond:pop, but 
                                                          # makes it easier to perform the contrasts

dds

## class: DESeqDataSet 
## dim: 51574 36 
## metadata(0):
## assays(1): counts
## rownames(51574): oystercontig_1 oystercontig_2 ...
##   oystercontig_51573 oystercontig_51574
## rowRanges metadata column names(0):
## colnames(36): 1 2 ... 35 36
## colData names(4): sample cond pop rep

# Add feature data (gene id's as column)
featureData <- data.frame(gene=rownames(data))
(mcols(dds) <- DataFrame(mcols(dds), featureData))

## DataFrame with 51574 rows and 1 column
##                     gene
##                 <factor>
## 1         oystercontig_1
## 2         oystercontig_2
## 3         oystercontig_3
## 4         oystercontig_4
## 5         oystercontig_5
## ...                  ...
## 51570 oystercontig_51570
## 51571 oystercontig_51571
## 51572 oystercontig_51572
## 51573 oystercontig_51573
## 51574 oystercontig_51574

# Filtering to remove rows with 0 reads
dds <- dds[ rowSums(counts(dds)) > 1, ]

# First generate the dispersions
dds <- DESeq(dds)

# Dropping condition column
dds <- nbinomLRT(dds, full = design(dds), reduced = ~ cond)

# Check the coefficients
resultsNames(dds)

## [1] "Intercept"    "cond_5_vs_33" "cond33.popM"  "cond5.popM"  
## [5] "cond33.popOP" "cond5.popOP"

# nbinormLRT
# For analyses using the likelihood ratio test (using nbinomLRT), the p-values are determined solely # by the difference in deviance between the full and reduced model formula. A log2 fold change is   # included, which  # can be controlled using the name argument, or by default this will be the      # estimated coefficient for the   # last element of resultsNames(object)

# Question 1: There is not a single contrast which gives the answer of DE of e.g. (OP vs LL) or (M  # vs LL), but you can accomplish this by taking the union of the FDR limited sets from the above two # pairwise comparisons. 

# You can compare OP vs LL for treatment 5 with the following
res_OPvsLL.5 <- results(dds, test = "Wald", name="cond5.popOP")
res_OPvsLL.5 <- subset(res_OPvsLL.5, res_OPvsLL.5$padj <= 0.05)
res_OPvsLL.5.genes <- row.names(res_OPvsLL.5)

# You can compare M vs LL for treatment 5 with the following
res_MvsLL.5 <- results(dds, test = "Wald", name = "cond5.popM")
res_MvsLL.5 <- subset(res_MvsLL.5, res_MvsLL.5$padj <= 0.05)
res_MvsLL.5.genes <- row.names(res_MvsLL.5)

# Combining the two above..
comb <- c(res_OPvsLL.5.genes,res_MvsLL.5.genes)

# Comparing comb with the above two
res_OPvsLL.5.genes.2 <- comb %in% res_OPvsLL.5.genes
res_MvsLL.5.genes.2 <- comb %in% res_MvsLL.5.genes  

# Generating venn counts to plot venn diagram
counts.5 <- cbind(res_OPvsLL.5.genes.2, res_MvsLL.5.genes.2)
results.5 <- vennCounts(counts.5)
vennDiagram(results.5, cex = 1,names = c("OPvsLL.5","MvsLL.5"), circle.col = c("red", "blue"))

# You can compare OP vs LL for treatment 33 with the following
res_OPvsLL.33 <- results(dds, test = "Wald", name="cond33.popOP")
res_OPvsLL.33 <- subset(res_OPvsLL.33, res_OPvsLL.33$padj <= 0.05)
res_OPvsLL.33.genes <- row.names(res_OPvsLL.33)

# You can compare M vs LL for treatment 33 with the following
res_MvsLL.33 <- results(dds, test = "Wald", name = "cond33.popM")
res_MvsLL.33 <- subset(res_MvsLL.33, res_MvsLL.33$padj <= 0.05)
res_MvsLL.33.genes <- row.names(res_MvsLL.33)

# Combining the two above..
comb <- c(res_OPvsLL.33.genes,res_MvsLL.33.genes)

# Comparing the comb with the above two
res_OPvsLL.33.genes.2 <- comb %in% res_OPvsLL.33.genes
res_MvsLL.33.genes.2 <- comb %in% res_MvsLL.33.genes  

# Generating venn counts to plot venn diagram
counts.33 <- cbind(res_OPvsLL.33.genes.2, res_MvsLL.33.genes.2)
results.33 <- vennCounts(counts.33)
vennDiagram(results.33, cex = 1,names = c("OPvsLL.33","MvsLL.33"), circle.col = c("red", "blue"))

# Question 2:
# If you want to ask if the OP vs LL effect is different in treatment 33 vs treatment 5, you form the contrast:
dds <- DESeq(dds)
res.1 <- results(dds, contrast=list("cond33.popOP", "cond5.popOP"))
res.1 <- subset(res.1, res.1$padj <= 0.05)
dim(res.1) # Very few genes

## [1] 16  6

# If you want to ask if the M vs LL effect is different in treatment 33 vs treatment 5, you form the contrast:
res.2 <- results(dds, contrast=list("cond33.popM", "cond5.popM"))
res.2 <- subset(res.2, res.2$padj <= 0.05)
dim(res.2)

## [1] 173   6

# Then you extract the genes in the union of venn diagram. Not doing union here since there are very few gens in the the first contrast

DESeq2

Upendra

February 12, 2016