edgeR of Growth Chamber Samples (GC)

DGE of bam files from Growth Chamber samples GO annotation of DGE genes done with 48513 transcripts (new transcriptome) and 34304 GO terms (new annotation) load required libraries

library(edgeR)

## Loading required package: limma

library("ggplot2")
library(Biostrings)

## Loading required package: BiocGenerics Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport,
## clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply,
## parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
## xtabs
## 
## The following objects are masked from 'package:base':
## 
## anyDuplicated, as.data.frame, cbind, colnames, duplicated, eval, Filter,
## Find, get, intersect, lapply, Map, mapply, match, mget, order, paste,
## pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rep.int,
## rownames, sapply, setdiff, sort, table, tapply, union, unique, unlist
## 
## Loading required package: IRanges

library(GO.db)

## Loading required package: AnnotationDbi Loading required package: Biobase
## Welcome to Bioconductor
## 
## Vignettes contain introductory material; view with 'browseVignettes()'. To
## cite Bioconductor, see 'citation("Biobase")', and for packages
## 'citation("pkgname")'.
## 
## Loading required package: DBI

library(goseq)

## Loading required package: BiasedUrn Loading required package:
## geneLenDataBase

Read counts file

counts_GC_final_merged <- read.delim("DGE_parents_GC.bam.tsv", row.names = NULL)
counts_GC_final_merged <- counts_GC_final_merged[counts_GC_final_merged$gene != 
    "*", ]
row.names(counts_GC_final_merged) <- counts_GC_final_merged$gene
counts_GC_final_merged <- counts_GC_final_merged[, -1]
counts_GC_final_merged[is.na(counts_GC_final_merged)] <- 0
counts_GC_final_merged <- counts_GC_final_merged[, colSums(counts_GC_final_merged) > 
    1e+05]

To check if the samples are clustered well or not

data_GC_final_merged <- DGEList(counts = counts_GC_final_merged, group = c(rep("IMB211", 
    30), rep("R500", 35)))
# data_GC_final_merged$samples
dim(data_GC_final_merged)

## [1] 46958    65

data_GC_final_merged <- data_GC_final_merged[rowSums(cpm(data_GC_final_merged) > 
    1) >= 3, ]
dim(data_GC_final_merged)

## [1] 36537    65

data_GC_final_merged <- calcNormFactors(data_GC_final_merged)
# data_GC_final_merged$samples

MDS plot

MDS.plot_GC_final_merged <- plotMDS(data_GC_final_merged, cex = 0.7)

plot of chunk unnamed-chunk-4

ggplot2

parsed_GC_final_merged <- matrix(unlist(strsplit(names(MDS.plot_GC_final_merged$x), 
    "_")), ncol = 6, byrow = T)
MDS_DE_GC_final_merged <- data.frame(MDS.plot_GC_final_merged$x, MDS.plot_GC_final_merged$y, 
    parsed_GC_final_merged)
names(MDS_DE_GC_final_merged) <- c("x", "y", "par", "treatment", "rep", "tissue")
mds_GC_final_merged <- ggplot(MDS_DE_GC_final_merged, aes(x, y)) + geom_point(aes(color = treatment, 
    shape = rep)) + geom_rug(aes(colour = treatment)) + facet_grid(par ~ tissue) + 
    labs(title = "ggplot of GC new ref samples") + theme(strip.text.x = element_text(size = 8))
mds_GC_final_merged

plot of chunk unnamed-chunk-5

Internode

Extract the columns for Internode tissue

grep("INT", names(counts_GC_final_merged))

##  [1]  1  7 16 21 26 33 40 45 51 56 61

count_GC_INT <- counts_GC_final_merged[c(1, 7, 16, 21, 26, 33, 40, 45, 51, 56, 
    61)]

dataframe

gt <- sub("(IMB211|R500)_[[:print:]]+", "\\1", names(count_GC_INT))
gt <- sub("GCR500", "R500", gt)
gt <- sub("GCIMB211", "IMB211", gt)
samples <- data.frame(file = names(count_GC_INT), gt, trt = sub("[[:print:]]+_(SUN|SHADE)[[:print:]]+", 
    "\\1", names(count_GC_INT)), rep = sub("[[:alnum:]]+_[[:alnum:]]+_([1-3])[[:print:]]+.bam", 
    "\\1", names(count_GC_INT)))
# samples
samples$group <- paste(samples$gt, samples$trt, sep = "_")
group <- samples$group
gt <- samples$gt
trt <- samples$trt
samples

##                                        file     gt   trt rep        group
## 1  GCIMB211_SHADE_1_INTERNODE_s_1.fq.gz.bam IMB211 SHADE   1 IMB211_SHADE
## 2  GCIMB211_SHADE_2_INTERNODE_s_1.fq.gz.bam IMB211 SHADE   2 IMB211_SHADE
## 3    GCIMB211_SUN_1_INTERNODE_s_1.fq.gz.bam IMB211   SUN   1   IMB211_SUN
## 4    GCIMB211_SUN_2_INTERNODE_s_1.fq.gz.bam IMB211   SUN   2   IMB211_SUN
## 5    GCIMB211_SUN_3_INTERNODE_s_1.fq.gz.bam IMB211   SUN   3   IMB211_SUN
## 6    GCR500_SHADE_1_INTERNODE_s_1.fq.gz.bam   R500 SHADE   1   R500_SHADE
## 7    GCR500_SHADE_2_INTERNODE_s_1.fq.gz.bam   R500 SHADE   2   R500_SHADE
## 8    GCR500_SHADE_3_INTERNODE_s_1.fq.gz.bam   R500 SHADE   3   R500_SHADE
## 9      GCR500_SUN_1_INTERNODE_s_1.fq.gz.bam   R500   SUN   1     R500_SUN
## 10     GCR500_SUN_2_INTERNODE_s_1.fq.gz.bam   R500   SUN   2     R500_SUN
## 11     GCR500_SUN_3_INTERNODE_s_1.fq.gz.bam   R500   SUN   3     R500_SUN

Create DGEList object and normalization

data_GC_INT.int <- DGEList(count_GC_INT, group = group)
# data_GC_INT.int$samples
data_GC_INT.int <- data_GC_INT.int[rowSums(cpm(data_GC_INT.int) > 1) >= 3, ]
# dim(data_GC_INT.int)
data_GC_INT.int <- calcNormFactors(data_GC_INT.int)
# data_GC_INT.int$samples

relevel for gt and trt

gt <- relevel(samples$gt, ref = "R500")
trt <- relevel(samples$trt, ref = "SUN")

model matrix

design_GC_INT_int <- model.matrix(~gt * trt)
colnames(design_GC_INT_int)

## [1] "(Intercept)"       "gtIMB211"          "trtSHADE"         
## [4] "gtIMB211:trtSHADE"

rownames(design_GC_INT_int) <- colnames(data_GC_INT.int)

Estimating the dispersion First we estimate the overall dispersion for the dataset, to get an idea of the overall level of biological variability

data_GC_INT.int <- estimateGLMCommonDisp(data_GC_INT.int, design_GC_INT_int, 
    verbose = TRUE)

## Disp = 0.09554 , BCV = 0.3091

Then we estimate gene-wise dispersion estimates, allowing a possible trend with averge count size

data_GC_INT.int <- estimateGLMTrendedDisp(data_GC_INT.int, design_GC_INT_int)

## Loading required package: splines

data_GC_INT.int <- estimateGLMTagwiseDisp(data_GC_INT.int, design_GC_INT_int)

BCV plot

plotBCV(data_GC_INT.int, main = "comparision of common vs Trend dispersion")

plot of chunk unnamed-chunk-13

data_GC_INT.int <- estimateGLMTrendedDisp(data_GC_INT.int, design_GC_INT_int)
data_GC_INT.int <- estimateGLMTagwiseDisp(data_GC_INT.int, design_GC_INT_int)

Find top hits Fit the linear model to the design matrix

fit_GC_INT_int <- glmFit(data_GC_INT.int, design_GC_INT_int)
colnames(fit_GC_INT_int$design)

## [1] "(Intercept)"       "gtIMB211"          "trtSHADE"         
## [4] "gtIMB211:trtSHADE"

Gene expression analysis

coefficient = 3 (“trt”)

GC_INT_trt_lrt <- glmLRT(fit_GC_INT_int, coef = 3)
results_GC_INT_trt_lrt <- topTags(GC_INT_trt_lrt, n = Inf)

The total number of differentially expressed genes by Shade in R500 (because 3 indicates the number of R500 gene DE by shade in R500)

sum(results_GC_INT_trt_lrt$table$FDR < 0.01)

## [1] 2047

save results in a file and read it back

GC_INT_trt_lrt_DGE <- results_GC_INT_trt_lrt$table[results_GC_INT_trt_lrt$table$FDR < 
    0.01, ]
write.csv(GC_INT_trt_lrt_DGE, "edgeR_DE_GC_IMB211_R500_INT_trt.csv")
trt.df <- read.csv("edgeR_DE_GC_IMB211_R500_INT_trt.csv", sep = ",", h = T)
colnames(trt.df) <- c("geneX", "logFC_trt_only", "logCPM_trt_only", "LR_trt_only", 
    "PValue_trt_only", "FDR_trt_only")

coefficient = 3 + 4 (“trt” + “gt*trt”)

GC_INT_trt_gt.int_lrt <- glmLRT(fit_GC_INT_int, coef = c(3, 4))
results_GC_INT_trt_gt.int <- topTags(GC_INT_trt_gt.int_lrt, n = Inf)

The total number of genes with DE in shade in either or both genotypes (because 3 indicates the number of R500 gene DE by shade in R500 and 4 indicates the number of genes that were different in IMB211 compared to R500)

sum(results_GC_INT_trt_gt.int$table$FDR < 0.01)

## [1] 2448

save results in a file

write.csv(results_GC_INT_trt_gt.int, "edgeR_DE_GC_IMB211_R500_INT_trt_gt.int.csv")

coefficient = 4 (“gt*trt”)

GC_INT_gt_int_lrt <- glmLRT(fit_GC_INT_int, coef = 4)
results_GC_INT_gt_int_lrt <- topTags(GC_INT_gt_int_lrt, n = Inf)
GC_INT_gt_trt_lrt_DGE <- results_GC_INT_gt_int_lrt$table[results_GC_INT_gt_int_lrt$table$FDR < 
    0.01, ]

save results in a file and read it back

write.csv(results_GC_INT_gt_int_lrt, "edgeR_DE_GC_IMB211_R500_INT_gt_int.csv")
write.csv(GC_INT_gt_trt_lrt_DGE, "edgeR_DE_GC_IMB211_R500_INT_gt_int.csv")
gt.trt_int.df <- read.csv("edgeR_DE_GC_IMB211_R500_INT_gt_int.csv", sep = ",", 
    h = T)
colnames(gt.trt_int.df) <- c("geneX", "logFC_gt.trt_int_only", "logCPM_gt.trt_int_only", 
    "LR_gt.trt_int_only", "PValue_gt.trt_int_only", "FDR_gt.trt_int_only")

Now checking to see different categories of genes between coefficients 3 and 3+4

data <- merge(trt.df, gt.trt_int.df, by = 1)
data1 <- data[, c(1, 2, 7)]
data1$summ <- rowSums(data1[, c(2, 3)])

The total number of genes that are repressed in both R500 and IMB211 but more stronger repression in R500

summary(data1$summ < 0)[3]

##  TRUE 
## "560"

The total number of genes that are induced in both R500 and IMB211 but more stronger induction in R500

summary(data1$summ < 0)[2]

## FALSE 
## "260"

I couldn't find other 2 categories of genes.Am i doing something wrong here?

data1$sign_trt_only <- sign(data1$logFC_trt_only)
data1$sign_gt.trt_int_only <- sign(data1$logFC_gt.trt_int_only)
summary(data1$sign_trt_only == data1$sign_gt.trt_int_only)

##    Mode   FALSE    NA's 
## logical     820       0