DGE of bam files from Growth Chamber samples GO annotation of DGE genes done with 48513 transcripts (new transcriptome) and 34304 GO terms (new annotation) load required libraries
library(edgeR)
## Loading required package: limma
library("ggplot2")
library(Biostrings)
## Loading required package: BiocGenerics Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport,
## clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply,
## parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, as.data.frame, cbind, colnames, duplicated, eval, Filter,
## Find, get, intersect, lapply, Map, mapply, match, mget, order, paste,
## pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rep.int,
## rownames, sapply, setdiff, sort, table, tapply, union, unique, unlist
##
## Loading required package: IRanges
library(GO.db)
## Loading required package: AnnotationDbi Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with 'browseVignettes()'. To
## cite Bioconductor, see 'citation("Biobase")', and for packages
## 'citation("pkgname")'.
##
## Loading required package: DBI
library(goseq)
## Loading required package: BiasedUrn Loading required package:
## geneLenDataBase
Read counts file
counts_GC_final_merged <- read.delim("DGE_parents_GC.bam.tsv", row.names = NULL)
counts_GC_final_merged <- counts_GC_final_merged[counts_GC_final_merged$gene !=
"*", ]
row.names(counts_GC_final_merged) <- counts_GC_final_merged$gene
counts_GC_final_merged <- counts_GC_final_merged[, -1]
counts_GC_final_merged[is.na(counts_GC_final_merged)] <- 0
counts_GC_final_merged <- counts_GC_final_merged[, colSums(counts_GC_final_merged) >
1e+05]
To check if the samples are clustered well or not
data_GC_final_merged <- DGEList(counts = counts_GC_final_merged, group = c(rep("IMB211",
30), rep("R500", 35)))
# data_GC_final_merged$samples
dim(data_GC_final_merged)
## [1] 46958 65
data_GC_final_merged <- data_GC_final_merged[rowSums(cpm(data_GC_final_merged) >
1) >= 3, ]
dim(data_GC_final_merged)
## [1] 36537 65
data_GC_final_merged <- calcNormFactors(data_GC_final_merged)
# data_GC_final_merged$samples
MDS plot
MDS.plot_GC_final_merged <- plotMDS(data_GC_final_merged, cex = 0.7)
ggplot2
parsed_GC_final_merged <- matrix(unlist(strsplit(names(MDS.plot_GC_final_merged$x),
"_")), ncol = 6, byrow = T)
MDS_DE_GC_final_merged <- data.frame(MDS.plot_GC_final_merged$x, MDS.plot_GC_final_merged$y,
parsed_GC_final_merged)
names(MDS_DE_GC_final_merged) <- c("x", "y", "par", "treatment", "rep", "tissue")
mds_GC_final_merged <- ggplot(MDS_DE_GC_final_merged, aes(x, y)) + geom_point(aes(color = treatment,
shape = rep)) + geom_rug(aes(colour = treatment)) + facet_grid(par ~ tissue) +
labs(title = "ggplot of GC new ref samples") + theme(strip.text.x = element_text(size = 8))
mds_GC_final_merged
Extract the columns for Internode tissue
grep("INT", names(counts_GC_final_merged))
## [1] 1 7 16 21 26 33 40 45 51 56 61
count_GC_INT <- counts_GC_final_merged[c(1, 7, 16, 21, 26, 33, 40, 45, 51, 56,
61)]
dataframe
gt <- sub("(IMB211|R500)_[[:print:]]+", "\\1", names(count_GC_INT))
gt <- sub("GCR500", "R500", gt)
gt <- sub("GCIMB211", "IMB211", gt)
samples <- data.frame(file = names(count_GC_INT), gt, trt = sub("[[:print:]]+_(SUN|SHADE)[[:print:]]+",
"\\1", names(count_GC_INT)), rep = sub("[[:alnum:]]+_[[:alnum:]]+_([1-3])[[:print:]]+.bam",
"\\1", names(count_GC_INT)))
# samples
samples$group <- paste(samples$gt, samples$trt, sep = "_")
group <- samples$group
gt <- samples$gt
trt <- samples$trt
samples
## file gt trt rep group
## 1 GCIMB211_SHADE_1_INTERNODE_s_1.fq.gz.bam IMB211 SHADE 1 IMB211_SHADE
## 2 GCIMB211_SHADE_2_INTERNODE_s_1.fq.gz.bam IMB211 SHADE 2 IMB211_SHADE
## 3 GCIMB211_SUN_1_INTERNODE_s_1.fq.gz.bam IMB211 SUN 1 IMB211_SUN
## 4 GCIMB211_SUN_2_INTERNODE_s_1.fq.gz.bam IMB211 SUN 2 IMB211_SUN
## 5 GCIMB211_SUN_3_INTERNODE_s_1.fq.gz.bam IMB211 SUN 3 IMB211_SUN
## 6 GCR500_SHADE_1_INTERNODE_s_1.fq.gz.bam R500 SHADE 1 R500_SHADE
## 7 GCR500_SHADE_2_INTERNODE_s_1.fq.gz.bam R500 SHADE 2 R500_SHADE
## 8 GCR500_SHADE_3_INTERNODE_s_1.fq.gz.bam R500 SHADE 3 R500_SHADE
## 9 GCR500_SUN_1_INTERNODE_s_1.fq.gz.bam R500 SUN 1 R500_SUN
## 10 GCR500_SUN_2_INTERNODE_s_1.fq.gz.bam R500 SUN 2 R500_SUN
## 11 GCR500_SUN_3_INTERNODE_s_1.fq.gz.bam R500 SUN 3 R500_SUN
Create DGEList object and normalization
data_GC_INT.int <- DGEList(count_GC_INT, group = group)
# data_GC_INT.int$samples
data_GC_INT.int <- data_GC_INT.int[rowSums(cpm(data_GC_INT.int) > 1) >= 3, ]
# dim(data_GC_INT.int)
data_GC_INT.int <- calcNormFactors(data_GC_INT.int)
# data_GC_INT.int$samples
relevel for gt and trt
gt <- relevel(samples$gt, ref = "R500")
trt <- relevel(samples$trt, ref = "SUN")
model matrix
design_GC_INT_int <- model.matrix(~gt * trt)
colnames(design_GC_INT_int)
## [1] "(Intercept)" "gtIMB211" "trtSHADE"
## [4] "gtIMB211:trtSHADE"
rownames(design_GC_INT_int) <- colnames(data_GC_INT.int)
Estimating the dispersion First we estimate the overall dispersion for the dataset, to get an idea of the overall level of biological variability
data_GC_INT.int <- estimateGLMCommonDisp(data_GC_INT.int, design_GC_INT_int,
verbose = TRUE)
## Disp = 0.09554 , BCV = 0.3091
Then we estimate gene-wise dispersion estimates, allowing a possible trend with averge count size
data_GC_INT.int <- estimateGLMTrendedDisp(data_GC_INT.int, design_GC_INT_int)
## Loading required package: splines
data_GC_INT.int <- estimateGLMTagwiseDisp(data_GC_INT.int, design_GC_INT_int)
BCV plot
plotBCV(data_GC_INT.int, main = "comparision of common vs Trend dispersion")
data_GC_INT.int <- estimateGLMTrendedDisp(data_GC_INT.int, design_GC_INT_int)
data_GC_INT.int <- estimateGLMTagwiseDisp(data_GC_INT.int, design_GC_INT_int)
Find top hits Fit the linear model to the design matrix
fit_GC_INT_int <- glmFit(data_GC_INT.int, design_GC_INT_int)
colnames(fit_GC_INT_int$design)
## [1] "(Intercept)" "gtIMB211" "trtSHADE"
## [4] "gtIMB211:trtSHADE"
Gene expression analysis
coefficient = 3 (“trt”)
GC_INT_trt_lrt <- glmLRT(fit_GC_INT_int, coef = 3)
results_GC_INT_trt_lrt <- topTags(GC_INT_trt_lrt, n = Inf)
The total number of differentially expressed genes by Shade in R500 (because 3 indicates the number of R500 gene DE by shade in R500)
sum(results_GC_INT_trt_lrt$table$FDR < 0.01)
## [1] 2047
save results in a file and read it back
GC_INT_trt_lrt_DGE <- results_GC_INT_trt_lrt$table[results_GC_INT_trt_lrt$table$FDR <
0.01, ]
write.csv(GC_INT_trt_lrt_DGE, "edgeR_DE_GC_IMB211_R500_INT_trt.csv")
trt.df <- read.csv("edgeR_DE_GC_IMB211_R500_INT_trt.csv", sep = ",", h = T)
colnames(trt.df) <- c("geneX", "logFC_trt_only", "logCPM_trt_only", "LR_trt_only",
"PValue_trt_only", "FDR_trt_only")
coefficient = 3 + 4 (“trt” + “gt*trt”)
GC_INT_trt_gt.int_lrt <- glmLRT(fit_GC_INT_int, coef = c(3, 4))
results_GC_INT_trt_gt.int <- topTags(GC_INT_trt_gt.int_lrt, n = Inf)
The total number of genes with DE in shade in either or both genotypes (because 3 indicates the number of R500 gene DE by shade in R500 and 4 indicates the number of genes that were different in IMB211 compared to R500)
sum(results_GC_INT_trt_gt.int$table$FDR < 0.01)
## [1] 2448
save results in a file
write.csv(results_GC_INT_trt_gt.int, "edgeR_DE_GC_IMB211_R500_INT_trt_gt.int.csv")
coefficient = 4 (“gt*trt”)
GC_INT_gt_int_lrt <- glmLRT(fit_GC_INT_int, coef = 4)
results_GC_INT_gt_int_lrt <- topTags(GC_INT_gt_int_lrt, n = Inf)
GC_INT_gt_trt_lrt_DGE <- results_GC_INT_gt_int_lrt$table[results_GC_INT_gt_int_lrt$table$FDR <
0.01, ]
save results in a file and read it back
write.csv(results_GC_INT_gt_int_lrt, "edgeR_DE_GC_IMB211_R500_INT_gt_int.csv")
write.csv(GC_INT_gt_trt_lrt_DGE, "edgeR_DE_GC_IMB211_R500_INT_gt_int.csv")
gt.trt_int.df <- read.csv("edgeR_DE_GC_IMB211_R500_INT_gt_int.csv", sep = ",",
h = T)
colnames(gt.trt_int.df) <- c("geneX", "logFC_gt.trt_int_only", "logCPM_gt.trt_int_only",
"LR_gt.trt_int_only", "PValue_gt.trt_int_only", "FDR_gt.trt_int_only")
Now checking to see different categories of genes between coefficients 3 and 3+4
data <- merge(trt.df, gt.trt_int.df, by = 1)
data1 <- data[, c(1, 2, 7)]
data1$summ <- rowSums(data1[, c(2, 3)])
The total number of genes that are repressed in both R500 and IMB211 but more stronger repression in R500
summary(data1$summ < 0)[3]
## TRUE
## "560"
The total number of genes that are induced in both R500 and IMB211 but more stronger induction in R500
summary(data1$summ < 0)[2]
## FALSE
## "260"
I couldn't find other 2 categories of genes.Am i doing something wrong here?
data1$sign_trt_only <- sign(data1$logFC_trt_only)
data1$sign_gt.trt_int_only <- sign(data1$logFC_gt.trt_int_only)
summary(data1$sign_trt_only == data1$sign_gt.trt_int_only)
## Mode FALSE NA's
## logical 820 0