DGE of bam files from Growth Chamber samples GO annotation of DGE genes done with 48513 transcripts (new transcriptome) and 34304 GO terms (new annotation) load required libraries
library(edgeR)
## Loading required package: limma
library("ggplot2")
library(Biostrings)
## Loading required package: BiocGenerics Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport,
## clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply,
## parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, as.data.frame, cbind, colnames, duplicated, eval, Filter,
## Find, get, intersect, lapply, Map, mapply, match, mget, order, paste,
## pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rep.int,
## rownames, sapply, setdiff, sort, table, tapply, union, unique, unlist
##
## Loading required package: IRanges
library(GO.db)
## Loading required package: AnnotationDbi Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with 'browseVignettes()'. To
## cite Bioconductor, see 'citation("Biobase")', and for packages
## 'citation("pkgname")'.
##
## Loading required package: DBI
library(goseq)
## Loading required package: BiasedUrn Loading required package:
## geneLenDataBase
Read counts file
counts_GC_final_merged <- read.delim("DGE_parents_GC.bam.tsv", row.names = NULL)
counts_GC_final_merged <- counts_GC_final_merged[counts_GC_final_merged$gene !=
"*", ]
row.names(counts_GC_final_merged) <- counts_GC_final_merged$gene
counts_GC_final_merged <- counts_GC_final_merged[, -1]
counts_GC_final_merged[is.na(counts_GC_final_merged)] <- 0
counts_GC_final_merged <- counts_GC_final_merged[, colSums(counts_GC_final_merged) >
1e+05]
To check if the samples are clustered well or not
data_GC_final_merged <- DGEList(counts = counts_GC_final_merged, group = c(rep("IMB211",
30), rep("R500", 35)))
# data_GC_final_merged$samples
dim(data_GC_final_merged)
## [1] 46958 65
data_GC_final_merged <- data_GC_final_merged[rowSums(cpm(data_GC_final_merged) >
1) >= 3, ]
dim(data_GC_final_merged)
## [1] 36537 65
data_GC_final_merged <- calcNormFactors(data_GC_final_merged)
# data_GC_final_merged$samples
MDS plot
MDS.plot_GC_final_merged <- plotMDS(data_GC_final_merged, cex = 0.7)
ggplot2
parsed_GC_final_merged <- matrix(unlist(strsplit(names(MDS.plot_GC_final_merged$x),
"_")), ncol = 6, byrow = T)
MDS_DE_GC_final_merged <- data.frame(MDS.plot_GC_final_merged$x, MDS.plot_GC_final_merged$y,
parsed_GC_final_merged)
names(MDS_DE_GC_final_merged) <- c("x", "y", "par", "treatment", "rep", "tissue")
mds_GC_final_merged <- ggplot(MDS_DE_GC_final_merged, aes(x, y)) + geom_point(aes(color = treatment,
shape = rep)) + geom_rug(aes(colour = treatment)) + facet_grid(par ~ tissue) +
labs(title = "ggplot of GC new ref samples") + theme(strip.text.x = element_text(size = 8))
mds_GC_final_merged
Extract the columns for Internode tissue
grep("INT", names(counts_GC_final_merged))
## [1] 1 7 16 21 26 33 40 45 51 56 61
count_GC_INT <- counts_GC_final_merged[c(1, 7, 16, 21, 26, 33, 40, 45, 51, 56,
61)]
dataframe
gt <- sub("(IMB211|R500)_[[:print:]]+", "\\1", names(count_GC_INT))
gt <- sub("GCR500", "R500", gt)
gt <- sub("GCIMB211", "IMB211", gt)
samples <- data.frame(file = names(count_GC_INT), gt, trt = sub("[[:print:]]+_(SUN|SHADE)[[:print:]]+",
"\\1", names(count_GC_INT)), rep = sub("[[:alnum:]]+_[[:alnum:]]+_([1-3])[[:print:]]+.bam",
"\\1", names(count_GC_INT)))
# samples
samples$group <- paste(samples$gt, samples$trt, sep = "_")
group <- samples$group
gt <- samples$gt
trt <- samples$trt
samples
## file gt trt rep group
## 1 GCIMB211_SHADE_1_INTERNODE_s_1.fq.gz.bam IMB211 SHADE 1 IMB211_SHADE
## 2 GCIMB211_SHADE_2_INTERNODE_s_1.fq.gz.bam IMB211 SHADE 2 IMB211_SHADE
## 3 GCIMB211_SUN_1_INTERNODE_s_1.fq.gz.bam IMB211 SUN 1 IMB211_SUN
## 4 GCIMB211_SUN_2_INTERNODE_s_1.fq.gz.bam IMB211 SUN 2 IMB211_SUN
## 5 GCIMB211_SUN_3_INTERNODE_s_1.fq.gz.bam IMB211 SUN 3 IMB211_SUN
## 6 GCR500_SHADE_1_INTERNODE_s_1.fq.gz.bam R500 SHADE 1 R500_SHADE
## 7 GCR500_SHADE_2_INTERNODE_s_1.fq.gz.bam R500 SHADE 2 R500_SHADE
## 8 GCR500_SHADE_3_INTERNODE_s_1.fq.gz.bam R500 SHADE 3 R500_SHADE
## 9 GCR500_SUN_1_INTERNODE_s_1.fq.gz.bam R500 SUN 1 R500_SUN
## 10 GCR500_SUN_2_INTERNODE_s_1.fq.gz.bam R500 SUN 2 R500_SUN
## 11 GCR500_SUN_3_INTERNODE_s_1.fq.gz.bam R500 SUN 3 R500_SUN
Create DGEList object and normalization
data_GC_INT.int <- DGEList(count_GC_INT, group = group)
# data_GC_INT.int$samples
data_GC_INT.int <- data_GC_INT.int[rowSums(cpm(data_GC_INT.int) > 1) >= 3, ]
# dim(data_GC_INT.int)
data_GC_INT.int <- calcNormFactors(data_GC_INT.int)
# data_GC_INT.int$samples
relevel for gt and trt
gt <- relevel(samples$gt, ref = "R500")
trt <- relevel(samples$trt, ref = "SUN")
model matrix
design_GC_INT_int <- model.matrix(~gt * trt)
colnames(design_GC_INT_int)
## [1] "(Intercept)" "gtIMB211" "trtSHADE"
## [4] "gtIMB211:trtSHADE"
rownames(design_GC_INT_int) <- colnames(data_GC_INT.int)
Estimating the dispersion First we estimate the overall dispersion for the dataset, to get an idea of the overall level of biological variability
data_GC_INT.int <- estimateGLMCommonDisp(data_GC_INT.int, design_GC_INT_int,
verbose = TRUE)
## Disp = 0.09554 , BCV = 0.3091
Then we estimate gene-wise dispersion estimates, allowing a possible trend with averge count size
data_GC_INT.int <- estimateGLMTrendedDisp(data_GC_INT.int, design_GC_INT_int)
## Loading required package: splines
data_GC_INT.int <- estimateGLMTagwiseDisp(data_GC_INT.int, design_GC_INT_int)
BCV plot
plotBCV(data_GC_INT.int, main = "comparision of common vs Trend dispersion")
data_GC_INT.int <- estimateGLMTrendedDisp(data_GC_INT.int, design_GC_INT_int)
data_GC_INT.int <- estimateGLMTagwiseDisp(data_GC_INT.int, design_GC_INT_int)
Find top hits Fit the linear model to the design matrix
fit_GC_INT_int <- glmFit(data_GC_INT.int, design_GC_INT_int)
colnames(fit_GC_INT_int$design)
## [1] "(Intercept)" "gtIMB211" "trtSHADE"
## [4] "gtIMB211:trtSHADE"
Are there more genes DE in the gt that is more morphologically responsive i.e testing for DE for gt and gt*trt interaction effect. To test for DE for gt and gt*trt effect drop coefficeint 2 and 4…
GC_INT_gt_int_lrt <- glmLRT(fit_GC_INT_int, coef = c(2, 4))
results_GC_INT_gt_int_lrt <- topTags(GC_INT_gt_int_lrt, n = Inf)
head(results_GC_INT_gt_int_lrt$table)
## logFC.gtIMB211 logFC.gtIMB211.trtSHADE logCPM LR PValue
## Bra31014 13.41 -0.1119 6.756 809.3 1.789e-176
## Bra05343 12.31 -0.1242 5.649 717.5 1.588e-156
## Bra21311 12.44 -3.2281 5.789 714.5 7.102e-156
## Bra28173 12.30 -3.0037 5.908 679.8 2.418e-148
## Bra23852 14.87 -4.5924 8.192 679.7 2.501e-148
## Bra28185 10.87 -0.6847 8.158 658.0 1.309e-143
## FDR
## Bra31014 5.197e-172
## Bra05343 2.306e-152
## Bra21311 6.878e-152
## Bra28173 1.453e-144
## Bra23852 1.453e-144
## Bra28185 6.340e-140
sum(results_GC_INT_gt_int_lrt$table$FDR < 0.01)
## [1] 12777
interesting_samples_gt_int <- rownames(results_GC_INT_gt_int_lrt$table[results_GC_INT_gt_int_lrt$table$FDR <
0.01, ])
save results in a file
write.csv(results_GC_INT_gt_int_lrt, "edgeR_DE_GC_IMB211_R500_INT_gt_int")
Preparation of transcript and Go table file
Bra_trans_len <- read.csv("Brassica_transcriptome_assembly_gene_lengths", sep = "\t",
h = F)
colnames(Bra_trans_len) <- c("Gene", "length")
Bra_trans_lenf <- Bra_trans_len[, 1]
Bra_trans_len2 <- Bra_trans_len[, 2]
genes.in.annot <- Bra_trans_lenf
go <- read.table("Brassica_transcriptome_assembly_split.all.fasta.annot.xml.annot.blast_out.annot.wego",
h = F)
colnames(go) <- c("Gene", "GO")
go.list <- strsplit(as.character(go[, 2]), split = ",", fixed = T)
names(go.list) <- as.character(go[, 1])
GO enrichment function
GO.enrich <- function(exp_file, Bra_trans_len2, genes.in.annot, go.list) {
GeneExpFile <- c(exp_file)
genes <- read.csv(GeneExpFile, sep = ",", quote = "\"", h = T)
DE_genes <- subset(genes, FDR < 0.01)
NDE_genes <- subset(genes, FDR > 0.01)
DE_genes <- as.character(DE_genes[, 1])
NDE_genes <- as.character(NDE_genes[, 1])
gene_name <- c(DE_genes, NDE_genes)
comb_genes <- rep(c(1, 0), c(length(DE_genes), length(NDE_genes)))
names(comb_genes) <- gene_name
bias <- Bra_trans_len2
names(bias) <- genes.in.annot
new_bias <- bias[names(bias) %in% names(comb_genes)]
pwf <- nullp(comb_genes, bias.data = new_bias)
go.analysis <- goseq(pwf, gene2cat = go.list)
print(length(go.analysis$category[p.adjust(go.analysis$over_represented_pvalue,
method = "BH") < 0.01]))
enriched.GO_category <- go.analysis$category[go.analysis$over_represented_pvalue <
0.01]
enriched.GO_p_value <- go.analysis$over_represented_pvalue[go.analysis$over_represented_pvalue <
0.01]
enriched.GO <- cbind(enriched.GO_category, enriched.GO_p_value)
filename <- paste(exp_file, ".enriched.GO_pvalue.csv", sep = "")
write.csv(enriched.GO, file = filename, quote = F, row.names = F)
}
go_term_table <- function(enriched.GO) {
go_table <- read.csv(enriched.GO)
go_table <- go_table[, 1]
go.id <- 0
go.term <- 0
go.ont <- 0
i <- 1
for (go in go_table) {
go.id[i] <- GOID(GOTERM[[go]])
go.term[i] <- Term(GOTERM[[go]])
go.ont[i] <- Ontology(GOTERM[[go]])
i <- i + 1
}
enriched.GO.df <- data.frame(go.id, go.term, go.ont)
filename1 <- paste(enriched.GO, ".enriched_GO_terms.csv", sep = "")
write.table(enriched.GO.df, file = filename1)
}
GO.enrich("edgeR_DE_GC_IMB211_R500_INT_gt_int", Bra_trans_len2, genes.in.annot,
go.list)
## Warning: the matrix is either rank-deficient or indefinite
## Using manually entered categories. Calculating the p-values...
## [1] 89
go_term_table("edgeR_DE_GC_IMB211_R500_INT_gt_int.enriched.GO_pvalue.csv")
## Error: unable to find an inherited method for function 'GOID' for
## signature '"NULL"'