For the paper we want summary statistics about the number of genes and transcripts detected and DE at a range of posterior probability cutoffs in each compartment, as well as about the relationship between DE genes and transcripts.
options(warn = -1)
res <- read.csv("~/experiments/translatome/Arabidopsis/rsem/genes.DE.median_normalised.results.v2.csv",
as.is = T)
library(EBSeq)
library(reshape2)
library(gridExtra)
library(ggplot2)
normfactors <- MedianNorm(Data = res[, 2:7])
prenorm <- melt(res[, 2:7])
postnorm <- round(res[, 2:7]/do.call(rbind, rep(list(normfactors), nrow(res))))
res[, 2:7] <- postnorm
postnorm <- melt(postnorm)
print("The following plot checks whether normalisation has worked correctly. If it has, the two plots should look identical. If it failed, the lines in the first plot will look less tightly overlapping than those in the second plot.")
## [1] "The following plot checks whether normalisation has worked correctly. If it has, the two plots should look identical. If it failed, the lines in the first plot will look less tightly overlapping than those in the second plot."
suppressWarnings({
p1 <- ggplot(prenorm, aes(x = log(value), colour = variable)) + geom_density() +
ggtitle("pre-normalisation")
p2 <- ggplot(postnorm, aes(x = log(value), colour = variable)) + geom_density() +
ggtitle("post-normalisation")
grid.arrange(p1, p2, ncol = 2)
})
# Calculate numbers of genes detected
genes_detected <- length(which(apply(res[, 8:9], 1, function(x) !all(x == 0))))
genes_nonzero <- length(which(apply(res[, 8:9], 1, function(x) any(x >= 1))))
print(paste(genes_detected, "genes detected with >0 counts in at least one sample,",
genes_nonzero, "of which had >= 1 normalised counts in at least one sample."))
## [1] "23442 genes detected with >0 counts in at least one sample, 21955 of which had >= 1 normalised counts in at least one sample."
# count DE
ppdesweep <- function(df) {
ppde_range <- c(0.95, 0.995, 0.9995, 0.99995, 1)
num_de <- sapply(ppde_range, function(x) length(which(df$PPDE >= x)))
up_bs <- sapply(ppde_range, function(x) {
nrow(subset(df, PPDE >= x & log2.FC < 0))
})
up_bs_x2 <- sapply(ppde_range, function(x) {
nrow(subset(df, PPDE >= x & log2.FC <= -1))
})
up_35s <- sapply(ppde_range, function(x) {
nrow(subset(df, PPDE >= x & log2.FC > 0))
})
up_35s_x2 <- sapply(ppde_range, function(x) {
nrow(subset(df, PPDE >= x & log2.FC >= 1))
})
diff <- data.frame(PPDE = format(ppde_range, scientific = FALSE, digits = 6),
num_DE = num_de, up_BS = up_bs, up_BS_x2 = up_bs_x2, up_35S = up_35s,
up_35S_x2 = up_35s_x2)
return(diff)
}
gene_diff <- ppdesweep(res)
print(gene_diff)
## PPDE num_DE up_BS up_BS_x2 up_35S up_35S_x2
## 1 0.95000 7946 4224 1869 3722 1155
## 2 0.99500 6666 3691 1759 2975 1018
## 3 0.99950 5773 3319 1674 2454 901
## 4 0.99995 5102 3032 1600 2070 803
## 5 1.00000 2241 1571 1074 670 354
tres <- read.csv("~/experiments/translatome/Arabidopsis/rsem/isoforms.DE.median_normalised.results.v2.csv",
as.is = T)
normfactors <- MedianNorm(Data = res[, 2:7])
prenorm <- melt(res[, 2:7])
postnorm <- round(res[, 2:7]/do.call(rbind, rep(list(normfactors), nrow(res))))
res[, 2:7] <- postnorm
postnorm <- melt(postnorm)
print("The following plot checks whether normalisation has worked correctly. If it has, the two plots should look identical. If it failed, the lines in the first plot will look less tightly overlapping than those in the second plot.")
## [1] "The following plot checks whether normalisation has worked correctly. If it has, the two plots should look identical. If it failed, the lines in the first plot will look less tightly overlapping than those in the second plot."
suppressWarnings({
p1 <- ggplot(prenorm, aes(x = log(value), colour = variable)) + geom_density() +
ggtitle("pre-normalisation")
p2 <- ggplot(postnorm, aes(x = log(value), colour = variable)) + geom_density() +
ggtitle("post-normalisation")
grid.arrange(p1, p2, ncol = 2)
})
tres$agi <- gsub(tres$id, pattern = "\\.[0-9]+", replacement = "")
trans_detected <- length(which(apply(tres[, 8:9], 1, function(x) any(x > 0))))
trans_nonzero <- length(which(apply(tres[, 8:9], 1, function(x) any(x >= 1))))
print(paste(trans_detected, "transcripts detected with >0 counts in at least one sample,",
trans_nonzero, "of which had >= 1 normalised counts in at least one sample."))
## [1] "30213 transcripts detected with >0 counts in at least one sample, 28639 of which had >= 1 normalised counts in at least one sample."
trans_diff <- ppdesweep(tres)
print(trans_diff)
## PPDE num_DE up_BS up_BS_x2 up_35S up_35S_x2
## 1 0.95000 8572 4575 1710 3997 1983
## 2 0.99500 7080 3960 1560 3120 1638
## 3 0.99950 6059 3529 1462 2530 1389
## 4 0.99995 5323 3200 1382 2123 1192
## 5 1.00000 2309 1644 916 665 473
# gmet = genes with multiple expressed transcripts
library(plyr)
gmet <- ddply(tres, .(agi), summarise, count = length(id[X35S.mean > 0 || GDC.mean >
0]))
# ngmet = number of genes with multuple expressed transcripts
ngmet <- length(which(gmet$count >= 1))
print(paste("There were", ngmet, "genes with multiple expressed (count >=1 in at least one sample) transcripts"))
## [1] "There were 23231 genes with multiple expressed (count >=1 in at least one sample) transcripts"
# bsst = bundle sheath specific transcripts from multiple-transcript genes
# wlst = whole leaf specific transcripts from multiple-transcript genes bwst
# = both way specific (at least one transcript up in each cell type)
bsst = subset(tres, PPDE >= 0.95 & log2.FC < 0)
bsst2x = bsst[bsst$log2.FC < -1, ]
wlst = subset(tres, PPDE >= 0.95 & log2.FC > 0)
wlst2x = wlst[wlst$log2.FC > 1, ]
n_bsst <- length(unique(bsst$agi))
n_bsst2x <- length(unique(bsst2x$agi))
n_wlst <- length(unique(wlst$agi))
n_wlst2x <- length(unique(wlst2x$agi))
bwst <- intersect(unique(bsst$agi), unique(wlst$agi))
n_bwst <- length(bwst)
bwst2x <- intersect(unique(bsst$agi), unique(wlst$agi))
n_bwst2x <- length(bwst2x)
print(data.frame(FC_cutoff = c("none", "2x"), genes_BS_ts = c(n_bsst, n_bsst2x),
genes_35S_ts = c(n_wlst, n_wlst2x), genes_both_ts = c(n_bwst, n_bwst2x)))
## FC_cutoff genes_BS_ts genes_35S_ts genes_both_ts
## 1 none 4340 3775 87
## 2 2x 1625 1856 87
library(ggplot2)
library(reshape2)
names(res)[1] <- "agi"
plotfigure <- function(fig) {
genelist <- read.csv(fig, head = F, as.is = T)
names(genelist) <- c("agi", "name")
genelist$agi <- toupper(genelist$agi)
genelist$name <- factor(genelist$name, ordered = T, levels = genelist$name)
genelist <- merge(genelist, res[, c("agi", "log2.FC")], by = "agi")
genelist <- genelist[with(genelist, order(name)), ]
p <- ggplot(genelist, aes(x = name, y = log2.FC)) + geom_bar(stat = "identity") +
theme_bw() + ggtitle(gsub(fig, pattern = "([^/]*)\\\\.csv", replacement = "\\1")) +
xlab("gene") + ylab("log2 fold change") + theme(axis.text.x = element_text(angle = 90,
hjust = 1))
print(p)
}
files <- list.files(path = "~/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists/",
full.names = T)
print(files)
## [1] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig4b.csv"
## [2] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig4c.csv"
## [3] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig4d.csv"
## [4] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig5a.csv"
## [5] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig5b.csv"
## [6] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig5c.csv"
## [7] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig5d.csv"
## [8] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig6.csv"
## [9] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//sfig2a.csv"
## [10] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//sfig2b.csv"
## [11] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//sfig2c.csv"
for (file in files) {
plotfigure(file)
}