Summary statistics for Arabidopsis Translatome paper

For the paper we want summary statistics about the number of genes and transcripts detected and DE at a range of posterior probability cutoffs in each compartment, as well as about the relationship between DE genes and transcripts.

Genes

load numbers and check normalisation

options(warn = -1)
res <- read.csv("~/experiments/translatome/Arabidopsis/rsem/genes.DE.median_normalised.results.v2.csv", 
    as.is = T)
library(EBSeq)
library(reshape2)
library(gridExtra)
library(ggplot2)
normfactors <- MedianNorm(Data = res[, 2:7])
prenorm <- melt(res[, 2:7])
postnorm <- round(res[, 2:7]/do.call(rbind, rep(list(normfactors), nrow(res))))
res[, 2:7] <- postnorm
postnorm <- melt(postnorm)
print("The following plot checks whether normalisation has worked correctly. If it has, the two plots should look identical. If it failed, the lines in the first plot will look less tightly overlapping than those in the second plot.")

## [1] "The following plot checks whether normalisation has worked correctly. If it has, the two plots should look identical. If it failed, the lines in the first plot will look less tightly overlapping than those in the second plot."

suppressWarnings({
    p1 <- ggplot(prenorm, aes(x = log(value), colour = variable)) + geom_density() + 
        ggtitle("pre-normalisation")
    p2 <- ggplot(postnorm, aes(x = log(value), colour = variable)) + geom_density() + 
        ggtitle("post-normalisation")
    grid.arrange(p1, p2, ncol = 2)
})

plot of chunk unnamed-chunk-1

number detected

# Calculate numbers of genes detected

genes_detected <- length(which(apply(res[, 8:9], 1, function(x) !all(x == 0))))
genes_nonzero <- length(which(apply(res[, 8:9], 1, function(x) any(x >= 1))))
print(paste(genes_detected, "genes detected with >0 counts in at least one sample,", 
    genes_nonzero, "of which had >= 1 normalised counts in at least one sample."))

## [1] "23442 genes detected with >0 counts in at least one sample, 21955 of which had >= 1 normalised counts in at least one sample."

number called as differentially expressed

# count DE
ppdesweep <- function(df) {
    ppde_range <- c(0.95, 0.995, 0.9995, 0.99995, 1)
    num_de <- sapply(ppde_range, function(x) length(which(df$PPDE >= x)))
    up_bs <- sapply(ppde_range, function(x) {
        nrow(subset(df, PPDE >= x & log2.FC < 0))
    })
    up_bs_x2 <- sapply(ppde_range, function(x) {
        nrow(subset(df, PPDE >= x & log2.FC <= -1))
    })
    up_35s <- sapply(ppde_range, function(x) {
        nrow(subset(df, PPDE >= x & log2.FC > 0))
    })
    up_35s_x2 <- sapply(ppde_range, function(x) {
        nrow(subset(df, PPDE >= x & log2.FC >= 1))
    })
    diff <- data.frame(PPDE = format(ppde_range, scientific = FALSE, digits = 6), 
        num_DE = num_de, up_BS = up_bs, up_BS_x2 = up_bs_x2, up_35S = up_35s, 
        up_35S_x2 = up_35s_x2)
    return(diff)
}

gene_diff <- ppdesweep(res)
print(gene_diff)

##      PPDE num_DE up_BS up_BS_x2 up_35S up_35S_x2
## 1 0.95000   7946  4224     1869   3722      1155
## 2 0.99500   6666  3691     1759   2975      1018
## 3 0.99950   5773  3319     1674   2454       901
## 4 0.99995   5102  3032     1600   2070       803
## 5 1.00000   2241  1571     1074    670       354

Transcripts

load and check data

tres <- read.csv("~/experiments/translatome/Arabidopsis/rsem/isoforms.DE.median_normalised.results.v2.csv", 
    as.is = T)
normfactors <- MedianNorm(Data = res[, 2:7])
prenorm <- melt(res[, 2:7])
postnorm <- round(res[, 2:7]/do.call(rbind, rep(list(normfactors), nrow(res))))
res[, 2:7] <- postnorm
postnorm <- melt(postnorm)
print("The following plot checks whether normalisation has worked correctly. If it has, the two plots should look identical. If it failed, the lines in the first plot will look less tightly overlapping than those in the second plot.")

## [1] "The following plot checks whether normalisation has worked correctly. If it has, the two plots should look identical. If it failed, the lines in the first plot will look less tightly overlapping than those in the second plot."

suppressWarnings({
    p1 <- ggplot(prenorm, aes(x = log(value), colour = variable)) + geom_density() + 
        ggtitle("pre-normalisation")
    p2 <- ggplot(postnorm, aes(x = log(value), colour = variable)) + geom_density() + 
        ggtitle("post-normalisation")
    grid.arrange(p1, p2, ncol = 2)
})

plot of chunk unnamed-chunk-4

number detected

tres$agi <- gsub(tres$id, pattern = "\\.[0-9]+", replacement = "")
trans_detected <- length(which(apply(tres[, 8:9], 1, function(x) any(x > 0))))
trans_nonzero <- length(which(apply(tres[, 8:9], 1, function(x) any(x >= 1))))
print(paste(trans_detected, "transcripts detected with >0 counts in at least one sample,", 
    trans_nonzero, "of which had >= 1 normalised counts in at least one sample."))

## [1] "30213 transcripts detected with >0 counts in at least one sample, 28639 of which had >= 1 normalised counts in at least one sample."

number called as differentially expressed

trans_diff <- ppdesweep(tres)
print(trans_diff)

##      PPDE num_DE up_BS up_BS_x2 up_35S up_35S_x2
## 1 0.95000   8572  4575     1710   3997      1983
## 2 0.99500   7080  3960     1560   3120      1638
## 3 0.99950   6059  3529     1462   2530      1389
## 4 0.99995   5323  3200     1382   2123      1192
## 5 1.00000   2309  1644      916    665       473

Multiple transcripts

how many genes produce multiple transcripts?

# gmet = genes with multiple expressed transcripts
library(plyr)
gmet <- ddply(tres, .(agi), summarise, count = length(id[X35S.mean > 0 || GDC.mean > 
    0]))
# ngmet = number of genes with multuple expressed transcripts
ngmet <- length(which(gmet$count >= 1))
print(paste("There were", ngmet, "genes with multiple expressed (count >=1 in at least one sample) transcripts"))

## [1] "There were 23231 genes with multiple expressed (count >=1 in at least one sample) transcripts"

of those, how many were enriched in each compartment at the transcript level

# bsst = bundle sheath specific transcripts from multiple-transcript genes
# wlst = whole leaf specific transcripts from multiple-transcript genes bwst
# = both way specific (at least one transcript up in each cell type)
bsst = subset(tres, PPDE >= 0.95 & log2.FC < 0)
bsst2x = bsst[bsst$log2.FC < -1, ]
wlst = subset(tres, PPDE >= 0.95 & log2.FC > 0)
wlst2x = wlst[wlst$log2.FC > 1, ]

n_bsst <- length(unique(bsst$agi))
n_bsst2x <- length(unique(bsst2x$agi))
n_wlst <- length(unique(wlst$agi))
n_wlst2x <- length(unique(wlst2x$agi))
bwst <- intersect(unique(bsst$agi), unique(wlst$agi))
n_bwst <- length(bwst)
bwst2x <- intersect(unique(bsst$agi), unique(wlst$agi))
n_bwst2x <- length(bwst2x)
print(data.frame(FC_cutoff = c("none", "2x"), genes_BS_ts = c(n_bsst, n_bsst2x), 
    genes_35S_ts = c(n_wlst, n_wlst2x), genes_both_ts = c(n_bwst, n_bwst2x)))

##   FC_cutoff genes_BS_ts genes_35S_ts genes_both_ts
## 1      none        4340         3775            87
## 2        2x        1625         1856            87

Sanity check plots

library(ggplot2)
library(reshape2)
names(res)[1] <- "agi"
plotfigure <- function(fig) {
    genelist <- read.csv(fig, head = F, as.is = T)
    names(genelist) <- c("agi", "name")
    genelist$agi <- toupper(genelist$agi)
    genelist$name <- factor(genelist$name, ordered = T, levels = genelist$name)
    genelist <- merge(genelist, res[, c("agi", "log2.FC")], by = "agi")
    genelist <- genelist[with(genelist, order(name)), ]
    p <- ggplot(genelist, aes(x = name, y = log2.FC)) + geom_bar(stat = "identity") + 
        theme_bw() + ggtitle(gsub(fig, pattern = "([^/]*)\\\\.csv", replacement = "\\1")) + 
        xlab("gene") + ylab("log2 fold change") + theme(axis.text.x = element_text(angle = 90, 
        hjust = 1))
    print(p)
}

files <- list.files(path = "~/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists/", 
    full.names = T)
print(files)

##  [1] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig4b.csv" 
##  [2] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig4c.csv" 
##  [3] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig4d.csv" 
##  [4] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig5a.csv" 
##  [5] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig5b.csv" 
##  [6] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig5c.csv" 
##  [7] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig5d.csv" 
##  [8] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//fig6.csv"  
##  [9] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//sfig2a.csv"
## [10] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//sfig2b.csv"
## [11] "/home/rds45/Dropbox/papers/Translatome/Arabidopsis/rsem/fig_lists//sfig2c.csv"

for (file in files) {
    plotfigure(file)
}

plot of chunk unnamed-chunk-9