Introduction

This document demonstrates a strategy to adjust the p-values for a differential translation efficiency analysis using riborex. The dataset involves a time-course experiment studying the effect of radiation on glioblatoma cells. Briefly, two glioblastoma cell lines U251 (p53 mt) and U343 (p53 wt) were profiled using RNA-Seq and Ribo-Seq at 3 time points: 0h, 1h and 2h post radiation.

Utility functions

suppressMessages(library(riborex))
suppressMessages(library(fdrtool))
suppressMessages(library(cowplot))
annotations <- read.table(file.path('..', 'annotations', 
                                    'hg38_gene_names_stripped.tsv'), 
                          header = F, 
                          col.names = c('gene_id', 'gene_name', 
                                        'gene_type'),
                          stringsAsFactors = F)
rownames(annotations) <- annotations$gene_id
histone.genes.df <- read.table(file.path('..', 'annotations', 
                                         'histone_genes.tsv'), 
                               header = T, 
                               stringsAsFactors = F,
                               sep = '\t') 
histone.genes.1 <- annotations[grep('HIST', annotations$gene_name), ]$gene_id
histone.genes.2 <-annotations[annotations$gene_name %in% 
                                histone.genes.df$Approved.Symbol, ]$gene_id
# Add RP1-34B20.21 separately since symbol doesn't have HIST in it.
histone.genes <- unique(c(histone.genes.1, histone.genes.2, 'ENSG00000282988'))
readcounts.dir <- file.path('..', 'read_counts', 'byCDS')
results.dir <- file.path('..', 'results', 'translation_efficiency', 
                         'without_histones_edgeRD')
rna.design.file <- file.path('..', 'design_files', 'rna_seq_design.tsv')
ribo.design.file <- file.path('..', 'design_files', 'ribo_seq_design.tsv')
## Suffix of htseq-count output
counts.suffix <- '.CDS.counts.tsv'
rna.design.info <- read.table(rna.design.file, header=T, 
                              stringsAsFactors=FALSE)
ribo.design.info <- read.table(ribo.design.file, header=T, 
                               stringsAsFactors=FALSE)
count.reads <- function(design.info, dirname){
 files <- paste(design.info$SampleFile, counts.suffix, sep='')
 sampleName <- design.info$SampleName
 sampleTable <- data.frame(sampleName=sampleName, 
                           fileName=files, 
                           time=factor(design.info$Time, levels=c('T0', 'T1', 'T24')), 
                           Cell_line=factor(design.info$Cell_line, levels=c('U251', 'U343')))
 ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, 
          directory = file.path(readcounts.dir, dirname),
          design = ~ time+Cell_line)
 rownames(ddsHTSeq) <- gsub('\\.[0-9]+', '', rownames(ddsHTSeq))
 # Remove histones
 ddsHTSeq <- ddsHTSeq[!(rownames(ddsHTSeq) %in% histone.genes),]
 ddsHTSeq <- ddsHTSeq[ rowSums(counts(ddsHTSeq)) > 1, ]
 dds <- DESeq(ddsHTSeq)
 return(dds)
}
filter_results <- function(df){
 df<- as.data.frame(df)
 df <- df[order(df$padj),]
 df$gene_name <- annotations[rownames(df),]$gene_name
 df.sig <- subset(df, padj<0.05)
 return (df.sig)
}
filter_results.edgeR <- function(df){
 df<- as.data.frame(df$table)
 df <- df[order(df$FDR),]
 df$gene_name <- annotations[rownames(df),]$gene_name
 df.sig <- subset(df, FDR<0.05)
 return (df.sig)
}
doPvalueAdjustment <- function(results){
  hist(results$pvalue,  main = 'DESeq2 unadjusted p-values', 
       xlab = 'Unadjusted p-values')
  results <- results[ !is.na(results$padj), ]
  results <- results[ !is.na(results$pvalue), ]
  results <- results[, -which(names(results) == 'padj')]
  resultsFDR <- fdrtool(results$stat, 
                        statistic= 'normal', 
                        plot = T)
  results[,'padj']  <- p.adjust(resultsFDR$pval,
                                method = 'BH')
  hist(resultsFDR$pval, 
       main = 'DESeq2 corrected p-values | Empirical null', 
       xlab = 'Corrected p-values')
  return (results)
}
riborex.for.cellline <- function(rna.read.counts, ribo.read.counts, 
                                 cell.line, contrast, engine='DESeq2', minMeanCount = 20,
                                 merge.T1=FALSE){
 rna.read.counts <- rna.read.counts[, grepl(cell.line, 
                                            colnames(rna.read.counts))]
 ribo.read.counts <- ribo.read.counts[, grepl(cell.line, 
                                              colnames(ribo.read.counts))]
 rna.conditions.time <- as.factor(as.vector(
   sapply(colnames(rna.read.counts), 
          function(x) unlist(strsplit(x, '_'))[3])))
 ribo.conditions.time <- as.factor(as.vector(
   sapply(colnames(ribo.read.counts), 
          function(x) unlist(strsplit(x, '_'))[4])))
 rna.conditions.cell <- as.factor(as.vector(
   sapply(colnames(rna.read.counts), 
          function(x) unlist(strsplit(x, '_'))[1])))
 ribo.conditions.cell <- as.factor(
   as.vector(sapply(colnames(ribo.read.counts), 
                    function(x) unlist(strsplit(x, '_'))[1])))
 
 if (merge.T1){
   levels(rna.conditions.time)[levels(rna.conditions.time)=='T0'] <- 'T0T1'
   levels(rna.conditions.time)[levels(rna.conditions.time)=='T1'] <- 'T0T1'
   levels(ribo.conditions.time)[levels(ribo.conditions.time)=='T0'] <- 'T0T1'
   levels(ribo.conditions.time)[levels(ribo.conditions.time)=='T1'] <- 'T0T1'
 }
 rna.conditions <- data.frame('time' = rna.conditions.time)
 ribo.conditions <- data.frame('time' = ribo.conditions.time)
 common.genes <- intersect(rownames(rna.read.counts), 
                           rownames(ribo.read.counts)) 
 rna.read.counts <- rna.read.counts[common.genes,]
 ribo.read.counts <- ribo.read.counts[common.genes,]
 colnames(rna.read.counts) <- paste(colnames(rna.read.counts),
                                    'RNA', sep='_')
 colnames(ribo.read.counts) <- paste(colnames(ribo.read.counts),
                                     'Ribo', sep='_')
 res <- riborex(rna.read.counts, 
                ribo.read.counts, 
                rna.conditions, 
                ribo.conditions, 
                contrast = contrast, 
                engine = engine,
                minMeanCount = minMeanCount)
 return (res)
}

Read data

rna.read.counts.all <- count.reads(rna.design.info, 'rna_seq')
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
ribo.read.counts.all <- count.reads(ribo.design.info, 'ribo_seq')
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
rna.conditions.time.all <- as.factor(
  as.vector(sapply(colnames(rna.read.counts.all), 
                   function(x) unlist(strsplit(x, '_'))[3])))
ribo.conditions.time.all <- as.factor(
  as.vector(sapply(colnames(ribo.read.counts.all), 
                   function(x) unlist(strsplit(x, '_'))[4])))
rna.conditions.cell.all <- as.factor(
  as.vector(sapply(colnames(rna.read.counts.all),
                   function(x) unlist(strsplit(x, '_'))[1])))
ribo.conditions.cell.all <- as.factor(as.vector(
  sapply(colnames(ribo.read.counts.all),
         function(x) unlist(strsplit(x, '_'))[1])))
rna.conditions.all <- data.frame('cell.type' = rna.conditions.cell.all, 
                                 'time' = rna.conditions.time.all)
ribo.conditions.all <- data.frame('cell.type' = ribo.conditions.cell.all, 
                                  'time' = ribo.conditions.time.all)
common.genes <- intersect(rownames(rna.read.counts.all), 
                          rownames(ribo.read.counts.all)) 
rna.read.counts.all <- rna.read.counts.all[common.genes,]
ribo.read.counts.all <- ribo.read.counts.all[common.genes,]
colnames(rna.read.counts.all) <- paste(colnames(rna.read.counts.all),
                                       'RNA', sep='_')
colnames(ribo.read.counts.all) <- paste(colnames(ribo.read.counts.all),
                                        'Ribo', sep='_')
contrast.T1vsT0 <- c('time', 'T1', 'T0')
contrast.T24vsT0 <- c('time', 'T24', 'T0')
contrast.T24vsT1 <- c('time', 'T24', 'T1')
var.rna <- ( 1/ log2(exp(1)) )^2 * (1 / rowData(rna.read.counts.all)$baseMean + rowData(rna.read.counts.all)$dispersion)
var.ribo <- ( 1/ log2(exp(1)) )^2 * (1 / rowData(ribo.read.counts.all)$baseMean + rowData(ribo.read.counts.all)$dispersion)
sqrtsd.rna <-  sqrt(sqrt(var.rna))
sqrtsd.ribo <-  sqrt(sqrt(var.ribo))
library(tidyverse)
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
collapse(): dplyr, IRanges
combine():  dplyr, Biobase, BiocGenerics
count():    dplyr, matrixStats
desc():     dplyr, IRanges
expand():   tidyr, Matrix, S4Vectors
filter():   dplyr, stats
first():    dplyr, S4Vectors
ggsave():   ggplot2, cowplot
lag():      dplyr, stats
Position(): ggplot2, BiocGenerics, base
reduce():   purrr, GenomicRanges, IRanges
rename():   dplyr, S4Vectors
select():   dplyr, AnnotationDbi
simplify(): purrr, clusterProfiler
slice():    dplyr, IRanges
# Difference in scores cetner around zero
qplot(sqrtsd.rna - sqrtsd.ribo, geom = "dotplot", binwidth = 0.008)

qplot(sqrtsd.rna - sqrtsd.ribo, geom = "density")

t.test(sqrtsd.rna, sqrtsd.ribo, paired =  TRUE)

    Paired t-test

data:  sqrtsd.rna and sqrtsd.ribo
t = -26.34, df = 15140, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.04533102 -0.03905170
sample estimates:
mean of the differences 
            -0.04219136 
---
title: "Accounting for overdispersion in Riborex"
subtitle: When the theoretical distribution is too theoretical.
output:
  html_notebook: default
  html_document: default
  pdf_document: default
---

# Introduction

This  document demonstrates a strategy to adjust the p-values for a differential translation efficiency analysis using riborex. The dataset involves a time-course experiment studying the effect of radiation on glioblatoma cells. Briefly, two glioblastoma cell lines U251 (p53 mt) and U343 (p53 wt) were profiled using RNA-Seq and Ribo-Seq at 3 time points: 0h, 1h and 2h post radiation.

# Utility functions
```{r}
suppressMessages(library(riborex))
suppressMessages(library(fdrtool))
suppressMessages(library(cowplot))

annotations <- read.table(file.path('..', 'annotations', 
                                    'hg38_gene_names_stripped.tsv'), 
                          header = F, 
                          col.names = c('gene_id', 'gene_name', 
                                        'gene_type'),
                          stringsAsFactors = F)
rownames(annotations) <- annotations$gene_id


histone.genes.df <- read.table(file.path('..', 'annotations', 
                                         'histone_genes.tsv'), 
                               header = T, 
                               stringsAsFactors = F,
                               sep = '\t') 
histone.genes.1 <- annotations[grep('HIST', annotations$gene_name), ]$gene_id
histone.genes.2 <-annotations[annotations$gene_name %in% 
                                histone.genes.df$Approved.Symbol, ]$gene_id
# Add RP1-34B20.21 separately since symbol doesn't have HIST in it.
histone.genes <- unique(c(histone.genes.1, histone.genes.2, 'ENSG00000282988'))


readcounts.dir <- file.path('..', 'read_counts', 'byCDS')
results.dir <- file.path('..', 'results', 'translation_efficiency', 
                         'without_histones_edgeRD')
rna.design.file <- file.path('..', 'design_files', 'rna_seq_design.tsv')
ribo.design.file <- file.path('..', 'design_files', 'ribo_seq_design.tsv')

## Suffix of htseq-count output
counts.suffix <- '.CDS.counts.tsv'

rna.design.info <- read.table(rna.design.file, header=T, 
                              stringsAsFactors=FALSE)
ribo.design.info <- read.table(ribo.design.file, header=T, 
                               stringsAsFactors=FALSE)


count.reads <- function(design.info, dirname){
 files <- paste(design.info$SampleFile, counts.suffix, sep='')
 sampleName <- design.info$SampleName
 sampleTable <- data.frame(sampleName=sampleName, 
                           fileName=files, 
                           time=factor(design.info$Time, levels=c('T0', 'T1', 'T24')), 
                           Cell_line=factor(design.info$Cell_line, levels=c('U251', 'U343')))
 ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, 
          directory = file.path(readcounts.dir, dirname),
          design = ~ time+Cell_line)
 rownames(ddsHTSeq) <- gsub('\\.[0-9]+', '', rownames(ddsHTSeq))
 # Remove histones
 ddsHTSeq <- ddsHTSeq[!(rownames(ddsHTSeq) %in% histone.genes),]
 ddsHTSeq <- ddsHTSeq[ rowSums(counts(ddsHTSeq)) > 1, ]
 dds <- DESeq(ddsHTSeq)

 return(dds)
}

filter_results <- function(df){
 df<- as.data.frame(df)
 df <- df[order(df$padj),]
 df$gene_name <- annotations[rownames(df),]$gene_name
 df.sig <- subset(df, padj<0.05)
 return (df.sig)
}

filter_results.edgeR <- function(df){
 df<- as.data.frame(df$table)
 df <- df[order(df$FDR),]
 df$gene_name <- annotations[rownames(df),]$gene_name
 df.sig <- subset(df, FDR<0.05)
 return (df.sig)
}

doPvalueAdjustment <- function(results){
  hist(results$pvalue,  main = 'DESeq2 unadjusted p-values', 
       xlab = 'Unadjusted p-values')
  results <- results[ !is.na(results$padj), ]
  results <- results[ !is.na(results$pvalue), ]
  results <- results[, -which(names(results) == 'padj')]
  resultsFDR <- fdrtool(results$stat, 
                        statistic= 'normal', 
                        plot = T)
  results[,'padj']  <- p.adjust(resultsFDR$pval,
                                method = 'BH')
  hist(resultsFDR$pval, 
       main = 'DESeq2 corrected p-values | Empirical null', 
       xlab = 'Corrected p-values')
  return (results)
}

riborex.for.cellline <- function(rna.read.counts, ribo.read.counts, 
                                 cell.line, contrast, engine='DESeq2', minMeanCount = 20,
                                 merge.T1=FALSE){
 rna.read.counts <- rna.read.counts[, grepl(cell.line, 
                                            colnames(rna.read.counts))]
 ribo.read.counts <- ribo.read.counts[, grepl(cell.line, 
                                              colnames(ribo.read.counts))]
 rna.conditions.time <- as.factor(as.vector(
   sapply(colnames(rna.read.counts), 
          function(x) unlist(strsplit(x, '_'))[3])))
 ribo.conditions.time <- as.factor(as.vector(
   sapply(colnames(ribo.read.counts), 
          function(x) unlist(strsplit(x, '_'))[4])))
 rna.conditions.cell <- as.factor(as.vector(
   sapply(colnames(rna.read.counts), 
          function(x) unlist(strsplit(x, '_'))[1])))
 ribo.conditions.cell <- as.factor(
   as.vector(sapply(colnames(ribo.read.counts), 
                    function(x) unlist(strsplit(x, '_'))[1])))
 
 if (merge.T1){
   levels(rna.conditions.time)[levels(rna.conditions.time)=='T0'] <- 'T0T1'
   levels(rna.conditions.time)[levels(rna.conditions.time)=='T1'] <- 'T0T1'
   levels(ribo.conditions.time)[levels(ribo.conditions.time)=='T0'] <- 'T0T1'
   levels(ribo.conditions.time)[levels(ribo.conditions.time)=='T1'] <- 'T0T1'
 }
 rna.conditions <- data.frame('time' = rna.conditions.time)
 ribo.conditions <- data.frame('time' = ribo.conditions.time)
 common.genes <- intersect(rownames(rna.read.counts), 
                           rownames(ribo.read.counts)) 
 rna.read.counts <- rna.read.counts[common.genes,]
 ribo.read.counts <- ribo.read.counts[common.genes,]
 colnames(rna.read.counts) <- paste(colnames(rna.read.counts),
                                    'RNA', sep='_')
 colnames(ribo.read.counts) <- paste(colnames(ribo.read.counts),
                                     'Ribo', sep='_')
 res <- riborex(rna.read.counts, 
                ribo.read.counts, 
                rna.conditions, 
                ribo.conditions, 
                contrast = contrast, 
                engine = engine,
                minMeanCount = minMeanCount)
 return (res)
}

```

# Read data

```{r}
rna.read.counts.all <- count.reads(rna.design.info, 'rna_seq')
ribo.read.counts.all <- count.reads(ribo.design.info, 'ribo_seq')

rna.conditions.time.all <- as.factor(
  as.vector(sapply(colnames(rna.read.counts.all), 
                   function(x) unlist(strsplit(x, '_'))[3])))
ribo.conditions.time.all <- as.factor(
  as.vector(sapply(colnames(ribo.read.counts.all), 
                   function(x) unlist(strsplit(x, '_'))[4])))
rna.conditions.cell.all <- as.factor(
  as.vector(sapply(colnames(rna.read.counts.all),
                   function(x) unlist(strsplit(x, '_'))[1])))
ribo.conditions.cell.all <- as.factor(as.vector(
  sapply(colnames(ribo.read.counts.all),
         function(x) unlist(strsplit(x, '_'))[1])))

rna.conditions.all <- data.frame('cell.type' = rna.conditions.cell.all, 
                                 'time' = rna.conditions.time.all)
ribo.conditions.all <- data.frame('cell.type' = ribo.conditions.cell.all, 
                                  'time' = ribo.conditions.time.all)

common.genes <- intersect(rownames(rna.read.counts.all), 
                          rownames(ribo.read.counts.all)) 

rna.read.counts.all <- rna.read.counts.all[common.genes,]
ribo.read.counts.all <- ribo.read.counts.all[common.genes,]

colnames(rna.read.counts.all) <- paste(colnames(rna.read.counts.all),
                                       'RNA', sep='_')
colnames(ribo.read.counts.all) <- paste(colnames(ribo.read.counts.all),
                                        'Ribo', sep='_')

contrast.T1vsT0 <- c('time', 'T1', 'T0')
contrast.T24vsT0 <- c('time', 'T24', 'T0')
contrast.T24vsT1 <- c('time', 'T24', 'T1')
```


```{r}
var.rna <- ( 1/ log2(exp(1)) )^2 * (1 / rowData(rna.read.counts.all)$baseMean + rowData(rna.read.counts.all)$dispersion)
var.ribo <- ( 1/ log2(exp(1)) )^2 * (1 / rowData(ribo.read.counts.all)$baseMean + rowData(ribo.read.counts.all)$dispersion)

sqrtsd.rna <-  sqrt(sqrt(var.rna))
sqrtsd.ribo <-  sqrt(sqrt(var.ribo))


library(tidyverse)

# Difference in scores cetner around zero
qplot(sqrtsd.rna - sqrtsd.ribo, geom = "dotplot", binwidth = 0.008)
qplot(sqrtsd.rna - sqrtsd.ribo, geom = "density")
```
```{r}
t.test(sqrtsd.rna, sqrtsd.ribo, paired =  TRUE)
```
