Introduction
This document demonstrates a strategy to adjust the p-values for a differential translation efficiency analysis using riborex. The dataset involves a time-course experiment studying the effect of radiation on glioblatoma cells. Briefly, two glioblastoma cell lines U251 (p53 mt) and U343 (p53 wt) were profiled using RNA-Seq and Ribo-Seq at 3 time points: 0h, 1h and 2h post radiation.
Utility functions
suppressMessages(library(riborex))
suppressMessages(library(fdrtool))
suppressMessages(library(cowplot))
annotations <- read.table(file.path('..', 'annotations',
'hg38_gene_names_stripped.tsv'),
header = F,
col.names = c('gene_id', 'gene_name',
'gene_type'),
stringsAsFactors = F)
rownames(annotations) <- annotations$gene_id
histone.genes.df <- read.table(file.path('..', 'annotations',
'histone_genes.tsv'),
header = T,
stringsAsFactors = F,
sep = '\t')
histone.genes.1 <- annotations[grep('HIST', annotations$gene_name), ]$gene_id
histone.genes.2 <-annotations[annotations$gene_name %in%
histone.genes.df$Approved.Symbol, ]$gene_id
# Add RP1-34B20.21 separately since symbol doesn't have HIST in it.
histone.genes <- unique(c(histone.genes.1, histone.genes.2, 'ENSG00000282988'))
readcounts.dir <- file.path('..', 'read_counts', 'byCDS')
results.dir <- file.path('..', 'results', 'translation_efficiency',
'without_histones_edgeRD')
rna.design.file <- file.path('..', 'design_files', 'rna_seq_design.tsv')
ribo.design.file <- file.path('..', 'design_files', 'ribo_seq_design.tsv')
## Suffix of htseq-count output
counts.suffix <- '.CDS.counts.tsv'
rna.design.info <- read.table(rna.design.file, header=T,
stringsAsFactors=FALSE)
ribo.design.info <- read.table(ribo.design.file, header=T,
stringsAsFactors=FALSE)
count.reads <- function(design.info, dirname){
files <- paste(design.info$SampleFile, counts.suffix, sep='')
sampleName <- design.info$SampleName
sampleTable <- data.frame(sampleName=sampleName,
fileName=files,
time=factor(design.info$Time, levels=c('T0', 'T1', 'T24')),
Cell_line=factor(design.info$Cell_line, levels=c('U251', 'U343')))
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
directory = file.path(readcounts.dir, dirname),
design = ~ time+Cell_line)
rownames(ddsHTSeq) <- gsub('\\.[0-9]+', '', rownames(ddsHTSeq))
# Remove histones
ddsHTSeq <- ddsHTSeq[!(rownames(ddsHTSeq) %in% histone.genes),]
ddsHTSeq <- ddsHTSeq[ rowSums(counts(ddsHTSeq)) > 1, ]
dds <- DESeq(ddsHTSeq)
return(dds)
}
filter_results <- function(df){
df<- as.data.frame(df)
df <- df[order(df$padj),]
df$gene_name <- annotations[rownames(df),]$gene_name
df.sig <- subset(df, padj<0.05)
return (df.sig)
}
filter_results.edgeR <- function(df){
df<- as.data.frame(df$table)
df <- df[order(df$FDR),]
df$gene_name <- annotations[rownames(df),]$gene_name
df.sig <- subset(df, FDR<0.05)
return (df.sig)
}
doPvalueAdjustment <- function(results){
hist(results$pvalue, main = 'DESeq2 unadjusted p-values',
xlab = 'Unadjusted p-values')
results <- results[ !is.na(results$padj), ]
results <- results[ !is.na(results$pvalue), ]
results <- results[, -which(names(results) == 'padj')]
resultsFDR <- fdrtool(results$stat,
statistic= 'normal',
plot = T)
results[,'padj'] <- p.adjust(resultsFDR$pval,
method = 'BH')
hist(resultsFDR$pval,
main = 'DESeq2 corrected p-values | Empirical null',
xlab = 'Corrected p-values')
return (results)
}
riborex.for.cellline <- function(rna.read.counts, ribo.read.counts,
cell.line, contrast, engine='DESeq2', minMeanCount = 20,
merge.T1=FALSE){
rna.read.counts <- rna.read.counts[, grepl(cell.line,
colnames(rna.read.counts))]
ribo.read.counts <- ribo.read.counts[, grepl(cell.line,
colnames(ribo.read.counts))]
rna.conditions.time <- as.factor(as.vector(
sapply(colnames(rna.read.counts),
function(x) unlist(strsplit(x, '_'))[3])))
ribo.conditions.time <- as.factor(as.vector(
sapply(colnames(ribo.read.counts),
function(x) unlist(strsplit(x, '_'))[4])))
rna.conditions.cell <- as.factor(as.vector(
sapply(colnames(rna.read.counts),
function(x) unlist(strsplit(x, '_'))[1])))
ribo.conditions.cell <- as.factor(
as.vector(sapply(colnames(ribo.read.counts),
function(x) unlist(strsplit(x, '_'))[1])))
if (merge.T1){
levels(rna.conditions.time)[levels(rna.conditions.time)=='T0'] <- 'T0T1'
levels(rna.conditions.time)[levels(rna.conditions.time)=='T1'] <- 'T0T1'
levels(ribo.conditions.time)[levels(ribo.conditions.time)=='T0'] <- 'T0T1'
levels(ribo.conditions.time)[levels(ribo.conditions.time)=='T1'] <- 'T0T1'
}
rna.conditions <- data.frame('time' = rna.conditions.time)
ribo.conditions <- data.frame('time' = ribo.conditions.time)
common.genes <- intersect(rownames(rna.read.counts),
rownames(ribo.read.counts))
rna.read.counts <- rna.read.counts[common.genes,]
ribo.read.counts <- ribo.read.counts[common.genes,]
colnames(rna.read.counts) <- paste(colnames(rna.read.counts),
'RNA', sep='_')
colnames(ribo.read.counts) <- paste(colnames(ribo.read.counts),
'Ribo', sep='_')
res <- riborex(rna.read.counts,
ribo.read.counts,
rna.conditions,
ribo.conditions,
contrast = contrast,
engine = engine,
minMeanCount = minMeanCount)
return (res)
}
Read data
rna.read.counts.all <- count.reads(rna.design.info, 'rna_seq')
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
ribo.read.counts.all <- count.reads(ribo.design.info, 'ribo_seq')
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
rna.conditions.time.all <- as.factor(
as.vector(sapply(colnames(rna.read.counts.all),
function(x) unlist(strsplit(x, '_'))[3])))
ribo.conditions.time.all <- as.factor(
as.vector(sapply(colnames(ribo.read.counts.all),
function(x) unlist(strsplit(x, '_'))[4])))
rna.conditions.cell.all <- as.factor(
as.vector(sapply(colnames(rna.read.counts.all),
function(x) unlist(strsplit(x, '_'))[1])))
ribo.conditions.cell.all <- as.factor(as.vector(
sapply(colnames(ribo.read.counts.all),
function(x) unlist(strsplit(x, '_'))[1])))
rna.conditions.all <- data.frame('cell.type' = rna.conditions.cell.all,
'time' = rna.conditions.time.all)
ribo.conditions.all <- data.frame('cell.type' = ribo.conditions.cell.all,
'time' = ribo.conditions.time.all)
common.genes <- intersect(rownames(rna.read.counts.all),
rownames(ribo.read.counts.all))
rna.read.counts.all <- rna.read.counts.all[common.genes,]
ribo.read.counts.all <- ribo.read.counts.all[common.genes,]
colnames(rna.read.counts.all) <- paste(colnames(rna.read.counts.all),
'RNA', sep='_')
colnames(ribo.read.counts.all) <- paste(colnames(ribo.read.counts.all),
'Ribo', sep='_')
contrast.T1vsT0 <- c('time', 'T1', 'T0')
contrast.T24vsT0 <- c('time', 'T24', 'T0')
contrast.T24vsT1 <- c('time', 'T24', 'T1')
var.rna <- ( 1/ log2(exp(1)) )^2 * (1 / rowData(rna.read.counts.all)$baseMean + rowData(rna.read.counts.all)$dispersion)
var.ribo <- ( 1/ log2(exp(1)) )^2 * (1 / rowData(ribo.read.counts.all)$baseMean + rowData(ribo.read.counts.all)$dispersion)
sqrtsd.rna <- sqrt(sqrt(var.rna))
sqrtsd.ribo <- sqrt(sqrt(var.ribo))
library(tidyverse)
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
collapse(): dplyr, IRanges
combine(): dplyr, Biobase, BiocGenerics
count(): dplyr, matrixStats
desc(): dplyr, IRanges
expand(): tidyr, Matrix, S4Vectors
filter(): dplyr, stats
first(): dplyr, S4Vectors
ggsave(): ggplot2, cowplot
lag(): dplyr, stats
Position(): ggplot2, BiocGenerics, base
reduce(): purrr, GenomicRanges, IRanges
rename(): dplyr, S4Vectors
select(): dplyr, AnnotationDbi
simplify(): purrr, clusterProfiler
slice(): dplyr, IRanges
# Difference in scores cetner around zero
qplot(sqrtsd.rna - sqrtsd.ribo, geom = "dotplot", binwidth = 0.008)

qplot(sqrtsd.rna - sqrtsd.ribo, geom = "density")

t.test(sqrtsd.rna, sqrtsd.ribo, paired = TRUE)
Paired t-test
data: sqrtsd.rna and sqrtsd.ribo
t = -26.34, df = 15140, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.04533102 -0.03905170
sample estimates:
mean of the differences
-0.04219136
