library(dplyr)
library(tidyr)
library(ggplot2)mean_fragment_length = 300
counts = tibble(Gene = c('A', 'B', 'C', 'D'),
Length = c(700, 700, 800, 800),
Control = c(40, 20, 30, 40),
Treatment = c(20, 20, 30, 40) * 2) %>%
gather(Library, Count, -Gene, -Length) %>%
mutate(EffectiveLength = Length - mean_fragment_length + 1)
countsgene_colors = c(A = 'dodgerblue2', B = 'darkgrey', C = 'darkgrey', D = 'darkgrey')ggplot(counts, aes(x = Gene, y = Count, fill = Gene)) +
geom_bar(stat = 'identity') +
facet_wrap(~ Library) +
scale_fill_manual(values = gene_colors, guide = FALSE) +
labs(y = 'Sequenced fragments') +
nogrid()But: we don’t want
fragments per library.
We want
RNA molecules per cell.
ggplot(counts, aes(x = Library, y = Count, fill = Gene)) +
geom_bar(stat = 'sum') +
scale_fill_manual(values = gene_colors) +
labs(y = 'Sequenced fragments') +
nogrid() +
theme(legend.position = 'none')# FIXME: Direction should be clockwise but `direction = 1` does anticlockwise
# on my computer. But labels are clockwise?!
counts %>%
group_by(Library) %>%
mutate(Fraction = Count / sum(Count)) %>%
mutate(LabelPos = sum(Fraction) - (cumsum(Fraction) - Fraction / 2)) %>%
ggplot(aes(x = '', y = Fraction, fill = Gene)) +
geom_bar(stat = 'identity', width = 1, color = 'white') +
geom_text(aes(y = LabelPos, label = Gene)) +
pie_chart() +
facet_wrap(~ Library) +
scale_fill_manual(values = gene_colors, guide = FALSE) +
labs(x = '', y = 'Fraction of RNA molecules')tlen = 22
flen = 10
gap = 0.3
transcript = tibble(xmin = 1, ymin = 1, xmax = tlen, ymax = 2, type = 'transcript')
fragments = tibble(xmin = seq(1, tlen - flen), ymin = xmin * (1 + gap) + 1,
xmax = xmin + flen, ymax = ymin + 1, type = 'fragment')
annotate_equation = function (x, y, label)
annotate('text', x = x, y = y, label = label, parse = TRUE,
size = 8, family = 'Times New Roman')
ggplot(bind_rows(transcript, fragments),
aes(xmin = xmin, ymin = ymin, xmax = xmax, ymax = ymax, fill = type)) +
geom_rect() +
coord_cartesian(xlim = c(-1.5, tlen), ylim = c(-3, tail(fragments$ymax, 1))) +
annotation_custom(brackets(0.135, 0.28, 0.135, 1, h = 0.1, lwd = 2)) +
annotate_equation(-1.5, 11, 'italic(l)[plain(eff)]') +
annotation_custom(brackets(0.145, 0.21, 0.95, 0.21, h = -0.1, lwd = 2)) +
annotate_equation(11.4, -2.9, 'italic(l)') +
annotation_custom(brackets(0.145, 0.34, 0.526, 0.34, h = 0.1, lwd = 2)) +
annotate_equation(6.15, 7.5, 'italic(bar(n))') +
scale_fill_manual(values = c(transcript = 'darkolivegreen4', fragment = 'darkgrey')) +
theme(panel.grid = element_blank(),
legend.position = 'none',
axis.title = element_blank(),
axis.text = element_blank())\[ \large l_\text{eff} = l - \bar{n} + 1 \]
Different transcript lengths lead to different expected fragment counts
transcripts = IRanges::IRanges(start = c(1, 22), width = c(20, 40))
fstarts = IRanges::resize(transcripts, IRanges::width(transcripts) - flen + 1) %>%
IRanges::coverage() %>%
{seq_along(.)[as.integer(.) == 1]}
set.seed(1234)
fragments = IRanges::IRanges(start = sample(fstarts, 9), width = flen)
fragment_data = fragments %>%
as.data.frame() %>%
mutate(bin = IRanges::disjointBins(fragments), type = 'read')transcripts %>%
as.data.frame() %>%
mutate(bin = 0, type = c('a', 'b')) %>%
bind_rows(fragment_data) %>%
mutate(ymin = bin * (1 + gap), ymax = ymin + 1) %>%
ggplot(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax, fill = type)) +
geom_rect() +
scale_fill_manual(values = c(a = 'darkolivegreen4', b = 'dodgerblue3', read = 'darkgrey')) +
theme(panel.grid = element_blank(),
legend.position = 'none',
axis.title = element_blank(),
axis.text = element_blank())RNA-seq count is influenced by
TPM (transcripts per millions): Given a million RNA fragments, how many fragments of a given transcript do we see?
→ TPM is a relative fraction, not absolute expression
\[ \large \text{TPM}_\color{orchid}i = {\color{dodgerblue}{\frac{x_\color{orchid}i}{{l_\text{eff}}_\color{orchid}i}}} \cdot \frac{1}{\sum_\color{tomato}j \color{dodgerblue}{\frac{x_\color{tomato}j}{{l_\text{eff}}_\color{tomato}j}}} \cdot \color{darkcyan}{10^6} \]
tpm = function (counts, effective_lengths) {
rate = log(counts) - log(effective_lengths)
exp(rate - log(sum(exp(rate))) + log(10 ^ 6))
}tpms = counts %>%
group_by(Library) %>%
mutate(TPM = tpm(Count, EffectiveLength))
tpmstpm_plot = ggplot(tpms, aes(x = Gene, y = TPM, fill = Gene)) +
geom_bar(stat = 'identity') +
facet_wrap(~ Library) +
scale_fill_manual(values = gene_colors, guide = FALSE) +
labs(y = 'TPM') +
nogrid()
tpm_plotFundamental assumption:
Most transcripts are not differentially expressed.
size_factors_ = function (counts, genes, experiment) {
counts %>%
group_by_(genes) %>%
mutate(LogGeomMean = mean(log(Count))) %>%
group_by_(experiment) %>%
mutate(SF = exp(median(log(Count) - LogGeomMean)))
}
size_factors = function (counts, genes, experiment) {
size_factors_(counts, lazyeval::lazy(genes), lazyeval::lazy(experiment))
}
norm_counts = counts %>%
size_factors(Gene, Library) %>%
mutate(Count = Count / SF)
norm_counts# TPM plot again, for comparison
tpm_plot %+% mutate(tpms, TPM = TPM / 10000) + ylab('') + ggtitle('TPM / 10000')ggplot(norm_counts, aes(Gene, Count, fill = Gene)) +
geom_bar(stat = 'identity') +
facet_wrap(~ Library) +
scale_fill_manual(values = gene_colors, guide = FALSE) +
ylab('') +
ggtitle('Count / DESeq size factors') +
nogrid()CPM = RPM = {counts/reads} per million
\[ \text{CPM}_i = x_i \cdot \frac{1}{\sum_j x_j} \cdot 10^6 \]
👎 Does not account for differences in transcript length
RPKM = FPKM = {reads/fragments} per kilobase per million
\[ \text{FPKM}_i = \frac{x_i}{{l_\text{eff}}_i} \cdot \frac{1}{\sum_j x_j} \cdot 10^6 \cdot 10^3 \]
👎 Has different scaling factors for each RNA-seq library