We will split up the coding sequences into codons and look at codon counts. We can then use individual codon counts or various related metrics (tAI, CAI, amino acid charge, etc) to examine the effects on DNA, RNA, and protein levels.
load_genomic_codon_data <- function(codon_filename = paste(getwd(), "/data/codon_usage.txt",
sep = "")) {
# Let's get codon names and AAs from this table:
codon.txt <- read.table(file = paste(getwd(), "/data/codon_usage.txt", sep = ""),
sep = "\t", header = T)[1:64, 1:5]
# remove the empty level, convert U to T
codon.txt$Codon <- factor(gsub("U", "T", codon.txt$Codon))
# name the rows by codon name for easy lookup
rownames(codon.txt) <- codon.txt$Codon
return(codon.txt)
}
codon.txt <- load_genomic_codon_data()
get_codon_freq <- function(seq) {
seq <- as.character(seq[1, "CDS.seq"])
codons <- factor(unlist(strsplit(seq, "(?<=\\G...)", perl = T)), levels = levels(codon.txt$Codon))
codons <- as.data.frame(table(codons))
names(codons) <- c("Codon", "Freq")
return(codons)
}
# Generate per-sequence codon frequencies
codon.freq <- ddply(lib_seqs, .(CDS.seq), get_codon_freq)
# Add Gene Name to codon frequencies
codon.freq <- merge(codon.freq, unique(lib_seqs[, c("CDS.seq", "Gene")]), by = "CDS.seq")
# Add AA name to codon frequencies
codon.freq <- merge(codon.freq, codon.txt[, c("AA", "Codon")])
codon.totals <- cast(codon.freq, Codon ~ ., value = .(Freq), function(x) sum(as.integer(x)))
codon.totals <- cbind(codon.totals, matrix(unlist(strsplit(as.character(codon.totals$Codon),
split = ""), recursive = F), ncol = 3, byrow = T))
names(codon.totals) <- c("Codon", "Freq", "First", "Second", "Third")
# Add gene count - how many genes this codon appears in
codon.totals$Count.Gene <- rowSums(cast(codon.freq[, c(1, 2, 4, 3)], Codon ~
Gene, sum) > 0, na.rm = T)
# merge, make display column
codon.totals <- merge(codon.totals, codon.txt, by = "Codon")
codon.totals$Display <- with(codon.totals, paste(Codon, " (", AA, ") - ", Count.Gene,
sep = ""))
I want a count of the number of each amino acid per gene, and the normalized ratio of each codon by the number of times the codon's amino acid appears in a gene.
get_aa_freq <- function(seq) {
seq <- as.character(seq[1, "CDS.seq"])
codons <- factor(unlist(strsplit(seq, "(?<=\\G...)", perl = T)), levels = levels(codon.txt$Codon))
codons <- as.data.frame(table(codons))
names(codons) <- c("Codon", "Freq")
aa <- ddply(merge(codons, codon.txt[, c("Codon", "AA")]), "AA", summarize,
Freq = sum(Freq))
return(aa)
}
# generate an amino acid per-gene frequency table and total counts table
aa.freq <- ddply(lib_seqs, .(CDS.seq), get_aa_freq)
codon.aa.freq <- merge(codon.freq, aa.freq, by = c("AA", "CDS.seq"), suffixes = c(".Codon",
".AA"))
aa.freq <- merge(aa.freq, unique(lib_seqs[, c("CDS.seq", "Gene")]), by = "CDS.seq")
aa.totals <- cast(aa.freq, AA ~ ., value = .(Freq), function(x) sum(as.integer(x)))
names(aa.totals) <- c("AA", "Freq.AA")
# merge with codon total counts, get percentages
aa.codon.totals <- merge(aa.totals, codon.totals, by = "AA")
aa.codon.totals$Codon.Pct.Freq <- with(aa.codon.totals, Freq/Freq.AA)
How many times does each amino acid appear per gene?
# get aa by gene counts
aa.by_gene <- ddply(aa.freq, c("Gene", "AA"), summarize, Freq = mean(Freq))
aa.by_gene$Freq.Factor <- as.factor(aa.by_gene$Freq)
aa.gene_table <- with(aa.by_gene, table(AA, Freq.Factor))
print(xtable(aa.gene_table), type = "html")
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | |
A | 58 | 52 | 22 | 5 | 0 | 0 |
B | 137 | 0 | 0 | 0 | 0 | 0 |
C | 128 | 9 | 0 | 0 | 0 | 0 |
D | 94 | 38 | 5 | 0 | 0 | 0 |
E | 85 | 38 | 11 | 2 | 1 | 0 |
F | 90 | 39 | 7 | 1 | 0 | 0 |
G | 92 | 32 | 12 | 1 | 0 | 0 |
H | 120 | 16 | 1 | 0 | 0 | 0 |
I | 49 | 53 | 28 | 7 | 0 | 0 |
K | 61 | 54 | 20 | 1 | 1 | 0 |
L | 38 | 47 | 36 | 13 | 2 | 1 |
M | 0 | 123 | 12 | 2 | 0 | 0 |
N | 94 | 34 | 8 | 1 | 0 | 0 |
O | 137 | 0 | 0 | 0 | 0 | 0 |
P | 88 | 41 | 8 | 0 | 0 | 0 |
Q | 79 | 46 | 10 | 2 | 0 | 0 |
R | 73 | 49 | 13 | 2 | 0 | 0 |
S | 62 | 55 | 19 | 1 | 0 | 0 |
T | 58 | 61 | 15 | 3 | 0 | 0 |
U | 137 | 0 | 0 | 0 | 0 | 0 |
V | 72 | 52 | 11 | 0 | 2 | 0 |
W | 125 | 10 | 2 | 0 | 0 | 0 |
Y | 111 | 26 | 0 | 0 | 0 | 0 |
Let's take a look at what the codon distribution per amino acid looks like.
label_aa_count <- function(aa) paste(aa, aa.totals[aa.totals$AA == aa, "Freq.AA"],
sep = ": ")
aa.codon.totals$AA.Display <- unlist(lapply(aa.codon.totals$AA, label_aa_count))
ggplot(subset(aa.codon.totals, Freq.AA > 0), aes(y = Codon.Pct.Freq, x = Codon,
label = Freq)) + geom_bar() + facet_wrap(~AA.Display, scale = "free_x") +
geom_text(colour = "black", vjust = -0.5) + theme_bw() + labs(title = "Codon Frequency Per Amino Acid")
Here is a simple table with the codon frequency in the whole library:
ggplot(codon.totals, aes(x = 1, y = Third, fill = log10(Freq), label = Display)) +
geom_tile() + facet_grid(First ~ Second, labeller = label_both) + geom_text(colour = "white") +
labs(title = "Absolue Codon Frequency across Library")
It looks like there are enough instances of most of these codons to be able to look at correlation.
Now let's merge codon.freq
with the ngs
dataframe so that we can see if there is a correlation between the frequency of any of the codons and the Protein or (unlikely) RNA levels.
We should also see how many unique peptides are represented by each amino acid, so we'll put that number in parenthesis.
codon_plot_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ Freq",
"Protein Level")
Quite a few amino acids seem to have a correlation with Protein level.
For the grids on the left, light-grey text means the pearson fell below the threshold of significance (0.05/61) corrected for the number of codons. (This is how Pilpel's Genome Biology paper does it in Figure 5).
The linear model here takes into account only frequency of codons, not secondary structure or GC changes that might arise from the codon usage.
Let's do it again and sort by codon occurrences in the genome:
codon_plot_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ Freq",
"Protein Level", sort_x = "Freq.Genome")
The rare codons (right hand side of bar graph x axes) seem to increase expression generally, and the common ones decrease it, like we saw before.
CAA, CAG, and AGG all have very strong correlations, CAG is negative, the other two are positive. R, Q, and N seem to be the amino acids with the strongest correlations overall.
Even when we account for dG and GC content, the correlation remains strong.
codon_plot_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ dG + GC + Freq",
"Protein Level", sort_x = "Freq.Genome")
Let's look at a few strong codon examples and see what the protein fold-change looks like for each.
codon.prot <- codon_corr_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ dG + GC + Freq")
pos_corr_codons <- as.character(head(subset(codon.prot, lm.slope > 0)[order(subset(codon.prot,
lm.slope > 0)$lm.p.value), ], 5)$Codon)
codon_indiv_plot(pos_corr_codons)
codon_indiv_plot(pos_corr_codons, facet = "aa")
codon_indiv_plot(pos_corr_codons, facet = "promo_rbs")
The first plot shows change in protein level by codon variant based on the number of times that codon appears in the codon variant. The second plot splits it up based on the total occurrences of the amino acid.
The correlations look similar, but not as impressive, for the negatively correlated codons:
codon.prot <- codon_corr_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ dG + GC + Freq")
neg_corr_codons <- as.character(head(subset(codon.prot, lm.slope < 0)[order(subset(codon.prot,
lm.slope < 0)$lm.p.value), ], 5)$Codon)
codon_indiv_plot(neg_corr_codons, , bp_color = "red")
codon_indiv_plot(neg_corr_codons, facet = "aa", bp_color = "red")
codon_indiv_plot(neg_corr_codons, facet = "promo_rbs", bp_color = "red")
We should see less correlation with RNA, so let's check:
codon_plot_all(aa.codon.totals, "RNA.FoldChange.Codons", "RNA.FoldChange.Codons ~ dG + GC + Freq",
"RNA Level", sort_x = "AA")
This is interesting. It looks similar to the Protein level. However, I'm not sure what to think of it. Codons should not affect RNA production, unless it's somehow tied into either secondary structure, or perhaps growth rate. However, the growth rate term should be removed since we divide by DNA. Speaking of which, what about DNA?
codon_plot_all(aa.codon.totals, "DNA.FoldChange.Codons", "DNA.FoldChange.Codons ~ dG + GC + Freq",
dep_var = "DNA Level", sort_x = "AA")
## Error: object 'DNA.FoldChange.Codons' not found
The DNA correlation is weaker and slightly different qualitatively; the ANOVA percent explained is less than for protein or RNA.
These plots are somewhat different than the data (Fig. 5) in The 2011 Genome Biology paper by Navon and Pilpel:
OK, so this is an interesting effect. Finally, what if we look at the translation efficiency, Protein / RNA?
codon_plot_all(aa.codon.totals, "Trans.FoldChange.Codons", "Trans.FoldChange.Codons ~ dG + GC + Freq",
label.name = "Translation Efficiency", sort_x = "AA")
codon_plot_all(aa.codon.totals, "dG.Diff.Codons", "dG.Diff.Codons ~ Freq", label.name = "Change in Secondary Structure",
sort_x = "AA")
Finally, let's factor in codon rarity and see how the observed data deviates from it, to see if there are any
codon_plot_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ dG + GC + Rel.Codon.Freq.Diff.Codons + Freq",
label.name = "Protein Level", sort_x = "Freq.Genome")
Let's see if we can generate an ANOVA model using this information.
codon.aa.freq[with(codon.aa.freq, Freq.AA == 0), ]$Freq.Codon <- -1
ngs_codons <- merge(subset(ngs, !Insuff.Prot & Bin.Pct.1 < 1/12 & Bin.Pct.12 <
1/2), cast(transform(codon.aa.freq, Freq.Codon.Factor = as.factor(Freq.Codon)),
CDS.seq + Gene ~ Codon, value = "Freq.Codon.Factor"))
codon.aa.freq[with(codon.aa.freq, Freq.AA == 0), ]$Freq.Codon <- NA
ngs_codons <- merge(subset(ngs, !Insuff.Prot & Bin.Pct.1 < 1/12 & Bin.Pct.12 <
1/2), cast(transform(codon.aa.freq, Freq.Codon.Factor = Freq.Codon), CDS.seq +
Gene ~ Codon, value = "Freq.Codon.Factor"))
formula <- as.formula(log10(Prot) ~ Promoter + RBS + Gene + dG + GC + AAA +
AAC + AAG + AAT + ACA + ACC + ACG + ACT + AGA + AGC + AGG + AGT + ATA +
ATC + ATT + CAA + CAC + CAG + CAT + CCA + CCC + CCG + CCT + CGA + CGC +
CGG + CGT + CTA + CTC + CTG + CTT + GAA + GAC + GAG + GAT + GCA + GCC +
GCG + GCT + GGA + GGC + GGG + GGT + GTA + GTC + GTG + GTT + TAC + TAT +
TCA + TCC + TCG + TCT + TGC + TGG + TGT + TTA + TTC + TTG + TTT)
formula <- as.formula(Prot.FoldChange.Codons ~ dG + GC + AAA + AAC + AAG + AAT +
ACA + ACC + ACG + ACT + AGA + AGC + AGG + AGT + ATA + ATC + ATT + CAA +
CAC + CAG + CAT + CCA + CCC + CCG + CCT + CGA + CGC + CGG + CGT + CTA +
CTC + CTG + CTT + GAA + GAC + GAG + GAT + GCA + GCC + GCG + GCT + GGA +
GGC + GGG + GGT + GTA + GTC + GTG + GTT + TAC + TAT + TCA + TCC + TCG +
TCT + TGC + TGG + TGT + TTA + TTC + TTG + TTT)
formula <- as.formula(Prot ~ Promoter + RBS + Gene + dG + AAA + AAC + AAG +
AAT + ACA + ACC + ACG + ACT + AGA + AGC + AGG + AGT + ATA + ATC + ATT +
CAA + CAC + CAG + CAT + CCA + CCC + CCG + CCT + CGA + CGC + CGG + CGT +
CTA + CTC + CTG + CTT + GAA + GAC + GAG + GAT + GCA + GCC + GCG + GCT +
GGA + GGC + GGG + GGT + GTA + GTC + GTG + GTT + TAC + TAT + TCA + TCC +
TCG + TCT + TGC + TGG + TGT + TTA + TTC + TTG + TTT)
# missing ATG, TAA, TAG, TGA, ATA, CAC
formula <- as.formula(Prot.FoldChange.Codons ~ dG + GC + Gene + CAG)
formula <- as.formula(Prot.FoldChange.Codons ~ dG + GC + Gene + CAG + AAT)
# ATA, ATC, ATT all fail in above
# linear model
clm <- lm(formula, data = ngs_codons)
caov <- Anova(clm)
csos <- data.frame(Component = rownames(caov), SumSqPct = caov$"Sum Sq"/sum(caov$"Sum Sq",
na.rm = T) * 100)
csos <- csos[order(csos$SumSqPct), ]
csos
## Component SumSqPct
## 5 AAT 0.8549
## 1 dG 1.7688
## 4 CAG 2.1219
## 2 GC 5.2104
## 3 Gene 7.2496
## 6 Residuals 82.7944
codon.aa.freq2 <- codon.aa.freq
codon.aa.freq2$Freq.Codon.Factor <- NA
codon.aa.freq2[with(codon.aa.freq2, Freq.AA == 0), ]$Freq.Codon.Factor <- -1
codon.aa.freq2[with(codon.aa.freq2, Freq.AA > 0), ]$Freq.Codon.Factor <- 0
codon.aa.freq2[with(codon.aa.freq2, Freq.AA > 0 & Freq.Codon > 0), ]$Freq.Codon.Factor <- 1
codon.aa.freq2$Freq.Codon.Factor <- as.factor(codon.aa.freq2$Freq.Codon.Factor)
ngs_codons2 <- merge(subset(ngs, !Insuff.Prot & Bin.Pct.1 < 1/12 & Bin.Pct.12 <
1/2), cast(codon.aa.freq2, CDS.seq + Gene ~ Codon, value = "Freq.Codon.Factor"))
formula <- as.formula(Prot.FoldChange.Codons ~ (dG + GC + RBS + Gene + dG)^2)
# linear model
clm <- lm(formula, data = ngs_codons2)
caov <- Anova(clm)
csos <- data.frame(Component = rownames(caov), SumSqPct = caov$"Sum Sq"/sum(caov$"Sum Sq",
na.rm = T) * 100)
csos <- csos[order(csos$SumSqPct), ]
csos
## Component SumSqPct
## 5 dG:GC 0.02860
## 8 GC:RBS 0.05635
## 6 dG:RBS 0.06700
## 9 GC:Gene 2.96157
## 7 dG:Gene 3.45020
## 1 dG 4.04290
## 10 RBS:Gene 4.79213
## 3 RBS 5.07138
## 2 GC 10.20335
## 4 Gene 13.19193
## 11 Residuals 56.13458
codon_plot_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ Freq",
"Protein Level", sort_x = "Freq.Genome")