Step 10 - Looking at amino acid and codon usage

We will split up the coding sequences into codons and look at codon counts. We can then use individual codon counts or various related metrics (tAI, CAI, amino acid charge, etc) to examine the effects on DNA, RNA, and protein levels.

Splitting each CDS into codon frequencies.


load_genomic_codon_data <- function(codon_filename = paste(getwd(), "/data/codon_usage.txt", 
    sep = "")) {

    # Let's get codon names and AAs from this table:
    codon.txt <- read.table(file = paste(getwd(), "/data/codon_usage.txt", sep = ""), 
        sep = "\t", header = T)[1:64, 1:5]

    # remove the empty level, convert U to T
    codon.txt$Codon <- factor(gsub("U", "T", codon.txt$Codon))

    # name the rows by codon name for easy lookup
    rownames(codon.txt) <- codon.txt$Codon
    return(codon.txt)
}

codon.txt <- load_genomic_codon_data()

get_codon_freq <- function(seq) {
    seq <- as.character(seq[1, "CDS.seq"])
    codons <- factor(unlist(strsplit(seq, "(?<=\\G...)", perl = T)), levels = levels(codon.txt$Codon))
    codons <- as.data.frame(table(codons))
    names(codons) <- c("Codon", "Freq")
    return(codons)
}

# Generate per-sequence codon frequencies
codon.freq <- ddply(lib_seqs, .(CDS.seq), get_codon_freq)

# Add Gene Name to codon frequencies
codon.freq <- merge(codon.freq, unique(lib_seqs[, c("CDS.seq", "Gene")]), by = "CDS.seq")
# Add AA name to codon frequencies
codon.freq <- merge(codon.freq, codon.txt[, c("AA", "Codon")])

codon.totals <- cast(codon.freq, Codon ~ ., value = .(Freq), function(x) sum(as.integer(x)))
codon.totals <- cbind(codon.totals, matrix(unlist(strsplit(as.character(codon.totals$Codon), 
    split = ""), recursive = F), ncol = 3, byrow = T))
names(codon.totals) <- c("Codon", "Freq", "First", "Second", "Third")

# Add gene count - how many genes this codon appears in
codon.totals$Count.Gene <- rowSums(cast(codon.freq[, c(1, 2, 4, 3)], Codon ~ 
    Gene, sum) > 0, na.rm = T)

# merge, make display column
codon.totals <- merge(codon.totals, codon.txt, by = "Codon")
codon.totals$Display <- with(codon.totals, paste(Codon, " (", AA, ") - ", Count.Gene, 
    sep = ""))

Amino Acid Frequencies per Gene

Normalized Codon Expression per Amino Acid

I want a count of the number of each amino acid per gene, and the normalized ratio of each codon by the number of times the codon's amino acid appears in a gene.


get_aa_freq <- function(seq) {
    seq <- as.character(seq[1, "CDS.seq"])
    codons <- factor(unlist(strsplit(seq, "(?<=\\G...)", perl = T)), levels = levels(codon.txt$Codon))
    codons <- as.data.frame(table(codons))
    names(codons) <- c("Codon", "Freq")
    aa <- ddply(merge(codons, codon.txt[, c("Codon", "AA")]), "AA", summarize, 
        Freq = sum(Freq))
    return(aa)
}

# generate an amino acid per-gene frequency table and total counts table
aa.freq <- ddply(lib_seqs, .(CDS.seq), get_aa_freq)
codon.aa.freq <- merge(codon.freq, aa.freq, by = c("AA", "CDS.seq"), suffixes = c(".Codon", 
    ".AA"))

aa.freq <- merge(aa.freq, unique(lib_seqs[, c("CDS.seq", "Gene")]), by = "CDS.seq")
aa.totals <- cast(aa.freq, AA ~ ., value = .(Freq), function(x) sum(as.integer(x)))
names(aa.totals) <- c("AA", "Freq.AA")

# merge with codon total counts, get percentages
aa.codon.totals <- merge(aa.totals, codon.totals, by = "AA")
aa.codon.totals$Codon.Pct.Freq <- with(aa.codon.totals, Freq/Freq.AA)

How many times does each amino acid appear per gene?

# get aa by gene counts
aa.by_gene <- ddply(aa.freq, c("Gene", "AA"), summarize, Freq = mean(Freq))
aa.by_gene$Freq.Factor <- as.factor(aa.by_gene$Freq)

aa.gene_table <- with(aa.by_gene, table(AA, Freq.Factor))
print(xtable(aa.gene_table), type = "html")
0 1 2 3 4 5
0 0 0 0 0 0
A 58 52 22 5 0 0
B 137 0 0 0 0 0
C 128 9 0 0 0 0
D 94 38 5 0 0 0
E 85 38 11 2 1 0
F 90 39 7 1 0 0
G 92 32 12 1 0 0
H 120 16 1 0 0 0
I 49 53 28 7 0 0
K 61 54 20 1 1 0
L 38 47 36 13 2 1
M 0 123 12 2 0 0
N 94 34 8 1 0 0
O 137 0 0 0 0 0
P 88 41 8 0 0 0
Q 79 46 10 2 0 0
R 73 49 13 2 0 0
S 62 55 19 1 0 0
T 58 61 15 3 0 0
U 137 0 0 0 0 0
V 72 52 11 0 2 0
W 125 10 2 0 0 0
Y 111 26 0 0 0 0

Let's take a look at what the codon distribution per amino acid looks like.


label_aa_count <- function(aa) paste(aa, aa.totals[aa.totals$AA == aa, "Freq.AA"], 
    sep = ": ")

aa.codon.totals$AA.Display <- unlist(lapply(aa.codon.totals$AA, label_aa_count))

ggplot(subset(aa.codon.totals, Freq.AA > 0), aes(y = Codon.Pct.Freq, x = Codon, 
    label = Freq)) + geom_bar() + facet_wrap(~AA.Display, scale = "free_x") + 
    geom_text(colour = "black", vjust = -0.5) + theme_bw() + labs(title = "Codon Frequency Per Amino Acid")

plot of chunk 10_03-codon_freq_per_aa

Codon Frequency Correlations

Here is a simple table with the codon frequency in the whole library:

ggplot(codon.totals, aes(x = 1, y = Third, fill = log10(Freq), label = Display)) + 
    geom_tile() + facet_grid(First ~ Second, labeller = label_both) + geom_text(colour = "white") + 
    labs(title = "Absolue Codon Frequency across Library")

plot of chunk unnamed-chunk-2

It looks like there are enough instances of most of these codons to be able to look at correlation.

Codon Frequency vs. Protein Level

Now let's merge codon.freq with the ngs dataframe so that we can see if there is a correlation between the frequency of any of the codons and the Protein or (unlikely) RNA levels.

We should also see how many unique peptides are represented by each amino acid, so we'll put that number in parenthesis.

codon_plot_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ Freq", 
    "Protein Level")

plot of chunk 10_04a-codon-prot-corr_grid_plot

Quite a few amino acids seem to have a correlation with Protein level.

For the grids on the left, light-grey text means the pearson fell below the threshold of significance (0.05/61) corrected for the number of codons. (This is how Pilpel's Genome Biology paper does it in Figure 5).

The linear model here takes into account only frequency of codons, not secondary structure or GC changes that might arise from the codon usage.

Let's do it again and sort by codon occurrences in the genome:

codon_plot_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ Freq", 
    "Protein Level", sort_x = "Freq.Genome")

plot of chunk 10_04b-codon-prot-corr_grid_plot

The rare codons (right hand side of bar graph x axes) seem to increase expression generally, and the common ones decrease it, like we saw before.

CAA, CAG, and AGG all have very strong correlations, CAG is negative, the other two are positive. R, Q, and N seem to be the amino acids with the strongest correlations overall.

Even when we account for dG and GC content, the correlation remains strong.

codon_plot_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ dG + GC + Freq", 
    "Protein Level", sort_x = "Freq.Genome")

plot of chunk 10_04c-codon-prot-corr_grid_plot

Codon Frequency for a few examples:

Let's look at a few strong codon examples and see what the protein fold-change looks like for each.


codon.prot <- codon_corr_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ dG + GC + Freq")

pos_corr_codons <- as.character(head(subset(codon.prot, lm.slope > 0)[order(subset(codon.prot, 
    lm.slope > 0)$lm.p.value), ], 5)$Codon)

codon_indiv_plot(pos_corr_codons)

plot of chunk 10_04d-codon-prot-corr-pos


codon_indiv_plot(pos_corr_codons, facet = "aa")

plot of chunk 10_04d-codon-prot-corr-pos


codon_indiv_plot(pos_corr_codons, facet = "promo_rbs")

plot of chunk 10_04d-codon-prot-corr-pos

The first plot shows change in protein level by codon variant based on the number of times that codon appears in the codon variant. The second plot splits it up based on the total occurrences of the amino acid.

The correlations look similar, but not as impressive, for the negatively correlated codons:


codon.prot <- codon_corr_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ dG + GC + Freq")

neg_corr_codons <- as.character(head(subset(codon.prot, lm.slope < 0)[order(subset(codon.prot, 
    lm.slope < 0)$lm.p.value), ], 5)$Codon)

codon_indiv_plot(neg_corr_codons, , bp_color = "red")

plot of chunk 10_04e-codon-prot-corr-pos


codon_indiv_plot(neg_corr_codons, facet = "aa", bp_color = "red")

plot of chunk 10_04e-codon-prot-corr-pos


codon_indiv_plot(neg_corr_codons, facet = "promo_rbs", bp_color = "red")

plot of chunk 10_04e-codon-prot-corr-pos

Codon Frequency vs. RNA Level

We should see less correlation with RNA, so let's check:

codon_plot_all(aa.codon.totals, "RNA.FoldChange.Codons", "RNA.FoldChange.Codons ~ dG + GC + Freq", 
    "RNA Level", sort_x = "AA")

plot of chunk 10_04f-codon-rna-corr

This is interesting. It looks similar to the Protein level. However, I'm not sure what to think of it. Codons should not affect RNA production, unless it's somehow tied into either secondary structure, or perhaps growth rate. However, the growth rate term should be removed since we divide by DNA. Speaking of which, what about DNA?

codon_plot_all(aa.codon.totals, "DNA.FoldChange.Codons", "DNA.FoldChange.Codons ~ dG + GC + Freq", 
    dep_var = "DNA Level", sort_x = "AA")
## Error: object 'DNA.FoldChange.Codons' not found

The DNA correlation is weaker and slightly different qualitatively; the ANOVA percent explained is less than for protein or RNA.

These plots are somewhat different than the data (Fig. 5) in The 2011 Genome Biology paper by Navon and Pilpel:

Navon and Pilpel 2011 - Fig 5

OK, so this is an interesting effect. Finally, what if we look at the translation efficiency, Protein / RNA?

Codon Frequency vs. Translation Efficiency

codon_plot_all(aa.codon.totals, "Trans.FoldChange.Codons", "Trans.FoldChange.Codons ~ dG + GC + Freq", 
    label.name = "Translation Efficiency", sort_x = "AA")

plot of chunk 10_05-teff-trans-corr

codon_plot_all(aa.codon.totals, "dG.Diff.Codons", "dG.Diff.Codons ~ Freq", label.name = "Change in Secondary Structure", 
    sort_x = "AA")

plot of chunk 10_06-dG-trans-corr

Finally, let's factor in codon rarity and see how the observed data deviates from it, to see if there are any

codon_plot_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ dG + GC + Rel.Codon.Freq.Diff.Codons + Freq", 
    label.name = "Protein Level", sort_x = "Freq.Genome")

plot of chunk 10_07-freq-trans-corr

ANOVA Modeling

Let's see if we can generate an ANOVA model using this information.

codon.aa.freq[with(codon.aa.freq, Freq.AA == 0), ]$Freq.Codon <- -1
ngs_codons <- merge(subset(ngs, !Insuff.Prot & Bin.Pct.1 < 1/12 & Bin.Pct.12 < 
    1/2), cast(transform(codon.aa.freq, Freq.Codon.Factor = as.factor(Freq.Codon)), 
    CDS.seq + Gene ~ Codon, value = "Freq.Codon.Factor"))

codon.aa.freq[with(codon.aa.freq, Freq.AA == 0), ]$Freq.Codon <- NA
ngs_codons <- merge(subset(ngs, !Insuff.Prot & Bin.Pct.1 < 1/12 & Bin.Pct.12 < 
    1/2), cast(transform(codon.aa.freq, Freq.Codon.Factor = Freq.Codon), CDS.seq + 
    Gene ~ Codon, value = "Freq.Codon.Factor"))

formula <- as.formula(log10(Prot) ~ Promoter + RBS + Gene + dG + GC + AAA + 
    AAC + AAG + AAT + ACA + ACC + ACG + ACT + AGA + AGC + AGG + AGT + ATA + 
    ATC + ATT + CAA + CAC + CAG + CAT + CCA + CCC + CCG + CCT + CGA + CGC + 
    CGG + CGT + CTA + CTC + CTG + CTT + GAA + GAC + GAG + GAT + GCA + GCC + 
    GCG + GCT + GGA + GGC + GGG + GGT + GTA + GTC + GTG + GTT + TAC + TAT + 
    TCA + TCC + TCG + TCT + TGC + TGG + TGT + TTA + TTC + TTG + TTT)

formula <- as.formula(Prot.FoldChange.Codons ~ dG + GC + AAA + AAC + AAG + AAT + 
    ACA + ACC + ACG + ACT + AGA + AGC + AGG + AGT + ATA + ATC + ATT + CAA + 
    CAC + CAG + CAT + CCA + CCC + CCG + CCT + CGA + CGC + CGG + CGT + CTA + 
    CTC + CTG + CTT + GAA + GAC + GAG + GAT + GCA + GCC + GCG + GCT + GGA + 
    GGC + GGG + GGT + GTA + GTC + GTG + GTT + TAC + TAT + TCA + TCC + TCG + 
    TCT + TGC + TGG + TGT + TTA + TTC + TTG + TTT)

formula <- as.formula(Prot ~ Promoter + RBS + Gene + dG + AAA + AAC + AAG + 
    AAT + ACA + ACC + ACG + ACT + AGA + AGC + AGG + AGT + ATA + ATC + ATT + 
    CAA + CAC + CAG + CAT + CCA + CCC + CCG + CCT + CGA + CGC + CGG + CGT + 
    CTA + CTC + CTG + CTT + GAA + GAC + GAG + GAT + GCA + GCC + GCG + GCT + 
    GGA + GGC + GGG + GGT + GTA + GTC + GTG + GTT + TAC + TAT + TCA + TCC + 
    TCG + TCT + TGC + TGG + TGT + TTA + TTC + TTG + TTT)



# missing ATG, TAA, TAG, TGA, ATA, CAC

formula <- as.formula(Prot.FoldChange.Codons ~ dG + GC + Gene + CAG)

formula <- as.formula(Prot.FoldChange.Codons ~ dG + GC + Gene + CAG + AAT)

# ATA, ATC, ATT all fail in above

# linear model
clm <- lm(formula, data = ngs_codons)
caov <- Anova(clm)
csos <- data.frame(Component = rownames(caov), SumSqPct = caov$"Sum Sq"/sum(caov$"Sum Sq", 
    na.rm = T) * 100)
csos <- csos[order(csos$SumSqPct), ]
csos
##   Component SumSqPct
## 5       AAT   0.8549
## 1        dG   1.7688
## 4       CAG   2.1219
## 2        GC   5.2104
## 3      Gene   7.2496
## 6 Residuals  82.7944


codon.aa.freq2 <- codon.aa.freq
codon.aa.freq2$Freq.Codon.Factor <- NA
codon.aa.freq2[with(codon.aa.freq2, Freq.AA == 0), ]$Freq.Codon.Factor <- -1
codon.aa.freq2[with(codon.aa.freq2, Freq.AA > 0), ]$Freq.Codon.Factor <- 0
codon.aa.freq2[with(codon.aa.freq2, Freq.AA > 0 & Freq.Codon > 0), ]$Freq.Codon.Factor <- 1
codon.aa.freq2$Freq.Codon.Factor <- as.factor(codon.aa.freq2$Freq.Codon.Factor)

ngs_codons2 <- merge(subset(ngs, !Insuff.Prot & Bin.Pct.1 < 1/12 & Bin.Pct.12 < 
    1/2), cast(codon.aa.freq2, CDS.seq + Gene ~ Codon, value = "Freq.Codon.Factor"))

formula <- as.formula(Prot.FoldChange.Codons ~ (dG + GC + RBS + Gene + dG)^2)

# linear model
clm <- lm(formula, data = ngs_codons2)
caov <- Anova(clm)
csos <- data.frame(Component = rownames(caov), SumSqPct = caov$"Sum Sq"/sum(caov$"Sum Sq", 
    na.rm = T) * 100)
csos <- csos[order(csos$SumSqPct), ]
csos
##    Component SumSqPct
## 5      dG:GC  0.02860
## 8     GC:RBS  0.05635
## 6     dG:RBS  0.06700
## 9    GC:Gene  2.96157
## 7    dG:Gene  3.45020
## 1         dG  4.04290
## 10  RBS:Gene  4.79213
## 3        RBS  5.07138
## 2         GC 10.20335
## 4       Gene 13.19193
## 11 Residuals 56.13458


codon_plot_all(aa.codon.totals, "Prot.FoldChange.Codons", "Prot.FoldChange.Codons ~ Freq", 
    "Protein Level", sort_x = "Freq.Genome")

plot of chunk unnamed-chunk-3