Step 07 - Looking at Codon and tRNA Adaptation Indices

TAI Calcluation

I'm using codonR to calculate tRNA adaptation for my constructs. I'm getting this first set of commands from codonR/README.

I first had to run codonM on the coding sequence only, in perl. I also ran the same codonM script on the E. coli genome, for comparison.

$ perl codonR/codonM \
    <(perl -pe '/^[ATGCatgc]/ && s/.*([ATGC]{33})/$1/' \
        data/203.norestrict.fa) \
    data/203.codonR.m
$ perl codonR/codonM codonR/ecolik12.ffn codonR/ecolik12.m

source("codonR/tAI.R")
eco.trna <- scan("codonR/ecolik12.trna")
eco.ws <- get.ws(tRNA = eco.trna, sking = 1)
codonR.m <- matrix(scan("data/203.codonR.m"), ncol = 61, byrow = T)
codonR.m <- codonR.m[, -33]
tai <- get.tai(codonR.m, eco.ws)

# merge the tai scores with the gene names, and finally with NGS
names <- system("perl -ne \"/>/ && s/>// && print\" data/203.norestrict.fa", 
    intern = T)
ngs <- merge(ngs, data.frame(Name = names, tAI = as.numeric(tai)), by = "Name")

OK, so now we can look at the correlation of tAI with expression level:

multiplot(
    ggplot(melt(subset(ngs, Promoter=='BBaJ23100'), 
            measure.vars= c('RNA','Count.DNA','Prot','Trans')), 
            aes(x=tAI, color=RBS, y=value)) + 
        geom_point(alpha=0.05) + theme_bw() + stat_smooth(method=lm, se=F) +
        facet_wrap(RBS~variable, scale='free') +
        scale_x_continuous(name="Secondary Structure Free Energy") +
        scale_y_log10("Log10 of Dependent Variable") +
        opts(title="tRNA adaptation correlations (Strong Promoter)"),

    ggplot(melt(subset(ngs, Promoter=='BBaJ23108'), 
            measure.vars= c('RNA','Count.DNA','Prot','Trans')), 
            aes(x=tAI, color=RBS, y=value)) + 
        geom_point(alpha=0.05) + theme_bw() + stat_smooth(method=lm, se=F) +
        facet_wrap(RBS~variable, scale='free') +
        scale_x_continuous(name="Secondary Structure Free Energy") +
        scale_y_log10("Log10 of Dependent Variable") +
        opts(title="tRNA adaptation correlations (Weak Promoter)"),
cols=1)

I checked a few things to see if I did this right, because I expected to see a much larger effect.

Maybe I should compare this by gene as well:


ngs <- ddply(ngs, c('Gene','RBS','Promoter'), transform, 
    RNA.FoldChange.Codons= log2(RNA) - mean(log2(RNA), na.rm=T),
    DNA.FoldChange.Codons= log2(Count.DNA) - mean(log2(Count.DNA), na.rm=T),             
    Prot.FoldChange.Codons= log2(Prot) - mean(log2(Prot), na.rm=T),
    Fitness.Read_Ratio.FoldChange.Codons= log2(Fitness.Read_Ratio) - 
        mean(log2(Fitness.Read_Ratio), na.rm=T),
    Trans.FoldChange.Codons= log2(Trans) - mean(log2(Trans), na.rm=T),
    tAI.Diff.Codons = tAI - mean(tAI, na.rm=T))

multiplot(
    ggplot(melt(subset(ngs, Promoter=='BBaJ23100'), 
            measure.vars= paste(c('RNA','Fitness.Read_Ratio','Prot','Trans'), 
                                '.FoldChange.Codons', sep='')), 
            aes(x=tAI, color=RBS, y=value)) + 
        geom_point(alpha=0.05) + theme_bw() + stat_smooth(method=lm, se=F) +
        facet_wrap(RBS~variable, scale='free') +
        scale_x_continuous(name="Secondary Structure Free Energy") +
        scale_y_log10("Log10 of Dependent Variable") +
        labs(title="tRNA adaptation correlations  (Strong Promoter)"),

    ggplot(melt(subset(ngs, Promoter=='BBaJ23108'), 
            measure.vars= paste(c('RNA','Fitness.Read_Ratio','Prot','Trans'), 
                                '.FoldChange.Codons', sep='')), 
            aes(x=tAI, color=RBS, y=value)) + 
        geom_point(alpha=0.05) + theme_bw() + stat_smooth(method=lm, se=F) +
        facet_wrap(RBS~variable, scale='free') +
        scale_x_continuous(name="Secondary Structure Free Energy") +
        scale_y_log10("Log10 of Dependent Variable") +
        lab(title="tRNA adaptation correlations (Weak Promoter)"),
cols=1)

## Error: could not find function "lab"

I will perform an ANOVA. First let's look at percent explained variation of raw protein level:

tai.lm <- lm(log10(Prot) ~ Promoter + RBS + Gene * tAI, data = ngs)
tai.aov <- Anova(tai.lm)
data.frame(Effect = rownames(tai.aov), Pct.Explained = tai.aov$"Sum Sq"/sum(tai.aov$"Sum Sq") * 
    100)

##      Effect Pct.Explained
## 1  Promoter        30.167
## 2       RBS        19.841
## 3      Gene        13.775
## 4       tAI         1.045
## 5  Gene:tAI         1.084
## 6 Residuals        34.088

Now let's look at percent explained variation of the relative fold change in expression within a promoter, rbs, and gene combination. Since we're looking within promoter and gene, the only components to this ANOVA are tAI and gene identity.

tai.lm <- lm(Prot.FoldChange.Codons ~ tAI * Gene, data = ngs)
tai.aov <- Anova(tai.lm)
data.frame(Effect = rownames(tai.aov), Pct.Explained = tai.aov$"Sum Sq"/sum(tai.aov$"Sum Sq") * 
    100)

##      Effect Pct.Explained
## 1       tAI        3.8638
## 2      Gene        0.8064
## 3  tAI:Gene        3.9533
## 4 Residuals       91.3765

The effect is very very small; tAI seems to account for only about 1% of variance and 3% of the variance within a set of codon variants for one gene. I remember this being much larger! To shake my concerns about the data being calculated incorrectly (either by codonM, or by me), I will plot the data like I did earlier, in a large grid by gene, cds type, promoter, and RBS.

ggplot(ngs, aes(x = reorder(Gene, log10(Prot), mean), y = CDS.type)) + geom_tile(aes(fill = rescale(log10(Prot), 
    c(-1, 1)))) + opts(panel.background = theme_rect(fill = "gray80"), axis.ticks = theme_blank()) + 
    scale_fill_gradient2(low = "darkred", mid = "yellow", high = "darkgreen") + 
    facet_grid(RBS ~ Promoter) + opts(plot.title = theme_text(size = 14, lineheight = 0.8, 
    face = "bold"), legend.position = FALSE, axis.text.x = theme_text(angle = -90))

## Error: Element legend.position must be a string or numeric vector.

The effect is there and definitely quite strong. We can check and see if TAI is correctly measured:


ngs$Gene <- with(ngs, reorder(Gene, log10(Prot), mean))

ggplot(subset(ngs, Promoter == "BBaJ23100" & RBS == "BB0032"), aes(x = Gene, 
    y = CDS.type)) + geom_tile(aes(fill = tAI)) + opts(panel.background = theme_rect(fill = "gray80"), 
    axis.ticks = theme_blank()) + scale_fill_gradient2(low = "blue", mid = "beige", 
    high = "red", midpoint = mean(ngs$tAI), limits = range(ngs$tAI)) + opts(plot.title = theme_text(size = 14, 
    lineheight = 0.8, face = "bold"), axis.text.x = theme_text(angle = -90))

plot of chunk 07-03-plot_prot_grid

It looks correctly measured and it also looks like quite a strong correlation! In addition to the 'max rare' codon variants all having very low tAI, it looks like the first ~¼ of the genes with the lowest expression (X axis is sorted by mean Protein level) all have lower tAIs.

So why are the trend lines so flat? Let's look at just the min and max codons:

multiplot(
    ggplot(melt(subset(ngs, 
            Promoter=='BBaJ23100' & grepl('Rare',ngs$CDS.type)), 
            measure.vars= c('RNA','Count.DNA','Prot','Trans')), 
            aes(x=tAI, group=CDS.type, color=CDS.type, y=value)) + 
        geom_point(alpha=0.15) + theme_bw() +
        facet_wrap(RBS~variable, scale='free') +
        scale_x_continuous(name="Secondary Structure Free Energy") +
        scale_y_log10("Log10 of Dependent Variable") +
        opts(title="tRNA adaptation correlations (Strong Promoter)") +
        geom_boxplot(fill=NA, outlier.shape=NA),

    ggplot(melt(subset(ngs, 
            Promoter=='BBaJ23108' & grepl('Rare',ngs$CDS.type)), 
            measure.vars= c('RNA','Count.DNA','Prot','Trans')), 
            aes(x=tAI, group=CDS.type, color=CDS.type, y=value)) + 
        geom_point(alpha=0.15) + theme_bw() +
        facet_wrap(RBS~variable, scale='free') +
        scale_x_continuous(name="Secondary Structure Free Energy") +
        scale_y_log10("Log10 of Dependent Variable") +
        opts(title="tRNA adaptation correlations (Weak Promoter)") +
        geom_boxplot(fill=NA, outlier.shape=NA),
cols=1)

Alright, there is an effect here, but why is it so strong for RNA?. The max protein measurement might be washing out the effect, perhaps? Or the fitness cost makes the cells divide more slowly, increasing the amount of RNA per cell? Something weird is definitely going on here. I split the tAI into regions and plot it with boxplots:

multiplot(ggplot(melt(subset(ngs, Promoter == "BBaJ23100"), measure.vars = c("RNA", 
    "Count.DNA", "Prot", "Trans")), aes(y = value, x = cut(tAI, breaks = 5), 
    color = RBS)) + geom_boxplot(alpha = 0.15) + theme_bw() + facet_wrap(RBS ~ 
    variable, scale = "free") + opts(title = "tRNA adaptation correlations (Strong Promoter)") + 
    scale_y_log10("Log10 of Dependent Variable") + geom_boxplot(fill = NA, outlier.shape = NA), 
    ggplot(melt(subset(ngs, Promoter == "BBaJ23108"), measure.vars = c("RNA", 
        "Count.DNA", "Prot", "Trans")), aes(y = value, x = cut(tAI, breaks = 5), 
        color = RBS)) + geom_boxplot(alpha = 0.15) + theme_bw() + facet_wrap(RBS ~ 
        variable, scale = "free") + opts(title = "tRNA adaptation correlations (Weak Promoter)") + 
        scale_y_log10("Log10 of Dependent Variable") + geom_boxplot(fill = NA, 
        outlier.shape = NA), cols = 1)

plot of chunk 07-05-plot_tAI_boxplot

So it looks like max/min is a strong effect but when I include the tAI measures for all sequences, the effect is not very strong (though still significant) when I include sequences not explicitly designed to have a high rare codon usage. What if we split it up into the extremes, the 1% and 99% quantiles:

multiplot(ggplot(melt(subset(ngs, Promoter == "BBaJ23100"), measure.vars = c("RNA", 
    "Count.DNA", "Prot", "Trans")), aes(y = value, x = cut(tAI, breaks = c(0, 
    quantile(ngs$tAI, c(0.01, 0.99)), 1), labels = c("1%", "mid", "99%")), color = RBS)) + 
    geom_boxplot(alpha = 0.15) + theme_bw() + facet_wrap(RBS ~ variable, scale = "free") + 
    opts(title = "tRNA adaptation correlations (Strong Promoter)") + scale_y_log10("Log10 of Dependent Variable") + 
    geom_boxplot(fill = NA, outlier.shape = NA), ggplot(melt(subset(ngs, Promoter == 
    "BBaJ23108"), measure.vars = c("RNA", "Count.DNA", "Prot", "Trans")), aes(y = value, 
    x = cut(tAI, breaks = c(0, quantile(ngs$tAI, c(0.01, 0.99)), 1), labels = c("1%", 
        "mid", "99%")), color = RBS)) + geom_boxplot(alpha = 0.15) + theme_bw() + 
    facet_wrap(RBS ~ variable, scale = "free") + opts(title = "tRNA adaptation correlations (Weak Promoter)") + 
    scale_y_log10("Log10 of Dependent Variable") + geom_boxplot(fill = NA, outlier.shape = NA), 
    cols = 1)

plot of chunk 07-05b-plot_tAI_extremes


quantile_list <- quantile(ngs$tAI, (1:19)/20)
quantile_labels <- c(names(quantile_list), "100%")

multiplot(ggplot(melt(subset(ngs, Promoter == "BBaJ23100"), measure.vars = c("RNA", 
    "Count.DNA", "Prot", "Trans")), aes(y = value, x = cut(tAI, breaks = c(0, 
    quantile_list, 1), labels = quantile_labels), color = RBS)) + geom_boxplot(fill = NA, 
    outlier.shape = NA) + theme_bw() + geom_jitter(alpha = 0.01) + facet_wrap(RBS ~ 
    variable, scale = "free") + opts(title = "Codon adaptation correlations (Strong Promoter)", 
    axis.text.x = theme_text(angle = -90, size = 6)) + scale_y_continuous("Log10 of Dependent Variable"), 
    ggplot(melt(subset(ngs, Promoter == "BBaJ23108"), measure.vars = c("RNA", 
        "Count.DNA", "Prot", "Trans")), aes(y = value, x = cut(tAI, breaks = c(0, 
        quantile_list, 1), labels = quantile_labels), color = RBS)) + geom_boxplot(fill = NA, 
        outlier.shape = NA) + theme_bw() + geom_jitter(alpha = 0.01) + facet_wrap(RBS ~ 
        variable, scale = "free") + opts(title = "Codon adaptation correlations (Weak Promoter)", 
        axis.text.x = theme_text(angle = -90, size = 6)) + scale_y_continuous("Log10 of Dependent Variable"), 
    cols = 1)

So even plotting the top 1% and the bottom 1% of tAI values is not as strong as the difference between min/max rare codon types. It seems as if the min/max rare constructs have some other property that tAI alone is not catching. Could it be re-use of the same tRNA for consecutive codons? Some other metric of codon usage? It is unclear, but I will have to explore further.

Relative Codon Frequencies

Instead of tRNA adaptation index, what if we calculate the geometric mean of the codon frequencies for these 10 amino acids?

I've calculated the relative freuency for each codon like this:

\[ f_{\,codon} = \frac{freq_{\,codon,\,genome}}{freq_{\,AA,\,genome}} \]

The codon score for each gene is calculated like this:

\[ F_{gene} = \exp\left( \frac{1}{peptide\,length} \sum_{codon}^{n_{codons}} \ln\left( f_{\,codon} \cdot freq_{\,codon,\,gene}\right) \right) \]


load_genomic_codon_data <- function(codon_filename = paste(getwd(), "/data/codon_usage.txt", 
    sep = "")) {

    # Let's get codon names and AAs from this table:
    codon.txt <- read.table(file = paste(getwd(), "/data/codon_usage.txt", sep = ""), 
        sep = "\t", header = T)[1:64, 1:5]

    # remove the empty level, convert U to T
    codon.txt$Codon <- factor(gsub("U", "T", codon.txt$Codon))

    # name the rows by codon name for easy lookup
    rownames(codon.txt) <- codon.txt$Codon
    return(codon.txt)
}

codon.txt <- load_genomic_codon_data()

get_codon_freq <- function(seq) {
    seq <- as.character(seq[1, "CDS.seq"])
    codons <- factor(unlist(strsplit(seq, "(?<=\\G...)", perl = T)), levels = levels(codon.txt$Codon))
    codons <- as.data.frame(table(codons))
    names(codons) <- c("Codon", "Freq")
    return(codons)
}

codon.freq <- ddply(lib_seqs, .(CDS.seq), get_codon_freq)

# compute relative codon frequencies per amino acid
codon.txt <- merge(codon.txt, ddply(codon.txt, "AA", summarize, Codon = Codon, 
    Rel.Freq = Freq.Genome/sum(Freq.Genome)), by = c("AA", "Codon"))
# add relative frequencies to per-construct codon frequency table
codon.freq <- merge(codon.freq, codon.txt[, c("Codon", "Rel.Freq")], by = "Codon")
cds.codon.freq <- ddply(subset(codon.freq, Freq > 0 & Codon != "ATG"), "CDS.seq", 
    summarize, Rel.Codon.Freq = exp(1/10 * sum(Freq * log(Rel.Freq))))

# merge the codon frequencies with lib_seqs based on the CDS sequence
lib_seqs <- merge(cds.codon.freq, lib_seqs, by = "CDS.seq")
# merge from lib_seqs to ngs based on seq name
ngs <- merge(ngs, lib_seqs[, c("Name", "Rel.Codon.Freq")], by = "Name")

ggplot(subset(ngs, Promoter == "BBaJ23100" & RBS == "BB0032"), aes(x = Gene, 
    y = CDS.type)) + geom_tile(aes(fill = Rel.Codon.Freq)) + opts(panel.background = theme_rect(fill = "gray80"), 
    axis.ticks = theme_blank()) + scale_fill_gradient2(low = "blue", mid = "beige", 
    high = "red", midpoint = mean(ngs$Rel.Codon.Freq), limits = range(ngs$Rel.Codon.Freq)) + 
    opts(plot.title = theme_text(size = 14, lineheight = 0.8, face = "bold"), 
        axis.text.x = theme_text(angle = -90))

plot of chunk 07-06-calc_codon_freq


ngs <- ddply(ngs, c("Gene", "RBS", "Promoter"), transform, Rel.Codon.Freq.Diff.Codons = Rel.Codon.Freq - 
    mean(Rel.Codon.Freq, na.rm = T))

So this measure shows much better correlation with how Sri picked the min and max rare codons. Does it match tAI?

ggplot(ngs, aes(x = tAI, y = Rel.Codon.Freq)) + geom_point(alpha = 0.2) + theme_bw()

plot of chunk 07-07-codon_freq_vs_tAI

Let's look at the percent explained variation of this measure also:

rcf.lm <- lm(Prot.FoldChange.Codons ~ Rel.Codon.Freq * Gene, data = ngs)
rcf.aov <- Anova(rcf.lm)
data.frame(Effect = rownames(rcf.aov), Pct.Explained = rcf.aov$"Sum Sq"/sum(rcf.aov$"Sum Sq") * 
    100)

##                Effect Pct.Explained
## 1      Rel.Codon.Freq        4.8597
## 2                Gene        0.9269
## 3 Rel.Codon.Freq:Gene        6.5388
## 4           Residuals       87.6746

It's a much stronger effect that tAI. (4.8% versus 3.8%). The effect is stronger for some genes than others. Let's look at fitness also:

rcf.lm <- lm(Prot.FoldChange.Codons ~ Rel.Codon.Freq * Gene, data = ngs)
rcf.aov <- Anova(rcf.lm)
data.frame(Effect = rownames(rcf.aov), Pct.Explained = rcf.aov$"Sum Sq"/sum(rcf.aov$"Sum Sq") * 
    100)

##                Effect Pct.Explained
## 1      Rel.Codon.Freq        4.8597
## 2                Gene        0.9269
## 3 Rel.Codon.Freq:Gene        6.5388
## 4           Residuals       87.6746

Correlation of Relative Codon Frequency with Expression

Does it correlate better to measures of expression? I will look at 20 5% quantile chunks.

First I will look at log10 of each measure and then the standard deviations.


quantile_list <- quantile(ngs$Rel.Codon.Freq, (1:19)/20)
quantile_labels <- c(names(quantile_list), "100%")

multiplot(ggplot(melt(subset(ngs, Promoter == "BBaJ23100"), measure.vars = c("RNA", 
    "Count.DNA", "Prot", "Trans")), aes(y = log10(value), x = cut(Rel.Codon.Freq, 
    breaks = c(0, quantile_list, 1), labels = quantile_labels), color = RBS)) + 
    geom_boxplot(fill = NA, outlier.shape = NA) + theme_bw() + geom_jitter(alpha = 0.01) + 
    facet_wrap(RBS ~ variable, scale = "free") + opts(title = "Codon adaptation correlations (Strong Promoter)", 
    axis.text.x = theme_text(angle = -90, size = 6)) + scale_y_continuous("Log10 of Dependent Variable"), 
    ggplot(melt(subset(ngs, Promoter == "BBaJ23108"), measure.vars = c("RNA", 
        "Count.DNA", "Prot", "Trans")), aes(y = log10(value), x = cut(Rel.Codon.Freq, 
        breaks = c(0, quantile_list, 1), labels = quantile_labels), color = RBS)) + 
        geom_boxplot(fill = NA, outlier.shape = NA) + theme_bw() + geom_jitter(alpha = 0.01) + 
        facet_wrap(RBS ~ variable, scale = "free") + opts(title = "Codon adaptation correlations (Weak Promoter)", 
        axis.text.x = theme_text(angle = -90, size = 6)) + scale_y_continuous("Log10 of Dependent Variable"), 
    cols = 1)

plot of chunk 7.08-corr_codon_freq


multiplot(ggplot(ddply(melt(subset(ngs, Promoter == "BBaJ23100"), measure.vars = c("RNA", 
    "Count.DNA", "Prot", "Trans")), c("variable", "RBS"), transform, scaled = scale(value)), 
    aes(y = scaled, x = cut(Rel.Codon.Freq, breaks = c(0, quantile_list, 1), 
        labels = quantile_labels), color = RBS)) + geom_boxplot(fill = NA, outlier.shape = NA) + 
    theme_bw() + geom_jitter(alpha = 0.01) + facet_wrap(RBS ~ variable, scale = "free") + 
    opts(title = "Codon adaptation correlations by SD (Strong Promoter)", axis.text.x = theme_text(angle = -90, 
        size = 6)) + scale_y_continuous("x * SD of of Dependent Variable", limits = c(-4, 
    4)), ggplot(ddply(melt(subset(ngs, Promoter == "BBaJ23108"), measure.vars = c("RNA", 
    "Count.DNA", "Prot", "Trans")), c("variable", "RBS"), transform, scaled = scale(value)), 
    aes(y = scaled, x = cut(Rel.Codon.Freq, breaks = c(0, quantile_list, 1), 
        labels = quantile_labels), color = RBS)) + geom_boxplot(fill = NA, outlier.shape = NA) + 
    theme_bw() + geom_jitter(alpha = 0.01) + facet_wrap(RBS ~ variable, scale = "free") + 
    opts(title = "Codon adaptation correlations by SD (Weak Promoter)", axis.text.x = theme_text(angle = -90, 
        size = 6)) + scale_y_continuous("x * SD of Dependent Variable", limits = c(-4, 
    4)), cols = 1)

plot of chunk 7.08-corr_codon_freq

A 'chunk' consists of 711.7 sequences each on average. Here is the same data, not log10 adjusted:


quantile_list <- quantile(ngs$Rel.Codon.Freq, (1:19)/20)
quantile_labels <- c(names(quantile_list), "100%")

multiplot(ggplot(melt(subset(ngs, Promoter == "BBaJ23100"), measure.vars = c("RNA", 
    "Count.DNA", "Prot", "Trans")), aes(y = value, x = cut(Rel.Codon.Freq, breaks = c(0, 
    quantile_list, 1), labels = quantile_labels), color = RBS)) + geom_boxplot(fill = NA, 
    outlier.shape = NA) + theme_bw() + geom_jitter(alpha = 0.01) + facet_wrap(RBS ~ 
    variable, scale = "free") + opts(title = "Codon adaptation correlations (Strong Promoter)", 
    axis.text.x = theme_text(angle = -90, size = 6)) + scale_y_continuous("Dependent Variable ((NOT log10)"), 
    ggplot(melt(subset(ngs, Promoter == "BBaJ23108"), measure.vars = c("RNA", 
        "Count.DNA", "Prot", "Trans")), aes(y = value, x = cut(Rel.Codon.Freq, 
        breaks = c(0, quantile_list, 1), labels = quantile_labels), color = RBS)) + 
        geom_boxplot(fill = NA, outlier.shape = NA) + theme_bw() + geom_jitter(alpha = 0.01) + 
        facet_wrap(RBS ~ variable, scale = "free") + opts(title = "Codon adaptation correlations (Weak Promoter)", 
        axis.text.x = theme_text(angle = -90, size = 6)) + scale_y_continuous("Dependent Variable (NOT log10)"), 
    cols = 1)

plot of chunk 07-08-corr_codon_notlog

Lets try using protein level relative to the mean for that gene:

multiplot(ggplot(melt(subset(ngs, Promoter == "BBaJ23100"), measure.vars = c("RNA", 
    "Count.DNA", "Prot.FoldChange.Codons", "Trans")), aes(y = value, x = cut(Rel.Codon.Freq, 
    breaks = c(0, quantile_list, 1), labels = quantile_labels), color = RBS)) + 
    geom_boxplot(fill = NA, outlier.shape = NA) + theme_bw() + geom_jitter(alpha = 0.01) + 
    facet_wrap(RBS ~ variable, scale = "free") + opts(title = "Codon adaptation correlations (Strong Promoter)", 
    axis.text.x = theme_text(angle = -90, size = 6)) + scale_y_continuous("Gene-Relative Fold change of Dependent Variable"), 
    ggplot(melt(subset(ngs, Promoter == "BBaJ23108"), measure.vars = c("RNA", 
        "Count.DNA", "Prot.FoldChange.Codons", "Trans")), aes(y = value, x = cut(Rel.Codon.Freq, 
        breaks = c(0, quantile_list, 1), labels = quantile_labels), color = RBS)) + 
        geom_boxplot(fill = NA, outlier.shape = NA) + theme_bw() + geom_jitter(alpha = 0.01) + 
        facet_wrap(RBS ~ variable, scale = "free") + opts(title = "Codon adaptation correlations (Weak Promoter)", 
        axis.text.x = theme_text(angle = -90, size = 6)) + scale_y_continuous("Gene-Relative Fold change of Dependent Variable"), 
    cols = 1)

So it does look like rare codons matter, but it is clear that only using lots of the rarest codons makes a difference. Maybe we can find a 'cutoff' codon value and see which codons are present much more often in those sequences?

cdnfrq.get_sum_sq <- function(cutoff_pct) data.frame(cutoff_pct = cutoff_pct, 
    sum_sq = as.data.frame(Anova(lm(Prot ~ Promoter + RBS + dG + GC + (Rel.Codon.Freq < 
        cutoff_pct), data = ngs)))$"Sum Sq"[3])

cdnfrq.cutoff <- ldply(seq(0.15, 0.25, by = 0.005), cdnfrq.get_sum_sq)

## Error: object 'dG' not found


cdnfrq.best_cutoff <- cdnfrq.cutoff[which.max(cdnfrq.cutoff$sum_sq), "cutoff_pct"]

## Error: object 'cdnfrq.cutoff' not found

cdnfrq.lm <- lm(log10(Prot) ~ GC + dG + Promoter + RBS + Rel.Codon.Freq, data = ngs)

## Error: object 'GC' not found

cdnfrq.aov <- Anova(cdnfrq.lm)

## Error: object 'cdnfrq.lm' not found


cdnfrq.aov$"Pct Explained" <- cdnfrq.aov$"Sum Sq"/sum(cdnfrq.aov$"Sum Sq")

## Error: object 'cdnfrq.aov' not found

cdnfrq.aov

## Error: object 'cdnfrq.aov' not found


cdnfrq.scaled_residuals <- rescale(cdnfrq.lm$residuals/log10(subset(ngs, !is.na(dG) & 
    !is.na(Prot))$Prot), c(-1, 1))

## Error: object 'cdnfrq.lm' not found


ggplot(subset(ngs, !is.na(dG) & !is.na(Prot)), aes(x = reorder(Gene, log10(Prot), 
    mean), y = CDS.type)) + geom_tile(aes(fill = scaled_residuals)) + opts(panel.background = theme_rect(fill = "gray80"), 
    axis.ticks = theme_blank()) + scale_fill_gradient2(low = "blue", mid = "white", 
    high = "red") + facet_grid(RBS ~ Promoter) + opts(title = "Residuals after ANOVA on all measured variables so far", 
    plot.title = theme_text(size = 14, lineheight = 0.8, face = "bold"), legend.position = NA, 
    axis.text.x = theme_text(angle = -90))

## Error: object 'dG' not found

I was hoping there would be some rhyme or reason to the remaining variation, but it doesn't appear to be the case.