1 Introduction

This analysis compares codon usage bias (RSCU values) between Human and Yeast genomes. It computes codon frequencies, performs normality tests, and visualizes results through histograms, QQ plots, and heatmaps.

2 Load Libraries

library(Biostrings)
library(dplyr)
library(ggplot2)
library(plotly)
library(tidyr)
library(nortest)
library(htmltools)

3 Define Genome Files

genomes <- list(
  Human = "C:/Users/abdul/OneDrive/Desktop/Assignment 12/Assignment 12/Abdullah/Homo_sapiens.GRCh38.cdna.all.fa",
  Yeast = "C:/Users/abdul/OneDrive/Desktop/Assignment 12/Assignment 12/Abdullah/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa"
)

4 Function to Extract Codons

extract_cds_codons <- function(seq) {
  s <- as.character(seq)
  start_pos <- regexpr("ATG", s)[1]
  if (start_pos == -1) return(character(0))
  coding_seq <- substring(s, start_pos)
  codons <- substring(coding_seq,
                      seq(1, nchar(coding_seq), 3),
                      seq(3, nchar(coding_seq), 3))
  codons <- codons[nchar(codons) == 3]
  stops <- c("TAA", "TAG", "TGA")
  stop_index <- which(codons %in% stops)
  if (length(stop_index) > 0) codons <- codons[1:min(stop_index)]
  return(codons)
}

codon_table <- GENETIC_CODE
aa_df <- data.frame(codon = names(codon_table), aa = as.vector(codon_table))

5 Process Genomes

results_list <- list()
top_bottom_list <- list()

for (genome_name in names(genomes)) {
  cat("\nProcessing genome:", genome_name, "\n")
  fasta_file <- genomes[[genome_name]]
  cdna_seqs <- readDNAStringSet(fasta_file)
  
  all_codons <- unlist(lapply(cdna_seqs, extract_cds_codons), use.names = FALSE)
  
  codon_df <- data.frame(codon = all_codons) %>%
    count(codon, name = "obs_count") %>%
    inner_join(aa_df, by = "codon")
  
  rscu_df <- codon_df %>%
    group_by(aa) %>%
    mutate(expected = sum(obs_count) / n(),
           RSCU = obs_count / expected) %>%
    ungroup() %>%
    mutate(aa2 = ifelse(codon %in% c("TAA","TAG","TGA"), "*", aa))
  
  results_list[[genome_name]] <- rscu_df
  
  # Histogram
  print(ggplot(rscu_df, aes(x = RSCU)) +
    geom_histogram(binwidth = 0.25, fill = "steelblue", color = "black") +
    labs(title = paste0(genome_name, ": RSCU Distribution"), x = "RSCU", y = "Frequency") +
    theme_minimal())
  
  ad_test <- ad.test(rscu_df$RSCU)
  print(ad_test)
  
  if(ad_test$p.value > 0.05){
    print("RSCU Distribution is normal")
  } else {
    print("RSCU Distribution is not normal")
  }
  
  print(ggplot(rscu_df, aes(sample = RSCU)) +
    stat_qq(color = "darkred") +
    stat_qq_line(color = "black") +
    labs(title = "Q-Q Plot of RSCU Values", x = "Theoretical Quantiles", y = "Sample Quantiles") +
    theme_minimal())
  
  rscu_df <- rscu_df %>% mutate(RSCU_one = RSCU == 1)
  
  print(ggplot(rscu_df, aes(x = reorder(codon, RSCU), y = RSCU, fill = aa)) +
    geom_bar(stat = "identity") +
    coord_flip() +
    labs(title = paste0(genome_name, ": Codon Usage (RSCU)"), x = "Codon", y = "RSCU") +
    theme_minimal())
  
  heat_plot <- ggplot(rscu_df, aes(x = aa2, y = codon, fill = RSCU,
                                   text = paste0("Codon: ", codon,
                                                 "<br>Amino Acid: ", aa2,
                                                 "<br>RSCU: ", round(RSCU, 3)))) +
    geom_tile(color = "white") +
    geom_tile(data = subset(rscu_df, RSCU_one == TRUE),
              aes(x = aa2, y = codon),
              fill = NA, color = "grey", size = 0.6, inherit.aes = FALSE) +
    scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 1) +
    labs(title = paste0(genome_name, ": RSCU Heatmap"),
         x = "Amino Acid", y = "Codon", fill = "RSCU") +
    theme_minimal()
  
  #print(heat_plot)
  #print(ggplotly(heat_plot, tooltip = "text"))
  p <- ggplotly(heat_plot, tooltip = "text")
  p
  
  
  codon_table_df <- rscu_df %>%
    group_by(aa2) %>%
    summarize(
      most_used = codon[which.max(RSCU)],
      most_used_RSCU = RSCU[which.max(RSCU)],
      least_used = codon[which.min(RSCU)],
      least_used_RSCU = RSCU[which.min(RSCU)]
    ) %>%
    ungroup() %>%
    mutate(Genome = genome_name)
  
  top_bottom_list[[genome_name]] <- codon_table_df
}

Processing genome: Human Anderson-Darling normality test

data: rscu_df$RSCU A = 0.62371, p-value = 0.09993

[1] “RSCU Distribution is normal” Processing genome: Yeast Anderson-Darling normality test

data: rscu_df$RSCU A = 0.91519, p-value = 0.0187

[1] “RSCU Distribution is not normal”

6 Comparison Table

comparison_df <- bind_rows(top_bottom_list) %>%
  mutate(
    most_used = paste0(most_used, " (", round(most_used_RSCU, 3), ")"),
    least_used = paste0(least_used, " (", round(least_used_RSCU, 3), ")")
  ) %>%
  select(Genome, aa2, most_used, least_used) %>%
  pivot_longer(cols = c(most_used, least_used),
               names_to = "Usage",
               values_to = "Codon_RSCU") %>%
  unite("Genome_Usage", Genome, Usage, sep = "_") %>%
  pivot_wider(names_from = aa2, values_from = Codon_RSCU)

print(comparison_df)
## # A tibble: 4 × 22
##   Genome_Usage `*`   A     C     D     E     F     G     H     I     K     L    
##   <chr>        <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Human_most_… TGA … GCC … TGC … GAC … GAG … TTC … GGC … CAC … ATC … AAG … CTG …
## 2 Human_least… TAG … GCG … TGT … GAT … GAA … TTT … GGT … CAT … ATA … AAA … CTA …
## 3 Yeast_most_… TAA … GCT … TGT … GAT … GAA … TTT … GGT … CAT … ATT … AAA … TTG …
## 4 Yeast_least… TAG … GCG … TGC … GAC … GAG … TTC … GGG … CAC … ATC … AAG … CTC …
## # ℹ 10 more variables: M <chr>, N <chr>, P <chr>, Q <chr>, R <chr>, S <chr>,
## #   T <chr>, V <chr>, W <chr>, Y <chr>
write.csv(comparison_df, "codon_RSCU_comparison.csv", row.names = FALSE)
cat("\nSaved: codon_RSCU_comparison.csv\n")
## 
## Saved: codon_RSCU_comparison.csv

7 Summary Statistics (Must Run After Processing)

8 Answers to Guiding Questions

1. What is the maximum RSCU observed for the Human cDNA dataset? Do the same for your assigned organism.

In the human cDNA dataset, the maximum RSCU value is 2.42.
For yeast, the maximum RSCU value is 2.81.
Both of these values are well above 1, which means that in each species there is at least one codon that is used more than twice as often as expected under equal synonymous usage.

2. What is the minimum RSCU observed for the Human cDNA dataset? Do the same for your assigned organism.

In humans, the minimum RSCU is 0.35.
In yeast, the minimum RSCU is 0.26.
These very low values show that some codons are strongly avoided within their synonymous codon families.

3. Which codon shows the highest RSCU value in humans, and in your assigned organism?

The codon with the highest RSCU in humans is CTG, which encodes the amino acid L.
In yeast, the codon with the highest RSCU is AGA, encoding R.
These codons are clearly the “favorite” choices for those amino acids in each species.

4. Which codon shows the lowest RSCU value in humans, and in your assigned organism?

The lowest RSCU value in the human dataset is for codon TCG, which encodes S.
In yeast, the codon with the lowest RSCU is CGG, which encodes R.
These codons are rarely used and seem to be disfavored by each genome.

5. Are there any amino acids where humans and your assigned organism prefer different codons?

Yes. When I compare the heatmaps and the table, several amino acids show different preferred codons in humans and yeast. For example, the strongly preferred leucine codon in humans is not the same as in yeast, and the same is true for amino acids like serine and arginine. This means that even though both organisms use the same genetic code, they do not choose the same codons for the same amino acid.

6. Is there an amino acid where humans have a strongly preferred codon but your assigned organism does not (or vice versa)?

There are clear cases in both directions. Humans show a very strong preference for one particular codon for some amino acids, where yeast spreads its usage more evenly across different codons. The opposite also happens, where yeast has a very dominant codon that humans do not favor as strongly. This tells me that the strength of codon bias can be quite different between the two species for the same amino acid.

7. Do both species show similar usage patterns for synonymous codons of amino acids encoded by six codons?

For amino acids that are encoded by six codons, such as leucine, serine, and arginine, both species clearly show bias, but the detailed patterns are not the same. In both humans and yeast, one or two codons are used much more than the others, but the specific “best” codon differs. So the overall idea of biased usage is shared, but the exact pattern of which codons are preferred is species-specific.

8. Are stop codons (TAA, TAG, TGA) preferred differently between the two species?

Yes. Looking at the stop codons in the results, humans and yeast do not use TAA, TAG, and TGA equally, and their relative preferences differ between the two genomes. One species relies more heavily on one stop codon, while the other favors a different one, and TAG tends to be the least used. This shows that even termination codons follow species-specific usage patterns.

9. Are the overall patterns of codon usage broadly conserved, or do the heatmaps reveal species-specific signatures?

The heatmaps show that codon usage is not completely conserved. There are some broad similarities, such as the fact that both organisms have biased usage and do not treat synonymous codons equally. However, the exact positions of the most enriched and most depleted codons are different for many amino acids. This creates a kind of “signature” pattern for each species, so I would say the heatmaps clearly reveal species-specific codon usage.

10. If there are differences, could they reflect evolutionary divergence?

I think the differences we see are very likely a result of evolutionary divergence. Humans and yeast are distantly related and have been evolving separately for a very long time, under different ecological and physiological conditions. Over time, natural selection and mutation would shape codon usage to fit each species’ own tRNA pools, growth rates, and gene expression needs, which would naturally lead to the distinct patterns that show up in the RSCU analysis.

11. Biologically speaking, what could be the cause of the observed differences? Translation efficiency? tRNA availability and usage?

The most reasonable explanation is a combination of translation efficiency and tRNA availability. If a species maintains a high number of tRNAs for a certain codon, that codon can be translated more quickly and accurately, so it becomes favored and its RSCU increases. Codons that match rare tRNAs tend to be slow or error prone and end up with very low RSCU values. On top of that, differences in genomic GC content, mutation bias, and expression levels of specific genes all contribute to the codon usage patterns we see.

12. You wish to express a human gene in your assigned organism. How does RSCU inform on codon optimization?

If I want to express a human gene efficiently in yeast, the RSCU analysis is extremely helpful. It tells me which codons yeast prefers for each amino acid. I can then redesign the human coding sequence by replacing rare human codons with synonymous codons that have high RSCU values in yeast, without changing the amino acid sequence of the protein. This codon optimization should make translation in yeast faster and more efficient, increase protein yield, and reduce the chance that the expression will be limited by tRNA availability.