This analysis compares codon usage bias (RSCU values) between Human and Chicken genomes. It computes codon frequencies, performs normality tests, and visualizes results through histograms, QQ plots, and heatmaps.
library(Biostrings)
library(dplyr)
library(ggplot2)
library(plotly)
library(tidyr)
library(nortest)
library(htmltools)
genomes <- list(
Human = "Homo_sapiens.GRCh38.cdna.all.fa.gz",
Chicken = "Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.cdna.all.fa.gz"
)
extract_cds_codons <- function(seq) {
s <- as.character(seq)
start_pos <- regexpr("ATG", s)[1]
if (start_pos == -1) return(character(0))
coding_seq <- substring(s, start_pos)
codons <- substring(coding_seq,
seq(1, nchar(coding_seq), 3),
seq(3, nchar(coding_seq), 3))
codons <- codons[nchar(codons) == 3]
stops <- c("TAA", "TAG", "TGA")
stop_index <- which(codons %in% stops)
if (length(stop_index) > 0) codons <- codons[1:min(stop_index)]
return(codons)
}
library(Biostrings)
codon_table <- GENETIC_CODE
aa_df <- data.frame(
codon = names(codon_table),
aa = as.vector(codon_table)
)
codon_table <- GENETIC_CODE
aa_df <- data.frame(codon = names(codon_table), aa = as.vector(codon_table))
results_list <- list()
top_bottom_list <- list()
for (genome_name in names(genomes)) {
cat("\nProcessing genome:", genome_name, "\n")
fasta_file <- genomes[[genome_name]]
cdna_seqs <- readDNAStringSet(fasta_file)
all_codons <- unlist(lapply(cdna_seqs, extract_cds_codons), use.names = FALSE)
codon_df <- data.frame(codon = all_codons) %>%
count(codon, name = "obs_count") %>%
inner_join(aa_df, by = "codon")
rscu_df <- codon_df %>%
group_by(aa) %>%
mutate(expected = sum(obs_count) / n(),
RSCU = obs_count / expected) %>%
ungroup() %>%
mutate(aa2 = ifelse(codon %in% c("TAA","TAG","TGA"), "*", aa))
results_list[[genome_name]] <- rscu_df
# Histogram
print(ggplot(rscu_df, aes(x = RSCU)) +
geom_histogram(binwidth = 0.25, fill = "steelblue", color = "black") +
labs(title = paste0(genome_name, ": RSCU Distribution"), x = "RSCU", y = "Frequency") +
theme_minimal())
ad_test <- ad.test(rscu_df$RSCU)
print(ad_test)
if(ad_test$p.value > 0.05){
print("RSCU Distribution is normal")
} else {
print("RSCU Distribution is not normal")
}
print(ggplot(rscu_df, aes(sample = RSCU)) +
stat_qq(color = "darkred") +
stat_qq_line(color = "black") +
labs(title = "Q-Q Plot of RSCU Values", x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_minimal())
rscu_df <- rscu_df %>% mutate(RSCU_one = RSCU == 1)
print(ggplot(rscu_df, aes(x = reorder(codon, RSCU), y = RSCU, fill = aa)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = paste0(genome_name, ": Codon Usage (RSCU)"), x = "Codon", y = "RSCU") +
theme_minimal())
heat_plot <- ggplot(rscu_df, aes(x = aa2, y = codon, fill = RSCU,
text = paste0("Codon: ", codon,
"<br>Amino Acid: ", aa2,
"<br>RSCU: ", round(RSCU, 3)))) +
geom_tile(color = "white") +
geom_tile(data = subset(rscu_df, RSCU_one == TRUE),
aes(x = aa2, y = codon),
fill = NA, color = "grey", size = 0.6, inherit.aes = FALSE) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 1) +
labs(title = paste0(genome_name, ": RSCU Heatmap"),
x = "Amino Acid", y = "Codon", fill = "RSCU") +
theme_minimal()
#print(heat_plot)
#print(ggplotly(heat_plot, tooltip = "text"))
p <- ggplotly(heat_plot, tooltip = "text")
p
codon_table_df <- rscu_df %>%
group_by(aa2) %>%
summarize(
most_used = codon[which.max(RSCU)],
most_used_RSCU = RSCU[which.max(RSCU)],
least_used = codon[which.min(RSCU)],
least_used_RSCU = RSCU[which.min(RSCU)]
) %>%
ungroup() %>%
mutate(Genome = genome_name)
top_bottom_list[[genome_name]] <- codon_table_df
}
Processing genome: Human
Anderson-Darling normality test
data: rscu_df$RSCU A = 0.62371, p-value = 0.09993
[1] “RSCU Distribution is normal”
Processing genome: Chicken
Anderson-Darling normality test
data: rscu_df$RSCU A = 1.2046, p-value = 0.003553
[1] “RSCU Distribution is not normal”
comparison_df <- bind_rows(top_bottom_list) %>%
mutate(
most_used = paste0(most_used, " (", round(most_used_RSCU, 3), ")"),
least_used = paste0(least_used, " (", round(least_used_RSCU, 3), ")")
) %>%
select(Genome, aa2, most_used, least_used) %>%
pivot_longer(cols = c(most_used, least_used),
names_to = "Usage",
values_to = "Codon_RSCU") %>%
unite("Genome_Usage", Genome, Usage, sep = "_") %>%
pivot_wider(names_from = aa2, values_from = Codon_RSCU)
print(comparison_df)
## # A tibble: 4 × 22
## Genome_Usage `*` A C D E F G H I K L
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Human_most_… TGA … GCC … TGC … GAC … GAG … TTC … GGC … CAC … ATC … AAG … CTG …
## 2 Human_least… TAG … GCG … TGT … GAT … GAA … TTT … GGT … CAT … ATA … AAA … CTA …
## 3 Chicken_mos… TGA … GCC … TGC … GAT … GAG … TTC … GGC … CAC … ATC … AAG … CTG …
## 4 Chicken_lea… TAG … GCG … TGT … GAC … GAA … TTT … GGT … CAT … ATA … AAA … CTA …
## # ℹ 10 more variables: M <chr>, N <chr>, P <chr>, Q <chr>, R <chr>, S <chr>,
## # T <chr>, V <chr>, W <chr>, Y <chr>
write.csv(comparison_df, "codon_RSCU_comparison.csv", row.names = FALSE)
cat("\nSaved: codon_RSCU_comparison.csv\n")
##
## Saved: codon_RSCU_comparison.csv