test genes: agamous, argonaute2, ad1, DGAT2, leafy, MYB16, pin1, shine1
all 1kp assemblies were translated-ublasted into the test gene protein set with max e-value cutoff of 1e-5.
# load test run data
data <- read.csv("all_usearch_results.csv", as.is = T, head = F)
# sane column names
names(data) <- c("genus", "species", "sample_code", "query_label", "target_label",
"percent_id", "aln_len", "no_mismatches", "no_gapopens", "q_start_pos",
"q_end_pos", "t_start_pos", "t_end_pos", "e_value", "bit_score")
# enforce 1e-5 cutoff
data <- data[data$e_value <= 1e-05, ]
# simple plot of e-value distribution for each gene
library(ggplot2)
ggplot(data, aes(factor(target_label), e_value, group = factor(target_label))) +
geom_boxplot() + scale_y_log10() + labs(x = "", y = "e value") + theme_bw()
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
Divisions were retrieved for Genera contained in dataset by scraping from NCBI Taxonomy or DBPedia. To be improved in the future (to eliminate unknowns)
# load taxonomy data
taxa <- read.csv("taxonomy.csv", as.is = T, head = T)
# annotate usearch data with division
data <- merge(data, taxa, by = "genus")
# aggregate data by counting hits per target gene
library(plyr)
by_target <- ddply(data, .(target_label), function(x) data.frame(hits = nrow(x)))
ggplot(by_target, aes(factor(target_label), hits)) + geom_bar(stat = "identity") +
labs(x = "") + theme_bw()
by_target_division <- ddply(data, .(target_label, division), function(x) data.frame(hits = nrow(x)))
ggplot(by_target_division, aes(factor(target_label), hits, fill = factor(division),
group = factor(target_label))) + labs(x = "", y = "hits", fill = "Division") +
geom_bar(stat = "identity") + theme_bw()
library(scales)
by_target_division$prop = apply(by_target_division, 1, function(x) {
as.numeric(x["hits"])/sum(as.numeric(by_target_division[by_target_division$target_label ==
x["target_label"], c("hits")]))
})
ggplot(by_target_division, aes(factor(target_label), prop, fill = factor(division),
group = factor(target_label))) + geom_bar(stat = "identity") + labs(x = "",
y = "proportion", fill = "Division") + theme_bw() + scale_y_continuous(labels = percent_format())
by_species <- ddply(data, .(target_label), function(x) data.frame(no_species = length(unique(x$sample_code))))
ggplot(by_species, aes(factor(target_label), no_species)) + geom_bar(stat = "identity") +
theme_bw()
# and per division
by_species_division <- ddply(data, .(target_label, division), function(x) data.frame(no_species = length(unique(x$sample_code))))
ggplot(by_species_division, aes(factor(target_label), no_species, fill = factor(division),
group = factor(target_label))) + geom_bar(stat = "identity") + theme_bw()
by_species_division$prop = apply(by_species_division, 1, function(x) {
as.numeric(x["no_species"])/sum(as.numeric(by_species_division[by_species_division$target_label ==
x["target_label"], c("no_species")]))
})
ggplot(by_species_division, aes(factor(target_label), prop, fill = factor(division),
group = factor(target_label))) + geom_bar(stat = "identity") + labs(x = "",
y = "proportion", fill = "Division") + theme_bw() + scale_y_continuous(labels = percent_format())