ublast e-value cutoff exploration

test genes: agamous, argonaute2, ad1, DGAT2, leafy, MYB16, pin1, shine1

all 1kp assemblies were translated-ublasted into the test gene protein set with max e-value cutoff of 1e-5.

# load test run data
data <- read.csv("all_usearch_results.csv", as.is = T, head = F)
# sane column names
names(data) <- c("genus", "species", "sample_code", "query_label", "target_label", 
    "percent_id", "aln_len", "no_mismatches", "no_gapopens", "q_start_pos", 
    "q_end_pos", "t_start_pos", "t_end_pos", "e_value", "bit_score")
# enforce 1e-5 cutoff
data <- data[data$e_value <= 1e-05, ]
# simple plot of e-value distribution for each gene
library(ggplot2)
ggplot(data, aes(factor(target_label), e_value, group = factor(target_label))) + 
    geom_boxplot() + scale_y_log10() + labs(x = "", y = "e value") + theme_bw()
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-1

Divisions were retrieved for Genera contained in dataset by scraping from NCBI Taxonomy or DBPedia. To be improved in the future (to eliminate unknowns)

# load taxonomy data
taxa <- read.csv("taxonomy.csv", as.is = T, head = T)
# annotate usearch data with division
data <- merge(data, taxa, by = "genus")

e-value cutoff exploration

number of hits per gene

# aggregate data by counting hits per target gene
library(plyr)
by_target <- ddply(data, .(target_label), function(x) data.frame(hits = nrow(x)))
ggplot(by_target, aes(factor(target_label), hits)) + geom_bar(stat = "identity") + 
    labs(x = "") + theme_bw()

plot of chunk unnamed-chunk-3

further grouped by division

by_target_division <- ddply(data, .(target_label, division), function(x) data.frame(hits = nrow(x)))
ggplot(by_target_division, aes(factor(target_label), hits, fill = factor(division), 
    group = factor(target_label))) + labs(x = "", y = "hits", fill = "Division") + 
    geom_bar(stat = "identity") + theme_bw()

plot of chunk unnamed-chunk-4

same shown as poportions

library(scales)
by_target_division$prop = apply(by_target_division, 1, function(x) {
    as.numeric(x["hits"])/sum(as.numeric(by_target_division[by_target_division$target_label == 
        x["target_label"], c("hits")]))
})
ggplot(by_target_division, aes(factor(target_label), prop, fill = factor(division), 
    group = factor(target_label))) + geom_bar(stat = "identity") + labs(x = "", 
    y = "proportion", fill = "Division") + theme_bw() + scale_y_continuous(labels = percent_format())

plot of chunk unnamed-chunk-5

number of species hit per gene

by_species <- ddply(data, .(target_label), function(x) data.frame(no_species = length(unique(x$sample_code))))
ggplot(by_species, aes(factor(target_label), no_species)) + geom_bar(stat = "identity") + 
    theme_bw()

plot of chunk unnamed-chunk-6

same further grouped by division

# and per division
by_species_division <- ddply(data, .(target_label, division), function(x) data.frame(no_species = length(unique(x$sample_code))))
ggplot(by_species_division, aes(factor(target_label), no_species, fill = factor(division), 
    group = factor(target_label))) + geom_bar(stat = "identity") + theme_bw()

plot of chunk unnamed-chunk-7

same shown as porportions

by_species_division$prop = apply(by_species_division, 1, function(x) {
    as.numeric(x["no_species"])/sum(as.numeric(by_species_division[by_species_division$target_label == 
        x["target_label"], c("no_species")]))
})
ggplot(by_species_division, aes(factor(target_label), prop, fill = factor(division), 
    group = factor(target_label))) + geom_bar(stat = "identity") + labs(x = "", 
    y = "proportion", fill = "Division") + theme_bw() + scale_y_continuous(labels = percent_format())

plot of chunk unnamed-chunk-8