genome.size <- 6017035
gff.all <- readGff3("query.masked.gff.orig")
gff <- gff.all[gff.all$type == "match", ]
data <- data.frame(Name = sub("/$", "", gsub("%2F", "/", getGffAttribute(gff,
"Name"))), Size = size(gff), stringsAsFactors = FALSE)
data$Family <- sub(".*genus:([^ ;]*).*", "\\1", data$Name)
data$Class <- factor(sub("/.*", "", data$Family))
data$Family <- factor(data$Family)
data.grouped <- droplevels(chain(data, group_by(Class, Family), summarise(Count = n(),
Size = sum(Size), Proportion = Size/genome.size), filter(Size >= 1000)))
tab <- data.grouped[c("Count", "Size", "Proportion")]
rownames(tab) <- data.grouped$Family
tab["Sum", ] <- colSums(tab)
kable(tab)
| id | Count | Size | Proportion |
|---|---|---|---|
| LINE | 6 | 1089 | 0.0002 |
| LINE/Jockey | 105 | 31778 | 0.0053 |
| LINE/R1 | 42 | 7081 | 0.0012 |
| Low_complexity | 293 | 12781 | 0.0021 |
| LTR/Copia | 71 | 17145 | 0.0028 |
| LTR/Gypsy | 185 | 74162 | 0.0123 |
| NHF | 119 | 23363 | 0.0039 |
| rRNA | 60 | 8298 | 0.0014 |
| Simple_repeat | 3079 | 141376 | 0.0235 |
| UNK | 7 | 1754 | 0.0003 |
| Unknown | 633 | 72525 | 0.0121 |
| Sum | 4600 | 391352 | 0.0650 |
stripplot(Family ~ Size, data, subset = Family %in% data.grouped$Family, jitter = TRUE)
barchart(Family ~ Size/1000, data.grouped, origin = 0, xlab = "Size (kbp)")
barchart(Class ~ Size/1000, data.grouped, group = Family, stack = TRUE, auto.key = list(space = "right"),
xlab = "Size (kbp)")
barchart(~Size/1000, data.grouped, group = Family, stack = TRUE, auto.key = list(space = "right"),
xlab = "Size (kbp)")