This example script shows you how to process data from Kraken2 and produce a taxonomic bar plot for species. You won’t all need all of these steps for 2.10 so take the parts that you need.
library(tidyverse, quietly = TRUE)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
#Read in taxonomic reports
Sample1All <-read.table('Galaxy27-[Report__Kraken2_on_data_25].tabular',
colClasses = c('character', 'numeric'),
sep='\t')
Sample2All <-read.table('Galaxy29-[Report__Kraken2_on_data_26].tabular',
colClasses = c('character', 'numeric'),
sep='\t')
#Create dataframe with just species rows using logical filter
sRows1 <- Sample1All[grepl("\\|s", Sample1All[["V1"]]),]
sRows2 <- Sample2All[grepl("\\|s", Sample2All[["V1"]]),]
#Use for loop with gsub to simplify Species name
for (i in 1:nrow(sRows1)){
sRows1[i,1]<-gsub("d__.*s__","",sRows1[i,1],perl=TRUE)
}
for (i in 1:nrow(sRows2)){
sRows2[i,1]<-gsub("d__.*s__","",sRows2[i,1],perl=TRUE)
}
#Format column names
names(sRows1) <- c('Species','Count')
names(sRows2) <- c('Species','Count')
#Dataframes above will be similar to what you read in if you
#start with csv files created in 2.8
#Sort data by count in each dataframe and create Other category
#for species not in the top 30
sRows1 <- arrange(sRows1,-Count)
otherRow1 <- data.frame(Species=character(),
Count=numeric())
otherRow1[1,1] <-'Other'
otherRow1[1,2] <- sum(sRows1[31:217,2])
topSpecies1 <- bind_rows(sRows1[1:30,],otherRow1)
sRows2 <- arrange(sRows2,-Count)
otherRow2 <- data.frame(Species=character(),
Count=numeric())
otherRow2[1,1] <-'Other'
otherRow2[1,2] <- sum(sRows2[31:217,2])
topSpecies2 <- bind_rows(sRows2[1:30,],otherRow1)
#Change the data from counts to percentages in both dataframes
totalSample1 <- sum(topSpecies1$Count)
totalSample2 <- sum(topSpecies2$Count)
topSpecies1$Count <- topSpecies1$Count/totalSample1
topSpecies2$Count <- topSpecies2$Count/totalSample2
#Tidy
Sample1<-data.frame(Species = topSpecies1$Species,
variable="Sample1",
value=topSpecies1$Count)
Sample2<-data.frame(Species = topSpecies2$Species,
variable="Sample2",
value=topSpecies2$Count)
myData <- bind_rows(Sample1,Sample2)
Embedded plot
#Create taxonomic bar plot for species
ggplot(myData, aes(x = variable, y = value, fill = Species)) +
geom_bar(stat = "identity") +
theme(legend.text = element_text(colour="red", size = 6))