R Markdown

This example script shows you how to process data from Kraken2 and produce a taxonomic bar plot for species. You won’t all need all of these steps for 2.10 so take the parts that you need.

library(tidyverse, quietly = TRUE)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
#Read in taxonomic reports
Sample1All <-read.table('Galaxy27-[Report__Kraken2_on_data_25].tabular',
                     colClasses = c('character', 'numeric'),
                     sep='\t')  
Sample2All <-read.table('Galaxy29-[Report__Kraken2_on_data_26].tabular',
                     colClasses = c('character', 'numeric'),
                     sep='\t')    

#Create dataframe with just species rows using logical filter
sRows1 <- Sample1All[grepl("\\|s", Sample1All[["V1"]]),]
sRows2 <- Sample2All[grepl("\\|s", Sample2All[["V1"]]),]

#Use for loop with gsub to simplify Species name
for (i in 1:nrow(sRows1)){
  sRows1[i,1]<-gsub("d__.*s__","",sRows1[i,1],perl=TRUE)
}

for (i in 1:nrow(sRows2)){
  sRows2[i,1]<-gsub("d__.*s__","",sRows2[i,1],perl=TRUE)
}

#Format column names
names(sRows1) <- c('Species','Count')
names(sRows2) <- c('Species','Count')

#Dataframes above will be similar to what you read in if you 
#start with csv files created in 2.8

#Sort data by count in each dataframe and create Other category
#for species not in the top 30
sRows1 <- arrange(sRows1,-Count)
otherRow1 <- data.frame(Species=character(),
                        Count=numeric())
otherRow1[1,1] <-'Other'
otherRow1[1,2] <- sum(sRows1[31:217,2])
topSpecies1 <- bind_rows(sRows1[1:30,],otherRow1)

sRows2 <- arrange(sRows2,-Count)
otherRow2 <- data.frame(Species=character(),
                        Count=numeric())
otherRow2[1,1] <-'Other'
otherRow2[1,2] <- sum(sRows2[31:217,2])
topSpecies2 <- bind_rows(sRows2[1:30,],otherRow1)

#Change the data from counts to percentages in both dataframes
totalSample1 <- sum(topSpecies1$Count)
totalSample2 <- sum(topSpecies2$Count)  
topSpecies1$Count <- topSpecies1$Count/totalSample1
topSpecies2$Count <- topSpecies2$Count/totalSample2

#Tidy
Sample1<-data.frame(Species = topSpecies1$Species, 
                    variable="Sample1", 
                    value=topSpecies1$Count)
Sample2<-data.frame(Species = topSpecies2$Species, 
                    variable="Sample2", 
                    value=topSpecies2$Count) 
myData <- bind_rows(Sample1,Sample2)

Including Plots

Embedded plot

#Create taxonomic bar plot for species
ggplot(myData, aes(x = variable, y = value, fill = Species)) + 
    geom_bar(stat = "identity") + 
  theme(legend.text = element_text(colour="red", size = 6))