STEP 1: Read in .tabular files from Galaxy Report
Filter to species rows and clean up names
ALTERNATIVELY YOU CAN READ IN YOUR FILES FROM 2.08 INSTEAD
NoData <- read.table('NoToothbrush.tabular',sep='\t')
Data <- read.table('Toothbrush.tabular',sep='\t')
sRows1 <- NoData[grepl("d__.*s__",NoData$V1),]
sRows1$V1 <- gsub("d__.*s__","",sRows1$V1)
names(sRows1)<-c('Species','Count')
sRows2 <- Data[grepl("d__.*s__",Data$V1),]
sRows2$V1 <- gsub("d__.*s__","",sRows2$V1)
names(sRows2)<-c('Species','Count')
STEP 2: Create tidy dataframe for top 20 counts + other
sRows1 <- arrange(sRows1,-Count)
otherRow1 <- data.frame(Species=character(),
Count=numeric())
otherRow1[1,1] <-'Other'
otherRow1[1,2] <- sum(sRows1[21:nrow(sRows1),2])
topSpecies1 <- bind_rows(sRows1[1:20,],otherRow1)
sRows2 <- arrange(sRows2,-Count)
otherRow2 <- data.frame(Species=character(),
Count=numeric())
otherRow2[1,1] <-'Other'
otherRow2[1,2] <- sum(sRows2[21:nrow(sRows2),2])
topSpecies2 <- bind_rows(sRows2[1:20,],otherRow1)
#Change the data from counts to percentages in both dataframes
totalSample1 <- sum(topSpecies1$Count)
totalSample2 <- sum(topSpecies2$Count)
topSpecies1$Count <- topSpecies1$Count/totalSample1
topSpecies2$Count <- topSpecies2$Count/totalSample2
#Tidy
Sample1<-data.frame(Species = topSpecies1$Species,
Sample="Sample1",
ReadProportion=topSpecies1$Count)
Sample2<-data.frame(Species = topSpecies2$Species,
Sample="Sample2",
ReadProportion=topSpecies2$Count)
myData <- bind_rows(Sample1,Sample2)
STEP 3: Plot
p<-ggplot(myData, aes(x = Sample, y = ReadProportion, fill = Species)) +
geom_bar(stat = "identity") +
theme(legend.text = element_text(colour="red", size = 6))
p #If using markdown, typing p on console opens plot in plots window for download
