Details

This is an R Markdown document. The data for this analysis was collected on 25 January 2017. I have a pool of paired-end 100 sequences from Cedrela species. These sequences were obtained via hybridization capture, targeted enrichment, and short-read sequencing on the Illumina HiSeq 3000.

I’m comparing alignments for the same individuals to the baits sequences, the ced132 SPAdes de novo assembly, and the transcriptome reference that was used to select the bait sequences.

RNA probes provided by MYBaits.

These stats were obtained using bbmap.sh covstats.

Load libraries and packages

library(ggplot2)
library(dplyr)
library(gridExtra)

Load data

comp<-read.csv("comp_3refs.csv", header=1)

Graphs

reads.ref<-ggplot(comp, aes(y=Percent.Aligned,x=Reference, fill=Reference))+
  theme_bw()+
  geom_bar(stat="identity", width=.9, position="dodge")+
  scale_x_discrete(labels=c("ced132_reference_v1.fasta" = "ced132 SPAdes", "CEOD_transcriptome_v1.fasta.masked" = "Transcriptome",
                            "selected-baits-19740.fas" = "Baits"))+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())+
  labs(x = "Reference", y = "% Reads Aligned")+
  theme(legend.justification=c(1,1), legend.position="none",
        legend.title=element_blank())+
  ggtitle("% Aligned Per Reference")+
  theme(plot.title = element_text(hjust = 0, size=10),
        axis.title=element_text(size=8),
        legend.background = element_blank(),
        legend.key.size = unit(.5, "cm"),
        legend.text = element_text(size = 8),
        legend.key=element_rect(color="white"))

sam_ref<-ggplot(comp, aes(y=Percent.Aligned,x=Sample, fill=Reference))+
  theme_bw()+
  geom_bar(stat="identity", width=.9, position="dodge")+
  scale_x_discrete(labels=c("ced101" = "101", "ced103" = "103",
                            "ced132" = "132", "ced148" = "148",
                            "ced151" = "151", "ced153" = "153",
                            "ced175" = "175", "ced178" = "178",
                            "ced183" = "183", "ced280" = "280"))+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())+
  labs(x = "Reference", y = "% Reads Aligned")+
  theme(legend.justification=c(1,1), legend.position="none",
        legend.title=element_blank())+
  ggtitle("% Aligned Per Sample")+
  theme(plot.title = element_text(hjust = 0, size=10),
        axis.title=element_text(size=8),
        legend.background = element_blank(),
        legend.key.size = unit(.5, "cm"),
        legend.text = element_text(size = 8),
        legend.key=element_rect(color="white"))

spp_ref<-ggplot(comp, aes(y=Percent.Aligned,x=Reference, fill=Species))+
  theme_bw()+
  geom_bar(stat="identity", width=.9, position="dodge")+
  scale_x_discrete(labels=c("ced132_reference_v1.fasta" = "ced132 SPAdes", "CEOD_transcriptome_v1.fasta.masked" = "Transcriptome",
                            "selected-baits-19740.fas" = "Baits"))+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())+
  labs(x = "Reference", y = "% Reads Aligned")+
  theme(legend.justification=c(1,1), legend.position=c(1,1),
        legend.title=element_blank())+
  ggtitle("% Aligned per Species")+
  theme(plot.title = element_text(hjust = 0, size=10),
        axis.title=element_text(size=8),
        legend.background = element_blank(),
        legend.key.size = unit(.5, "cm"),
        legend.text = element_text(size = 8),
        legend.key=element_rect(color="white"))

scat<-ggplot(comp, aes(y=Reads.Aligned,x=Reads.Used))+
  theme_bw()+
  geom_point(aes(color=Reference))+
  scale_color_discrete(breaks=c("ced132_reference_v1.fasta", "CEOD_transcriptome_v1.fasta.masked", "selected-baits-19740.fas"),
                      labels=c("ced132 SPAdes", "Transcriptome", "Baits"))+
  geom_smooth(method=lm)+
  ggtitle("Correlation Test")+
  labs(x = "Reads Used", y = "Reads Aligned")+
  theme(legend.position=c(0.84,0.2),
        plot.title = element_text(hjust = 0, size=10),
        axis.title=element_text(size=8),
        legend.background = element_blank(),
        legend.key.size = unit(.5, "cm"),
        legend.text = element_text(size = 8),
        legend.key=element_rect(color="white"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

All together now.

comp_plot<-grid.arrange(reads.ref, scat, sam_ref, spp_ref, ncol=2)

Option: save

#ggsave("comp_summary.jpg",plot=comp_plot, width=8, height=6)

As a Boxplot

spp_ref_box<-ggplot(comp, aes(y=Percent.Aligned,x=Species, fill=Reference)) +
  theme_bw()+
  geom_boxplot()+
  scale_fill_discrete(labels=c("ced132_reference_v1.fasta" = "ced132 SPAdes", "CEOD_transcriptome_v1.fasta.masked" = "Transcriptome",
                            "selected-baits-19740.fas" = "Baits"))+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())+
  labs(x = "Species", y = "% Reads Aligned")+
  theme(legend.justification=c(1,1),
        legend.title=element_blank())+
  ggtitle("% Aligned per Reference")+
  theme(plot.title = element_text(hjust = 0, size=10),
        axis.title=element_text(size=8),
        legend.background = element_blank(),
        legend.key.size = unit(.5, "cm"),
        legend.text = element_text(size = 8),
        legend.key=element_rect(color="white"))
spp_ref_box