Details

This is an R Markdown document. The data for this analysis was collected on 13 January 2017. I have a pool of paired-end 100 sequences from Cedrela species. These sequences were obtained via hybridization capture, targeted enrichment, and short-read sequencing on the Illumina HiSeq 3000.

I am trying to quantify the proportion non-target (chloroplast) among my sequences, and address these questions: 1. How many captured and sequenced reads contained regions of the chloroplast genome? 2. If chloroplast reads were sequenced, what is the average chloroplast coverage?

These stats were obtained using bbmap.sh. Reads were aligned to my draft chloroplast genome of Cedrela odorata and a publically available draft chloroplast genome of Azadirachta indica.

Load libraries and packages

library(ggplot2)
library(dplyr)
library(gridExtra)

Load data

cp_cov<-read.csv("Chloroplast_aln_comp.csv", header=1)
names(cp_cov)
## [1] "Reference"        "File"             "Species"         
## [4] "Avg_fold"         "Covered_percent"  "Covered_bases"   
## [7] "Reads.used"       "Reads.Mapped"     "Percent.of.total"

Prepare a barplot showing the % of reads per sample that aligned to either the chloroplast genome of Cedrela odorata or Azadirachta indica.

per_cp<-ggplot(cp_cov, aes(y=Percent.of.total,x=File, fill=Reference))+
  theme_bw()+
  geom_bar(stat="identity", width=.9, position="dodge")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())+
  labs(x = "Individual", y = "% Reads = Chloroplast")+
  theme(legend.justification=c(1,1), legend.position=c(.4,1),
        legend.title=element_blank())+
  ggtitle("% Reads = Chloroplast")+
  theme(plot.title = element_text(hjust = 0, size=10),
        axis.title=element_text(size=8),
        legend.background = element_blank(),
        legend.key.size = unit(.5, "cm"),
        legend.text = element_text(size = 8),
        legend.key=element_rect(color="white"))

Prepare a barplot showing the average coverage across the chloroplast genome of Cedrela odorata or Azadirachta indica for each sample.

avg_cov_cp<-ggplot(cp_cov, aes(y=Avg_fold,x=File, fill=Reference)) +
  theme_bw()+
  geom_bar(stat="identity", width=.9, position="dodge")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())+
  labs(x = "Individual", y = "Average Coverage")+
  theme(legend.justification=c(1,1), legend.position=c(.4,1),
        legend.title=element_blank())+
  ggtitle("Average Coverage")+
  theme(plot.title = element_text(hjust = 0, size=10),
        axis.title=element_text(size=8),
        legend.background = element_blank(),
        legend.key.size = unit(.5, "cm"),
        legend.text = element_text(size = 8),
        legend.key=element_rect(color="white"))

Prepare a barplot showing the average coverage across the chloroplast genome of Cedrela odorata or Azadirachta indica for each species.

avg_cov_spp<-ggplot(cp_cov, aes(y=Avg_fold,x=Species, fill=Reference)) +
  theme_bw()+
  geom_bar(stat="identity", width=.9, position="dodge")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())+
  labs(x = "Species", y = "Average Coverage")+
  theme(legend.justification=c(1,1), legend.position=c(.35,1),
        legend.title=element_blank())+
  ggtitle("Average Coverage")+
  theme(plot.title = element_text(hjust = 0, size=10),
        axis.title=element_text(size=8),
        legend.background = element_blank(),
        legend.key.size = unit(.5, "cm"),
        legend.text = element_text(size = 8),
        legend.key=element_rect(color="white"))

Prepare a barplot showing the % of reads per sspecies that aligned to either the chloroplast genome of Cedrela odorata or Azadirachta indica.

per_spp<-ggplot(cp_cov, aes(y=Percent.of.total,x=Species, fill=Reference))+
  theme_bw()+
  geom_bar(stat="identity", width=.9, position="dodge")+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())+
  labs(x = "Species", y = "% Reads = Chloroplast")+
  theme(legend.justification=c(1,1), legend.position=c(.35,1),
        legend.title=element_blank())+
  ggtitle("% Reads = Chloroplast")+
  theme(plot.title = element_text(hjust = 0, size=10),
        axis.title=element_text(size=8),
        legend.background = element_blank(),
        legend.key.size = unit(.5, "cm"),
        legend.text = element_text(size = 8),
        legend.key=element_rect(color="white"))

Now show % reads chloroplast per species as a boxplot.

avg_cov_spp_box<-ggplot(cp_cov, aes(y=Avg_fold,x=Species, fill=Reference)) +
  theme_bw()+
  geom_boxplot()+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())+
  labs(x = "Species", y = "Average Coverage")+
  theme(legend.justification=c(1,1), legend.position=c(.35,1),
        legend.title=element_blank())+
  ggtitle("Average Coverage")+
  theme(plot.title = element_text(hjust = 0, size=10),
        axis.title=element_text(size=8),
        legend.background = element_blank(),
        legend.key.size = unit(.5, "cm"),
        legend.text = element_text(size = 8),
        legend.key=element_rect(color="white"))

Now show average coverage per species as a boxplot.

per_spp_box<-ggplot(cp_cov, aes(y=Percent.of.total,x=Species, fill=Reference))+
  theme_bw()+
  geom_boxplot()+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())+
  labs(x = "Species", y = "% Reads = Chloroplast")+
  theme(legend.justification=c(1,1), legend.position=c(.35,1),
        legend.title=element_blank())+
  ggtitle("% Reads = Chloroplast")+
  theme(plot.title = element_text(hjust = 0, size=10),
        axis.title=element_text(size=8),
        legend.background = element_blank(),
        legend.key.size = unit(.5, "cm"),
        legend.text = element_text(size = 8),
        legend.key=element_rect(color="white"))

Put them all together now.

cp_aln<-grid.arrange(per_cp,avg_cov_cp,per_spp, avg_cov_spp,per_spp_box, avg_cov_spp_box,ncol=2)

Save data to your working directory

#ggsave("cp_aln_summary_mo.jpg",plot=cp_aln, width=8.5, height=11)