This is an R Markdown document. The data for this analysis was collected on 13 January 2017. I have a pool of paired-end 100 sequences from Cedrela species. These sequences were obtained via hybridization capture, targeted enrichment, and short-read sequencing on the Illumina HiSeq 3000.
How many reads were on target?
This will help me to narrow the bait pool to baits that show high coverage.
RNA probes provided by MYBaits.
These stats were obtained using bbmap.sh. Reads were aligned to bait sequences. Load libraries and packages
library(ggplot2)
library(dplyr)
library(gridExtra)
Prepare data: Eliminate observations that showed >5% average coverage Prepare a 10% subset of observations or CRASH R.
ced101<-read.csv("ced101_2baits.csv", header=1)
ced103<-read.csv("ced103_2baits.csv", header=1)
ced151<-read.csv("ced151_2baits.csv", header=1)
ced175<-read.csv("ced175_2baits.csv", header=1)
ced280<-read.csv("ced280_2baits.csv", header=1)
ced101<-sample_n(ced101,0.1*nrow(ced101))
ced101<-ced101[order(ced101$Log.two),]
ced103<-sample_n(ced103,0.1*nrow(ced103))
ced103<-ced103[order(ced103$Log.two),]
ced151<-sample_n(ced151,0.1*nrow(ced151))
ced151<-ced151[order(ced151$Log.two),]
ced175<-sample_n(ced175,0.1*nrow(ced175))
ced175<-ced175[order(ced175$Log.two),]
ced280<-sample_n(ced280,0.1*nrow(ced280))
ced280<-ced280[order(ced280$Log.two),]
Prepare individual coverage plots:
ced101.plot<-ggplot(ced101, aes(x=sort(X.ID), y=Log.two))+
theme_bw()+
geom_bar(stat="identity")+
scale_y_continuous(limits=c(0,11))+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks.x=element_blank(),
axis.text.x=element_blank(),
plot.title = element_text(hjust = 0,size = 10),
axis.title=element_text(size=8))+
labs(x = "Individual", y = "log(Average Coverage,2)")+
ggtitle("CEOD 4 Peru (ced101) 298 observations")
ced103.plot<-ggplot(ced103, aes(x=sort(X.ID), y=Log.two))+
theme_bw()+
geom_bar(stat="identity")+
scale_y_continuous(limits=c(0,11))+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks.x=element_blank(),
axis.text.x=element_blank(),
plot.title = element_text(hjust = 0,size = 10),
axis.title=element_text(size=8))+
labs(x = "Individual", y = "log(Average Coverage,2)")+
ggtitle("CEOD 95 Peru (ced103) 620 observations")
ced151.plot<-ggplot(ced151, aes(x=sort(X.ID), y=Log.two))+
theme_bw()+
geom_bar(stat="identity")+
scale_y_continuous(limits=c(0,11))+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks.x=element_blank(),
axis.text.x=element_blank(),
plot.title = element_text(hjust = 0,size = 10),
axis.title=element_text(size=8))+
labs(x = "Individual", y = "log(Average Coverage,2)")+
ggtitle("CEOD 51 Ecuador (ced151) 1305 observations")
ced175.plot<-ggplot(ced175, aes(x=sort(X.ID), y=Log.two))+
theme_bw()+
geom_bar(stat="identity")+
scale_y_continuous(limits=c(0,11))+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks.x=element_blank(),
axis.text.x=element_blank(),
plot.title = element_text(hjust = 0,size = 10),
axis.title=element_text(size=8))+
labs(x = "Individual", y = "log(Average Coverage,2)")+
ggtitle("CEMO 55 Ecuador (ced175) 1295 observations")
ced280.plot<-ggplot(ced280, aes(x=sort(X.ID), y=Log.two))+
theme_bw()+
geom_bar(stat="identity")+
scale_y_continuous(limits=c(0,11))+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks.x=element_blank(),
axis.text.x=element_blank(),
plot.title = element_text(hjust = 0,size = 10),
axis.title=element_text(size=8))+
labs(x = "Individual", y = "log(Average Coverage,2)")+
ggtitle("CEAN 160 Bolivia (ced280) 1247 observations")
Plot all together now.
bait_cov_plot<-grid.arrange(ced101.plot,ced103.plot,ced151.plot,ced175.plot,ced280.plot, ncol=2)
Save figure.
#ggsave("bait_cov_summary.jpg",plot=bait_cov_plot, width=7.2, height=5)