This is an R Markdown document.
Dissertation Research.
library(dplyr)
library(ggplot2)
library(RColorBrewer)
library(rgdal) # used to read world map data
library(rgeos) # to fortify without needing gpclib
library(maptools)
library(scales) # for formatting ggplot scales with commas
library(gridExtra)
library(maps)
library(ggrepel)
library(devtools)
library(grid)
library(reshape)
library(stringr)
| Call | Function | Input(s) |
|---|---|---|
| get_cov_percent | Generates a dataframe with covered percent. One column per specimen. | List of data frames |
| get_depth | Generates a dataframe with depth of coverage. One column per specimen. | List of data frames |
| get_files | Generates a file list. | File pattern |
| get_reads_mapped | Generates a dataframe with sum of mapped reads. One column per specimen. | List of data frames |
| get_results | Calculates mean, min, max, and 95% confidence interval for a column of data | Note on data & column of data frame |
| get_spec_ids | Uses a regular expression to generate the specimen id list from the filenames. | File list and “header” |
| get_spp | Uses a regular expression to generate the species list from the filenames. | File list and “header” |
| get_target_ids | Saves column 1 of coverage stats life and designates it as target ids. | File list and “header” |
| make_df | Generates data frame with specimens as columns and targets as rows. | Coverage stat data & specimen ids & target ids |
| make_row | Generates a row of data from a assigned value. | Note about the value & an assigned value |
| make_tdf | Generates data frame with targets as columns and specimens as rows. | Coverage stat data & specimen ids & target ids & species list |
| readin_files | Generated a list of data frames from a file list | File list |
Generate file list.Example: get_files(“*file_ending.txt“)
get_files<-function(x){
list.files(pattern = x, full.names = TRUE)
}
Generate list of data frames by applying read.table() to all.
readin_files<-function(file_list){
lapply(file_list,read.table,header=1)
}
Generate data frame where each cell is the sum of mapped reads from an individual to a reference sequence.
For example, each cell could be the number of reads from an individual that mapped to a single gene target.
get_reads_mapped<-function(df_list){
output<-list()
for (i in 1:length(df_list)){
output[[i]]<-mutate(df_list[[i]], total = Plus_reads + Minus_reads)
}
reads_mapped<-list()
for (i in 1:length(output)){
reads_mapped[[i]]<-cbind.data.frame(output[[i]]$total)
}
covstat_data<-data.frame(reads_mapped)
covstat_data[covstat_data == "NaN"] <- 0
return(covstat_data)
}
Generate data frame where each cell is: sum of mapped reads • read length ÷ count bases of reference covered, from an individual to a reference sequence.
For example, each cell could be depth of coverage a single gene target for an individual.
get_depth<-function(df_list){
output<-list()
for (i in 1:length(df_list)){
output[[i]]<-mutate(df_list[[i]], depth = ((Plus_reads + Minus_reads)*101)/Covered_bases)
}
read_depth<-list()
for (i in 1:length(output)){
read_depth[[i]]<-cbind.data.frame(output[[i]]$depth)
}
covstat_data<-data.frame(read_depth)
covstat_data[covstat_data == "NaN"] <- 0
return(covstat_data)
}
Generate data frame where each cell is the percent of the reference that is covered by mapped reads from an individual.
For example, each cell could be the percent that a single gene target is covered by mapped reads from an individual.
get_cov_percent<-function(df_list){
cp_cov_per<-list()
for (i in 1:length(df_list)){
cp_cov_per[[i]]<-cbind.data.frame(df_list[[i]]$Covered_percent)
}
covstat_data<-data.frame(cp_cov_per)
return(covstat_data)
}
Extracts a list of specimen ids from the filenames with a regular expression.
Header must follow character syntax with double quotes: “header”
get_spec_ids<-function(file_list,header){
spec_ids<-data.frame(str_extract(file_list,"[C,G,S,T]...[_]\\d{0,3}"))
names(spec_ids)<-header
return(spec_ids)
}
Extracts a list of species from the filenames with a regular expression.
Header must follow character syntax with double quotes: “header”
get_spp<-function(file_list,header){
spp<-data.frame(str_extract(file_list,"[C,G,S,T]..."))
names(spp)<-header
return(spp)
}
Generates a list of target ids from the first column of the first file in the file list.
Header must follow character syntax with double quotes: “header”
get_target_ids<-function(file_list,header){
file_1<-read.table(file_list[1],header = 1)
target_ids<-data.frame(file_1[1])
names(target_ids)<-header
return(target_ids)
}
Generates data frame with specimens as columns and targets as rows.
Example:
| target_id | CEFI_9 | CESA_75 |
|---|---|---|
| ceod000050 | 49 | 2521 |
| ceod000075 | 38 | 1347 |
| ceod000084 | 71 | 3106 |
| ceod000163 | 103 | 5848 |
| ceod000194 | 56 | 2130 |
| ceod000233 | 69 | 4826 |
make_df<-function(covstat_data,spec_ids,target_ids){
spec_ids<-t(spec_ids)
colnames(covstat_data)<-spec_ids
df<-cbind.data.frame(target_ids,covstat_data)
return(df)
}
Generates data frame with targets as columns and specimens as rows.
Example:
| specimen | species | ceod000050 | ceod000075 |
|---|---|---|---|
| CEFI_9 | CEFI | 49 | 38 |
| CESA_75 | CESA | 2521 | 1347 |
| CEFI_112 | CEFI | 61 | 29 |
| CEFI_140 | CEFI | 1173 | 708 |
| CEOD_202 | CEOD | 540 | 217 |
| CEFI_230 | CEFI | 339 | 201 |
make_tdf<-function(covstat_data,spec_ids,target_ids,spp){
tcovstat_data<-t(covstat_data)
target_ids<-t(target_ids)
colnames(tcovstat_data)<-target_ids
tdf<-cbind.data.frame(spec_ids,spp,tcovstat_data)
return(tdf)
}
Calculates mean, min, max, and 95% confidence interval for a column of data|Note on data & column of data frame.
Example Usage: ind_percent_ontarget<-get_results(“individual percent on target”,yield_tdf_full$ind_percent_ontarget) Note_about_stat must follow character syntax with double quotes: “Note about stat”
Example Output:
| note_about_stat | mean | min | max | err | up | lo |
|---|---|---|---|---|---|---|
| individual percent on target | 19.75343 | 0.8108261 | 41.42799 | 5.41383 | 25.16726 | 14.3396 |
get_results<-function(note_about_stat,column_of_data){
mean<-mean(column_of_data,na.rm = TRUE)
min<-min(column_of_data,na.rm = TRUE)
max<-max(column_of_data,na.rm = TRUE)
err<-qt(0.975,df=length(column_of_data)-1)*sd(column_of_data,na.rm = TRUE)/sqrt(length(column_of_data))
up<-mean+err
lo<-mean-err
note_about_stat<-note_about_stat
results<-cbind.data.frame(note_about_stat,mean,min,max,err,up,lo)
return(results)
}
Generates a row of data from a assigned value.
Example Usage: make_row(“detected variants”,detected_variants) Note must follow character syntax with double quotes: “Note about value”
Example Output:
| result | stat |
|---|---|
| detected variants | 444979 |
make_row<-function(note,stat){
row<-cbind.data.frame(note, stat)
names(row)<-c("result","stat")
return(row)
}
# Data from http://thematicmapping.org/downloads/world_borders.php.
# Direct link: http://thematicmapping.org/downloads/TM_WORLD_BORDERS_SIMPL-0.3.zip
# Unpack and put the files in a dir 'data'
worldMap <- readOGR(dsn="TM_WORLD_BORDERS-0.3.shp", layer="TM_WORLD_BORDERS-0.3")
## OGR data source with driver: ESRI Shapefile
## Source: "TM_WORLD_BORDERS-0.3.shp", layer: "TM_WORLD_BORDERS-0.3"
## with 246 features
## It has 11 fields
## Integer64 fields read as strings: POP2005
# Change "data" to your path in the above!
worldMap.fort <- fortify(worldMap, region = "ISO3")
# Fortifying a map makes the data frame ggplot uses to draw the map outlines.
# "region" or "id" identifies those polygons, and links them to your data.
# Look at head(worldMap@data) to see other choices for id.
# Your data frame needs a column with matching ids to set as the map_id aesthetic in ggplot.
idList <- worldMap@data$ISO3
# "coordinates" extracts centroids of the polygons, in the order listed at worldMap@data
centroids.df <- as.data.frame(coordinates(worldMap))
names(centroids.df) <- c("Longitude", "Latitude") #more sensible column names
# This shapefile contained population data, let's plot it.
popList <- worldMap@data$POP2005
pop.df <- data.frame(id = idList, centroids.df)
#load sample data; GPS Coordinates and city labels with GPS Coordinates for the
#label position.
samples<-read.csv("20171214_ch1_mapfig.csv",header=1)
Labs<-read.csv("Labels.csv")
#make map
map<-ggplot(pop.df, aes(map_id = id)) + #"id" is col in your pop.df, not in the map object
geom_map(fill="white",colour= "black", map = worldMap.fort) +
expand_limits(x = worldMap.fort$long, y = worldMap.fort$lat) +
coord_equal(xlim = c(-90,-60), ylim = c(-25, 20)) + #let's view South America
labs(x = "Longitude", y = "Latitude") +
theme_bw()+
geom_label_repel(
aes(map_id=NA,long, lat, label = sample, color=species),
fontface="bold",
box.padding = unit(0.5, "lines"),
point.padding = unit(0.5, "lines"),
segment.color = 'grey40',
label.size = 0,
label.padding = unit(0.1, "lines"),
data=samples,
show.legend = F)+
geom_point(aes(map_id=NA,x = long, y = lat, color=species),
size = 2, data = samples)+
scale_color_brewer(palette="Dark2", name="Species")+
geom_label(aes(map_id=NA, x=LONG, y=LAT,label=lab),size=3, data=Labs)+
theme(legend.text = element_text(face = "italic"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
#make inset
inset<-ggplot(pop.df, aes(map_id = id)) + #"id" is col in your df, not in the map object
geom_map(fill="white",colour= "black", map = worldMap.fort) +
expand_limits(x = worldMap.fort$long, y = worldMap.fort$lat) +
coord_equal(xlim = c(-90,-35), ylim = c(-60, 20))+
geom_rect(aes(xmin = -90, xmax = -60, ymin = -25, ymax = 20), alpha = 0, colour = "blue", size = 1, linetype = 1)+
theme_bw() +
ggtitle("South America")+
theme(plot.background=element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = 'black', size = 1, linetype = 1),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
vpmain_ <- viewport(width = 1, height = 1, x = 0.5, y = 0.5) # the larger map
vpinset_ <- viewport(width = 0.5, height = 0.3, x = 0.25, y = 0.25) # the inset in upper right
print(map, vp = vpmain_)
print(inset, vp = vpinset_)
We designed probes for target capture from 10,001 gene targets that represented putative low copy transcript models from the transcriptome of the reference specimen. We identified these gene targets by mapping truncated DNA sequencing reads (50 bp) to the transcriptome of the reference specimen, and using the bbmap.sh coveragestats parameter to estimate mapped read depth per 1 kb of transcript model (depth/kb).
bb_stats<-read.table("bb_stats_v1.txt",header=1)
bb_stats<-mutate(bb_stats,sum_reads=Plus_reads + Minus_reads)
bb_stats<-mutate(bb_stats,reads_kb=sum_reads/Length)
bb_stats<-bb_stats[order(bb_stats$reads_kb),]
bb_stats$depth_ind<-52181:1
bb_stats$inv_depth_ind<-1:52181
#Depth/kb for all gene models in the transcriptome.
(depth_kb_models<-get_results("mean depth/kb gene models",bb_stats$reads_kb))
## note_about_stat mean min max err up
## 1 mean depth/kb gene models 3.479284 0 1889.475 0.2329332 3.712217
## lo
## 1 3.246351
#Create gene target list.
baits5000_15000<-filter(bb_stats,inv_depth_ind >= 5000 & inv_depth_ind<= 15000)
IDlist_5000_15000<-baits5000_15000[1]
#write.table(IDlist_5000_15000,"IDlist_5000_15000.txt",quote=FALSE,row.names = FALSE)
#This file can now be used to filter the transcriptome to keep gene targets.
#Depth/kb for the selected gene targets.
(depth_kb_targets<-get_results("mean depth/kb gene targets",baits5000_15000$reads_kb))
## note_about_stat mean min max err
## 1 mean depth/kb gene targets 0.84536 0.644898 1.003774 0.001958985
## up lo
## 1 0.847319 0.843401
#Transform for plotting.
bb_stats$lsum_reads<-log2(bb_stats$sum_reads)
#ggplot(aes(x=inv_depth_ind,y=lsum_reads),data=bb_stats)+
#xlab("Depth")+
#ylab("Mapped Reads")+
#geom_line(color="darkred")+
#theme_classic()
baitscript_index<-ggplot(aes(x=inv_depth_ind,y=lsum_reads),data=bb_stats)+
xlab("Inverse Depth Index")+
ylab(expression(Log[2]*" Mapped Reads"))+
geom_bar(stat="identity")+
theme_classic()+
geom_vline(xintercept = 5000,color="darkred",size=1)+
geom_vline(xintercept = 15000,color="darkred",size=1)
Our 10,001 gene targets had an inverse depth index of 5,000 to 15,000 and ranged from 0.6 to 1.0 in depth/kb with a mean depth/kb of 0.8. Figure 2 shows the distribution of Log2(Mapped Reads) for the transcriptome and the inverse depth index range of our selected gene targets.
baitscript_index
First, we calculated sequence yield for each specimen. To do this we used the unix command grep to count the number of sequence reads for each individual (lines beginning with @), and we pasted on filenames.
Usage:
grep -c "^@" *R1.fastq.gz > yield.txt
ls -1 *R1.fastq.gz > specimens.txt
paste specimens.txt yield.txt > ch1_yield.txt
Then, we mapped the sequenced reads from each individual to the gene targets and used to coverage stats to estimate the on-target yield. For this analysis, all sequenced reads from all specimens were used.
setwd("~/Documents/Cedrela/Ch1_Genome_Resources/Ch1_DA/covstats_yield/")
filenames<-get_files("*_yield_stats.txt")
ldf<-readin_files(filenames)
spec_ids<-get_spec_ids(filenames,"specimen")
spp<-get_spp(filenames,"species")
target_ids<-get_target_ids(filenames,"target_id")
reads_ontarget<-get_reads_mapped(ldf)
yield_df<-make_df(reads_ontarget,spec_ids,target_ids)
yield_tdf<-make_tdf(reads_ontarget,spec_ids,target_ids,spp)
ontarget_depth<-get_depth(ldf)
ontarget_depth<-make_df(ontarget_depth,spec_ids,target_ids)
#add columns and combine sequenced reads and on-target data frames.
yield_df$target_totals<-rowSums(yield_df[2:44])
yield<-read.table("ch1_yield.txt")
yield_spec_ids<-data.frame(str_extract(yield$V1,"[C,G,S,T]...[_]\\d{0,3}"))
long_ids<-lapply(yield$V1,as.character)
yield_spec_ids<-get_spec_ids(long_ids,"specimen")
yield_spp<-get_spp(long_ids,"species")
yield<-cbind.data.frame(yield_spec_ids,yield_spp,yield$V2)
names(yield)[3]<-c("reads_sequenced")
yield_tdf$reads_ontarget<-rowSums(yield_tdf[3:10003])
yield_tdf_full<-full_join(yield,yield_tdf)
yield_tdf_full$ind_mean_ontarget<-apply(yield_tdf_full[c(3:10003)],1,mean)
yield_tdf_full<-mutate(yield_tdf_full,ind_percent_ontarget = (reads_ontarget/reads_sequenced)*100)
ontarget_depth$target_mean_depth<-apply(ontarget_depth[c(2:43)],1,mean)
total_paired_sequence_yield<-sum(yield_tdf_full$reads_sequenced)
total_sequence_yield<-(total_paired_sequence_yield)*2
total_bp_yield<-total_sequence_yield*101
rbind.data.frame(make_row("total paired sequence yield",total_paired_sequence_yield),make_row("total sequence yield",total_sequence_yield),make_row("total bp yield",total_bp_yield))
## result stat
## 1 total paired sequence yield 365883143
## 2 total sequence yield 731766286
## 3 total bp yield 73908394886
(sequence_yield<-get_results("sequence yield",yield_tdf_full$reads_sequenced))
## note_about_stat mean min max err up lo
## 1 sequence yield 8508910 438310 29081506 1775064 10283974 6733847
specimens_high_yield<-nrow(data.frame(filter(yield_tdf_full,reads_sequenced>=sequence_yield$up)))
specimens_low_yield<-nrow(data.frame(filter(yield_tdf_full,reads_sequenced<=sequence_yield$lo)))
rbind.data.frame(make_row("Number of specimens with significantly greater sequenced reads than mean yield",specimens_high_yield),make_row("Number of specimens with significantly fewer sequenced reads than mean yield",specimens_low_yield))
## result
## 1 Number of specimens with significantly greater sequenced reads than mean yield
## 2 Number of specimens with significantly fewer sequenced reads than mean yield
## stat
## 1 13
## 2 20
specimens_high_yield_df<-data.frame(filter(yield_tdf_full,reads_sequenced>=sequence_yield$up))
specimens_low_yield_df<-data.frame(filter(yield_tdf_full,reads_sequenced<=sequence_yield$lo))
summary(specimens_high_yield_df$species)
## CEAN CEFI CEMO CEOD CESA GUGU SWMA TRTU
## 0 3 0 2 1 4 2 1
summary(specimens_low_yield_df$species)
## CEAN CEFI CEMO CEOD CESA GUGU SWMA TRTU
## 1 6 1 4 2 2 0 4
yield_tdf_full$genus<-str_extract(yield_tdf_full$species,"[C,G,S,T]")
yield_tdf_genus<-cbind.data.frame(yield_tdf_full$genus,yield_tdf_full$reads_sequenced)
names(yield_tdf_genus)<-c("genus","reads_sequenced")
(genus_mean_sequence_yield<-yield_tdf_genus%>%group_by(genus)%>%summarise_all(funs(mean)))
## # A tibble: 4 x 2
## genus reads_sequenced
## <fctr> <dbl>
## 1 C 7535987
## 2 G 9762405
## 3 S 17513210
## 4 T 7481282
ggplot(aes(x=species,y=reads_sequenced,fill=species),data=yield_tdf_full)+geom_bar(stat="identity")+
theme_classic()+
scale_fill_brewer(palette = "Dark2")+
scale_x_discrete(labels=c("CEAN"="C. angustifolia","CEFI"="C. fissilis","CEMO"="C. montana","CEOD"="C. odorata","CESA"="C. saltensis","GUGU"="G.guidonia","SWMA"="S. mahagoni","TRTU"="T. tuberculata"))+
theme(legend.position = "none",
axis.text.x = element_text(face="italic"))+
xlab("Species")+ylab("Sequence Yield")+ggtitle("Species Yield")
ggplot(aes(x=specimen,y=reads_sequenced,fill=species),data=yield_tdf_full)+
geom_bar(stat="identity")+theme_classic()+
xlab("Specimen")+ylab("Sequence Yield")+
scale_fill_brewer(palette = "Dark2",name="Species",labels=c("CEAN"="C. angustifolia","CEFI"="C. fissilis","CEMO"="C. montana","CEOD"="C. odorata","CESA"="C. saltensis","GUGU"="G.guidonia","SWMA"="S. mahagoni","TRTU"="T. tuberculata"))+
theme(axis.text.x = element_text(angle=90),
legend.text = element_text(face = "italic"))+
geom_hline(yintercept=sequence_yield$up, color="darkred",size=1)+
geom_hline(yintercept=sequence_yield$lo, color="darkred",size=1)+
geom_hline(yintercept=sequence_yield$mean,size=1)+
ggtitle("Specimen Yield Relative to Mean and 95% C. I.")
ontarget_depth_mean<-get_results("average mean depth per target",ontarget_depth$target_mean_depth)
failed_targets_full<-nrow(filter(ontarget_depth,target_mean_depth == 0))
targets_above_av_depth_full<-nrow(filter(ontarget_depth,target_mean_depth >= ontarget_depth_mean$mean))
high_mapping_targets_full<-nrow(filter(ontarget_depth,target_mean_depth >= ontarget_depth_mean$up))
targets_below_av_depth_full<-nrow(filter(ontarget_depth,target_mean_depth <= ontarget_depth_mean$mean))
rbind.data.frame(make_row("failed targets all reads",failed_targets_full),
make_row("above average depth targets",targets_above_av_depth_full),
make_row("high depth targets",high_mapping_targets_full),
make_row("below average depth targets",targets_below_av_depth_full))
## result stat
## 1 failed targets all reads 206
## 2 above average depth targets 3812
## 3 high depth targets 3725
## 4 below average depth targets 6189
on_target_yield<-get_results("on target yield",yield_tdf_full$reads_ontarget)
specimens_high_ontarget<-nrow(data.frame(filter(yield_tdf_full,reads_ontarget>=on_target_yield$up)))
specimens_low_ontarget<-nrow(data.frame(filter(yield_tdf_full,reads_ontarget<=on_target_yield$lo)))
rbind.data.frame(make_row("Number of specimens with significantly greater on-target sequenced reads than mean on-target yield",specimens_high_ontarget),make_row("Number of specimens with significantly fewer on-target sequenced reads than mean on-target yield",specimens_low_ontarget))
## result
## 1 Number of specimens with significantly greater on-target sequenced reads than mean on-target yield
## 2 Number of specimens with significantly fewer on-target sequenced reads than mean on-target yield
## stat
## 1 10
## 2 23
specimens_high_ontarget_df<-data.frame(filter(yield_tdf_full,reads_ontarget>=on_target_yield$up))
specimens_low_ontarget_df<-data.frame(filter(yield_tdf_full,reads_ontarget<=on_target_yield$lo))
summary(specimens_high_ontarget_df$species)
## CEAN CEFI CEMO CEOD CESA GUGU SWMA TRTU
## 1 4 0 4 1 0 0 0
summary(specimens_low_ontarget_df$species)
## CEAN CEFI CEMO CEOD CESA GUGU SWMA TRTU
## 1 3 0 1 1 10 0 7
(ind_percent_ontarget<-get_results("individual percent on target",yield_tdf_full$ind_percent_ontarget))
## note_about_stat mean min max err
## 1 individual percent on target 19.75343 0.8108261 41.42799 5.41383
## up lo
## 1 25.16726 14.3396
ggplot(aes(x=species,y=reads_ontarget,fill=species),data=yield_tdf_full)+geom_bar(stat="identity")+
theme_classic()+
scale_fill_brewer(palette = "Dark2")+
scale_x_discrete(labels=c("CEAN"="C. angustifolia","CEFI"="C. fissilis","CEMO"="C. montana","CEOD"="C. odorata","CESA"="C. saltensis","GUGU"="G.guidonia","SWMA"="S. mahagoni","TRTU"="T. tuberculata"))+
theme(legend.position = "none",
axis.text.x = element_text(face="italic"))+
xlab("Species")+ylab("On-Target Sequence Yield")+ggtitle("Species On-Target Yield")
ggplot(aes(x=specimen,y=reads_ontarget,fill=species),data=yield_tdf_full)+
geom_bar(stat="identity")+theme_classic()+
xlab("Specimen")+ylab("On-Target Sequence Yield")+
scale_fill_brewer(palette = "Dark2",name="Species",labels=c("CEAN"="C. angustifolia","CEFI"="C. fissilis","CEMO"="C. montana","CEOD"="C. odorata","CESA"="C. saltensis","GUGU"="G.guidonia","SWMA"="S. mahagoni","TRTU"="T. tuberculata"))+
theme(axis.text.x = element_text(angle=90),
legend.text = element_text(face = "italic"))+
geom_hline(yintercept=on_target_yield$up, color="darkred",size=1)+
geom_hline(yintercept=on_target_yield$lo, color="darkred",size=1)+
geom_hline(yintercept=on_target_yield$mean,size=1)+
ggtitle("Specimen On-Target Yield Relative to Mean and 95% C. I.")
ggplot(aes(x=target_mean_depth),data = ontarget_depth)+geom_histogram(binwidth=1)+
theme_classic()+
xlab("Mean Depth of Coverage")+
ylab("Count of Gene Targets")+
geom_vline(xintercept=ontarget_depth_mean$lo,color="darkred")+
geom_vline(xintercept=ontarget_depth_mean$up,color="darkred")+
geom_vline(xintercept=ontarget_depth_mean$mean,color="darkred")+
#geom_vline(xintercept=10,color="forestgreen")+
ggtitle("Depth of Coverage Distribution for Gene Targets (All Reads)")
ontarget_genus<-cbind.data.frame(yield_tdf_full$genus,yield_tdf_full$reads_ontarget)
names(ontarget_genus)<-c("genus","reads_ontarget")
test_ced<-filter(ontarget_genus,genus == "C")
test_other<-filter(ontarget_genus,genus != "C")
test_ced<-test_ced[2]
test_other<-test_other[2]
t.test(test_ced, test_other, alternative = "two.sided")
##
## Welch Two Sample t-test
##
## data: test_ced and test_other
## t = 4.9531, df = 23.859, p-value = 4.755e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1519070 3690557
## sample estimates:
## mean of x mean of y
## 2814485.6 209672.1
test_ced$genus<-"C"
test_other$genus<-"O"
test<-rbind.data.frame(test_ced,test_other)
ggplot(aes(x=genus,y=reads_ontarget),data=test)+
scale_x_discrete(labels=c("C"="Cedrela","O"="Other Meliaceae"))+
geom_boxplot()+
theme_classic()+
xlab("Genus")+ylab("Reads On-Target")
trans_gf<-read.table("~/Documents/Cedrela/Ch1_Genome_Resources/Ch1_DA/CEOD_bait_trans_gf.txt",header=1,fill = TRUE)
trans_gf[1]<-NULL
ontarget_depth[c(43,45)]<-NULL
ontarget_depth[c(2:43)]<-if_else(ontarget_depth[c(2:43)] >= 10, 1, 0)
names(trans_gf)[1]<-"target_id"
ontarget_depth<-inner_join(ontarget_depth,trans_gf)
ontarget_depth$gf_id<-if_else(ontarget_depth$gf_id == "", "none","available")
cean<-data.frame(select(ontarget_depth,target_id,gf_id,contains("AN")))
cefi<-data.frame(select(ontarget_depth,target_id,gf_id,contains("FI")))
cemo<-data.frame(select(ontarget_depth,target_id,gf_id,contains("MO")))
ceod<-data.frame(select(ontarget_depth,target_id,gf_id,contains("OD")))
cesa<-data.frame(select(ontarget_depth,target_id,gf_id,contains("SA")))
gugu<-data.frame(select(ontarget_depth,target_id,gf_id,contains("GU")))
swma<-data.frame(select(ontarget_depth,target_id,gf_id,contains("SW")))
trtu<-data.frame(select(ontarget_depth,target_id,gf_id,contains("TR")))
cean$sums<-rowSums(select(cean,-target_id,-gf_id))
cefi$sums<-rowSums(select(cefi,-target_id,-gf_id))
cemo$sums<-rowSums(select(cemo,-target_id,-gf_id))
ceod$sums<-rowSums(select(ceod,-target_id,-gf_id))
cesa$sums<-rowSums(select(cesa,-target_id,-gf_id))
gugu$sums<-rowSums(select(gugu,-target_id,-gf_id))
swma$sums<-rowSums(select(swma,-target_id,-gf_id))
trtu$sums<-rowSums(select(trtu,-target_id,-gf_id))
cean$species<-"cean"
cefi$species<-"cefi"
cemo$species<-"cemo"
ceod$species<-"ceod"
cesa$species<-"cesa"
gugu$species<-"gugu"
swma$species<-"swma"
trtu$species<-"trtu"
cean<-data.frame(filter(cean,sums >=2))
cefi<-data.frame(filter(cefi,sums >=10))
cemo<-data.frame(filter(cemo,sums >=1))
ceod<-data.frame(filter(ceod,sums >=7))
cesa<-data.frame(filter(cesa,sums >=3))
gugu<-data.frame(filter(gugu,sums >=10))
swma<-data.frame(filter(swma,sums >=2))
trtu<-data.frame(filter(trtu,sums >=7))
genes_retained<-rbind.data.frame(select(cean,target_id,gf_id,sums,species),
select(cefi,target_id,gf_id,sums,species),
select(cemo,target_id,gf_id,sums,species),
select(ceod,target_id,gf_id,sums,species),
select(cesa,target_id,gf_id,sums,species),
select(gugu,target_id,gf_id,sums,species),
select(swma,target_id,gf_id,sums,species),
select(trtu,target_id,gf_id,sums,species))
ggplot(aes(x=species,fill=gf_id,color=species),data=genes_retained)+
geom_bar(size=1)+
scale_color_brewer(palette = "Dark2")+
scale_x_discrete(labels=c("cean" = "C. angustifolia", "cefi" = "C. fissilis",
"cemo" = "C. montana", "ceod" = "C. odorata",
"cesa" = "C. saltensis","swma"="S. mahagoni",
"trtu"="T. tuberculata","gugu"="G. guidonia"))+
scale_fill_manual(values=c("white","darkgray"))+ #white is available
xlab("Species")+
ylab("Genes Retained at Depth 10")+
#ylim(c(0,6050))+
theme_classic()+
theme(legend.position = "none",
axis.text.x=element_text(face="italic",size=6.5))+
geom_text(stat='count', aes(label=..count..,y=..count..+20, fill="none"), vjust=-1,color="black",size=2.5)
We generated a vcf for the Cedrela experimental specimens using the gene targets as the reference to better understand the amount and quality of information available after our target capture experiment. The estimates below are from vcftools.
vcftools --gzvcf ch1_ceds_baitscripts_w254.vcf.gz
#After filtering, kept 444979 out of a possible 444979 Sites
vcftools --gzvcf ch1_ceds_baitscripts_w254.vcf.gz --remove-indels --recode --out 20810510_temp
#After filtering, kept 416910 out of a possible 444979 Sites
vcftools --vcf 20810510_temp.recode.vcf --max-alleles 2 --min-alleles 2 --recode --out 20810510_temp2
#After filtering, kept 399117 out of a possible 416910 Sites
vcftools --vcf 20810510_temp2.recode.vcf --missing-indv --out 20810510_temp2
vcftools --vcf 20810510_temp2.recode.vcf --max-missing 1.0 --recode --out 20810510_temp3
#After filtering, kept 215799 out of a possible 399117 Sites
#Make quality, frequency, hwe files, and calculate fst.
vcftools --vcf 20810510_temp3.recode.vcf --site-quality --out 20180511_SNPs
vcftools --vcf 20810510_temp3.recode.vcf --freq --out 20180511_SNPs
sed -i 's/{ALLELE:FREQ}/major_al\tmajor_al_frq\tminor_al\tminor_al_frq/g' 20180511_SNPs.frq
sed -i 's/:/\t/g' 20180511_SNPs.frq
vcftools --vcf 20810510_temp3.recode.vcf --hardy --out 20180511_SNPs
vcftools --vcf 20810510_temp3.recode.vcf --weir-fst-pop 20180511_angustifolia.txt --weir-fst-pop 20180511_fissilis.txt --weir-fst-pop 20180511_odorata.txt --weir-fst-pop 20180511_saltensis.txt --out 20180511_global_fst
Here we will generated a table of SNPs counts and categories. We will also generate a figure that shows the distribution of FST for SNPs with 0% missing data. The distribution will be color coded to indicate if the gene target has gene family association data (TRAPID via PLAZA 2.5).
setwd("~/Documents/Cedrela/Ch1_Genome_Resources/Ch1_DA/")
#we will assign the values now and make a summary table for the end of this section.
detected_variants<-444979
non_indel_variants<-416910
indels<-detected_variants - non_indel_variants
percent_indels<-(indels/detected_variants)*100
percent_non_indel_variants<-(non_indel_variants/detected_variants)*100
biallelic<-399117
multiallelic<-non_indel_variants - biallelic
percent_multiallelic<-(multiallelic/detected_variants)*100
percent_biallelic<-(biallelic/detected_variants)*100
ind_missingness<-read.table("20180510_ind_missingness.txt",header=1)
ind_missingness<-mutate(ind_missingness,percent_missing = F_MISS*100)
(ind_missingness_result<-get_results("individual missingness",ind_missingness$percent_missing))
## note_about_stat mean min max err up
## 1 individual missingness 19.76394 7.74535 35.9258 3.590661 23.3546
## lo
## 1 16.17328
biallelic_0missing<-215799
percent_biallelic_0missing<-(biallelic_0missing/detected_variants)*100
biallelic_other<-biallelic - biallelic_0missing
percent_biallelic_other<-(biallelic_other/detected_variants)*100
quality<-read.table("20180511_SNPs.lqual",header=1)
frequency<-read.table("20180511_SNPs.frq",header=1)
fst<-read.table("20180511_global_fst.weir.fst",header=1)
snp_stats<-full_join(quality,frequency)
snp_stats<-full_join(snp_stats,fst)
snp_stats[snp_stats == "NaN"] <- NA
summary(snp_stats[c(3,6:10)])
## QUAL major_al major_al_frq minor_al minor_al_frq
## Min. : 3.011 A:56640 Min. :0.0000 A:58192 Min. :0.02174
## 1st Qu.:214.000 C:51128 1st Qu.:0.6739 C:47866 1st Qu.:0.02174
## Median :664.000 G:51934 Median :0.8913 G:49539 Median :0.10870
## Mean :592.297 T:56097 Mean :0.7780 T:60202 Mean :0.22204
## 3rd Qu.:999.000 3rd Qu.:0.9783 3rd Qu.:0.32609
## Max. :999.000 Max. :0.9783 Max. :1.00000
##
## WEIR_AND_COCKERHAM_FST
## Min. :-0.209
## 1st Qu.:-0.035
## Median : 0.057
## Mean : 0.171
## 3rd Qu.: 0.257
## Max. : 1.000
## NA's :18215
#Bring in file containing gene family associations for full transcriptome. We will join this to the vcf data so that we can find the proportion of SNPs with FST information for the distribution figures.
trans_gf<-read.table("~/Documents/Cedrela/Ch1_Genome_Resources/Ch1_DA/CEOD_bait_trans_gf.txt",header=1,fill = TRUE)
trans_gf[1]<-NULL
names(trans_gf)[1]<-"CHROM"
snp_stats_gfs<-inner_join(snp_stats,trans_gf)
snp_stats_gfs$gf_id<-if_else(snp_stats_gfs$gf_id == "", "none","available")
summary(data.frame(snp_stats_gfs$gf_id))
## snp_stats_gfs.gf_id
## available:127026
## none : 88773
#this worked if none: 88773
snp_stats_gfs_MAF5<-filter(snp_stats_gfs, minor_al_frq >= 0.05)
biallelic_0missing_maf5<-nrow(snp_stats_gfs_MAF5)
percent_biallelic_0missing_maf5<-(biallelic_0missing_maf5/detected_variants)*100
snp_stats_gfs_MAF5_qual500<-filter(snp_stats_gfs_MAF5, QUAL >=500)
biallelic_0missing_maf5_q500<-nrow(snp_stats_gfs_MAF5_qual500)
percent_biallelic_0missing_maf5_q500<-(biallelic_0missing_maf5_q500/detected_variants)*100
snp_stats_MAF5_qual500_fst.5<-filter(snp_stats_gfs_MAF5_qual500,
WEIR_AND_COCKERHAM_FST >= 0.5)
snp_stats_MAF5_qual500_fst1<-filter(snp_stats_gfs_MAF5_qual500,
WEIR_AND_COCKERHAM_FST >= 1.0)
snp_stats_MAF5_qual500_fst1_wgf_id<-filter(snp_stats_gfs_MAF5_qual500,
WEIR_AND_COCKERHAM_FST >= 1.0 & gf_id == "available")
biallelic_0missing_MAF5_qual500_fst.5<-nrow(snp_stats_MAF5_qual500_fst.5)
biallelic_0missing_MAF5_qual500_fst1<-nrow(snp_stats_MAF5_qual500_fst1)
biallelic_0missing_MAF5_qual500_fst1_wgf<-nrow(snp_stats_MAF5_qual500_fst1_wgf_id)
(vcf_table<-rbind.data.frame(make_row("detected variants",detected_variants),
make_row("non-indel variants",non_indel_variants),
make_row("indels",indels),
make_row("percent indels",percent_indels),
make_row("percent non-indel variants",percent_non_indel_variants),
make_row("biallelic variants",biallelic),
make_row("multiallelic variants",multiallelic),
make_row("percent biallelic variants",percent_biallelic),
make_row("biallelic variants with 0% missing information",biallelic_0missing),
make_row("percent biallelic variants with 0% missing information",percent_biallelic_0missing),
make_row("biallelic variants with missing information",biallelic_other),
make_row("percent biallelic variants with missing information",percent_biallelic_other),
make_row("biallelic variants with 0% missing information MAF>5%",biallelic_0missing_maf5),
make_row("percent biallelic variants with 0% missing information MAF>5%",percent_biallelic_0missing_maf5),
make_row("biallelic variants with 0% missing information MAF>5% & quality>500",biallelic_0missing_maf5_q500),
make_row("biallelic variants with 0% missing information MAF>5% & quality>500",percent_biallelic_0missing_maf5_q500),
make_row("percent biallelic SNPs with 0% missing, MAF>5% & quality>500 & FST >0.5",biallelic_0missing_MAF5_qual500_fst.5),
make_row("biallelic variants with 0% missing information MAF>5% & quality>500 & FST 1",biallelic_0missing_MAF5_qual500_fst1),
make_row("biallelic variants with 0% missing information MAF>5% & quality>500 & FST 1 & gene family information",biallelic_0missing_MAF5_qual500_fst1_wgf)))
## result
## 1 detected variants
## 2 non-indel variants
## 3 indels
## 4 percent indels
## 5 percent non-indel variants
## 6 biallelic variants
## 7 multiallelic variants
## 8 percent biallelic variants
## 9 biallelic variants with 0% missing information
## 10 percent biallelic variants with 0% missing information
## 11 biallelic variants with missing information
## 12 percent biallelic variants with missing information
## 13 biallelic variants with 0% missing information MAF>5%
## 14 percent biallelic variants with 0% missing information MAF>5%
## 15 biallelic variants with 0% missing information MAF>5% & quality>500
## 16 biallelic variants with 0% missing information MAF>5% & quality>500
## 17 percent biallelic SNPs with 0% missing, MAF>5% & quality>500 & FST >0.5
## 18 biallelic variants with 0% missing information MAF>5% & quality>500 & FST 1
## 19 biallelic variants with 0% missing information MAF>5% & quality>500 & FST 1 & gene family information
## stat
## 1 4.449790e+05
## 2 4.169100e+05
## 3 2.806900e+04
## 4 6.307938e+00
## 5 9.369206e+01
## 6 3.991170e+05
## 7 1.779300e+04
## 8 8.969345e+01
## 9 2.157990e+05
## 10 4.849645e+01
## 11 1.833180e+05
## 12 4.119700e+01
## 13 1.382450e+05
## 14 3.106776e+01
## 15 1.190200e+05
## 16 2.674733e+01
## 17 2.207400e+04
## 18 8.817000e+03
## 19 4.281000e+03
#the number formating on the table is fixed when you write it to a text file.
Combined all and highest quality distributions of FST.
ggplot()+
geom_histogram(aes(x=WEIR_AND_COCKERHAM_FST),
data=snp_stats_gfs,binwidth = 0.1,fill="lightgray",color="lightgray")+
geom_histogram(aes(x=WEIR_AND_COCKERHAM_FST,fill = gf_id,color="black"),
data=snp_stats_gfs_MAF5_qual500,binwidth = 0.1)+
theme_classic()+
theme(legend.position = "none")+
scale_fill_manual(values=c("white","darkgray"))+
scale_color_manual(values=c("black"))+
xlab(expression("Weir and Cockerham F"[ST]))+
ylab("Count of SNPs")
We tried to include C. odorata 287 in the vcf, but the usable SNPs decreased significantly with that specimen. This specimen had a high percent of missing information.
detected_variants_287<-460752
non_indel_variants_287<-431573
indels_287<-detected_variants_287 - non_indel_variants_287
percent_indels_287<-(indels_287/detected_variants_287)*100
percent_non_indel_variants_287<-(non_indel_variants_287/detected_variants_287)*100
biallelic_287<-411932
multiallelic_287<-non_indel_variants_287 - biallelic_287
percent_multiallelic_287<-(multiallelic_287/detected_variants_287)*100
percent_biallelic_287<-(biallelic_287/detected_variants_287)*100
ind_missingness_287<-read.table("20180512_ind_missingness_287.imiss",header=1)
ind_missingness_287<-mutate(ind_missingness_287,percent_missing_287 = F_MISS*100)
(ind_missingness_287_result<-get_results("individual missingness with 287",ind_missingness_287$percent_missing_287))
## note_about_stat mean min max err
## 1 individual missingness with 287 21.70685 7.5493 78.9135 6.127989
## up lo
## 1 27.83484 15.57886
biallelic_0missing_287<-80728
percent_biallelic_0missing_287<-(biallelic_0missing_287/detected_variants_287)*100
biallelic_other_287<-biallelic_287 - biallelic_0missing_287
percent_biallelic_other_287<-(biallelic_other_287/detected_variants_287)*100
(vcf_table_w287<-rbind.data.frame(make_row("detected variants",detected_variants_287),
make_row("non-indel variants",non_indel_variants_287),
make_row("indels",indels_287),
make_row("percent indels",percent_indels_287),
make_row("percent non-indel variants",percent_non_indel_variants_287),
make_row("biallelic",biallelic_287),
make_row("multiallelic",multiallelic_287),
make_row("percent multiallelic",percent_multiallelic_287),
make_row("percent biallelic",percent_biallelic_287),
make_row("biallelic & 0% missing information",biallelic_0missing_287),
make_row("percent biallelic & 0% missing information",percent_biallelic_0missing_287),
make_row("biallelic with missing information",biallelic_other_287),
make_row("percent biallelic with missing information",percent_biallelic_other_287)))
## result stat
## 1 detected variants 4.607520e+05
## 2 non-indel variants 4.315730e+05
## 3 indels 2.917900e+04
## 4 percent indels 6.332908e+00
## 5 percent non-indel variants 9.366709e+01
## 6 biallelic 4.119320e+05
## 7 multiallelic 1.964100e+04
## 8 percent multiallelic 4.262814e+00
## 9 percent biallelic 8.940428e+01
## 10 biallelic & 0% missing information 8.072800e+04
## 11 percent biallelic & 0% missing information 1.752092e+01
## 12 biallelic with missing information 3.312040e+05
## 13 percent biallelic with missing information 7.188336e+01
For this analysis, we mapped the 1 million sequence reads from each individual to the gene targets to assess target capture efficiency and potential ascertainment bias. C. odorata 287 was excluded from this analysis due to low yield (fewer than 1 million reads).
setwd("~/Documents/Cedrela/Ch1_Genome_Resources/Ch1_DA/covstats_baitscripts/")
filenames<-get_files("*_baitscripts_covstats.txt")
ldf<-readin_files(filenames)
read_depth<-get_depth(ldf)
spec_ids<-get_spec_ids(filenames,"specimen")
spp<-get_spp(filenames,"species")
target_ids<-get_target_ids(filenames,"target_id")
eff_df<-make_df(read_depth,spec_ids,target_ids)
eff_tdf<-make_tdf(read_depth,spec_ids,target_ids,spp)
#depth/ind
ind_mean_depth<-data.frame(eff_tdf[c(1:2)])
ind_mean_depth$ind_min_depth<-apply(eff_tdf[c(3:10003)],1,min)
ind_mean_depth$ind_max_depth<-apply(eff_tdf[c(3:10003)],1,max)
ind_mean_depth$ind_mean_depth<-apply(eff_tdf[c(3:10003)],1,mean)
(mean_ind_mean_depth<-get_results("average mean depth per individual",ind_mean_depth$ind_mean_depth))
## note_about_stat mean min max err
## 1 average mean depth per individual 7.496515 0.5240102 15.02406 1.908875
## up lo
## 1 9.40539 5.58764
spp_mean_depth<-ind_mean_depth[c(2:5)]%>%group_by(species)%>%summarise_all(funs(mean))
names(spp_mean_depth)<-c("species","species_min_depth","species_max_depth","species_mean_depth")
(spp_mean_depth)
## # A tibble: 8 x 4
## species species_min_depth species_max_depth species_mean_depth
## <fctr> <dbl> <dbl> <dbl>
## 1 CEAN 0 142.31078 13.3520579
## 2 CEFI 0 180.34506 12.3402162
## 3 CEMO 0 180.35714 14.4757704
## 4 CEOD 0 186.24124 13.2056185
## 5 CESA 0 105.46936 12.7056851
## 6 GUGU 0 93.97078 0.8354956
## 7 SWMA 0 111.31524 3.4072255
## 8 TRTU 0 70.90823 0.6494003
#depth/target
target_mean_depth<-data.frame(eff_df[1])
target_mean_depth$target_min_depth<-apply(eff_df[c(2:43)],1,min)
target_mean_depth$target_max_depth<-apply(eff_df[c(2:43)],1,max)
target_mean_depth$target_mean_depth<-apply(eff_df[c(2:43)],1,mean)
(mean_target_mean_depth<-get_results("average mean depth per target",target_mean_depth$target_mean_depth))
## note_about_stat mean min max err up
## 1 average mean depth per target 7.496515 0 86.52464 0.1268354 7.623351
## lo
## 1 7.36968
#data for depth figures
depths<-melt(eff_tdf)
depths$lval<-log2(depths$value)
ind_mean_depth$genus<-str_extract(ind_mean_depth$species,"[C,G,S,T]")
mean_depth_genus<-cbind.data.frame(ind_mean_depth$genus,ind_mean_depth$ind_mean_depth)
names(mean_depth_genus)<-c("genus","mean_depth")
test_ced<-filter(mean_depth_genus,genus == "C")
test_other<-filter(mean_depth_genus,genus != "C")
test_ced<-test_ced[2]
test_other<-test_other[2]
t.test(test_ced, test_other, alternative = "two.sided")
##
## Welch Two Sample t-test
##
## data: test_ced and test_other
## t = 26.915, df = 31.645, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 10.90145 12.68748
## sample estimates:
## mean of x mean of y
## 12.832106 1.037643
test_ced$genus<-"C"
test_other$genus<-"O"
test<-rbind.data.frame(test_ced,test_other)
ggplot(aes(x=genus,y=mean_depth),data=test)+
scale_x_discrete(labels=c("C"="Cedrela","O"="Other Meliaceae"))+
geom_boxplot()+
theme_classic()+
xlab("Genus")+ylab("Reads On-Target")
ggplot(aes(x=specimen,y=ind_mean_depth,fill=species),data = ind_mean_depth)+
geom_bar(stat="identity")+theme_classic()+
xlab("Specimen")+ylab("Specimen Mean Depth of Coverage")+
scale_fill_brewer(palette = "Dark2",name="Species",
labels=c("CEAN"="C. angustifolia","CEFI"="C. fissilis",
"CEMO"="C. montana","CEOD"="C. odorata",
"CESA"="C. saltensis","GUGU"="G.guidonia",
"SWMA"="S. mahagoni","TRTU"="T. tuberculata"))+
theme(axis.text.x = element_text(angle=90),
legend.text = element_text(face = "italic"))+
theme(axis.text.x = element_text(angle=90))+
geom_hline(yintercept=mean_ind_mean_depth$lo,size=1, color="darkred")+
geom_hline(yintercept=mean_ind_mean_depth$up,size=1, color="darkred")+
geom_hline(yintercept=mean_ind_mean_depth$mean,size=1)+
ggtitle("Specimen Mean Depth of Coverage to Grand Mean and 95% C. I.")
ggplot(aes(x=target_mean_depth),data = target_mean_depth)+geom_histogram(binwidth=1)+
theme_classic()+
xlab("Mean Depth of Coverage")+
ylab("Count of Gene Targets")+
geom_vline(xintercept=mean_target_mean_depth$lo,color="darkred")+
geom_vline(xintercept=mean_target_mean_depth$up,color="darkred")+
geom_vline(xintercept=mean_target_mean_depth$mean,color="darkred")+
geom_vline(xintercept=10,color="forestgreen")+
ggtitle("Depth of Coverage Distribution for Gene Targets Relative to 95% C. I.")
ggplot(aes(x=species,y=lval,color=species,fill=species),data=depths)+
geom_violin()+
scale_color_brewer(palette="Dark2")+
scale_fill_brewer(palette="Dark2")+
scale_x_discrete(labels=c("CEAN" = "C. angustifolia", "CEFI" = "C. fissilis", "CEMO" = "C. montana", "CEOD"="C. odorata", "CESA" = "C. saltensis","GUGU"="G. guidonia","SWMA"="S. mahagoni","TRTU"="T. tuberculata"))+
stat_summary(fun.y=mean, geom="point", shape=23, size=2,color="black")+
theme_classic()+theme(legend.position="none",
axis.text.x=element_text(face="italic"))+
labs(x="Species", y=expression(Log[2]*" Depth of Coverage"))+
ggtitle("Species Depth of Coverage Distributions Relative to Species Mean")
ggplot(aes(x=lval,color=species),data=depths)+
geom_density(size=1)+
scale_color_brewer(palette="Dark2")+
theme_classic()+theme(legend.position="none",
axis.text.x=element_text(face="italic"))+
labs(x=expression(Log[2]*" Depth of Coverage"),y="Kernal Density Estimate")+
ggtitle("Species Depth of Coverage Density Distributions")
Once assembled, we used the draft chloroplast genome from the reference specimen as a reference to guide the assembly of chloroplast genomes for the experimental specimens. We used the coverage statistics to estimate chloroplast coverage and other metrics.
We also used a custom python program (percent_ns.py) to calculate the percent N bases (bases with coverare ≤ 2X).
#Usage: percent_ns.py file.fasta output.txt
import io
import sys
program = sys.argv[0]
file = sys.argv[1]
output = sys.argv[2]
if len(sys.argv) != 3:
print("too many args")
quit()
class CP_genome:
def __init__(self, id, seq):
self.id = id
self.seq = seq
def per_missing(self):
counter = 0
seq_len= len(self.seq)
for index in range(0,seq_len):
base = self.seq[index]
if base == "N":
counter = counter + 1
per_base = (counter/float(seq_len))*100
return per_base
from Bio import SeqIO
fhandle = io.open(file, "rU")
outhandle = io.open(output, "w")
for record in SeqIO.parse(fhandle, "fasta"):
cpobj = CP_genome(record.id, record.seq)
percent_ns = cpobj.per_missing()
outhandle.write(unicode(record.id) + "\t" + str(percent_ns) + "\n")
setwd("~/Documents/Cedrela/Ch1_Genome_Resources/Ch1_DA/covstats_chloroplast/")
filenames<-get_files("*_cp_covstats.txt")
ldf<-readin_files(filenames)
cp_reads_mapped<-get_reads_mapped(ldf)
spec_ids<-get_spec_ids(filenames,"specimen")
spp<-get_spp(filenames,"species")
target_ids<-get_target_ids(filenames,"target_id")
#cp_df<-make_df(read_depth,spec_ids,target_ids)
cp_tdf<-make_tdf(cp_reads_mapped,spec_ids,target_ids,spp)
#cp yield
names(cp_tdf)[3]<-"cp_mapped"
cp_tdf<-full_join(yield,cp_tdf)
cp_tdf<-mutate(cp_tdf,per_cp_mapped = (cp_mapped/reads_sequenced)*100)
per_n<-read.table("20180302_ced_ch1_per_n.txt")
spec_ids<-str_extract(per_n$V1,"[C,G,S,T]...[_]\\d{0,3}")
per_n<-cbind.data.frame(spec_ids,per_n$V2)
names(per_n)<-c("specimen","per_n")
cp_tdf<-full_join(cp_tdf,per_n)
chloroplast_yield<-get_results("chloroplast yield",cp_tdf$cp_mapped)
percent_chloroplast<-get_results("percent of sequenced reads with chloroplast identity",cp_tdf$per_cp_mapped)
percent_n<-get_results("percent N bases in chloroplast genome",cp_tdf$per_n)
#cp depth
cp_depth<-get_depth(ldf)
cp_tdepth<-make_tdf(cp_depth,spec_ids,target_ids,spp)
names(cp_tdepth)<-c("specimen","species","cp_depth")
chloroplast_depth<-get_results("chloroplast depth",cp_tdepth$cp_depth)
cp_cov_percent<-get_cov_percent(ldf)
cp_cov_percent<-make_tdf(cp_cov_percent,spec_ids,target_ids,spp)
names(cp_cov_percent)<-c("specimen","species","cp_cov_percent")
chloroplast_cov_percent<-get_results("chloroplast percent covered",cp_cov_percent$cp_cov_percent)
rbind.data.frame(chloroplast_yield,percent_chloroplast,percent_n,chloroplast_depth,chloroplast_cov_percent)
## note_about_stat mean
## 1 chloroplast yield 4.317984e+05
## 2 percent of sequenced reads with chloroplast identity 5.632593e+00
## 3 percent N bases in chloroplast genome 5.531471e+00
## 4 chloroplast depth 2.796603e+02
## 5 chloroplast percent covered 9.221323e+01
## min max err up lo
## 1 3.560400e+04 3.806990e+06 1.989506e+05 6.307490e+05 2.328479e+05
## 2 8.882719e-01 2.622174e+01 1.712763e+00 7.345356e+00 3.919829e+00
## 3 5.340907e-02 2.347763e+01 2.136357e+00 7.667828e+00 3.395114e+00
## 4 2.487568e+01 2.399368e+03 1.244424e+02 4.041026e+02 1.552179e+02
## 5 6.092870e+01 1.000000e+02 3.451518e+00 9.566475e+01 8.876171e+01
ind_yield<-data.frame(yield_tdf_full[c(1:3,10005:10007)])
temp<-full_join(cp_tdf,ind_yield)
temp<-full_join(temp,cp_tdepth)
(full_result_table<-full_join(temp,cp_cov_percent))
## specimen species reads_sequenced cp_mapped per_cp_mapped per_n
## 1 CESA_75 CESA 29081506 831948 2.8607459 0.09487368
## 2 CEFI_112 CEFI 1750937 151391 8.6462848 0.39507291
## 3 CEFI_140 CEFI 12183129 403902 3.3152567 0.13817707
## 4 CEOD_202 CEOD 6372915 116791 1.8326151 0.39004071
## 5 CEFI_230 CEFI 3251712 106353 3.2706771 0.36015162
## 6 CEFI_264 CEFI 19290108 356979 1.8505806 0.12252515
## 7 CEFI_292 CEFI 7116518 1866075 26.2217421 0.08732856
## 8 CEOD_10 CEOD 4009101 273812 6.8297606 0.37514447
## 9 CEOD_52 CEOD 12254507 618594 5.0478897 0.09686144
## 10 CEAN_88 CEAN 1515526 248243 16.3799895 0.12381138
## 11 CEFI_9 CEFI 1174195 93257 7.9422072 0.24805304
## 12 CEAN_143 CEAN 8462020 901138 10.6492067 0.14450232
## 13 CEOD_162 CEOD 8942300 756725 8.4623084 0.05340907
## 14 CEOD_185 CEOD 5713214 382876 6.7015869 0.10490279
## 15 CEOD_222 CEOD 10421206 289695 2.7798606 0.30254707
## 16 CEOD_277 CEOD 6794076 309154 4.5503465 0.52869377
## 17 TRTU_1 TRTU 3823599 54256 1.4189773 17.10114577
## 18 GUGU_2 GUGU 5211243 92122 1.7677548 19.47211959
## 19 TRTU_3 TRTU 5915718 92170 1.5580526 14.23719219
## 20 GUGU_4 GUGU 6710368 129500 1.9298495 13.85054438
## 21 TRTU_6 TRTU 6159511 140155 2.2754241 11.38574053
## 22 CEMO_50 CEMO 5561468 577438 10.3828342 0.15577889
## 23 GUGU_7 GUGU 6850155 81731 1.1931263 17.80405649
## 24 TRTU_8 TRTU 6595654 59560 0.9030189 23.47762656
## 25 GUGU_9 GUGU 14213046 197446 1.3891885 13.47256192
## 26 GUGU_10 GUGU 7833412 69582 0.8882719 15.02868947
## 27 GUGU_11 GUGU 9166583 279357 3.0475587 7.26521367
## 28 TRTU_12 TRTU 11354222 283424 2.4961992 8.32653303
## 29 GUGU_13 GUGU 9909255 259749 2.6212768 7.58342218
## 30 GUGU_15 GUGU 10634632 267334 2.5138058 7.58861523
## 31 GUGU_17 GUGU 10900380 138537 1.2709373 13.30771827
## 32 TRTU_18 TRTU 10145721 225870 2.2262587 12.16526396
## 33 CESA_102 CESA 5252123 672165 12.7979676 0.10110081
## 34 GUGU_19 GUGU 16194976 342985 2.1178482 8.30678532
## 35 TRTU_20 TRTU 8374551 117705 1.4055082 13.21347111
## 36 SWMA_21 SWMA 16927562 1530129 9.0392757 1.47610676
## 37 SWMA_22 SWMA 18098858 3806990 21.0344211 0.47148461
## 38 CEFI_130 CEFI 5351355 135883 2.5392260 0.21426731
## 39 CESA_186 CESA 1829249 125974 6.8866513 0.15141710
## 40 CEFI_211 CEFI 1272124 167826 13.1925819 0.20726170
## 41 CEFI_254 CEFI 17394724 643227 3.6978281 0.10366210
## 42 CEOD_287 CEOD 438310 35604 8.1230180 7.65715196
## 43 CEFI_19 CEFI 5431374 333680 6.1435651 0.16222494
## reads_ontarget ind_mean_ontarget ind_percent_ontarget cp_depth
## 1 10692874 3977.04030 36.7686391 524.33807
## 2 376881 212.76052 21.5245323 95.58827
## 3 5047225 1722.86311 41.4279862 254.56061
## 4 2366009 873.80502 37.1260091 73.80320
## 5 1183526 443.47945 36.3970118 67.09298
## 6 7776152 2706.35536 40.3116043 225.00829
## 7 2752495 986.80262 38.6775527 1176.10014
## 8 1472131 548.06839 36.7197284 172.86435
## 9 4791273 1704.40756 39.0980478 389.87098
## 10 488207 200.35326 32.2137001 156.51069
## 11 286398 146.04470 24.3910083 58.81738
## 12 3191296 1165.21508 37.7131701 568.07291
## 13 3574795 1251.58434 39.9762365 476.99995
## 14 2177026 788.94511 38.1051016 241.30891
## 15 4017316 1443.70783 38.5494347 182.87797
## 16 2539070 933.22128 37.3718222 195.27185
## 17 39222 386.24348 1.0257875 158.54272
## 18 68501 527.92161 1.3144849 78.25466
## 19 60793 597.59134 1.0276521 76.01432
## 20 100652 681.03390 1.4999475 104.92475
## 21 63116 622.20048 1.0246917 110.34966
## 22 2026840 758.75492 36.4443345 364.05722
## 23 101252 695.07119 1.4780979 73.78158
## 24 70395 666.53825 1.0672937 50.45129
## 25 206827 1441.84312 1.4551912 139.28441
## 26 91070 792.36896 1.1625841 68.62331
## 27 123598 928.92521 1.3483541 193.79405
## 28 92063 1144.51405 0.8108261 56.12306
## 29 133845 1004.20958 1.3507070 184.12089
## 30 148498 1078.20518 1.3963624 184.90234
## 31 128635 1102.79122 1.1800965 108.05567
## 32 103682 1024.83782 1.0219284 195.66791
## 33 2107682 735.90691 40.1300960 423.63428
## 34 195256 1638.85931 1.2056579 228.85909
## 35 91146 846.48505 1.0883688 89.92863
## 36 1051671 1797.74353 6.2127730 965.93015
## 37 1113548 1921.04850 6.1525871 2399.36844
## 38 2154295 750.48995 40.2570003 85.70544
## 39 554830 238.38406 30.3310266 79.39554
## 40 259059 153.10299 20.3642884 105.77423
## 41 5583057 2297.54835 32.0962667 405.39601
## 42 6566 44.48315 1.4980265 24.87568
## 43 2122651 755.32697 39.0812896 210.48947
## cp_cov_percent
## 1 100.0000
## 2 99.8184
## 3 100.0000
## 4 99.7354
## 5 99.9051
## 6 99.9906
## 7 100.0000
## 8 99.8303
## 9 100.0000
## 10 99.9651
## 11 99.9289
## 12 99.9775
## 13 99.9850
## 14 100.0000
## 15 99.8378
## 16 99.7816
## 17 89.7899
## 18 74.1939
## 19 76.4204
## 20 77.7870
## 21 80.0484
## 22 99.9657
## 23 69.8159
## 24 74.4042
## 25 89.3431
## 26 63.9058
## 27 90.8520
## 28 60.9287
## 29 88.9132
## 30 91.1228
## 31 80.8041
## 32 91.2919
## 33 100.0000
## 34 94.4544
## 35 82.4921
## 36 99.8384
## 37 100.0000
## 38 99.9245
## 39 100.0000
## 40 99.9988
## 41 100.0000
## 42 90.2067
## 43 99.9114