This is an R Markdown document.
Dissertation Research: The data here were collected August 28, 2017 and samples were sequenced on or near August 14, 2017 at Univeristy of Oregon. I have a pool of paired-end 100 sequences from Cedrela, Swietenia, Guarea, and Trichillia species (Meliaceae). These sequences were obtained via hybridization capture, targeted enrichment, and short-read sequencing on the Illumina HiSeq 4000. Baits were designed from the transcriptome of Cedrela odorata. Here I am testing how many reads were captured by the baits across these species.
We also wanted to learn how much information we could gain for every species when we impose a depth of coverage threshold of 10X (10 reads per transcript). Then, when we have list of genes retained after the depth threshold is imposed, we want to know how many gene families are represented and how many unique gene ontology terms are available. This is a more stringent measurment of bait efficiency across these species. (KF November 17, 2017)
library(dplyr)
library(robustbase)
library(matrixStats)
library(vcfR)
library(ggplot2)
library(tidyr)
library(tibble)
library(reshape2)
library(RColorBrewer)
#load in covstats files
filenames <- list.files(pattern="*.2.txt", full.names=TRUE)
ldf <- lapply(filenames, read.table, header=1)
#make a new column for each sample that is the sum of Plus_reads and Minus_reads fields
output<-list()
l<-
for (i in 1:length(ldf)){
output[[i]]<-mutate(ldf[[i]], total = Plus_reads + Minus_reads)
}
#make a new list that only contains totals.
reads<-list()
for (i in 1:length(output)){
reads[[i]]<-cbind.data.frame(output[[i]]$total)
}
#convert list to dataframe
reads<-data.frame(reads)
reads$r.sums<-rowSums(reads)
MELI_10_CESA_75<-read.table("MELI_10_CESA_75_S10_trnxome_covstats.2.txt",header=1)
reads<-cbind.data.frame(MELI_10_CESA_75$ID,reads)
colnames(reads)<-c("transcript","CEFI_9", "CESA_75", "CEFI_112", "CEFI_140", "CEOD_202", "CEFI_230", "CEFI_264", "CEFI_292", "CEOD_10", "CEOD_52", "CEAN_88", "CEMO_50", "CEAN_143", "CEOD_162", "CEOD_185", "CEOD_222", "CEOD_277", "TRTU_1", "GUGU_2", "TRTU_3", "GUGU_4", "TRTU_6", "CESA_102", "GUGU_7", "TRTU_8", "GUGU_9", "GUGU_10", "GUGU_11", "TRTU_12", "GUGU_13", "GUGU_15", "GUGU_17", "TRTU_18", "CEFI_130", "GUGU_19", "TRTU_20", "SWMA_21", "SWMA_22", "CESA_186", "CEFI_211", "CEFI_254", "CEFI_19","sums")
#remove transcripts that had nothing mapping 9847/10001 (98%)
reads.filno0<-filter(reads, sums != 0 )
#remove transcripts with less than 1000 reads mapping to them
reads.fil1000<-filter(reads, sums >= 1000 )
#make column 1 row names
row.names(reads.fil1000)<-reads.fil1000[,1]
row.names(reads.filno0)<-reads.filno0[,1]
#now remove the column containing the transcript IDs and the column with row sums.
reads.filno0[1]<-NULL
reads.filno0[43]<-NULL
reads.fil1000[1]<-NULL
reads.fil1000[43]<-NULL
#transpose these two dataframes
treads.filno0<-data.frame(t(reads.filno0))
treads.fil1000<-data.frame(t(reads.fil1000))
#make a new dataframe containing species IDs
spp<-data.frame(c("C.fissilis", "C.saltensis", "C.fissilis","C.fissilis", "C.odorata", "C.fissilis", "C.fissilis", "C.fissilis", "C.odorata", "C.odorata", "C.angustifolia", "C.montana", "C.angustifolia", "C.odorata", "C.odorata", "C.odorata", "C.odorata", "T.tuberculata", "G.guidonia", "T.tuberculata", "G.guidonia", "T.tuberculata", "C.saltensis", "G.guidonia", "T.tuberculata", "G.guidonia", "G.guidonia", "G.guidonia", "T.tuberculata", "G.guidonia", "G.guidonia", "G.guidonia", "T.tuberculata", "C.fissilis", "G.guidonia", "T.tuberculata", "S.mahagoni", "S.mahagoni", "C.saltensis", "C.fissilis", "C.fissilis", "C.fissilis"))
#combine transposed read dataframes with the spp dataframe
df<-cbind.data.frame(sample=row.names(treads.filno0),spp,treads.filno0[,-9847])
df1000<-cbind.data.frame(sample=row.names(treads.fil1000),spp,treads.fil1000[,-6527])
#fix the names
names(df)[2]<-"spp"
names(df1000)[2]<-"spp"
#use melt function to convert wide data to long data. This is good for plotting but would be horrible to perform by hand.
mdf<-melt(reads.filno0)
## No id variables; using all as measure variables
vdf<-melt(df)
## Using sample, spp as id variables
#Log transform the data so it shows up better
vdf$lval<-log2(vdf$value)
vdf1000<-melt(df1000)
## Using sample, spp as id variables
vdf1000$lval<-log2(vdf1000$value)
xdf<-vdf
xdf1000<-vdf1000
#make a copy of the data frame but change the Inf values to 0.
xdf[xdf == -Inf] <- 0
xdf1000[xdf1000 == -Inf] <- 0
#also make a dataframe of total reads per individual
ind.totals<-data.frame(colSums(reads[2:43]))
names(ind.totals)[1]<-"reads.mapped"
names(spp)[1]<-"spp"
ind.totals<-cbind.data.frame(ind.totals,spp)
ind.totals$lval<-log2(ind.totals$reads.mapped)
#Part 2
#thresholds, gene families, and gene ontology terms----
filenames <- list.files(pattern="*.2.txt", full.names=TRUE)
ldf <- lapply(filenames, read.table, header=1)
m<-
for (i in 1:length(ldf)){
output[[i]]<-mutate(ldf[[i]], depth = ((Plus_reads + Minus_reads)*101)/Covered_bases)
}
depth.10<-list()
for (i in 1:length(output)){
depth.10[[i]]<-cbind.data.frame(output[[i]]$depth)
}
#save that as a data frame
depth.10<-data.frame(depth.10)
#change NaN's to 0.
depth.10[depth.10 == "NaN"] <- 0
#calculate row sums
depth.10$r.sums<-rowSums(depth.10)
#bring in transcript IDs from one of the files
MELI_10_CESA_75<-read.table("MELI_10_CESA_75_S10_trnxome_covstats.2.txt",header=1)
depth.10<-cbind.data.frame(MELI_10_CESA_75$ID,depth.10)
#change column names to sample IDs
colnames(depth.10)<-c("transcript","CEFI_9", "CESA_75", "CEFI_112", "CEFI_140", "CEOD_202", "CEFI_230", "CEFI_264", "CEFI_292", "CEOD_10", "CEOD_52", "CEAN_88", "CEMO_50", "CEAN_143", "CEOD_162", "CEOD_185", "CEOD_222", "CEOD_277", "TRTU_1", "GUGU_2", "TRTU_3", "GUGU_4", "TRTU_6", "CESA_102", "GUGU_7", "TRTU_8", "GUGU_9", "GUGU_10", "GUGU_11", "TRTU_12", "GUGU_13", "GUGU_15", "GUGU_17", "TRTU_18", "CEFI_130", "GUGU_19", "TRTU_20", "SWMA_21", "SWMA_22", "CESA_186", "CEFI_211", "CEFI_254", "CEFI_19","sums")
hist(depth.10$sums,breaks = 1000)
#these two steps are necessary for conversion
row.names(depth.10)<-depth.10[,1]
depth.10[1]<-NULL
#convert values greater than depth 10 to 1 and less than depth 10 to 0
depth.10[depth.10<10]<-0
depth.10[depth.10>10]<-1
names(depth.10)
## [1] "CEFI_9" "CESA_75" "CEFI_112" "CEFI_140" "CEOD_202" "CEFI_230"
## [7] "CEFI_264" "CEFI_292" "CEOD_10" "CEOD_52" "CEAN_88" "CEMO_50"
## [13] "CEAN_143" "CEOD_162" "CEOD_185" "CEOD_222" "CEOD_277" "TRTU_1"
## [19] "GUGU_2" "TRTU_3" "GUGU_4" "TRTU_6" "CESA_102" "GUGU_7"
## [25] "TRTU_8" "GUGU_9" "GUGU_10" "GUGU_11" "TRTU_12" "GUGU_13"
## [31] "GUGU_15" "GUGU_17" "TRTU_18" "CEFI_130" "GUGU_19" "TRTU_20"
## [37] "SWMA_21" "SWMA_22" "CESA_186" "CEFI_211" "CEFI_254" "CEFI_19"
## [43] "sums"
#subset into species lists
ceod<-depth.10[c(5,9:10,14:17)]
cefi<-depth.10[c(1,3:4,6:8,34,40:42)]
cesa<-depth.10[c(2,23,39)]
cemo<-depth.10[c(12)]
cean<-depth.10[c(11,13)]
swma<-depth.10[c(37:38)]
trtu<-depth.10[c(18,20,22,25,29,33,36)]
gugu<-depth.10[c(19,21,24,26:28,30:32,35)]
#find sums. this will allow me to find transcripts that are retained with the threshold for all samples in a species group.
ceod$r.sums<-rowSums(ceod)
cefi$r.sums<-rowSums(cefi)
cean$r.sums<-rowSums(cean)
cesa$r.sums<-rowSums(cesa)
swma$r.sums<-rowSums(swma)
trtu$r.sums<-rowSums(trtu)
gugu$r.sums<-rowSums(gugu)
#could the number of transcripts that are retained with depth threshold
ceod.ret<-sum(ceod$r.sums == '7')
cefi.ret<-sum(cefi$r.sums == '10')
cean.ret<-sum(cean$r.sums == '2')
cesa.ret<-sum(cesa$r.sums == '3')
cemo.ret<-sum(cemo$CEMO_50 == '1')
swma.ret<-sum(swma$r.sums == '2')
trtu.ret<-sum(trtu$r.sums == '7')
gugu.ret<-sum(gugu$r.sums == '10')
#save that information into a data frame. these are results for a table.
spp<-c("ceod","cefi","cean","cesa","cemo","swma","trtu","gugu")
genes.ret<-data.frame(rbind(ceod.ret,cefi.ret,cean.ret,cesa.ret,cemo.ret,swma.ret,trtu.ret,gugu.ret))
genes.spp<-data.frame(spp,genes.ret)
names(genes.spp)[2]<-"genes.ret"
#now convert the rownames to column
ceod<-rownames_to_column(ceod,var="transcript")
#subset the list to only include transcripts that are retained for all samples in the spp. group
ceod.ret.list<-subset(ceod, r.sums == '7')
#we just need the trancript list
ceod.ret.list<-ceod.ret.list[c("transcript")]
#save the transcript list. This can be used with seqkit to grep sequences from the fasta file containing the transcriptome.
#write.table(ceod.ret.list,"ceod_ret_list.txt",quote=FALSE,row.names=FALSE)
#repeat for all species.
cefi<-rownames_to_column(cefi,var="transcript")
cefi.ret.list<-subset(cefi, r.sums == '10')
cefi.ret.list<-cefi.ret.list[c("transcript")]
#write.table(cefi.ret.list,"cefi_ret_list.txt",quote=FALSE,row.names=FALSE)
cean<-rownames_to_column(cean,var="transcript")
cean.ret.list<-subset(cean, r.sums == '2')
cean.ret.list<-cean.ret.list[c("transcript")]
#write.table(cean.ret.list,"cean_ret_list.txt",quote=FALSE,row.names=FALSE)
cesa<-rownames_to_column(cesa,var="transcript")
cesa.ret.list<-subset(cesa, r.sums == '3')
cesa.ret.list<-cesa.ret.list[c("transcript")]
#write.table(cesa.ret.list,"cesa_ret_list.txt",quote=FALSE,row.names=FALSE)
cemo<-rownames_to_column(cemo,var="transcript")
cemo.ret.list<-subset(cemo, CEMO_50 == '1')
cemo.ret.list<-cemo.ret.list[c("transcript")]
#write.table(cemo.ret.list,"cemo_ret_list.txt",quote=FALSE,row.names=FALSE)
swma<-rownames_to_column(swma,var="transcript")
swma.ret.list<-subset(swma, r.sums == '2')
swma.ret.list<-swma.ret.list[c("transcript")]
#write.table(swma.ret.list,"swma_ret_list.txt",quote=FALSE,row.names=FALSE)
trtu<-rownames_to_column(trtu,var="transcript")
trtu.ret.list<-subset(trtu, r.sums == '7')
trtu.ret.list<-trtu.ret.list[c("transcript")]
#write.table(trtu.ret.list,"trtu_ret_list.txt",quote=FALSE,row.names=FALSE)
gugu<-rownames_to_column(gugu,var="transcript")
gugu.ret.list<-subset(gugu, r.sums == '10')
gugu.ret.list<-gugu.ret.list[c("transcript")]
#write.table(gugu.ret.list,"gugu_ret_list.txt",quote=FALSE,row.names=FALSE)
#outside of R, I used the ret lists to filter the gene family lists (from TRAPID) by name. This made a new list of transcripts IDs and the gene families they are associated with for only the transcript models that were retained after the depth threshold was imposed.
#bring those gene family files back in from the seqkit output.
ceod.gfs<-read.table("ceod_ret_gf.txt",header=1,row.names = 1)
cean.gfs<-read.table("cean_ret_gf.txt",header=1,row.names = 1)
cemo.gfs<-read.table("cemo_ret_gf.txt",header=1,row.names = 1)
cesa.gfs<-read.table("cesa_ret_gf.txt",header=1,row.names = 1)
cefi.gfs<-read.table("cefi_ret_gf.txt",header=1,row.names = 1)
swma.gfs<-read.table("swma_ret_gf.txt",header=1,row.names = 1)
trtu.gfs<-read.table("trtu_ret_gf.txt",header=1,row.names = 1)
gugu.gfs<-read.table("gugu_ret_gf.txt",header=1,row.names = 1)
#Make a dataframe that shows the number of unique gene famililies per species.
spp<-c("ceod","cefi","cean","cesa","cemo","swma","trtu","gugu")
gf.uni<-c(length(unique(ceod.gfs$gf_id)),length(unique(cefi.gfs$gf_id)),length(unique(cean.gfs$gf_id)),length(unique(cesa.gfs$gf_id)),length(unique(cemo.gfs$gf_id)),length(unique(swma.gfs$gf_id)),length(unique(trtu.gfs$gf_id)),length(unique(gugu.gfs$gf_id)))
gf.uni.spp<-cbind.data.frame(spp,gf.uni)
#bring those gene ontology files back in from the seqkit output.
ceod.gos<-read.table("ceod_gos.txt",header=1,row.names = 1)
cean.gos<-read.table("cean_gos.txt",header=1,row.names = 1)
cemo.gos<-read.table("cemo_gos.txt",header=1,row.names = 1)
cesa.gos<-read.table("cesa_gos.txt",header=1,row.names = 1)
cefi.gos<-read.table("cefi_gos.txt",header=1,row.names = 1)
swma.gos<-read.table("swma_gos.txt",header=1,row.names = 1)
trtu.gos<-read.table("trtu_gos.txt",header=1,row.names = 1)
gugu.gos<-read.table("gugu_gos.txt",header=1,row.names = 1)
#Make a dataframe that shows the number of unique gene ontology terms per species.
go.uni<-c(length(unique(ceod.gos$go)),length(unique(cefi.gos$go)),length(unique(cean.gos$go)),length(unique(cesa.gos$go)),length(unique(cemo.gos$go)),length(unique(swma.gos$go)),length(unique(trtu.gos$go)),length(unique(gugu.gos$go)))
go.uni.spp<-cbind.data.frame(spp,go.uni)
gf.go.uni.spp<-cbind.data.frame(spp,gf.uni,go.uni)
m.gf.go.uni.spp<-melt(gf.go.uni.spp)
## Using spp as id variables
ind_counts_by_transcript<-ggplot(vdf1000,aes(x=sample,y=lval,color=spp,fill=spp))+geom_violin()+
scale_color_brewer(palette="Dark2")+
scale_fill_brewer(palette="Dark2")+
stat_summary(fun.y=mean, geom="point", shape=23, size=2,color="black")+
theme_classic()+theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))+
xlab("Sample")+
ylab("Log2(read counts per transcript)")+
ggtitle("Counts/Transcript/Individual (Infs excluded)")
spp_counts_by_transcript<-ggplot(vdf1000,aes(x=spp,y=lval,color=spp,fill=spp))+
geom_violin()+
scale_color_brewer(palette="Dark2")+
scale_fill_brewer(palette="Dark2")+
stat_summary(fun.y=mean, geom="point", shape=23, size=2,color="black")+
theme_classic()+theme(legend.position="none")+
xlab("Species")+
ylab("Log2(read counts per transcript)")+
ggtitle("Counts/Transcript/Species (Infs excluded)")
ind_counts_by_transcript0s<-ggplot(xdf1000,aes(x=sample,y=lval,color=spp,fill=spp))+geom_violin()+
scale_color_brewer(palette="Dark2")+
scale_fill_brewer(palette="Dark2")+
stat_summary(fun.y=mean, geom="point", shape=23, size=2,color="black")+
theme_classic()+theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))+
xlab("Sample")+
ylab("Log2(read counts per transcript)")+
ggtitle("Counts/Transcript/Individual (Infs=0)")
spp_counts_by_transcript0s<-ggplot(xdf1000,aes(x=spp,y=lval,color=spp,fill=spp))+
geom_violin()+
scale_color_brewer(palette="Dark2")+
scale_fill_brewer(palette="Dark2")+
stat_summary(fun.y=mean, geom="point", shape=23, size=2,color="black")+
theme_classic()+theme(legend.position="none")+
xlab("Species")+
ylab("Log2(read counts per transcript)")+
ggtitle("Counts/Transcript/Species (Infs=0)")
spp_total_counts<-ggplot(ind.totals,aes(x=spp,y=reads.mapped,color=spp,fill=spp))+geom_boxplot()+
scale_color_brewer(palette="Dark2")+
scale_fill_brewer(palette="Dark2")+
stat_summary(fun.y=mean, geom="point", shape=23, size=2,color="black")+
theme_classic()+theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))+
xlab("Species")+
ylab("Total reads mapped")+
ggtitle("Reads/Species")
log_spp_total_counts<-ggplot(ind.totals,aes(x=spp,y=lval,color=spp,fill=spp))+geom_boxplot()+
scale_color_brewer(palette="Dark2")+
scale_fill_brewer(palette="Dark2")+
stat_summary(fun.y=mean, geom="point", shape=23, size=2,color="black")+
theme_classic()+theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))+
xlab("Species")+
ylab("Log2(Total mapped reads)")+
ggtitle("Log2(Reads/Species)")
#Part 2
#plot these values as a barplot to show differnces
genes.spp.plot<-ggplot(aes(spp,genes.ret,fill=spp),data=genes.spp)+geom_bar(stat = "identity")+
scale_fill_brewer(palette = "Dark2")+
scale_color_brewer(palette = "Dark2")+
theme_classic()+theme(legend.position="none",
axis.text.x=element_text(face="italic"))+
xlab("Species")+
ylab("Genes Retained at Depth 10")+
scale_x_discrete(labels=c("cean" = "C. angustifolia", "cefi" = "C. fissilis", "cemo" = "C. montana", "ceod" = "C. odorata", "cesa" = "C. saltensis","swma"="S. mahagoni","trtu"="T. tuberculata","gugu"="G. guidonia"))
#save the plot and the table
#ggsave("genes_ret_by_spp10.jpg",genes.spp.plot,width=7,height=4,units ="in")
#write.csv(genes.spp,"genes_ret_by_spp_10.csv")
gf.go<-ggplot(m.gf.go.uni.spp,aes(spp,value,fill=variable,color=spp))+
geom_bar(stat="identity",position="dodge",size=2)+
scale_color_brewer(palette = "Dark2")+
scale_fill_manual(values=c("white","darkgray"))+
theme_classic()+theme(legend.position="none",
axis.text.x=element_text(face="italic"))+
xlab("Species")+
ylab("Gene Families and Ontologies Retained at Depth 10")+
scale_x_discrete(labels=c("cean" = "C. angustifolia", "cefi" = "C. fissilis", "cemo" = "C. montana", "ceod" = "C. odorata", "cesa" = "C. saltensis","swma"="S. mahagoni","trtu"="T. tuberculata","gugu"="G. guidonia"))
#ggsave("gfsgos_ret_by_spp10.jpg",gf.go)
ind_counts_by_transcript
## Warning: Removed 71099 rows containing non-finite values (stat_ydensity).
## Warning: Removed 71099 rows containing non-finite values (stat_summary).
spp_counts_by_transcript
## Warning: Removed 71099 rows containing non-finite values (stat_ydensity).
## Warning: Removed 71099 rows containing non-finite values (stat_summary).
ind_counts_by_transcript0s
spp_counts_by_transcript0s
spp_total_counts
log_spp_total_counts
genes.spp.plot
gf.go