mec protein tree building
The goal of this pipeline is to build phylogenetic trees for each of the 10 mec cassette proteins along with the top 20 genome-based homologs.
- Gather top 20 homologs from IMG, excluding those in defined mec cassettes
- Save the blast results table for each
- define a homolog set for each as an IMG object
- download the protein sequences for each set
- download the gene info page for each set
- import the proteins into R
- reformat fasta header to read: genus_species_geneID
- import the DCM cassette proteins
- Reformat the fasta header to read: genus_species_geneID
- Add each set of DCM genes to the appropirate homolog set
- manually add the DCMF proteins to each and re-import
- make sure DIEL has proper name throughout
- build an automated MAFFT G-INS-I alignment pipeline -> trimgappy -> RaxML -f treebuild, loop over files in directory shell script
- cluster metagenome-derived mec proteins and build trees using the same procedure which includes them
0.1 Notes
MecC under default settings (1e-5) found no hits outside of DCM cassette. Switched to 1e-2.
1 Import the homolog protein sets and rename fasta headers
Since it is difficult to create named R objects inside of a loop, each set is read in along with info, fastas are refined, and then written again
library(Biostrings)
library(dplyr)
homo.files <- list.files("orthologs.faa/")
info.files <- list.files("gene.info/")
#Define a string split function which splits by space character and returns first element
splitfun <- function(x){
strsplit(x," ")[[1]][1]
}
#Define a simple function that will replace all spaces and illegal characters with underscores
spc.replc <- function(x){
gsub(" ", "_", x, fixed = TRUE)
gsub(",",".",x)
gsub(";",".",x)
gsub(":",".",x)
}
#define a funciton that will replace any annoying illegal punctuation
punct.replc <- function(x){
gsub("([[:punct:]])|\\s+","_",x)
}
#define write fasta function
writeFasta<-function(data, filename){
fastaLines = c()
for (rowNum in 1:nrow(data)){
fastaLines = c(fastaLines, as.character(paste(">", data[rowNum,"name"], sep = "")))
fastaLines = c(fastaLines,as.character(data[rowNum,"seq"]))
}
fileConn<-file(filename)
writeLines(fastaLines, fileConn)
close(fileConn)
}
for (x in c(1:length(homo.files))) {
iter <- readAAStringSet(paste("orthologs.faa/",homo.files[x],sep = ""))
Gene.ID <- names(iter)
Gene.ID <- unlist(lapply(Gene.ID, splitfun))
seq <- paste(iter)
fasta.df <- data.frame(Gene.ID, seq, stringsAsFactors = F)
fasta.df$Gene.ID <- as.character(fasta.df$Gene.ID)
info <- read.csv(paste("gene.info/",info.files[x],sep=""),header = T,sep = "\t")[c(1,5)]
info$newname <- paste(info$Genome.Name,info$Gene.ID,sep=" ")
#info$newname <- unlist(lapply(info$newname, spc.replc))
info$newname <- unlist(lapply(info$newname, punct.replc))
info$Gene.ID <- as.character(info$Gene.ID)
fasta.df <- dplyr::left_join(fasta.df, info, by="Gene.ID" )
fasta.df <- fasta.df[c(4,2)]
colnames(fasta.df)[1] <- "name"
writeFasta(fasta.df, paste("orthlogs.clean/",homo.files[x],sep = ""))
}2 Import and reformat mec cassette proteins
DCM.all <- readAAStringSet("cass.proteins/five.DCM.cassette.proteins.faa")
name.old <- names(DCM.all)
seq <- paste(DCM.all)
DCM.df <- data.frame(name.old, seq)
colnames(DCM.df)[1] <- "gene_oid"
DCM.info.all <- read.csv("cass.proteins/gene.info.IMG.all.csv",header = T,stringsAsFactors = F)[c(1,4,8)]
cass <- sort(unique(DCM.info.all$DCM.gene.name))
for (x in c(1:length(cass))) {
info <- DCM.info.all[DCM.info.all$DCM.gene.name == cass[x],]
info$newname <- paste(info$Genome.Name,info$gene_oid,sep=" ")
#info$newname <- unlist(lapply(info$newname, spc.replc))
info$newname <- unlist(lapply(info$newname, punct.replc))
info$gene_oid <- as.character(info$gene_oid)
fasta.df <- dplyr::left_join(info, DCM.df, by="gene_oid" )
fasta.df <- fasta.df[c(4,5)]
colnames(fasta.df)[1] <- "name"
writeFasta(fasta.df, paste("DCM.cass.clean/",cass[x],".clean.faa",sep = ""))
}3 Combine the cassette genes with their orthologs
DCM.files <- list.files("DCM.cass.clean/")
ortho.files <- list.files("orthlogs.clean/")
for (x in c(1:length(DCM.files))) {
DCM <- readAAStringSet(paste("DCM.cass.clean/",DCM.files[x],sep = ""))
name <- names(DCM)
seq <- paste(DCM)
DCM <- data.frame(name,seq)
DCM$name <- as.character(DCM$name)
ortho <- readAAStringSet(paste("orthlogs.clean/",ortho.files[x],sep=""))
name <- names(ortho)
seq <- paste(ortho)
ortho <- data.frame(name,seq)
ortho$name <- as.character(ortho$name)
all.fa <- rbind(DCM,ortho)
writeFasta(all.fa, paste("DCM.cass.with.ortho/",DCM.files[x],sep=""))
}4 build trees
mkdir proteins.align
for f in DCM.cass.with.ortho/*
do
name=${f##*/}
mafft --maxiterate 1000 --globalpair "$f" > proteins.align/"${name}.align"
done
mkdir proteins.trim
for f in proteins.align/*
do
name=${f##*/}
base=${name%.align}
/home/robert/Tools/trimal.v1.2rev59/trimAl/source/trimal -in "$f" -out proteins.trim/"${base}.trim" -gappyout
done
mkdir proteins.tree
for f in proteins.trim/*
do
name=${f##*/}
base=${name%.trim}
fasttree -lg -gamma "$f" > proteins.tree/"$base"
done
5 Adding mec genes from metagenomes
5.1 Sorting the fastas by mec gene
Locus tags must be used for these; the IMG locus tag for metagenomes has the genome ID incorporated.
5.1.1 Import the genes and clean fasta header
use env type instead of species name
5.1.2 Sort
The gene.info.all file generated previously, I have amended it with homolog codes for each gene. Use this as the guide.
this also removes sequences which are abnormally short, less than 70% of the average protein length
library(dplyr)
library(tidyr)
info <- read.csv("mgenome.cass/gene.info.all.with.proportions.csv", stringsAsFactors = F)
info <- info[c(2,4,5)]
prots <- left_join(prots, info)
prots <- prots[complete.cases(prots),]
prots <- unite(prots, name, Genome.source,Locus.Tag)
ortho <- sort(unique(prots$Dcm.ortholog, na.omit = T))
for (x in (1:length(ortho))) {
gene <- prots[prots$Dcm.ortholog == ortho[x], ]
gene <- gene[complete.cases(gene),]
gene <- gene[gene$len > (0.7*mean(gene$len)),]
gene <- gene[complete.cases(gene),]
writeFasta(gene, paste("DCM.cass.ortho.mgenome.only/",ortho[x],".mgenome.faa",sep=""))
}The largest set has 37 sequences, too many for a tree, especially considering that many of them are duplicates. Each has to be sent through a CD hit pipeline
5.2 Clustering the mgenome genes
They are clustered at 80%
for f in DCM.cass.ortho.mgenome.only/*
do
name=${f##*/}
cdhit -i "$f" \
-o mgenome.cluster/"${name}" -M 50000 -T 0 -d 50 -c .8 -g 1 -l 150
done
5.3 Parsing the *.clstr files
rough pipeline instructions here: http://www.rpubs.com/rmurdoch/cdhit_to_mapping_file. This pipeline has been placed into a loop. I should turn it into a function for the future.
files <- list.files("mgenome.cluster/")
numbers_only <- function(x) !grepl("\\D", x)
x=0
for (f in c(1:length(files))) {
clstr <- read.csv(paste("mgenome.cluster/",files[f],sep=""), sep = "\t", row.names = NULL, header = F, stringsAsFactors = F)
clstr2 <- clstr
for (n in c(1:nrow(clstr2))) {
if (numbers_only(clstr2[n,1]) == T) {
clstr2[n,1] <- x
}
else {NULL}
x <- clstr2[n,1]
}
library(stringr)
clstr.sums <- data.frame(dplyr::count(clstr2,V1))
switch <- clstr.sums[1,2]
clstr3 <- cbind(clstr2[1], clstr)
clstr4 <- clstr2[-which(clstr2$V2 == ""), ]
clstr5 <- lapply(clstr4, gsub, pattern='>', replacement='')
clstr5.2 <- data.frame(str_split_fixed(clstr5$V2, "aa, ", 2))
clstr5.3 <- data.frame(str_split_fixed(clstr5.2$X2, "... ", 2))
clstr6 <- cbind(clstr5[1],clstr5.2[1],clstr5.3[1:2])
colnames(clstr6) <- c("cluster","aa","gene","stat")
write.csv(clstr6,paste("mgenome.clstr.map.1/",files[f],sep=""),row.names = F)
}5.4 Building trees with inclusion of metagenome-derived mec genes
#!/bin/bash
mkdir mgenome.genome.proteins.align
for f in DCM.cass.mgenome.plus.genome/*
do
name=${f##*/}
mafft --maxiterate 1000 --globalpair "$f" > mgenome.genome.proteins.align/"${name}.align"
done
mkdir mgenome.genome.proteins.trim
for f in mgenome.genome.proteins.align/*
do
name=${f##*/}
base=${name%.align}
/home/robert/Tools/trimal.v1.2rev59/trimAl/source/trimal -in "$f" -out mgenome.genome.proteins.trim/"${base}.trim" -gappyout
done
mkdir mgenome.genome.proteins.tree
for f in mgenome.genome.proteins.trim/*
do
name=${f##*/}
base=${name%.trim}
fasttree -lg -gamma "$f" > mgenome.genome.proteins.tree/"$base"
done
Trees are then visuzlied at iTOL with mid-point rooting and custom colors