The goal of this pipeline is to build phylogenetic trees for each of the 10 mec cassette proteins along with the top 20 genome-based homologs.

Gather top 20 homologs from IMG, excluding those in defined mec cassettes
Save the blast results table for each
define a homolog set for each as an IMG object
download the protein sequences for each set
download the gene info page for each set
import the proteins into R
reformat fasta header to read: genus_species_geneID
import the DCM cassette proteins
Reformat the fasta header to read: genus_species_geneID
Add each set of DCM genes to the appropirate homolog set
manually add the DCMF proteins to each and re-import
make sure DIEL has proper name throughout
build an automated MAFFT G-INS-I alignment pipeline -> trimgappy -> RaxML -f treebuild, loop over files in directory shell script
cluster metagenome-derived mec proteins and build trees using the same procedure which includes them

0.1 Notes

MecC under default settings (1e-5) found no hits outside of DCM cassette. Switched to 1e-2.

1 Import the homolog protein sets and rename fasta headers

Since it is difficult to create named R objects inside of a loop, each set is read in along with info, fastas are refined, and then written again

library(Biostrings)
library(dplyr)
homo.files <- list.files("orthologs.faa/")
info.files <- list.files("gene.info/")

#Define a string split function which splits by space character and returns first element
splitfun <- function(x){
  strsplit(x," ")[[1]][1]
}

#Define a simple function that will replace all spaces and illegal characters with underscores
spc.replc <- function(x){
  gsub(" ", "_", x, fixed = TRUE)
  gsub(",",".",x)
  gsub(";",".",x)
  gsub(":",".",x)
}


#define a funciton that will replace any annoying illegal punctuation
punct.replc <- function(x){
  gsub("([[:punct:]])|\\s+","_",x)
}

#define write fasta function
writeFasta<-function(data, filename){
  fastaLines = c()
  for (rowNum in 1:nrow(data)){
    fastaLines = c(fastaLines, as.character(paste(">", data[rowNum,"name"], sep = "")))
    fastaLines = c(fastaLines,as.character(data[rowNum,"seq"]))
  }
  fileConn<-file(filename)
  writeLines(fastaLines, fileConn)
  close(fileConn)
}

for (x in c(1:length(homo.files))) {
  iter <- readAAStringSet(paste("orthologs.faa/",homo.files[x],sep = ""))
  Gene.ID <- names(iter)
  Gene.ID <- unlist(lapply(Gene.ID, splitfun))
  seq <- paste(iter)
  fasta.df <- data.frame(Gene.ID, seq, stringsAsFactors = F)
  fasta.df$Gene.ID <- as.character(fasta.df$Gene.ID)
  info <- read.csv(paste("gene.info/",info.files[x],sep=""),header = T,sep = "\t")[c(1,5)]
  info$newname <- paste(info$Genome.Name,info$Gene.ID,sep=" ")
  #info$newname <- unlist(lapply(info$newname, spc.replc))
  info$newname <- unlist(lapply(info$newname, punct.replc))
  info$Gene.ID <- as.character(info$Gene.ID)
  fasta.df <- dplyr::left_join(fasta.df, info, by="Gene.ID" )
  fasta.df <- fasta.df[c(4,2)]
  colnames(fasta.df)[1] <- "name"
  writeFasta(fasta.df, paste("orthlogs.clean/",homo.files[x],sep = ""))
}

2 Import and reformat mec cassette proteins

DCM.all <- readAAStringSet("cass.proteins/five.DCM.cassette.proteins.faa")
name.old <- names(DCM.all)
seq <- paste(DCM.all)
DCM.df <- data.frame(name.old, seq)
colnames(DCM.df)[1] <- "gene_oid"
DCM.info.all <- read.csv("cass.proteins/gene.info.IMG.all.csv",header = T,stringsAsFactors = F)[c(1,4,8)]
cass <- sort(unique(DCM.info.all$DCM.gene.name))

for (x in c(1:length(cass))) {
  info <- DCM.info.all[DCM.info.all$DCM.gene.name == cass[x],]
  info$newname <- paste(info$Genome.Name,info$gene_oid,sep=" ")
  #info$newname <- unlist(lapply(info$newname, spc.replc))
  info$newname <- unlist(lapply(info$newname, punct.replc))
  info$gene_oid <- as.character(info$gene_oid)
  fasta.df <- dplyr::left_join(info, DCM.df, by="gene_oid" )
  fasta.df <- fasta.df[c(4,5)]
  colnames(fasta.df)[1] <- "name"
  writeFasta(fasta.df, paste("DCM.cass.clean/",cass[x],".clean.faa",sep = ""))
}

3 Combine the cassette genes with their orthologs

DCM.files <- list.files("DCM.cass.clean/")
ortho.files <- list.files("orthlogs.clean/")

for (x in c(1:length(DCM.files))) {
  DCM <- readAAStringSet(paste("DCM.cass.clean/",DCM.files[x],sep = ""))
  name <- names(DCM)
  seq <- paste(DCM)
  DCM <- data.frame(name,seq)
  DCM$name <- as.character(DCM$name)
  ortho <- readAAStringSet(paste("orthlogs.clean/",ortho.files[x],sep=""))
  name <- names(ortho)
  seq <- paste(ortho)
  ortho <- data.frame(name,seq)
  ortho$name <- as.character(ortho$name)
  all.fa <- rbind(DCM,ortho)
  writeFasta(all.fa, paste("DCM.cass.with.ortho/",DCM.files[x],sep=""))
}

4 build trees

mkdir proteins.align

for f in DCM.cass.with.ortho/*
    do 
    name=${f##*/}
    mafft --maxiterate 1000 --globalpair "$f" > proteins.align/"${name}.align"
done


mkdir proteins.trim

for f in proteins.align/*
    do 
    name=${f##*/}
    base=${name%.align}
    /home/robert/Tools/trimal.v1.2rev59/trimAl/source/trimal -in "$f" -out proteins.trim/"${base}.trim" -gappyout
done

mkdir proteins.tree

for f in proteins.trim/*
    do 
    name=${f##*/}
    base=${name%.trim}
    fasttree -lg -gamma "$f" > proteins.tree/"$base" 
done

5 Adding mec genes from metagenomes

5.1 Sorting the fastas by mec gene

Locus tags must be used for these; the IMG locus tag for metagenomes has the genome ID incorporated.

5.1.1 Import the genes and clean fasta header

use env type instead of species name

prots <- readAAStringSet("mgenome.cass/dcm.cassette.all.proteins.mgenome.only.faa")
Locus.Tag <- names(prots)
Locus.Tag <- unlist(lapply(Locus.Tag, splitfun))
seq <- paste(prots)
len <- width(prots)
prots <- data.frame(Locus.Tag,seq, len, stringsAsFactors = F)

5.1.2 Sort

The gene.info.all file generated previously, I have amended it with homolog codes for each gene. Use this as the guide.

this also removes sequences which are abnormally short, less than 70% of the average protein length

library(dplyr)
library(tidyr)
info <- read.csv("mgenome.cass/gene.info.all.with.proportions.csv", stringsAsFactors = F)
info <- info[c(2,4,5)]
prots <- left_join(prots, info)
prots <- prots[complete.cases(prots),]
prots <- unite(prots, name, Genome.source,Locus.Tag)

ortho <- sort(unique(prots$Dcm.ortholog, na.omit = T))

for (x in (1:length(ortho))) {
  gene <- prots[prots$Dcm.ortholog == ortho[x], ]
  gene <- gene[complete.cases(gene),]
  gene <- gene[gene$len > (0.7*mean(gene$len)),]
  gene <- gene[complete.cases(gene),]
  writeFasta(gene, paste("DCM.cass.ortho.mgenome.only/",ortho[x],".mgenome.faa",sep=""))
}

The largest set has 37 sequences, too many for a tree, especially considering that many of them are duplicates. Each has to be sent through a CD hit pipeline

5.2 Clustering the mgenome genes

They are clustered at 80%

for f in DCM.cass.ortho.mgenome.only/*
    do 
    name=${f##*/}
    cdhit -i "$f" \
    -o mgenome.cluster/"${name}" -M 50000 -T 0 -d 50 -c .8 -g 1 -l 150
done

5.3 Parsing the *.clstr files

rough pipeline instructions here: http://www.rpubs.com/rmurdoch/cdhit_to_mapping_file. This pipeline has been placed into a loop. I should turn it into a function for the future.

files <- list.files("mgenome.cluster/")


numbers_only <- function(x) !grepl("\\D", x)

x=0
for (f in c(1:length(files))) {
  clstr <- read.csv(paste("mgenome.cluster/",files[f],sep=""), sep = "\t", row.names = NULL, header = F, stringsAsFactors = F)
  clstr2 <- clstr
  for (n in c(1:nrow(clstr2))) {
    if (numbers_only(clstr2[n,1]) == T) {
      clstr2[n,1] <- x
    }
    else {NULL}
    x <- clstr2[n,1]
  }
  library(stringr)
  clstr.sums <- data.frame(dplyr::count(clstr2,V1))
  switch <- clstr.sums[1,2]
  clstr3 <- cbind(clstr2[1], clstr)
  clstr4 <- clstr2[-which(clstr2$V2 == ""), ]
  clstr5 <- lapply(clstr4, gsub, pattern='>', replacement='')
  clstr5.2 <- data.frame(str_split_fixed(clstr5$V2, "aa, ", 2))
  clstr5.3 <- data.frame(str_split_fixed(clstr5.2$X2, "... ", 2))
  clstr6 <- cbind(clstr5[1],clstr5.2[1],clstr5.3[1:2])
  colnames(clstr6) <- c("cluster","aa","gene","stat")
  write.csv(clstr6,paste("mgenome.clstr.map.1/",files[f],sep=""),row.names = F)
}

5.4 Building trees with inclusion of metagenome-derived mec genes

#!/bin/bash



mkdir mgenome.genome.proteins.align

for f in DCM.cass.mgenome.plus.genome/*
    do 
    name=${f##*/}
    mafft --maxiterate 1000 --globalpair "$f" > mgenome.genome.proteins.align/"${name}.align"
done


mkdir mgenome.genome.proteins.trim

for f in mgenome.genome.proteins.align/*
    do 
    name=${f##*/}
    base=${name%.align}
    /home/robert/Tools/trimal.v1.2rev59/trimAl/source/trimal -in "$f" -out mgenome.genome.proteins.trim/"${base}.trim" -gappyout
done

mkdir mgenome.genome.proteins.tree

for f in mgenome.genome.proteins.trim/*
    do 
    name=${f##*/}
    base=${name%.trim}
    fasttree -lg -gamma "$f" > mgenome.genome.proteins.tree/"$base" 
done

Trees are then visuzlied at iTOL with mid-point rooting and custom colors

mec protein tree building