This project aims at understanding the evolutionary processes that lead to the prescence and abscence of the potential to fix nitrogen in bacterial symbionts of lucinid clams. For this, a pipeline was developed to reconcile the symbiont species tree with their respective nitrogenase gene tree.
Because the concept of species is controversial for bacteria and common methods used for sexual reproducing organisms can not be applied to them, we had to decide on a method that allowed us to get to an equivalent of a species tree. We decided on applying the GTDB workflow, that uses genome trees inferred from an aligned concatenated set of 120 single copy marker proteins for Bacteria.
input directorydRep dereplicate drep_bins -g input/*.fa -sa 0.99 -p 32
gtdbtk classify_wf --genome_dir drep_bins --out_dir gtdb_classify -x fa --cpus 48
mkdir spp_tree
cd spp_tree
iqtree2 -m TEST -s ../gtdb_classify/gtdbtk.bac120.user_msa.fasta -B 10000 --alrt 10000 -T AUTO --threads-max 64
cd ..
spp_tree/gtdbtk.bac120.user_msa.newickWe obtained gene trees from the nitrogenase operon (nifHDKT) and from recA (single copy core gene), wich was used as a control. First we annotated the MAGs to identify and extract the genes of interest.
mkdir gene_trees
cd gene_trees
conda activate DRAM
DRAM.py annotate -i 'drep_bins/*.fa' -o annotation
conda deactivate
We built the tree from a concatenated alignment of four genes (nifH, nifD, nifK and nifT) to obtain a stronger phylogenetic signal. Before concatenating the alignments, we compared the trees of the individual genes. We checked that the branches with high support were shared between the gene trees, which would indicate that the genes share an evolutionary history.
mkdir nif
grep -A1 'nitrogenase' annotation/genes.fna | grep -A1 'NifH' | sed '/^--/d' >> nif/nifH_extraction.fna
grep -A1 'NifT' annotation/genes.fna | sed '/^--/d' >> nif/nifT_extraction_drep.fna
grep -A1 'nitrogenase' annotation/genes.fna | grep -A1 'alpha' | sed '/^--/d' >> nif/nifD_extraction.fna
grep -A1 'nitrogenase' annotation/genes.fna | grep -A1 'beta' | sed '/^--/d' >> nif/nifK_extraction.fna
ls ../drep_bins/*.fa | sed 's/.fa//g' >> drep_mags.txt
FastA.rename.pl from Enveomicsmkdir nif/rename
cd nif/rename
cp ../../drep_mags.txt .
for i in H D K T;
do mv ../nif${i}_extraction.fna . ;
grep '>' nif${i}_extraction.fna | sed 's/>//g' | sort >> headers_nif${i}.txt ;
cat drep_mags.txt | while read line; do grep -o $line nif${i}_extraction.fna; done > drep_mags_nif${i}.txt;
awk '{print $0"_nif"}' drep_mags_nif${i}.txt >> new_headers_nif${i}.txt;
paste -d '\t' <(sort headers_nif${i}.txt) <(sort new_headers_nif${i}.txt) >> rename_nif${i}.txt ;
FastA.rename.pl rename_nif${i}.txt nif${i}_extraction.fna > nif${i}_renamed.fna;
done
mkdir ../alig
cd ../align
for i in H D K T;
do cp ../rename/nif${i}_renamed.fna;
seqkit rmdup -n nif${i}_renamed.fna -o nif${i}_renamed_uniq.fna;
time fsa --fast nif${i}_renamed_uniq.fna > nif${i}_fsa_renamed_uniq.fa;
done
seqkit concat nif*_fsa_renamed_uniq.fa > nifHDKT_fsa_renamed_uniq.fna
mkdir ../tree
cd ../tree
for i in H D K T ;
do cp ../align/nif${i}_fsa_renamed_uniq.fa . ;
iqtree2 -m TEST -s nif${i}_fsa_renamed_uniq.fa -B 1000 --alrt 10000 -T AUTO --threads-max 24 ;
done
cp ../align/nifHDKT_fsa_renamed_uniq.fna
iqtree2 -m TEST -s nifHDKT_fsa_renamed_uniq.fna -B 1000 --alrt 10000 -T AUTO --threads-max 24
nifHDKT_fsa_renamed_uniq.fna.treefile ) at outgroup MAG in FigTree and exported in Newick format: gene_trees/nif/tree/nifHDKT_fsa_renamed_uniq.newickcd ../..
mkdir reca
grep -A1 'RecA' annotation/genes.fna | grep -A1 'recombination' | sed '/^--/d' >> reca/reca_extraction.fna
FastA.rename.pl from Enveomicsmkdir reca/rename
cd reca/rename
cp ../../drep_mags.txt .
grep '>' reca_extraction.fna | sed 's/^>//' | sort >> headers_reca.txt
cat drep_mags.txt | while read line; do grep -o $line reca_extraction.fna; done > drep_mags_reca.txt
awk '{print $0"_recA"}' drep_mags_reca.txt >> new_headers_reca.txt
paste -d '\t' <(sort headers_reca.txt) <(sort new_headers_reca.txt) >> rename_reca.txt
FastA.rename.pl rename_reca.txt reca_extraction.fna > reca_extraction_renamed.fna
mkdir ../alig
cd ../align
seqkit rmdup reca_extraction_renamed.fna -o reca_extraction_renamed_uniq.fna
time fsa --fast ../rename/reca_extraction_renamed_uniq.fna > reca_fsa_renamed_uniq.fna
mkdir ../tree
cd ../tree
cp ../align/reca_fsa_renamed_uniq.fa .
iqtree2 -m TEST -s reca_fsa_renamed_uniq.fna -B 1000 --alrt 10000 -T AUTO --threads-max 24
reca_fsa_renamed_uniq.fna.treefile ) at outgroup MAG in FigTree and exported in Newick format: gene_trees/reca/tree/reca_fsa_renamed_uniq.newickcd ../../..
mkdir generax
cp spp_tree/gtdbtk.bac120.user_msa.newick .
cp gene_trees/nif/align/nifHDKT_fsa_renamed_uniq.fna .
cp gene_trees/nif/tree/nifHDKT_fsa_renamed_uniq.fna.newick .
cp gene_trees/reca/align/reca_fsa_renamed_uniq.fna .
cp gene_trees/reca/tree/reca_fsa_renamed_uniq.fna.newick .
grep '>' nifHDKT_fsa_renamed_uniq.fna | sed 's/>//g' >> headers_gene_nif.fa
grep '>' nifHDKT_fsa_renamed_uniq.fna | sed 's/>//g' | sed 's/_nif//g' >> headers_spp_nif.fa
grep '>' reca_fsa_renamed_uniq.fna | sed 's/>//g' >> headers_gene_reca.fa
grep '>' reca_fsa_renamed_uniq.fna | sed 's/>//g' | sed 's/_recA//g' >> headers_spp_reca.fa
paste -d ':' headers_spp_nif.fa headers_gene_nif.fa >> mapping_nif.link
paste -d ':' headers_spp_reca.fa headers_gene_reca.fa >> mapping_reca.link
families.txt file, which contains the path to the needed filesecho '[FAMILIES]
- NifHDKT
starting_gene_tree = ./nifHDKT_fsa_renamed_uniq.fna.newick
alignment = ./nifHDKT_fsa_renamed_uniq.fna
mapping = ./mapping_nif.link
subst_model = GTR+G+I
- RecA
starting_gene_tree = ./reca_fsa_renamed_uniq.fna.treefile.newick
alignment = ./reca_fsa_renamed_uniq.fna
mapping = ./mapping_reca.link
subst_model = GTR+G+I' >> families.txt
generax --families families.txt --species-tree gtdbtk.bac120.user_msa.fasta.treefile.newick --rec-model UndatedDTL --prefix correction --strategy SPR
cd correction/reconciliations
ls *.xml >> list_files
thirdkind -f list_files -m -o reca_nif_reconciliated_polish2.0.svg