Genome Reconstruction 1

Module 1: Subsample and BLAST vs all bacteria to find out what’s there

For the input data, take a subsample of the full dataset.

It will also be useful to get the summary statistics of the input data (number of sequences, average length, etc):

## bbmap is a usefull tool here 
#https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbmap-guide/
##subsample
reformat.sh \
    in="path/to/data.fastq"  \
    out="path/to/subsamples.fastq"  \
    samplereadstarget=1000 sampleseed=1234 overwrite=true
    
 
## get the basic statistics using seqkit
## https://bioinf.shenwei.me/seqkit/
seqkit stats path/to/data.fastq -T > path/to/basic_stats.txt

Then, select a database to use (eg representative, complete refseq).

Make sure to also have it as a blast database:

## download using this tool: https://github.com/kblin/ncbi-genome-download
# efetch also works here, or even directly from the NCBI assembly cite
ncbi-genome-download --refseq-category representative --assembly-level complete chromosome bacteria

## store a blast database
makeblastdb \
 -in path/to/database.fasta \
 -out path/to/database_db \
 -dbtype nucl -parse_seqids

Now BLAST the subset of sequences against the database:

blastn \
    -task 'megablast' \
    -db path/to/database_db \
    -max_hsps 3 \
    -max_target_seqs 3000 \
    -perc_identity 95 \
    -query path/to/subsamples.fastq \
    -outfmt '10 qseqid sseqid length pident mismatch gapopen slen qlen bitscore' \
    -out path/to/module_1_blast.csv \
    -num_threads 8

Parse the output and write out two summary csv files.

The 1st summary file is sorted by the total count of hits to each reference.

The 2nd summary file is the number of hits at each read length

We’ll use this table to figure out which genus to investigate in the next step.

library(tidyverse)
library(data.table)
## load the stats table and the blast table
stats_table <- 
    fread("path/to/basic_stats.txt")

## add a column for true percent identity, and filter
blast_table_m1 <- 
    fread(
        "path/to/module_1_blast.csv"
    ) %>%
    `colnames<-`(c(
        "qseqid", "sseqid", "length", 
        "pident", "mismatch", 
        "gapopen", "slen", "qlen", "bitscore")) %>%
    mutate(npid = round((length*0.01*pident*100)/qlen)) %>%
    filter(npid >= 95) 

## the depth of the full database will be the number of R1s*2 
sample_depth <-
    stats_table %>% 
    pull(sequences))*2
            
mean_seq_length <- 
    sample_table %>%
    pull(`average length`)

## Report a summary of each database hit 
out_table_hits <- 
    blast_table_m1 %>%
    mutate(
        sseqid = 
            unlist(
                lapply(
                    sseqid, 
                    function(x){ strsplit(x, "\\|")[[1]][2]}
                    )
                )
    ) %>% 
    group_by(sseqid) %>%
    summarise(
        total_count=n(),
        total_bitscore = sum(bitscore),
        slen = unique(slen),
        reads = sample_depth,
        exp_coverage = 
             round(
                 ((total_count * mean_ss_length)*(sample_depth/1000))/
                     unique(slen)
                 )
        ) %>% 
    arrange(desc(total_count)) %>%
    `colnames<-`(
        c(
            "accession", "total_count", 
            "total_bitscore", "genome_length", "total_reads",
            "expected_coverage"
            
            )
        )
write_csv(
    out_table,
    "path/to/module_1_summary_table.csv",
    )

## then get the count of each read length
out_table_read_length <- 
    blast_table_m1 %>%
    dplyr::count(qlen)

write_csv(
    out_table,
    "path/to/module_1_read_length_table.csv",
    )

Module 2: Align the subsample to the database of one genus

(similar to module 1)

After looking at the results of the last table, we now now which genus to look in for a reference genome.

We can now query the 1000 reads against a database containing all of those sequences.

If the top genus is salmonella, we download all of those sequences:

ncbi-genome-download --genus Salmonella --assembly-level complete chromosome bacteria

## store a blast database
makeblastdb \
 -in path/to/genus_database.fasta \
 -out path/to/genus_database_db \
 -dbtype nucl -parse_seqids

Then BLAST the subset of sequences against the genus database:

blastn \
    -task 'megablast' \
    -db path/to/genus_database_db \
    -max_hsps 3 \
    -max_target_seqs 3000 \
    -perc_identity 95 \
    -query path/to/subsamples.fastq \
    -outfmt '10 qseqid sseqid length pident mismatch gapopen slen qlen bitscore' \
    -out path/to/module_2_blast.csv \
    -num_threads 8

Get summary tables for that database.

These will look very similar to the summary tables from the first module.

Use this table to fill out which accession sequence to download.

## add a column for true percent identity, and filter
blast_table_m2 <- 
    fread(
        "path/to/module_2_blast.csv"
    ) %>%
    `colnames<-`(c(
        "qseqid", "sseqid", "length", 
        "pident", "mismatch", 
        "gapopen", "slen", "qlen", "bitscore")) %>%
    mutate(npid = round((length*0.01*pident*100)/qlen)) %>%
    filter(npid >= 95) 

## the depth of the full database will be the number of R1s*2 
sample_depth <-
    stats_table %>% 
    pull(sequences))*2
            
mean_seq_length <- 
    sample_table %>%
    pull(`average length`)

## get the accessions from the BLAST
out_table <- 
    blast_table_m2 %>%
    mutate(
        sseqid = 
            unlist(
                lapply(
                    sseqid, 
                    function(x){ strsplit(x, "\\|")[[1]][2]}
                    )
                )
    ) %>% 
    group_by(sseqid) %>%
    summarise(
        total_count=n(),
        total_bitscore = sum(bitscore),
        slen = unique(slen),
        reads = sample_depth,
        exp_coverage = 
             round(
                 ((total_count * mean_ss_length)*(sample_depth/1000))/
                     unique(slen)
                 )
        ) %>% 
    arrange(desc(total_count)) %>%
    `colnames<-`(
        c(
            "accession", "total_count", 
            "total_bitscore", "genome_length", "total_reads",
            "expected_coverage"
            
            )
        )
write_csv(
    out_table,
    "path/to/module_2_summary_table.csv"
    )

## then get the count of each read length
out_table_read_length <- 
    blast_table_m2 %>%
    dplyr::count(qlen)

write_csv(
    out_table,
    "path/to/module_2_read_length_table.csv",
    )

Module 3: Read length optimization

For this step, let’s say we give the option to define three read length cutoffs at 230, 240, and 250.

For the subusample dataset, we’ll trim all reads greater than 230 nt to 230 nt, all greater than 240 nt to 240 nt, and all greater than 250 nt to 250 nt. We won’t look at anything shorter than the lowest cutoff

Write new fasta files at each read length

library(Biostrings)
## We'll use 230, 240, and 250
rl_cutoffs <- c(230, 240, 250)

sample_seqs <- 
    readDNAStringSet("path/to/subsamples.fastq", format = "fastq")
    
## trim the sequence, and write out to a new file
## we'll store the number of seqs for later
n_seqs <- 
    lapply(
        r1_cutoffs,
        function(cutoff_number){
            cut_seq <- 
                all_seqs[lengths(all_seqs)>=cutoff_number] %>%
                subseq(., 1, cutoff_number)
            writeXStringSet(
                sprintf(
                    "path/to/cutoff_%i.txt",
                    cutoff_number
                )
            )
        }
        ) %>%
    unlist

Make each of these new files into a blast database

makeblastdb \
 -in path/to/cutoff_230.txt \
 -out path/to/cutoff_230_db -dbtype nucl -parse_seqids

makeblastdb \
 -in path/to/cutoff_240.txt \
 -out path/to/cutoff_240_db -dbtype nucl -parse_seqids
 
makeblastdb \
 -in path/to/cutoff_250.txt \
 -out path/to/cutoff_250_db -dbtype nucl -parse_seqids

Then BLAST the chosen reference file with each of these cutoff sample databases

blastn \
    -task 'megablast' \
    -db path/to/cutoff_230_db \
    -max_hsps 1 \
    -max_target_seqs 5000 \
    -perc_identity 100 \
    -query path/to/NZ_LR134187.1.txt \
    -outfmt '10 qseqid sseqid length pident mismatch gapopen slen qlen bitscore' \
    -out  path/to/cutoff_230.csv \
    -num_threads 8 

blastn \
    -task 'megablast' \
    -db path/to/cutoff_240_db \
    -max_hsps 1 \
    -max_target_seqs 5000 \
    -perc_identity 100 \
    -query path/to/NZ_LR134187.1.txt \
    -outfmt '10 qseqid sseqid length pident mismatch gapopen slen qlen bitscore' \
    -out  path/to/cutoff_240.csv \
    -num_threads 8 
blastn \
    -task 'megablast' \
    -db path/to/cutoff_250_db \
    -max_hsps 1 \
    -max_target_seqs 5000 \
    -perc_identity 100 \
    -query read_lengths/NZ_LR134187.1.txt \
    -outfmt '10 qseqid sseqid length pident mismatch gapopen slen qlen bitscore' \
    -out  path/to/cutoff_250.csv \
    -num_threads 8

Now we get the summary values for each result

perfect_alignments <- 
    lapply(
        rl_cutoffs, 
        function(cutoff_number){
            blast_table_m3 <- 
                fread(
                    sprintf(
                        "path/to/cutoff_%i.csv",
                        cutoff_number
                    )
                ) %>%
                `colnames<-`(c(
                    "qseqid", "sseqid", "length", 
                    "pident", "mismatch", 
                    "gapopen", "slen", "qlen", "bitscore")) %>%
                mutate(npid = round((length*0.01*pident*100)/qlen)) %>%
                filter(npid == 100) %>%
                nrow
        }
    ) %>% unlist

m3_summary_table <- 
    tibble(
        Cutoff = r1_cutoffs,
        Number_of_reads = n_seqs,
        Perfect_hits = perfect_alignments
    ) %>%
    mutate(product = Perfect_hits * Cutoff)

write_csv(m3_summary_table, "path/to/read_length_summary.csv")