Scrape sequences

#Tutorial on how to scrape Genbank for all sequential sequences and write them to a .fasta file.

#Use library command to make ape functions accessible by this script
library(ape)

#Use paste() function to create a chr vector of accession numbers for Gasterosteus sequences
# These sequences all belong to one genus of sticklebacks
#Change in the tutorial to be sequences previous Endicott
#Bioinformatics students uploaded MT103163-MT103183
seq1 <- paste("JQ", seq(983161, 983255), sep = "") #paste is similar to c(), but output is a string instead of vector

# Download all sequential sequences from Genbank
# This would be really hard to do my hand
# Note that the downloaded sequences are stored in a single variable called a list
sequences <- read.GenBank(seq1,
                          seq.names = seq1,
                          species.names = TRUE,
                          as.character = TRUE)

# Write the sequences to a fasta file
write.dna(sequences, "fish.fasta", format = "fasta")

#Applying the above tools to study the MT CO1 gene sequences of a specific species, Bicylus anisops.

#Reading in accession number of MT CO1 gene sequences. Accession list(.seq) file used due to sequences not being sequential.
seq <- read.table("sequence.seq")

# Download all sequential sequences from Genbank.
sequences <- read.GenBank(seq$V1,
                          seq.names = seq$V1,
                          species.names = TRUE,
                          as.character = TRUE)

# Write the sequences to a fasta file.
write.dna(sequences, "Bicylus_anisops.fasta", format = "fasta")

Scrape sequences

John Beliveau

2024-09-30