Download required package and identify input fasta file The fasta file is COI sequences from biodiversity sampling in Curacao
library(seqinr)
# Replace 'input.fasta' with the name of your multi-sequence fasta file
input_file <- "../data/ALL_Atlantic_2_Feb_2017.fasta"
sequences <- read.fasta(input_file)
Select 10 sequences from the fasta file
# Set the seed for reproducibility (optional)
set.seed(42)
number_of_sequences_to_select <- 10
if (length(sequences) < number_of_sequences_to_select) {
warning("There are fewer than 10 sequences in the fasta file. All sequences will be selected.")
number_of_sequences_to_select <- length(sequences)
}
selected_indices <- sample(length(sequences), number_of_sequences_to_select)
selected_sequences <- sequences[selected_indices]
Write a fasta file from the 10 randomly selected sequences
# Replace 'output.fasta' with your desired output file name
output_file <- "../output/output.fasta"
write.fasta(selected_sequences, names(selected_sequences), output_file, open = "w")
#likely will not need; fix issue where gff and fa name did not match
# sed -i 's/>lcl|/>/g' ../output/10_seqs.fa
Create index
#needed downstream for IGV
/home/shared/samtools-1.12/samtools faidx \
../output/output.fasta
Find CG motifs
fuzznuc -sequence ../output/output.fasta -pattern CG -rformat gff -outfile ../output/CGoutput.gff
## Search for patterns in nucleotide sequences
#Visualize files in IGV
Used 10 sequence fasta file as genome (output.fasta) and opened resulting GFF file (CGoutput.gff) as a file
Did not find any CG motifs??