cd ../data
curl -O https://gannet.fish.washington.edu/panopea/Cg-roslin/cgigas_uk_roslin_v1_genomic-mito.faweek07 - motifs
head data/cgigas_uk_roslin_v1_genomic-mito.fa>NC_047559.1 Crassostrea gigas linkage group 1, cgigas_uk_roslin_v1, whole genome shotgun sequence
AAATAATCGGTGTTAATTTAAGGACAAATATTTCTATAAAGAAATTACATCCCTGGAAAGAAAccaatattcttcatttt
aatgttaaaataggGATATAGTGTAAATGCTCCCGAAAGCCCCCTAAAACTTTTGGGATCAATAATAACACCCGTATTTC
TCCGTTgatacattaatacaaagagAACAAAGAAATCATTAGCATTAGTTTcagataaaattctttataaaaaaaattac
acccctgagaaaactcaaaatattctgtttagcgttaaaaaaaatcagtagacTTAATTAGTTgggaaactccctaaatc
ttttgataTGAATACTTACACCCTTATTTTATCTTGATTACATGAATacgaagaaataaacaaaatcatgttaaatgaaa
aatcactttattaaaaatgacatcctcaggaaggaacccatattcttcattttaacgttcaatAATAGGTATAGTTTAAA
CGCTAAGTAGACCCCCAAAAACTTTTGGGATTAAGACTTACACCATCAATTCTCCTTTAGtgcattaaaacaaagaaaat
aatcagctttagttttaggaaaaatccttttattgagaaataacacCCCTGAGAAtactcaaaattttcattttattgtt
aaaaaaatccgCTAGGGTAGTAAATCAGTTAGGAAACTCCTTAATtatcttttggtatgaataccTACACCCTTATATAT
library(seqinr)
# Replace 'input.fasta' with the name of your multi-sequence fasta file
input_file <- "data/cgigas_uk_roslin_v1_genomic-mito.fa"
sequences <- read.fasta(input_file)# Set the seed for reproducibility (optional)
set.seed(42)
number_of_sequences_to_select <- 10
if (length(sequences) < number_of_sequences_to_select) {
warning("There are fewer than 10 sequences in the fasta file. All sequences will be selected.")
number_of_sequences_to_select <- length(sequences)
}
selected_indices <- sample(length(sequences), number_of_sequences_to_select)
selected_sequences <- sequences[selected_indices]# Replace 'output.fasta' with your desired output file name
output_file <- "output/10-seq-output.fasta"
write.fasta(selected_sequences, names(selected_sequences), output_file, open = "w")create an index for your 10-sequence fasta
index necessary for rendering in IGV. Not necessary for finding motifs.
head -1 output/10-seq-output.fasta.faiNW_022994811.1 28060 16 60 61
This means:
Sequence name: NW_022994811.1
Sequence length: 28,060 bases
Sequence data starts at byte 16 in the FASTA file
Each line contains 60 bases
Each line is 61 bytes long
#needed downstream for IGV
/home/shared/samtools-1.12/samtools faidx \
output/10-seq-output.fastaprogram that we are using to find the motifs.
fuzznuc -sequence output/10-seq-output.fasta -pattern CG -rformat gff -outfile output/10-seq-output.gff
head -6 output/10-seq-output.gff##gff-version 3
##sequence-region NW_022994811.1 1 28060
#!Date 2025-05-15
#!Type DNA
#!Source-version EMBOSS 6.6.0.0
NW_022994811.1 fuzznuc nucleotide_motif 129 130 2 + . ID=NW_022994811.1.1;note=*pat pattern:CG
This output file (.gff) provides the location for each cg motif that was found.
The 6th line describes a cg motif on sequence NW_022994811.1, generated from the fuzznuc tool, located at position 129 to 130, which a match strength of 2, found on the ‘+’ strand, the ‘.’ is not applicable for the length of this motif, and with an ID and a pattern (cg).