cd ../data
curl -O https://gannet.fish.washington.edu/panopea/Cg-roslin/cgigas_uk_roslin_v1_genomic-mito.fa07-motifs
Rpubs Link!!
https://rpubs.com/sr320/1310956
Screenshot #1
screenshot #2
cd ../data
curl -O https://gannet.fish.washington.edu/panopea/Cg-roslin/cgigas_uk_roslin_v1_genomic-mito.fahead -40 ../data/Ab_4denovo_CLC6_a.fa>solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_1
ACACCCCACCCCAACGCACCCTCACCCCCACCCCAACAATCCATGATTGAATACTTCATC
TATCCAAGACAAACTCCTCCTACAATCCATGATAGAATTCCTCCAAAAATAATTTCACAC
TGAAACTCCGGTATCCGAGTTATTTTGTTCCCAGTAAAATGGCATCAACAAAAGTAGGTC
TGGATTAACGAACCAATGTTGCTGCGTAATATCCCATTGACATATCTTGTCGATTCCTAC
CAGGATCCGGACTGACGAGATTTCACTGTACGTTTATGCAAGTCATTTCCATATATAAAA
TTGGATCTTATTTGCACAGTTAAATGTCTCTATGCTTATTTATAAATCAATGCCCGTAAG
CTCCTAATATTTCTCTTTTCGTCCGACGAGCAAACAGTGAGTTTACTGTGGCCTTCAGCA
AAAGTATTGATGTTGTAAATCTCAGTTGTGATTGAACAATTTGCCTCACTAGAAGTAGCC
TTC
>solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2
ATTAGAGGCCTTGGGGTTGAAATATCTGACAGCAACAACCGACACAAATGCACCTGGGGT
CTTCCTAACTATCAAGGCAAAATCTGAAACTGGTAAATTTGGTATATATTCCCACTTTCT
CTCTCTGAATTAACCTCCAAACATACCTGACACAAGAAACGTCTAAAACGATCTGCCATG
TTGATGTGTGTGACTGCTTCATATATATTTAGATTAAGATACATATAGTAATATTCAAGA
ACGTTTGTC
>solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_3
TCCCGAGATAAGCACCTGCTCCTGTGGAAGCTGACACGTAATGAGACCAACTACGGCATC
CCATACAAGCAGCTCCATGGCCACAACCACTTTGTGTCTGATGTTGTGCTCTCCTCAGAT
GGCCAATTTGCTCTGTCTGCATCATGGGATGGATCCCTCAGACTCTGGGATCTTGTAACA
TGCAAGACAACACGTCACTTCGTTGGCCACGAGAAGGATGTCATGAGTGTTGCTTTCTCT
GCTGACAACAGACAGATTGTCTCTGGTTCACGGGATAAAACTGTCAGGCTGTGGAACACA
TTGGGGGTTTGCAAGTACACCATTCAGGAGGATTCACACAAAGACTGGGTGTCTTGTGTT
CGTTTCTCTCCCAACTCAACTAACCCGATCATTGTGTCCTGTGGCTGGGACAAAACCGTG
AAGGTGTGGAATCTGACTAACTGCAAGCTGAAGACCAACCACTATGGCCACACCCAGTTC
CTGAACACTGTCACCGTGTCTCCTGATGGCTCA
>solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_4
CAGCAGCCGGCCTCCTGCCAGCAGGTGCGCCAGCACTTCTAAACCCATTCCTAGTCCTAG
TAGAAACCGTTAGAATTATGGTCCGCCCAATTACATTATCTATCCGACTAGCAGCCAACA
TAAGGGCCGGACACATTGTATTGGGCCTTATCAGCACTTACCTTTGCTCAGCGATCTTCA
TTTACCCCAAAGTCACCCTCTTAACCTTAATAACCGTACAAACCTTTTACTTTATATTTG
AAATTGGCGTA
>solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_5
GAATCCTGTGGAAGAAAAATGAAGTTGCTGACTACGAAGCTACAACTTTTTCCATTTTCT
ACAACAACGCCCTCTACCTCGTGTTGCTGATAGTGGTGTCGTTCTACGCCCTGAAAAACT
TCAACCCCAATCTGAATTATGTGGTTTCCGTGGGCGCCTCAGCAGGTCTGCTGGCTCTGT
TTTCCACCAGCTCGAAGTAAAAGGTCAACGACAGCTATTTCATCATGATGTTCTAGAAAT
AGAAATTTATCAAATGTCATACATATGGTTTCAAGTAAAACCGCCCCACGGGGCGACTTA
GC
>solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_6
library(seqinr)
# Replace 'input.fasta' with the name of your multi-sequence fasta file
input_file <- "../data/Ab_4denovo_CLC6_a.fa"
sequences <- read.fasta(input_file)# Set the seed for reproducibility (optional)
set.seed(42)
number_of_sequences_to_select <- 10
if (length(sequences) < number_of_sequences_to_select) {
warning("There are fewer than 10 sequences in the fasta file. All sequences will be selected.")
number_of_sequences_to_select <- length(sequences)
}
selected_indices <- sample(length(sequences), number_of_sequences_to_select)
selected_sequences <- sequences[selected_indices]# Replace 'output.fasta' with your desired output file name
output_file <- "../output/10-seq-output.fasta"
write.fasta(selected_sequences, names(selected_sequences), output_file, open = "w")#likely will not need; fix issue where gff and fa name did not match
# sed -i 's/>lcl|/>/g' ../output/10_seqs.fa#needed downstream for IGV
/home/shared/samtools-1.12/samtools faidx \
../output/10-seq-output.fastafuzznuc -sequence ../output/10-seq-output.fasta -pattern CG -rformat gff -outfile ../output/10-seq-output.gffhead ../output/10-seq-output.gff
echo "THIS IS MY INDEX:!!"
head ../output/10-seq-output.fasta.fai##gff-version 3
##sequence-region solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609 1 551
#!Date 2025-05-15
#!Type DNA
#!Source-version EMBOSS 6.6.0.0
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609 fuzznuc nucleotide_motif 90 91 2 + . ID=solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609.1;note=*pat pattern:CG
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609 fuzznuc nucleotide_motif 175 176 2 + . ID=solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609.2;note=*pat pattern:CG
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609 fuzznuc nucleotide_motif 483 484 2 + . ID=solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609.3;note=*pat pattern:CG
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609 fuzznuc nucleotide_motif 495 496 2 + . ID=solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609.4;note=*pat pattern:CG
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609 fuzznuc nucleotide_motif 538 539 2 + . ID=solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609.5;note=*pat pattern:CG
THIS IS MY INDEX:!!
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609 551 69 60 61
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_4069 266 699 60 61
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2369 335 1039 60 61
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_5273 285 1449 60 61
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_1098 238 1808 60 61
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_1252 252 2119 60 61
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_634 237 2444 60 61
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2097 285 2754 60 61
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_5248 238 3113 60 61
solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_5423 216 3424 60 61
What is a fasta index??
• Sequence name: solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_2609
• Length: 551 bp
• Offset: 69 (the starting byte of the sequence in the FASTA file)
• Linebases: 60 bases per line
• Linewidth: 61 bytes per line (60 bases + 1 newline character)