cd ../data
curl -O https://owl.fish.washington.edu/halfshell/genomic-databank/cbai_genome_v1.01.fasta
ls ../data
## % Total % Received % Xferd Average Speed Time Time Time Current
## Dload Upload Total Spent Left Speed
##
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
2 18.1M 2 496k 0 0 1042k 0 0:00:17 --:--:-- 0:00:17 1040k
35 18.1M 35 6629k 0 0 4479k 0 0:00:04 0:00:01 0:00:03 4476k
100 18.1M 100 18.1M 0 0 8056k 0 0:00:02 0:00:02 --:--:-- 8056k
## 06_tview_screengrab.jpeg
## 19F_R1_val_1_bismark_bt2_pe.deduplicated.sorted.bam
## 19F_R1_val_1_bismark_bt2_pe.deduplicated.sorted.bam.bai
## Ab_4denovo_CLC6_a.fa
## cbai_genome_v1.01.fasta
## cbai_genome_v1.01.fasta.fai
## cgigas_roslin_rna.index
## cgigas_uk_roslin_v1_genomic-mito.fa
## cgigas_uk_roslin_v1_genomic-mito.fa.fai
## D54_S145_L001_R1_001.fastq.gz
## D54_S145_L002_R1_001.fastq.gz
## D55_S146_L001_R1_001.fastq.gz
## D55_S146_L002_R1_001.fastq.gz
## D56_S136_L001_R1_001.fastq.gz
## D57_S143_L001_R1_001.fastq.gz
## D58_S144_L001_R1_001.fastq.gz
## D59_S142_L001_R1_001.fastq.gz
## F143n08_R1_001.fastq.gz
## F143n08_R2_001.fastq.gz
## GCF_002022765.2_C_virginica-3.0_genomic.fa
## GCF_002022765.2_C_virginica-3.0_genomic.fa.fai
## GCF_902806645.1_cgigas_uk_roslin_v1_genomic-mito.gtf
## M45_S140_L001_R1_001.fastq.gz
## M46_S141_L001_R1_001.fastq.gz
## M48_S137_L001_R1_001.fastq.gz
## M49_S139_L001_R1_001.fastq.gz
## M89_S138_L001_R1_001.fastq.gz
## M90_S147_L001_R1_001.fastq.gz
## N48_S194_L001_R1_001.fastq.gz
## N49_S185_L001_R1_001.fastq.gz
## N50_S187_L001_R1_001.fastq.gz
## N51_S186_L001_R1_001.fastq.gz
## N52_S184_L001_R1_001.fastq.gz
## N53_S188_L001_R1_001.fastq.gz
## N54_S193_L001_R1_001.fastq.gz
## N55_S190_L001_R1_001.fastq.gz
## N56_S192_L001_R1_001.fastq.gz
## N57_S191_L001_R1_001.fastq.gz
## N58_S195_L001_R1_001.fastq.gz
## N59_S189_L001_R1_001.fastq.gz
## rna.fna
library(seqinr)
input_file <- "../data/cbai_genome_v1.01.fasta"
sequences <- read.fasta(input_file)
# Set the seed for reproducibility (optional)
set.seed(42)
number_of_sequences_to_select <- 10
if (length(sequences) < number_of_sequences_to_select) {
warning("There are fewer than 10 sequences in the fasta file. All sequences will be selected.")
number_of_sequences_to_select <- length(sequences)
}
selected_indices <- sample(length(sequences), number_of_sequences_to_select)
selected_sequences <- sequences[selected_indices]
# Replace 'output.fasta' with your desired output file name
output_file <- "../output/07-output.fa"
write.fasta(selected_sequences, names(selected_sequences), output_file, open = "w")
# download index file
cd ../data
curl -O https://owl.fish.washington.edu/halfshell/genomic-databank/cbai_genome_v1.01.fasta.fai
ls ../data
## % Total % Received % Xferd Average Speed Time Time Time Current
## Dload Upload Total Spent Left Speed
##
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
100 76608 100 76608 0 0 959k 0 --:--:-- --:--:-- --:--:-- 959k
## 06_tview_screengrab.jpeg
## 19F_R1_val_1_bismark_bt2_pe.deduplicated.sorted.bam
## 19F_R1_val_1_bismark_bt2_pe.deduplicated.sorted.bam.bai
## Ab_4denovo_CLC6_a.fa
## cbai_genome_v1.01.fasta
## cbai_genome_v1.01.fasta.fai
## cgigas_roslin_rna.index
## cgigas_uk_roslin_v1_genomic-mito.fa
## cgigas_uk_roslin_v1_genomic-mito.fa.fai
## D54_S145_L001_R1_001.fastq.gz
## D54_S145_L002_R1_001.fastq.gz
## D55_S146_L001_R1_001.fastq.gz
## D55_S146_L002_R1_001.fastq.gz
## D56_S136_L001_R1_001.fastq.gz
## D57_S143_L001_R1_001.fastq.gz
## D58_S144_L001_R1_001.fastq.gz
## D59_S142_L001_R1_001.fastq.gz
## F143n08_R1_001.fastq.gz
## F143n08_R2_001.fastq.gz
## GCF_002022765.2_C_virginica-3.0_genomic.fa
## GCF_002022765.2_C_virginica-3.0_genomic.fa.fai
## GCF_902806645.1_cgigas_uk_roslin_v1_genomic-mito.gtf
## M45_S140_L001_R1_001.fastq.gz
## M46_S141_L001_R1_001.fastq.gz
## M48_S137_L001_R1_001.fastq.gz
## M49_S139_L001_R1_001.fastq.gz
## M89_S138_L001_R1_001.fastq.gz
## M90_S147_L001_R1_001.fastq.gz
## N48_S194_L001_R1_001.fastq.gz
## N49_S185_L001_R1_001.fastq.gz
## N50_S187_L001_R1_001.fastq.gz
## N51_S186_L001_R1_001.fastq.gz
## N52_S184_L001_R1_001.fastq.gz
## N53_S188_L001_R1_001.fastq.gz
## N54_S193_L001_R1_001.fastq.gz
## N55_S190_L001_R1_001.fastq.gz
## N56_S192_L001_R1_001.fastq.gz
## N57_S191_L001_R1_001.fastq.gz
## N58_S195_L001_R1_001.fastq.gz
## N59_S189_L001_R1_001.fastq.gz
## rna.fna
#needed downstream for IGV
/home/shared/samtools-1.12/samtools faidx \
../output/07-output.fa
fuzznuc -sequence ../output/07-output.fa -pattern CG -rformat gff -outfile ../output/CGoutput.gff
## Search for patterns in nucleotide sequences
IGV screenshot of CG motifs
IGV screenshot of CG motifs