For this assignment, I’m taking 10 reference strains of Ruegeria and analyzing the CG motifs.

The IGV view is shown below:

CG motifs in R

First, we need to download the genomes

cd ../data
/home/shared/datasets download genome taxon "ruegeria" --annotated --reference
unzip ncbi_dataset.zip
library(seqinr)

fasta_files = list.files("../data/ncbi_dataset/data", pattern="GCF*")
fasta_path <- paste("../data/ncbi_dataset/data", fasta_files, sep="/")

sequences = list()

for (fasta_i in fasta_path) {
  fasta = list.files(fasta_i)
  full_fasta = paste(fasta_i,fasta,sep="/")
  sequences_new <- read.fasta(full_fasta)
  sequences = append(sequences, sequences_new)
}
# Set the seed for reproducibility (optional)
set.seed(42)

number_of_sequences_to_select <- 10

if (length(sequences) < number_of_sequences_to_select) {
  warning("There are fewer than 10 sequences in the fasta file. All sequences will be selected.")
  number_of_sequences_to_select <- length(sequences)
}

selected_indices <- sample(length(sequences), number_of_sequences_to_select)
selected_sequences <- sequences[selected_indices]
# Replace 'output.fasta' with your desired output file name
output_file <- "../output/07_CGmotifs.fasta"
write.fasta(selected_sequences, names(selected_sequences), output_file, open = "w")
#likely will not need; fix issue where gff and fa name did not match
# sed -i 's/>lcl|/>/g' ../output/10_seqs.fa
#needed downstream for IGV
/home/shared/samtools-1.12/samtools faidx ../output/07_CGmotifs.fasta
fuzznuc -sequence ../output/07_CGmotifs.fasta -pattern CG -rformat gff -outfile ../output/CGoutput.gff