Rosalind’s Genome Data Analysis Exercises Using R

setwd("/Users/nnthieu/Downloads/GenomeProject1000/Blast/")
library(Biostrings)

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: XVector

## Loading required package: GenomeInfoDb

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

1. Counting DNA nucleotides

# Corrected single-line AWK command
output <- system("awk '{a=gsub(/A/, \"\", $0); t=gsub(/T/, \"\", $0); c=gsub(/C/, \"\", $0); g=gsub(/G/, \"\", $0); print \"A: \" a; print \"T: \" t; print \"C: \" c; print \"G: \" g;}' rosalind_dna.txt", intern = TRUE)

# Print the captured output
cat(output, sep = "\n")

## A: 237
## T: 223
## C: 236
## G: 234

2. Transcribe to RNA string

output <- system("sed 's/T/U/g' rosalind_rna.txt", intern = TRUE)
cat(output, sep = "\n")

## CGUUUAAGACUAUGAAUACGCGGGCGAUGUAUCCACCACACAUAACCUAUCGCAACCCCGUUGCUACCGUAACGUGAUCACGCCUGAUUCCGAGUGACUGGGGAGGUUGAGGCUAGCCAUAUCACAGCCUCUUGCUACACGUGAACCAACCAUCACUUCAAUAUCUUGGACUAUUUUUACUGUACCUUUCGAUUACAAAAUUCGAAAAAUAGCGCGGUGGAUACAACUGCCGACUCCCUUCGCAAUCCGACGGCGUGUCAUAAGUGACGAGAAGUUCAGACUUCCCAGCCCAGCGCUAGAGCGUCAUAAUCAGUAGAUACACAUUAUCGGACGAACGCGAUAGUCUAGCCCACGCGUUAACGAGUACCUGCUAGAGACGUUCUAACGUUGGGUCGUGUUGGUUGUGGUCCGUCCUGUACAUUUCAACCACCCAUUGGAUCAGAAAACCAUAACUCACCCGGAAAUUAUUGACUCGGUCGAAUUAAUAAGCUGCGCCCUGCUGGAAGCUCUUCUGAGCCUUUAGAGCAUCCGCAUCCAGUACGCGUGGAGCCUAGUUGUUGAAGCCACGGGAAUGCGCUCAGUAAGGUAAUGUCCCUGAUGUAUAGUGUACUUAAGUUGCAAGAGACACGACUCGCUGGUCUUGGAGGCGUCGAUAUCAAGGAUAGUUCAACUCACGCGUCUCGUCGUCUCGCCUUAUGAAAUGUAUGAGACUGAUCACCAGCUUCUCAAGUAACCACAAGUACCUUCAGUAACGUAUCUAUUAUUCAGUUAUGGGUUGUGGAUGUGUCUACCAUCCCUGGUAGUUCUGCCCCGGCAUAUCGUGACAAAAGGUUUUCGAGCGACCGGCAAUGUAGGGUAACAUCGACUGACCAUGUCCUGGCUCAUGACCACUGGGUCGUUUGAUGCCUGCAAGUUUCACUAAAUUUAAAUCUGUA

3. Reverse a DNA strand

# Read the content of the file
dna_string <- readLines("rosalind_revc.txt")

# Reverse the string using rev
reversed_dna <- sapply(dna_string, function(x) system(paste("echo", x, "| rev"), intern = TRUE))

# Print the reversed output
cat(reversed_dna, sep = "\n")

## GCGGACAACCGACCTTCAGCATCAAACGGACTTCTATATTAGGTTTCGTTGGCCGGCGACACTAATCCTCACGTCGCACCAAAGTTTATTTTGATAACAGGTTTCAGCCAATTTGGCCGACGCCAGGGGTGACGCAGGGGCATAACCGCACCTCGTAGCGAGTATAGCCCCTGGAAGTCGAGCGAACGAATAGGGGTCGGGTGTTTAAGGTGATTTGCTTTTTTGGTAGGTTAAATTATCGAAACCGCGTAGCAGACGGCATGGTTCCTGAACCTGTAATTGACAGCTGCTGGGACGGGTTCGTGCCGGTCATAACCGCGCACACCACGGTGGTATTCGTATATACGAACGTCCACGAACTTGTATCGAGAGCACGTAGCTCGCGGTAGTCCGATAAAGCATCTTCTGGCTCTACTTAGTTGGACTGGGGTATCTGACCCAATGTCTACACACGAGTGACGTAACCATAACCTCATCACCGGTAAAAGGTGATTTGCTCACAACAGCCCGGTACGGTAACGAGTTGATATCCATTGTAGGAGATTGTCGGTTTGGACTTTCCTTCGCCCACCGGATAAGATCTGGGTTTGCTCCGCAACTTATCATTCGAATTCTGCCATCAACTAGCTCAAATTCGGGGGGCAGGGATGGCGAAGGCTCGTTTCGGTGTCAGACAATAACTATATATGCTAAAAGACACTCCCATATCCGACAGGATTTGATTCAGCCGTCTAACTAATACATAGAAATATATTGTAGACTGCCAGCCCTTGGAGGCCGCCTGAATCTATAGCACGTGGTCAGGTCCGTCTACTCTGACTAAGTCTCTTGACGTCATCCCTTCTAAGGGGTTCAGGCACGTCACGCGAGGTGTGGCTCGACAGGACACCGGTCGGACGAGCGACATTAGAGCCGCGTCTTACGTAAGAGACGTGAAACAAACAGTCTGCTTA

3.1 Complementing (reverse then complement) a DNA strand

# Read the sequence from the file
sequence <- readLines("rosalind_revc.txt")

# Reverse the sequence
reversed_sequence <- sapply(sequence, function(x) paste(rev(strsplit(x, "")[[1]]), collapse = ""))

# Complement the sequence
complemented_sequence <- chartr("ACGT", "TGCA", reversed_sequence)

# Print the result to the console
cat(complemented_sequence, "\n")

## CGCCTGTTGGCTGGAAGTCGTAGTTTGCCTGAAGATATAATCCAAAGCAACCGGCCGCTGTGATTAGGAGTGCAGCGTGGTTTCAAATAAAACTATTGTCCAAAGTCGGTTAAACCGGCTGCGGTCCCCACTGCGTCCCCGTATTGGCGTGGAGCATCGCTCATATCGGGGACCTTCAGCTCGCTTGCTTATCCCCAGCCCACAAATTCCACTAAACGAAAAAACCATCCAATTTAATAGCTTTGGCGCATCGTCTGCCGTACCAAGGACTTGGACATTAACTGTCGACGACCCTGCCCAAGCACGGCCAGTATTGGCGCGTGTGGTGCCACCATAAGCATATATGCTTGCAGGTGCTTGAACATAGCTCTCGTGCATCGAGCGCCATCAGGCTATTTCGTAGAAGACCGAGATGAATCAACCTGACCCCATAGACTGGGTTACAGATGTGTGCTCACTGCATTGGTATTGGAGTAGTGGCCATTTTCCACTAAACGAGTGTTGTCGGGCCATGCCATTGCTCAACTATAGGTAACATCCTCTAACAGCCAAACCTGAAAGGAAGCGGGTGGCCTATTCTAGACCCAAACGAGGCGTTGAATAGTAAGCTTAAGACGGTAGTTGATCGAGTTTAAGCCCCCCGTCCCTACCGCTTCCGAGCAAAGCCACAGTCTGTTATTGATATATACGATTTTCTGTGAGGGTATAGGCTGTCCTAAACTAAGTCGGCAGATTGATTATGTATCTTTATATAACATCTGACGGTCGGGAACCTCCGGCGGACTTAGATATCGTGCACCAGTCCAGGCAGATGAGACTGATTCAGAGAACTGCAGTAGGGAAGATTCCCCAAGTCCGTGCAGTGCGCTCCACACCGAGCTGTCCTGTGGCCAGCCTGCTCGCTGTAATCTCGGCGCAGAATGCATTCTCTGCACTTTGTTTGTCAGACGAAT

4. Rabbits and Recurrence Relations

# Define the function to calculate rabbit pairs
rabbit_pairs <- function(n, k) {
  # If n is 1 or 2, return 1 as there is only 1 pair initially
  if (n == 1 || n == 2) {
    return(1)
  }
  
  # Initialize the first two months
  F1 <- 1  # Rabbit pairs in month 1
  F2 <- 1  # Rabbit pairs in month 2
  
  # Calculate the number of pairs from month 3 to n
  for (i in 3:n) {
    # Calculate the next number of pairs
    Fn <- F2 + k * F1
    # Update the previous values
    F1 <- F2
    F2 <- Fn
  }
  
  # Return the number of pairs after n months
  return(F2)
}

# Example usage
n <- 5  # Number of months
k <- 3  # Number of pairs produced by each mature pair
result <- rabbit_pairs(n, k)
print(result)  # Output: 19

## [1] 19

5. Calculating CG content

# Run the awk command to calculate CG content
output <- system("awk '/^>/ {if (seqlen){ print seqname, (c+g)/seqlen*100 \"%\"}; c=g=seqlen=0; seqname=$0; next} {seqlen += length($0); c += gsub(/C/, \"C\"); g += gsub(/G/, \"G\")} END {if (seqlen) {print seqname, (c+g)/seqlen*100 \"%\"}}' rosalind_cg.fasta", intern = TRUE)

# Print the captured output line by line
cat(output, sep = "\n")

## >Rosalind_6404 53.75%
## >Rosalind_5959 53.5714%
## >Rosalind_0808 60.9195%

6. Counting point mutations

# Read sequences from FASTA file
read_sequences <- function(fasta_file) {
  fasta_data <- readDNAStringSet(fasta_file)
  if (length(fasta_data) != 2) {
    stop("The FASTA file must contain exactly two sequences.")
  }
  return(fasta_data)
}

# Calculate point mutations
calculate_point_mutations <- function(seq1, seq2) {
  if (nchar(seq1) != nchar(seq2)) {
    stop("Sequences must be of the same length.")
  }
  mutations <- sum(strsplit(as.character(seq1), NULL)[[1]] != strsplit(as.character(seq2), NULL)[[1]])
  return(mutations)
}

# Main
fasta_file <- "rosalind_mutations.fasta"
sequences <- read_sequences(fasta_file)
seq1 <- sequences[[1]]
seq2 <- sequences[[2]]
mutations <- calculate_point_mutations(seq1, seq2)

cat("Number of point mutations:", mutations, "\n")

## Number of point mutations: 7

7. Mendel’s First Law

# Define the numbers of each type
k <- 2  # Homozygous dominant
m <- 2  # Heterozygous
n <- 2  # Homozygous recessive

# Total population
total <- k + m + n

# Probability calculations for each pairing type
# P(dominant phenotype) = Probability of selecting each pair type * probability of dominant offspring

# Probabilities of each mating type
P_kk <- (k / total) * ((k - 1) / (total - 1))
P_km <- (k / total) * (m / (total - 1)) + (m / total) * (k / (total - 1))
P_kn <- (k / total) * (n / (total - 1)) + (n / total) * (k / (total - 1))
P_mm <- (m / total) * ((m - 1) / (total - 1))
P_mn <- (m / total) * (n / (total - 1)) + (n / total) * (m / (total - 1))
P_nn <- (n / total) * ((n - 1) / (total - 1))

# Total probability of producing a dominant offspring
prob_dominant <- P_kk * 1 + P_km * 1 + P_kn * 1 + P_mm * 0.75 + P_mn * 0.5 + P_nn * 0

# Print the probability
cat("Probability of dominant phenotype:", prob_dominant, "\n")

## Probability of dominant phenotype: 0.7833333

8. Translating RNA into Protein

create a R file: rna_to_protein.R

# Read the RNA sequence from the file
rna_sequence <- readLines("rosalind_rna.txt")

# Define the RNA codon table
codon_table <- list(
  "AUG" = "M", "UGG" = "W",
  "UUU" = "F", "UUC" = "F", "UUA" = "L", "UUG" = "L",
  "CUU" = "L", "CUC" = "L", "CUA" = "L", "CUG" = "L",
  "AUU" = "I", "AUC" = "I", "AUA" = "I",
  "GUU" = "V", "GUC" = "V", "GUA" = "V", "GUG" = "V",
  "UCU" = "S", "UCC" = "S", "UCA" = "S", "UCG" = "S",
  "CCU" = "P", "CCC" = "P", "CCA" = "P", "CCG" = "P",
  "ACU" = "T", "ACC" = "T", "ACA" = "T", "ACG" = "T",
  "GCU" = "A", "GCC" = "A", "GCA" = "A", "GCG" = "A",
  "UAU" = "Y", "UAC" = "Y", "UGU" = "C", "UGC" = "C",
  "CAU" = "H", "CAC" = "H", "CAA" = "Q", "CAG" = "Q",
  "AAU" = "N", "AAC" = "N", "AAA" = "K", "AAG" = "K",
  "GAU" = "D", "GAC" = "D", "GAA" = "E", "GAG" = "E",
  "CGU" = "R", "CGC" = "R", "CGA" = "R", "CGG" = "R",
  "AGU" = "S", "AGC" = "S", "AGA" = "R", "AGG" = "R",
  "GGU" = "G", "GGC" = "G", "GGA" = "G", "GGG" = "G",
  "UAA" = "", "UAG" = "", "UGA" = "" # Stop codons
)

# Function to translate RNA sequence to protein
translate_rna_to_protein <- function(rna) {
  protein <- ""
  # Process the RNA sequence in groups of three
  for (i in seq(1, nchar(rna), by = 3)) {
    codon <- substr(rna, i, i + 2)
    # Append the corresponding amino acid, ignoring stop codons
    if (codon_table[[codon]] != "") {
      protein <- paste0(protein, codon_table[[codon]])
    }
  }
  return(protein)
}

# Translate the RNA sequence
protein_string <- translate_rna_to_protein(rna_sequence)
print(protein_string)

source("rna_to_protein.R")

## [1] "RRGRHHTATPPHARGEESTATTNHDTRQKSATADRPTAKREQQPSRATRTNAPTRRDTPTQPPGENNPGRKRGSEEPPRESHGAKKQETRGGKHAADPQTTTNQTAPADKRSDRQRTDDHQ"

9. Finding a motif ‘ATAT’ in DNA

At first, I create a find_substring.R file

# Read the DNA string from the file
dna_string <- readLines("rosalind_dna.txt")

# Define the substring to search for
substring_to_find <- "ATAT"

# Function to find all positions of the substring in the main string
find_substring_positions <- function(main_string, sub_string) {
  positions <- c() # Initialize an empty vector to store positions
  
  # Loop through the main string and find matching positions
  for (i in 1:(nchar(main_string) - nchar(sub_string) + 1)) {
    # Extract a substring of the same length as sub_string
    current_substring <- substr(main_string, i, i + nchar(sub_string) - 1)
    
    # Check if the extracted substring matches sub_string
    if (current_substring == sub_string) {
      positions <- c(positions, i) # Store the position
    }
  }
  return(positions)
}

# Find the positions of the substring in the DNA string
positions <- find_substring_positions(dna_string, substring_to_find)

# Print the positions
print(positions)

Motif ‘ATAT’ at position:

source("find_substring.R")

## [1] 679

10. Consensus and profile of DNA sequences

Create a file rosalind_dna_consensus.R

# Load necessary libraries
library(Biostrings)

# Load the DNA sequences from the FASTA file
fasta_file <- "/Users/nnthieu/Downloads/GenomeProject1000/Blast/rosalind_dna.fasta"  # Update with the correct path
sequences <- readDNAStringSet(fasta_file)

# Clean sequences by removing invalid characters
sequences_clean <- lapply(sequences, function(seq) {
  gsub("[^ACGT]", "", as.character(seq))  # Remove characters not A, C, G, T
})

# Convert sequences to a matrix
dna_matrix <- as.matrix(sequences)

# Initialize the profile matrix
profile_matrix <- matrix(0, nrow = 4, ncol = ncol(dna_matrix),
                         dimnames = list(c("A", "C", "G", "T")))

# Fill the profile matrix with counts of each nucleotide
for (i in 1:ncol(dna_matrix)) {
    nucleotides <- dna_matrix[, i]
    profile_matrix["A", i] <- sum(nucleotides == "A")
    profile_matrix["C", i] <- sum(nucleotides == "C")
    profile_matrix["G", i] <- sum(nucleotides == "G")
    profile_matrix["T", i] <- sum(nucleotides == "T")
}

# Determine the consensus sequence
consensus <- apply(profile_matrix, 2, function(column) {
    names(which.max(column))
})

# Print the consensus string
cat("Consensus String:\n")
cat(paste(consensus, collapse = ""), "\n")

# Print the profile matrix
cat("\nProfile Matrix:\n")
print(profile_matrix)

source("rosalind_dna_consensus.R")

## Warning in .Call2("fasta_index", filexp_list, nrec, skip, seek.first.rec, :
## reading FASTA file
## /Users/nnthieu/Downloads/GenomeProject1000/Blast/rosalind_dna.fasta: ignored 49
## invalid one-letter sequence codes

## Consensus String:
## ATGCAACT 
## 
## Profile Matrix:
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## A    5    1    0    0    5    5    0    0
## C    0    0    1    4    2    0    6    1
## G    1    1    6    3    0    1    0    0
## T    1    5    0    0    0    1    1    6

Rosalind’s Genome Data Analysis Exercises Using R - Part 1

Thieu Nguyen

8/29/2024

1. Counting DNA nucleotides

2. Transcribe to RNA string

3. Reverse a DNA strand

3.1 Complementing (reverse then complement) a DNA strand

4. Rabbits and Recurrence Relations

5. Calculating CG content

6. Counting point mutations

7. Mendel’s First Law

8. Translating RNA into Protein

9. Finding a motif ‘ATAT’ in DNA

10. Consensus and profile of DNA sequences