setwd("/Users/nnthieu/Downloads/GenomeProject1000/Blast/")
library(Biostrings)
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
# Corrected single-line AWK command
output <- system("awk '{a=gsub(/A/, \"\", $0); t=gsub(/T/, \"\", $0); c=gsub(/C/, \"\", $0); g=gsub(/G/, \"\", $0); print \"A: \" a; print \"T: \" t; print \"C: \" c; print \"G: \" g;}' rosalind_dna.txt", intern = TRUE)
# Print the captured output
cat(output, sep = "\n")
## A: 237
## T: 223
## C: 236
## G: 234
output <- system("sed 's/T/U/g' rosalind_rna.txt", intern = TRUE)
cat(output, sep = "\n")
## CGUUUAAGACUAUGAAUACGCGGGCGAUGUAUCCACCACACAUAACCUAUCGCAACCCCGUUGCUACCGUAACGUGAUCACGCCUGAUUCCGAGUGACUGGGGAGGUUGAGGCUAGCCAUAUCACAGCCUCUUGCUACACGUGAACCAACCAUCACUUCAAUAUCUUGGACUAUUUUUACUGUACCUUUCGAUUACAAAAUUCGAAAAAUAGCGCGGUGGAUACAACUGCCGACUCCCUUCGCAAUCCGACGGCGUGUCAUAAGUGACGAGAAGUUCAGACUUCCCAGCCCAGCGCUAGAGCGUCAUAAUCAGUAGAUACACAUUAUCGGACGAACGCGAUAGUCUAGCCCACGCGUUAACGAGUACCUGCUAGAGACGUUCUAACGUUGGGUCGUGUUGGUUGUGGUCCGUCCUGUACAUUUCAACCACCCAUUGGAUCAGAAAACCAUAACUCACCCGGAAAUUAUUGACUCGGUCGAAUUAAUAAGCUGCGCCCUGCUGGAAGCUCUUCUGAGCCUUUAGAGCAUCCGCAUCCAGUACGCGUGGAGCCUAGUUGUUGAAGCCACGGGAAUGCGCUCAGUAAGGUAAUGUCCCUGAUGUAUAGUGUACUUAAGUUGCAAGAGACACGACUCGCUGGUCUUGGAGGCGUCGAUAUCAAGGAUAGUUCAACUCACGCGUCUCGUCGUCUCGCCUUAUGAAAUGUAUGAGACUGAUCACCAGCUUCUCAAGUAACCACAAGUACCUUCAGUAACGUAUCUAUUAUUCAGUUAUGGGUUGUGGAUGUGUCUACCAUCCCUGGUAGUUCUGCCCCGGCAUAUCGUGACAAAAGGUUUUCGAGCGACCGGCAAUGUAGGGUAACAUCGACUGACCAUGUCCUGGCUCAUGACCACUGGGUCGUUUGAUGCCUGCAAGUUUCACUAAAUUUAAAUCUGUA
# Read the content of the file
dna_string <- readLines("rosalind_revc.txt")
# Reverse the string using rev
reversed_dna <- sapply(dna_string, function(x) system(paste("echo", x, "| rev"), intern = TRUE))
# Print the reversed output
cat(reversed_dna, sep = "\n")
## GCGGACAACCGACCTTCAGCATCAAACGGACTTCTATATTAGGTTTCGTTGGCCGGCGACACTAATCCTCACGTCGCACCAAAGTTTATTTTGATAACAGGTTTCAGCCAATTTGGCCGACGCCAGGGGTGACGCAGGGGCATAACCGCACCTCGTAGCGAGTATAGCCCCTGGAAGTCGAGCGAACGAATAGGGGTCGGGTGTTTAAGGTGATTTGCTTTTTTGGTAGGTTAAATTATCGAAACCGCGTAGCAGACGGCATGGTTCCTGAACCTGTAATTGACAGCTGCTGGGACGGGTTCGTGCCGGTCATAACCGCGCACACCACGGTGGTATTCGTATATACGAACGTCCACGAACTTGTATCGAGAGCACGTAGCTCGCGGTAGTCCGATAAAGCATCTTCTGGCTCTACTTAGTTGGACTGGGGTATCTGACCCAATGTCTACACACGAGTGACGTAACCATAACCTCATCACCGGTAAAAGGTGATTTGCTCACAACAGCCCGGTACGGTAACGAGTTGATATCCATTGTAGGAGATTGTCGGTTTGGACTTTCCTTCGCCCACCGGATAAGATCTGGGTTTGCTCCGCAACTTATCATTCGAATTCTGCCATCAACTAGCTCAAATTCGGGGGGCAGGGATGGCGAAGGCTCGTTTCGGTGTCAGACAATAACTATATATGCTAAAAGACACTCCCATATCCGACAGGATTTGATTCAGCCGTCTAACTAATACATAGAAATATATTGTAGACTGCCAGCCCTTGGAGGCCGCCTGAATCTATAGCACGTGGTCAGGTCCGTCTACTCTGACTAAGTCTCTTGACGTCATCCCTTCTAAGGGGTTCAGGCACGTCACGCGAGGTGTGGCTCGACAGGACACCGGTCGGACGAGCGACATTAGAGCCGCGTCTTACGTAAGAGACGTGAAACAAACAGTCTGCTTA
# Read the sequence from the file
sequence <- readLines("rosalind_revc.txt")
# Reverse the sequence
reversed_sequence <- sapply(sequence, function(x) paste(rev(strsplit(x, "")[[1]]), collapse = ""))
# Complement the sequence
complemented_sequence <- chartr("ACGT", "TGCA", reversed_sequence)
# Print the result to the console
cat(complemented_sequence, "\n")
## CGCCTGTTGGCTGGAAGTCGTAGTTTGCCTGAAGATATAATCCAAAGCAACCGGCCGCTGTGATTAGGAGTGCAGCGTGGTTTCAAATAAAACTATTGTCCAAAGTCGGTTAAACCGGCTGCGGTCCCCACTGCGTCCCCGTATTGGCGTGGAGCATCGCTCATATCGGGGACCTTCAGCTCGCTTGCTTATCCCCAGCCCACAAATTCCACTAAACGAAAAAACCATCCAATTTAATAGCTTTGGCGCATCGTCTGCCGTACCAAGGACTTGGACATTAACTGTCGACGACCCTGCCCAAGCACGGCCAGTATTGGCGCGTGTGGTGCCACCATAAGCATATATGCTTGCAGGTGCTTGAACATAGCTCTCGTGCATCGAGCGCCATCAGGCTATTTCGTAGAAGACCGAGATGAATCAACCTGACCCCATAGACTGGGTTACAGATGTGTGCTCACTGCATTGGTATTGGAGTAGTGGCCATTTTCCACTAAACGAGTGTTGTCGGGCCATGCCATTGCTCAACTATAGGTAACATCCTCTAACAGCCAAACCTGAAAGGAAGCGGGTGGCCTATTCTAGACCCAAACGAGGCGTTGAATAGTAAGCTTAAGACGGTAGTTGATCGAGTTTAAGCCCCCCGTCCCTACCGCTTCCGAGCAAAGCCACAGTCTGTTATTGATATATACGATTTTCTGTGAGGGTATAGGCTGTCCTAAACTAAGTCGGCAGATTGATTATGTATCTTTATATAACATCTGACGGTCGGGAACCTCCGGCGGACTTAGATATCGTGCACCAGTCCAGGCAGATGAGACTGATTCAGAGAACTGCAGTAGGGAAGATTCCCCAAGTCCGTGCAGTGCGCTCCACACCGAGCTGTCCTGTGGCCAGCCTGCTCGCTGTAATCTCGGCGCAGAATGCATTCTCTGCACTTTGTTTGTCAGACGAAT
# Define the function to calculate rabbit pairs
rabbit_pairs <- function(n, k) {
# If n is 1 or 2, return 1 as there is only 1 pair initially
if (n == 1 || n == 2) {
return(1)
}
# Initialize the first two months
F1 <- 1 # Rabbit pairs in month 1
F2 <- 1 # Rabbit pairs in month 2
# Calculate the number of pairs from month 3 to n
for (i in 3:n) {
# Calculate the next number of pairs
Fn <- F2 + k * F1
# Update the previous values
F1 <- F2
F2 <- Fn
}
# Return the number of pairs after n months
return(F2)
}
# Example usage
n <- 5 # Number of months
k <- 3 # Number of pairs produced by each mature pair
result <- rabbit_pairs(n, k)
print(result) # Output: 19
## [1] 19
# Run the awk command to calculate CG content
output <- system("awk '/^>/ {if (seqlen){ print seqname, (c+g)/seqlen*100 \"%\"}; c=g=seqlen=0; seqname=$0; next} {seqlen += length($0); c += gsub(/C/, \"C\"); g += gsub(/G/, \"G\")} END {if (seqlen) {print seqname, (c+g)/seqlen*100 \"%\"}}' rosalind_cg.fasta", intern = TRUE)
# Print the captured output line by line
cat(output, sep = "\n")
## >Rosalind_6404 53.75%
## >Rosalind_5959 53.5714%
## >Rosalind_0808 60.9195%
# Read sequences from FASTA file
read_sequences <- function(fasta_file) {
fasta_data <- readDNAStringSet(fasta_file)
if (length(fasta_data) != 2) {
stop("The FASTA file must contain exactly two sequences.")
}
return(fasta_data)
}
# Calculate point mutations
calculate_point_mutations <- function(seq1, seq2) {
if (nchar(seq1) != nchar(seq2)) {
stop("Sequences must be of the same length.")
}
mutations <- sum(strsplit(as.character(seq1), NULL)[[1]] != strsplit(as.character(seq2), NULL)[[1]])
return(mutations)
}
# Main
fasta_file <- "rosalind_mutations.fasta"
sequences <- read_sequences(fasta_file)
seq1 <- sequences[[1]]
seq2 <- sequences[[2]]
mutations <- calculate_point_mutations(seq1, seq2)
cat("Number of point mutations:", mutations, "\n")
## Number of point mutations: 7
# Define the numbers of each type
k <- 2 # Homozygous dominant
m <- 2 # Heterozygous
n <- 2 # Homozygous recessive
# Total population
total <- k + m + n
# Probability calculations for each pairing type
# P(dominant phenotype) = Probability of selecting each pair type * probability of dominant offspring
# Probabilities of each mating type
P_kk <- (k / total) * ((k - 1) / (total - 1))
P_km <- (k / total) * (m / (total - 1)) + (m / total) * (k / (total - 1))
P_kn <- (k / total) * (n / (total - 1)) + (n / total) * (k / (total - 1))
P_mm <- (m / total) * ((m - 1) / (total - 1))
P_mn <- (m / total) * (n / (total - 1)) + (n / total) * (m / (total - 1))
P_nn <- (n / total) * ((n - 1) / (total - 1))
# Total probability of producing a dominant offspring
prob_dominant <- P_kk * 1 + P_km * 1 + P_kn * 1 + P_mm * 0.75 + P_mn * 0.5 + P_nn * 0
# Print the probability
cat("Probability of dominant phenotype:", prob_dominant, "\n")
## Probability of dominant phenotype: 0.7833333
create a R file: rna_to_protein.R
# Read the RNA sequence from the file
rna_sequence <- readLines("rosalind_rna.txt")
# Define the RNA codon table
codon_table <- list(
"AUG" = "M", "UGG" = "W",
"UUU" = "F", "UUC" = "F", "UUA" = "L", "UUG" = "L",
"CUU" = "L", "CUC" = "L", "CUA" = "L", "CUG" = "L",
"AUU" = "I", "AUC" = "I", "AUA" = "I",
"GUU" = "V", "GUC" = "V", "GUA" = "V", "GUG" = "V",
"UCU" = "S", "UCC" = "S", "UCA" = "S", "UCG" = "S",
"CCU" = "P", "CCC" = "P", "CCA" = "P", "CCG" = "P",
"ACU" = "T", "ACC" = "T", "ACA" = "T", "ACG" = "T",
"GCU" = "A", "GCC" = "A", "GCA" = "A", "GCG" = "A",
"UAU" = "Y", "UAC" = "Y", "UGU" = "C", "UGC" = "C",
"CAU" = "H", "CAC" = "H", "CAA" = "Q", "CAG" = "Q",
"AAU" = "N", "AAC" = "N", "AAA" = "K", "AAG" = "K",
"GAU" = "D", "GAC" = "D", "GAA" = "E", "GAG" = "E",
"CGU" = "R", "CGC" = "R", "CGA" = "R", "CGG" = "R",
"AGU" = "S", "AGC" = "S", "AGA" = "R", "AGG" = "R",
"GGU" = "G", "GGC" = "G", "GGA" = "G", "GGG" = "G",
"UAA" = "", "UAG" = "", "UGA" = "" # Stop codons
)
# Function to translate RNA sequence to protein
translate_rna_to_protein <- function(rna) {
protein <- ""
# Process the RNA sequence in groups of three
for (i in seq(1, nchar(rna), by = 3)) {
codon <- substr(rna, i, i + 2)
# Append the corresponding amino acid, ignoring stop codons
if (codon_table[[codon]] != "") {
protein <- paste0(protein, codon_table[[codon]])
}
}
return(protein)
}
# Translate the RNA sequence
protein_string <- translate_rna_to_protein(rna_sequence)
print(protein_string)
source("rna_to_protein.R")
## [1] "RRGRHHTATPPHARGEESTATTNHDTRQKSATADRPTAKREQQPSRATRTNAPTRRDTPTQPPGENNPGRKRGSEEPPRESHGAKKQETRGGKHAADPQTTTNQTAPADKRSDRQRTDDHQ"
At first, I create a find_substring.R file
# Read the DNA string from the file
dna_string <- readLines("rosalind_dna.txt")
# Define the substring to search for
substring_to_find <- "ATAT"
# Function to find all positions of the substring in the main string
find_substring_positions <- function(main_string, sub_string) {
positions <- c() # Initialize an empty vector to store positions
# Loop through the main string and find matching positions
for (i in 1:(nchar(main_string) - nchar(sub_string) + 1)) {
# Extract a substring of the same length as sub_string
current_substring <- substr(main_string, i, i + nchar(sub_string) - 1)
# Check if the extracted substring matches sub_string
if (current_substring == sub_string) {
positions <- c(positions, i) # Store the position
}
}
return(positions)
}
# Find the positions of the substring in the DNA string
positions <- find_substring_positions(dna_string, substring_to_find)
# Print the positions
print(positions)
Motif ‘ATAT’ at position:
source("find_substring.R")
## [1] 679
Create a file rosalind_dna_consensus.R
# Load necessary libraries
library(Biostrings)
# Load the DNA sequences from the FASTA file
fasta_file <- "/Users/nnthieu/Downloads/GenomeProject1000/Blast/rosalind_dna.fasta" # Update with the correct path
sequences <- readDNAStringSet(fasta_file)
# Clean sequences by removing invalid characters
sequences_clean <- lapply(sequences, function(seq) {
gsub("[^ACGT]", "", as.character(seq)) # Remove characters not A, C, G, T
})
# Convert sequences to a matrix
dna_matrix <- as.matrix(sequences)
# Initialize the profile matrix
profile_matrix <- matrix(0, nrow = 4, ncol = ncol(dna_matrix),
dimnames = list(c("A", "C", "G", "T")))
# Fill the profile matrix with counts of each nucleotide
for (i in 1:ncol(dna_matrix)) {
nucleotides <- dna_matrix[, i]
profile_matrix["A", i] <- sum(nucleotides == "A")
profile_matrix["C", i] <- sum(nucleotides == "C")
profile_matrix["G", i] <- sum(nucleotides == "G")
profile_matrix["T", i] <- sum(nucleotides == "T")
}
# Determine the consensus sequence
consensus <- apply(profile_matrix, 2, function(column) {
names(which.max(column))
})
# Print the consensus string
cat("Consensus String:\n")
cat(paste(consensus, collapse = ""), "\n")
# Print the profile matrix
cat("\nProfile Matrix:\n")
print(profile_matrix)
source("rosalind_dna_consensus.R")
## Warning in .Call2("fasta_index", filexp_list, nrec, skip, seek.first.rec, :
## reading FASTA file
## /Users/nnthieu/Downloads/GenomeProject1000/Blast/rosalind_dna.fasta: ignored 49
## invalid one-letter sequence codes
## Consensus String:
## ATGCAACT
##
## Profile Matrix:
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## A 5 1 0 0 5 5 0 0
## C 0 0 1 4 2 0 6 1
## G 1 1 6 3 0 1 0 0
## T 1 5 0 0 0 1 1 6