Reverse complementary problem @rosalind
rev_comp<- function(sequ){
#split & convert the sequence into a reverse character vector
rev_seq <- rev(strsplit(sequ, "")[[1]])
#empty vector to store the reverse complementary sequences
revcom<-c()
for (i in 1:length(rev_seq)) {
if (rev_seq[i] == "A") {
revcom[i] <- "T"
} else if (rev_seq[i] == "T") {
revcom[i] <- "A"
}else if (rev_seq[i] == "G") {
revcom[i] <- "C"
}else {
revcom[i] <- "G"
}
}
#collapse to get rid of "" spaces in the vector
return(paste(revcom, collapse=""))
}
#Example:
sequence<-c("AAAACCCGGT")
rev_comp(sequence)
## [1] "ACCGGGTTTT"
abc<-c("GATATC")
rev_comp(abc)
## [1] "GATATC"
For the palindrome:
To find_reverse_palindromes function takes a DNA sequence as input, and iterates over all possible subsequences of length between 4 and 12. For each subsequence, it checks if the subsequence is a reverse palindrome by comparing it to its own reverse complementary sequence with the code above. When a reverse palindrome is found: its start positions and length are stored in a dataframe.
rev_pal<- function(seq) {
#empty list to store palindrome locations and length
pal_loc<-data.frame(matrix(ncol=2))
colnames(pal_loc)<-c("Position","Length")
for (i in 1:(nchar(seq)-4)) {
for (j in 4:12){
#To prevent end of the sequence to be checked:
if(i+j <=nchar(seq)) {
#Get a substring starting from i, ending at i+j
split_seq<-substr(seq, i, i+j)
# check if its palindrome
if ( rev_comp(split_seq) == split_seq) {
#add the positin and length to dataframe:
subdf<-data.frame(i,nchar(split_seq))
colnames(subdf)<-c("Position","Length")
#Add to the final df
pal_loc<-rbind(pal_loc, subdf)
}
j<- j+1
}
i<-i+1
}
}
#First row has NA values due to first rbind(pal_loc,subdf)
#Remove it
pal_loc<-pal_loc[-1,]
return(pal_loc)
}
example<-c("GCGCATGCGGTAGAGGGTTGGCCACAAAGAGCTCCCAATATGGTAATTTCCGCCTCAGCTAGGTGCCCAAATCATAACCTGTGTCTGCGGGATTCCCTATACTTGAAAGTTGGCGGAAACCGTATGTTGTGGTGTCCTGCTCGTACCAAATATAATATAGTTAGCTGTGCTGTAGTAGCGAGGGTTTAGTAATGCCCAGTAGAGCGACGAGCTATTTGCAAAAGTGCCCTGCGGGTGCCATACCACTTCAATGGGGATAATAACTTTGGCCCGGAAACAAGGCTCTTAGGAACAGCTCCTCCAACCCGACCCTTCGAGAACGAGAGATTGAAATCCGCGCTGTAGGAGATATACTTTCGTGGAATTACCAGCAACGGGACCGCTGGAACGAACGGGGGTTATCTAACGGTTTCTGATGATCTGGTTACTCACGTACTTCCTCCTGAGAGCACACATATATGTGACTCTGCACCGTCGCTAACTTCTTGTCTTCTATAACGTCTAATACTTTATTGTTGTCGATAAGCCAATGAGCAGGCACCTCTAAGCGTTCAATTCACTGGCCCTTAGGTCTCGTACTAACTTATGCTACCTGGTTGTGTCTCCGAACAAGGTTAGGACCGTCGTAAACTACCCACTGTTACTCGTAGTCGGATGGGTTCTCAAGGCGGGGCCAGGAAAACACGAAGTATAGGACACTGATGGCGTCTCCTGTTCACTTATCGGTTAACGCGAGTGAAGAGCCTCACGACGTTATATCGTTTACCGTACTAAGCGTGTCCGGCAAGCACAAACGCCAGAATTAGAATCAACTTTTGGCTCAAGATAGGCTAGTATAACGCCAGCTCACCCCTTTGCGTTTGTGCTGTAACAACGTTTTTCTCGATGACTCCCCGTGCATGTACGATCCATAT")
rev_pal(example)
## Position Length
## 2 3 6
## 3 19 6
## 4 29 6
## 5 215 8
## 6 216 6
## 7 452 12
## 8 453 10
## 9 454 8
## 10 455 6
## 11 726 6
## 12 873 6
Another example with multiple GCATGC in the sequence:
random<-c("AGCATGCAGCATGCTGCATGCCTGCATGCGGGCATGCTTAGCATGC")
rev_pal(random)
## Position Length
## 2 2 6
## 3 7 10
## 4 8 8
## 5 9 6
## 6 16 6
## 7 24 6
## 8 32 6
## 9 41 6