##Programming excersice from http://manuals.bioinformatics.ucr.edu/home/programming-in-r
#Assignment:Write a function that calculates for a set of DNA sequences their GC content and generates their reverse and complement. ## Generate an example data frame with ID numbers and DNA sequences fx <- function(test) { x <- as.integer(runif(20, min=1, max=5)) x[x==1] <- “A”; x[x==2] <- “T”; x[x==3] <- “G”; x[x==4] <- “C” paste(x, sep = "“, collapse =”") } z1 <- c() for(i in 1:50) { z1 <- c(fx(i), z1) } z1 <- data.frame(ID=seq(along=z1), Seq=z1) z1
my_split <- strsplit(as.character(z1[1,2]),"") ## Calculate in the same loop the GC content for each sequence using the following command
table(my_split[[1]])/length(my_split[[1]]) my_rev <- rev(my_split[[1]]) paste(my_rev, collapse="")
#First part guided:
#Using the functions provided, first we define the function to generate each sequence's content
fx=function(test){
x=as.integer(runif(n=20,min=1,max=5)) #We generate random samples with uniform probability distribution. In this case, the "n" of the sample defines the sequence length. We will generate 20 random samples with uniform probability distribution of numerical results between 1(min) and 5(max), (1 to 4 included).
x[x==1]="A";#We asign 1 nucleotide initial to each possible number generated in the sample, giving each nucleotide the same probability to appear
x[x==2]="T";
x[x==3]="G";
x[x==4]="C"
paste(x,sep="",collapse="")#We save the 20 independent results in 1 string without spaces between them
}
z1=c() #we define a vector for our sequences
set.seed(9991) #we set.seed to any chosen number to be able to reproduce our results
for(i in 1:50){
z1=c(fx(i),z1) ##with a for loop, we call the function 50 times and save the results to our predefined vector. With this loop we will generate 50 samples of randomly generated, 20 nucleotide long, DNA sequences.
}
z1=data.frame(ID=seq(along=z1), Secuencia=z1) #I build the data frame with 1 column for IDs (indexes along the vector of interest) and the second column with the content of each entry of the vector itself
z1
## ID Secuencia
## 1 1 TAGATGATACTTAAGGTAGG
## 2 2 TAACGGACCTACTAGGTTAT
## 3 3 GACACGTGCTCATCATCTAG
## 4 4 CAACGCCGTCAACGTTCTTA
## 5 5 ACATCCGTAGCAGACCTGGA
## 6 6 CGGGAACAGACCGTTAGATT
## 7 7 TCCTGGACGTAGGTAGATCA
## 8 8 GTTAGGTCGTCGAGTCATAA
## 9 9 ATGAGGGATGTGCGCATTCC
## 10 10 GGCTTGGTTCGGAGCCAAGT
## 11 11 TAGGGCCAAAGGCGACGACT
## 12 12 AACCTAGGTCCTTAATTCCC
## 13 13 CGGGAGTGCGACCTGACTTC
## 14 14 TTGACTGTCTCTTGGCCGCG
## 15 15 ATGTTACGTTATAAGTGAAG
## 16 16 CAAGGGATCGCGACGCTGGC
## 17 17 AGAGCGAGGTCCCTTCCTCG
## 18 18 GCCGGCGAGTGGACGGCTGC
## 19 19 ACCCTACATGCGTCGCCACA
## 20 20 TAAGCATGGACTAACAATTT
## 21 21 CCAGTGCCACTGAAGTTTGG
## 22 22 ACACTCCAACCTATCCCGAA
## 23 23 TGATGCCCGCAACTATGCGC
## 24 24 TTGAGCCAAAAGTCGGGTAG
## 25 25 GCCAAGAGTTGTTACACGGA
## 26 26 CCGAGGGGGACAGTTCTCGC
## 27 27 TCCTCTGCTAGATCAGTTCT
## 28 28 AACGGAATCAATTGGGATGA
## 29 29 AACGCGATTGGTCCGGTAAT
## 30 30 AAAGTTTCCATCGCAGATTT
## 31 31 TATTCAACTTTGTCAAGAGT
## 32 32 TGAAGCTCGGAGTTACCGGT
## 33 33 CCCGCAGGTAAATGGCTACA
## 34 34 CAAAGAAGGCGCAGCAGCCA
## 35 35 AATGTCAAATTGACTGAGTG
## 36 36 ACTCAGGGCGTGGTGCACAA
## 37 37 ATTGTTTTTCATGACAGTCG
## 38 38 GTGGAATTCCAGTACCGCCA
## 39 39 TATAACCAGGGGATCCGGCG
## 40 40 CCGGAGAGGAGCAGCCTGTC
## 41 41 CGACGTGGTCTAAACGGAGA
## 42 42 TTTTGAAGATGTCGCACGTC
## 43 43 CTCATGAGCCGATACTGACC
## 44 44 CAAAGGCAGTCAGGGTAACC
## 45 45 CCGCAAAGCCAACGAACAAC
## 46 46 TCGACTGTCTGGTGAACAAA
## 47 47 TATCGATCACGCATAAACTT
## 48 48 TTTAAACTTTTACTGCATGA
## 49 49 TCTGGATGTTGCTATGCGAA
## 50 50 GCGCTGGTGGTCCATTACCG
#My proposed solution for second and third part:
#Caculate de G C content for each sequence
my_split=c()
for (i in 1:dim(z1)[1]){
my_split[i]=strsplit(as.character(z1[i,2]),"")
}#We write each character of sequence into separate vector field
my_split
## [[1]]
## [1] "T" "A" "G" "A" "T" "G" "A" "T" "A" "C" "T" "T" "A" "A" "G" "G" "T" "A" "G"
## [20] "G"
##
## [[2]]
## [1] "T" "A" "A" "C" "G" "G" "A" "C" "C" "T" "A" "C" "T" "A" "G" "G" "T" "T" "A"
## [20] "T"
##
## [[3]]
## [1] "G" "A" "C" "A" "C" "G" "T" "G" "C" "T" "C" "A" "T" "C" "A" "T" "C" "T" "A"
## [20] "G"
##
## [[4]]
## [1] "C" "A" "A" "C" "G" "C" "C" "G" "T" "C" "A" "A" "C" "G" "T" "T" "C" "T" "T"
## [20] "A"
##
## [[5]]
## [1] "A" "C" "A" "T" "C" "C" "G" "T" "A" "G" "C" "A" "G" "A" "C" "C" "T" "G" "G"
## [20] "A"
##
## [[6]]
## [1] "C" "G" "G" "G" "A" "A" "C" "A" "G" "A" "C" "C" "G" "T" "T" "A" "G" "A" "T"
## [20] "T"
##
## [[7]]
## [1] "T" "C" "C" "T" "G" "G" "A" "C" "G" "T" "A" "G" "G" "T" "A" "G" "A" "T" "C"
## [20] "A"
##
## [[8]]
## [1] "G" "T" "T" "A" "G" "G" "T" "C" "G" "T" "C" "G" "A" "G" "T" "C" "A" "T" "A"
## [20] "A"
##
## [[9]]
## [1] "A" "T" "G" "A" "G" "G" "G" "A" "T" "G" "T" "G" "C" "G" "C" "A" "T" "T" "C"
## [20] "C"
##
## [[10]]
## [1] "G" "G" "C" "T" "T" "G" "G" "T" "T" "C" "G" "G" "A" "G" "C" "C" "A" "A" "G"
## [20] "T"
##
## [[11]]
## [1] "T" "A" "G" "G" "G" "C" "C" "A" "A" "A" "G" "G" "C" "G" "A" "C" "G" "A" "C"
## [20] "T"
##
## [[12]]
## [1] "A" "A" "C" "C" "T" "A" "G" "G" "T" "C" "C" "T" "T" "A" "A" "T" "T" "C" "C"
## [20] "C"
##
## [[13]]
## [1] "C" "G" "G" "G" "A" "G" "T" "G" "C" "G" "A" "C" "C" "T" "G" "A" "C" "T" "T"
## [20] "C"
##
## [[14]]
## [1] "T" "T" "G" "A" "C" "T" "G" "T" "C" "T" "C" "T" "T" "G" "G" "C" "C" "G" "C"
## [20] "G"
##
## [[15]]
## [1] "A" "T" "G" "T" "T" "A" "C" "G" "T" "T" "A" "T" "A" "A" "G" "T" "G" "A" "A"
## [20] "G"
##
## [[16]]
## [1] "C" "A" "A" "G" "G" "G" "A" "T" "C" "G" "C" "G" "A" "C" "G" "C" "T" "G" "G"
## [20] "C"
##
## [[17]]
## [1] "A" "G" "A" "G" "C" "G" "A" "G" "G" "T" "C" "C" "C" "T" "T" "C" "C" "T" "C"
## [20] "G"
##
## [[18]]
## [1] "G" "C" "C" "G" "G" "C" "G" "A" "G" "T" "G" "G" "A" "C" "G" "G" "C" "T" "G"
## [20] "C"
##
## [[19]]
## [1] "A" "C" "C" "C" "T" "A" "C" "A" "T" "G" "C" "G" "T" "C" "G" "C" "C" "A" "C"
## [20] "A"
##
## [[20]]
## [1] "T" "A" "A" "G" "C" "A" "T" "G" "G" "A" "C" "T" "A" "A" "C" "A" "A" "T" "T"
## [20] "T"
##
## [[21]]
## [1] "C" "C" "A" "G" "T" "G" "C" "C" "A" "C" "T" "G" "A" "A" "G" "T" "T" "T" "G"
## [20] "G"
##
## [[22]]
## [1] "A" "C" "A" "C" "T" "C" "C" "A" "A" "C" "C" "T" "A" "T" "C" "C" "C" "G" "A"
## [20] "A"
##
## [[23]]
## [1] "T" "G" "A" "T" "G" "C" "C" "C" "G" "C" "A" "A" "C" "T" "A" "T" "G" "C" "G"
## [20] "C"
##
## [[24]]
## [1] "T" "T" "G" "A" "G" "C" "C" "A" "A" "A" "A" "G" "T" "C" "G" "G" "G" "T" "A"
## [20] "G"
##
## [[25]]
## [1] "G" "C" "C" "A" "A" "G" "A" "G" "T" "T" "G" "T" "T" "A" "C" "A" "C" "G" "G"
## [20] "A"
##
## [[26]]
## [1] "C" "C" "G" "A" "G" "G" "G" "G" "G" "A" "C" "A" "G" "T" "T" "C" "T" "C" "G"
## [20] "C"
##
## [[27]]
## [1] "T" "C" "C" "T" "C" "T" "G" "C" "T" "A" "G" "A" "T" "C" "A" "G" "T" "T" "C"
## [20] "T"
##
## [[28]]
## [1] "A" "A" "C" "G" "G" "A" "A" "T" "C" "A" "A" "T" "T" "G" "G" "G" "A" "T" "G"
## [20] "A"
##
## [[29]]
## [1] "A" "A" "C" "G" "C" "G" "A" "T" "T" "G" "G" "T" "C" "C" "G" "G" "T" "A" "A"
## [20] "T"
##
## [[30]]
## [1] "A" "A" "A" "G" "T" "T" "T" "C" "C" "A" "T" "C" "G" "C" "A" "G" "A" "T" "T"
## [20] "T"
##
## [[31]]
## [1] "T" "A" "T" "T" "C" "A" "A" "C" "T" "T" "T" "G" "T" "C" "A" "A" "G" "A" "G"
## [20] "T"
##
## [[32]]
## [1] "T" "G" "A" "A" "G" "C" "T" "C" "G" "G" "A" "G" "T" "T" "A" "C" "C" "G" "G"
## [20] "T"
##
## [[33]]
## [1] "C" "C" "C" "G" "C" "A" "G" "G" "T" "A" "A" "A" "T" "G" "G" "C" "T" "A" "C"
## [20] "A"
##
## [[34]]
## [1] "C" "A" "A" "A" "G" "A" "A" "G" "G" "C" "G" "C" "A" "G" "C" "A" "G" "C" "C"
## [20] "A"
##
## [[35]]
## [1] "A" "A" "T" "G" "T" "C" "A" "A" "A" "T" "T" "G" "A" "C" "T" "G" "A" "G" "T"
## [20] "G"
##
## [[36]]
## [1] "A" "C" "T" "C" "A" "G" "G" "G" "C" "G" "T" "G" "G" "T" "G" "C" "A" "C" "A"
## [20] "A"
##
## [[37]]
## [1] "A" "T" "T" "G" "T" "T" "T" "T" "T" "C" "A" "T" "G" "A" "C" "A" "G" "T" "C"
## [20] "G"
##
## [[38]]
## [1] "G" "T" "G" "G" "A" "A" "T" "T" "C" "C" "A" "G" "T" "A" "C" "C" "G" "C" "C"
## [20] "A"
##
## [[39]]
## [1] "T" "A" "T" "A" "A" "C" "C" "A" "G" "G" "G" "G" "A" "T" "C" "C" "G" "G" "C"
## [20] "G"
##
## [[40]]
## [1] "C" "C" "G" "G" "A" "G" "A" "G" "G" "A" "G" "C" "A" "G" "C" "C" "T" "G" "T"
## [20] "C"
##
## [[41]]
## [1] "C" "G" "A" "C" "G" "T" "G" "G" "T" "C" "T" "A" "A" "A" "C" "G" "G" "A" "G"
## [20] "A"
##
## [[42]]
## [1] "T" "T" "T" "T" "G" "A" "A" "G" "A" "T" "G" "T" "C" "G" "C" "A" "C" "G" "T"
## [20] "C"
##
## [[43]]
## [1] "C" "T" "C" "A" "T" "G" "A" "G" "C" "C" "G" "A" "T" "A" "C" "T" "G" "A" "C"
## [20] "C"
##
## [[44]]
## [1] "C" "A" "A" "A" "G" "G" "C" "A" "G" "T" "C" "A" "G" "G" "G" "T" "A" "A" "C"
## [20] "C"
##
## [[45]]
## [1] "C" "C" "G" "C" "A" "A" "A" "G" "C" "C" "A" "A" "C" "G" "A" "A" "C" "A" "A"
## [20] "C"
##
## [[46]]
## [1] "T" "C" "G" "A" "C" "T" "G" "T" "C" "T" "G" "G" "T" "G" "A" "A" "C" "A" "A"
## [20] "A"
##
## [[47]]
## [1] "T" "A" "T" "C" "G" "A" "T" "C" "A" "C" "G" "C" "A" "T" "A" "A" "A" "C" "T"
## [20] "T"
##
## [[48]]
## [1] "T" "T" "T" "A" "A" "A" "C" "T" "T" "T" "T" "A" "C" "T" "G" "C" "A" "T" "G"
## [20] "A"
##
## [[49]]
## [1] "T" "C" "T" "G" "G" "A" "T" "G" "T" "T" "G" "C" "T" "A" "T" "G" "C" "G" "A"
## [20] "A"
##
## [[50]]
## [1] "G" "C" "G" "C" "T" "G" "G" "T" "G" "G" "T" "C" "C" "A" "T" "T" "A" "C" "C"
## [20] "G"
Ad=c() #we generate a vector to save proportion content of each nucleotide for each of the 50 sequences
Gu=c()
Ci=c()
Ti=c()
for (i in 1:dim(z1)[1]){
temp_table=table(my_split[[i]])/length(my_split[[i]])
if ("A" %in% names(temp_table)){Ad[i]=temp_table[["A"]]}
else{Ad[i]=0}
if ("G" %in% names(temp_table)){Gu[i]=temp_table[["G"]]}
else{Gu[i]=0}
if ("C" %in% names(temp_table)){Ci[i]=temp_table[["C"]]}
else{Ci[i]=0}
if ("T" %in% names(temp_table)){Ti[i]=temp_table[["T"]]}
else{Ti[i]=0}
}
#Ee loop through the list of vectors saved in the variable my_split. We use the function table/length(vector) for each of them. We save the results of the table (proportion of nucleotide content),in the corresponding vector. We do this in order, for each of the 50 vectors in my_split (same order as in my data frame z1)
z2=cbind(z1,pA_orig=Ad, pG_orig=Gu, pC_orig=Ci, pT_orig=Ti) #We construct a new data frame (just intending to leave original data unaltered), where we add 4 new columns with the results of distribution of nucleotides for each sequence
z2
## ID Secuencia pA_orig pG_orig pC_orig pT_orig
## 1 1 TAGATGATACTTAAGGTAGG 0.35 0.30 0.05 0.30
## 2 2 TAACGGACCTACTAGGTTAT 0.30 0.20 0.20 0.30
## 3 3 GACACGTGCTCATCATCTAG 0.25 0.20 0.30 0.25
## 4 4 CAACGCCGTCAACGTTCTTA 0.25 0.15 0.35 0.25
## 5 5 ACATCCGTAGCAGACCTGGA 0.30 0.25 0.30 0.15
## 6 6 CGGGAACAGACCGTTAGATT 0.30 0.30 0.20 0.20
## 7 7 TCCTGGACGTAGGTAGATCA 0.25 0.30 0.20 0.25
## 8 8 GTTAGGTCGTCGAGTCATAA 0.25 0.30 0.15 0.30
## 9 9 ATGAGGGATGTGCGCATTCC 0.20 0.35 0.20 0.25
## 10 10 GGCTTGGTTCGGAGCCAAGT 0.15 0.40 0.20 0.25
## 11 11 TAGGGCCAAAGGCGACGACT 0.30 0.35 0.25 0.10
## 12 12 AACCTAGGTCCTTAATTCCC 0.25 0.10 0.35 0.30
## 13 13 CGGGAGTGCGACCTGACTTC 0.15 0.35 0.30 0.20
## 14 14 TTGACTGTCTCTTGGCCGCG 0.05 0.30 0.30 0.35
## 15 15 ATGTTACGTTATAAGTGAAG 0.35 0.25 0.05 0.35
## 16 16 CAAGGGATCGCGACGCTGGC 0.20 0.40 0.30 0.10
## 17 17 AGAGCGAGGTCCCTTCCTCG 0.15 0.30 0.35 0.20
## 18 18 GCCGGCGAGTGGACGGCTGC 0.10 0.50 0.30 0.10
## 19 19 ACCCTACATGCGTCGCCACA 0.25 0.15 0.45 0.15
## 20 20 TAAGCATGGACTAACAATTT 0.40 0.15 0.15 0.30
## 21 21 CCAGTGCCACTGAAGTTTGG 0.20 0.30 0.25 0.25
## 22 22 ACACTCCAACCTATCCCGAA 0.35 0.05 0.45 0.15
## 23 23 TGATGCCCGCAACTATGCGC 0.20 0.25 0.35 0.20
## 24 24 TTGAGCCAAAAGTCGGGTAG 0.30 0.35 0.15 0.20
## 25 25 GCCAAGAGTTGTTACACGGA 0.30 0.30 0.20 0.20
## 26 26 CCGAGGGGGACAGTTCTCGC 0.15 0.40 0.30 0.15
## 27 27 TCCTCTGCTAGATCAGTTCT 0.15 0.15 0.30 0.40
## 28 28 AACGGAATCAATTGGGATGA 0.40 0.30 0.10 0.20
## 29 29 AACGCGATTGGTCCGGTAAT 0.25 0.30 0.20 0.25
## 30 30 AAAGTTTCCATCGCAGATTT 0.30 0.15 0.20 0.35
## 31 31 TATTCAACTTTGTCAAGAGT 0.30 0.15 0.15 0.40
## 32 32 TGAAGCTCGGAGTTACCGGT 0.20 0.35 0.20 0.25
## 33 33 CCCGCAGGTAAATGGCTACA 0.30 0.25 0.30 0.15
## 34 34 CAAAGAAGGCGCAGCAGCCA 0.40 0.30 0.30 0.00
## 35 35 AATGTCAAATTGACTGAGTG 0.35 0.25 0.10 0.30
## 36 36 ACTCAGGGCGTGGTGCACAA 0.25 0.35 0.25 0.15
## 37 37 ATTGTTTTTCATGACAGTCG 0.20 0.20 0.15 0.45
## 38 38 GTGGAATTCCAGTACCGCCA 0.25 0.25 0.30 0.20
## 39 39 TATAACCAGGGGATCCGGCG 0.25 0.35 0.25 0.15
## 40 40 CCGGAGAGGAGCAGCCTGTC 0.20 0.40 0.30 0.10
## 41 41 CGACGTGGTCTAAACGGAGA 0.30 0.35 0.20 0.15
## 42 42 TTTTGAAGATGTCGCACGTC 0.20 0.25 0.20 0.35
## 43 43 CTCATGAGCCGATACTGACC 0.25 0.20 0.35 0.20
## 44 44 CAAAGGCAGTCAGGGTAACC 0.35 0.30 0.25 0.10
## 45 45 CCGCAAAGCCAACGAACAAC 0.45 0.15 0.40 0.00
## 46 46 TCGACTGTCTGGTGAACAAA 0.30 0.25 0.20 0.25
## 47 47 TATCGATCACGCATAAACTT 0.35 0.10 0.25 0.30
## 48 48 TTTAAACTTTTACTGCATGA 0.30 0.10 0.15 0.45
## 49 49 TCTGGATGTTGCTATGCGAA 0.20 0.30 0.15 0.35
## 50 50 GCGCTGGTGGTCCATTACCG 0.10 0.35 0.30 0.25
#Generate the reverse and complement for each sequence
my_rev=lapply(my_split,rev)#We generate the reverse of each vector in my_split list, and save them to a new list
my_rev_paste=lapply(my_rev, paste, collapse="")#We paste and collapse again every single vectorial entry to have a single string for each reverse sequence
library(mgsub) #install package mgsub previously if not already install
my_rev_comp=mgsub(my_rev_paste,c("G","C","A","T"),c("C","G","T","A"))
#we replace each nucleotide for it's base complement and save this to a new list
my_rev_comp_vec=unlist(my_rev_comp)#i generate a vector from the resulting list of previous steps to be able to join this column to the rest of the data frame
z3=cbind(z2,Rev_comp_seq=my_rev_comp_vec)#I generate a new dataframe contening previous columns plus new Reverse complementary sequence column
z3
## ID Secuencia pA_orig pG_orig pC_orig pT_orig Rev_comp_seq
## 1 1 TAGATGATACTTAAGGTAGG 0.35 0.30 0.05 0.30 CCTACCTTAAGTATCATCTA
## 2 2 TAACGGACCTACTAGGTTAT 0.30 0.20 0.20 0.30 ATAACCTAGTAGGTCCGTTA
## 3 3 GACACGTGCTCATCATCTAG 0.25 0.20 0.30 0.25 CTAGATGATGAGCACGTGTC
## 4 4 CAACGCCGTCAACGTTCTTA 0.25 0.15 0.35 0.25 TAAGAACGTTGACGGCGTTG
## 5 5 ACATCCGTAGCAGACCTGGA 0.30 0.25 0.30 0.15 TCCAGGTCTGCTACGGATGT
## 6 6 CGGGAACAGACCGTTAGATT 0.30 0.30 0.20 0.20 AATCTAACGGTCTGTTCCCG
## 7 7 TCCTGGACGTAGGTAGATCA 0.25 0.30 0.20 0.25 TGATCTACCTACGTCCAGGA
## 8 8 GTTAGGTCGTCGAGTCATAA 0.25 0.30 0.15 0.30 TTATGACTCGACGACCTAAC
## 9 9 ATGAGGGATGTGCGCATTCC 0.20 0.35 0.20 0.25 GGAATGCGCACATCCCTCAT
## 10 10 GGCTTGGTTCGGAGCCAAGT 0.15 0.40 0.20 0.25 ACTTGGCTCCGAACCAAGCC
## 11 11 TAGGGCCAAAGGCGACGACT 0.30 0.35 0.25 0.10 AGTCGTCGCCTTTGGCCCTA
## 12 12 AACCTAGGTCCTTAATTCCC 0.25 0.10 0.35 0.30 GGGAATTAAGGACCTAGGTT
## 13 13 CGGGAGTGCGACCTGACTTC 0.15 0.35 0.30 0.20 GAAGTCAGGTCGCACTCCCG
## 14 14 TTGACTGTCTCTTGGCCGCG 0.05 0.30 0.30 0.35 CGCGGCCAAGAGACAGTCAA
## 15 15 ATGTTACGTTATAAGTGAAG 0.35 0.25 0.05 0.35 CTTCACTTATAACGTAACAT
## 16 16 CAAGGGATCGCGACGCTGGC 0.20 0.40 0.30 0.10 GCCAGCGTCGCGATCCCTTG
## 17 17 AGAGCGAGGTCCCTTCCTCG 0.15 0.30 0.35 0.20 CGAGGAAGGGACCTCGCTCT
## 18 18 GCCGGCGAGTGGACGGCTGC 0.10 0.50 0.30 0.10 GCAGCCGTCCACTCGCCGGC
## 19 19 ACCCTACATGCGTCGCCACA 0.25 0.15 0.45 0.15 TGTGGCGACGCATGTAGGGT
## 20 20 TAAGCATGGACTAACAATTT 0.40 0.15 0.15 0.30 AAATTGTTAGTCCATGCTTA
## 21 21 CCAGTGCCACTGAAGTTTGG 0.20 0.30 0.25 0.25 CCAAACTTCAGTGGCACTGG
## 22 22 ACACTCCAACCTATCCCGAA 0.35 0.05 0.45 0.15 TTCGGGATAGGTTGGAGTGT
## 23 23 TGATGCCCGCAACTATGCGC 0.20 0.25 0.35 0.20 GCGCATAGTTGCGGGCATCA
## 24 24 TTGAGCCAAAAGTCGGGTAG 0.30 0.35 0.15 0.20 CTACCCGACTTTTGGCTCAA
## 25 25 GCCAAGAGTTGTTACACGGA 0.30 0.30 0.20 0.20 TCCGTGTAACAACTCTTGGC
## 26 26 CCGAGGGGGACAGTTCTCGC 0.15 0.40 0.30 0.15 GCGAGAACTGTCCCCCTCGG
## 27 27 TCCTCTGCTAGATCAGTTCT 0.15 0.15 0.30 0.40 AGAACTGATCTAGCAGAGGA
## 28 28 AACGGAATCAATTGGGATGA 0.40 0.30 0.10 0.20 TCATCCCAATTGATTCCGTT
## 29 29 AACGCGATTGGTCCGGTAAT 0.25 0.30 0.20 0.25 ATTACCGGACCAATCGCGTT
## 30 30 AAAGTTTCCATCGCAGATTT 0.30 0.15 0.20 0.35 AAATCTGCGATGGAAACTTT
## 31 31 TATTCAACTTTGTCAAGAGT 0.30 0.15 0.15 0.40 ACTCTTGACAAAGTTGAATA
## 32 32 TGAAGCTCGGAGTTACCGGT 0.20 0.35 0.20 0.25 ACCGGTAACTCCGAGCTTCA
## 33 33 CCCGCAGGTAAATGGCTACA 0.30 0.25 0.30 0.15 TGTAGCCATTTACCTGCGGG
## 34 34 CAAAGAAGGCGCAGCAGCCA 0.40 0.30 0.30 0.00 TGGCTGCTGCGCCTTCTTTG
## 35 35 AATGTCAAATTGACTGAGTG 0.35 0.25 0.10 0.30 CACTCAGTCAATTTGACATT
## 36 36 ACTCAGGGCGTGGTGCACAA 0.25 0.35 0.25 0.15 TTGTGCACCACGCCCTGAGT
## 37 37 ATTGTTTTTCATGACAGTCG 0.20 0.20 0.15 0.45 CGACTGTCATGAAAAACAAT
## 38 38 GTGGAATTCCAGTACCGCCA 0.25 0.25 0.30 0.20 TGGCGGTACTGGAATTCCAC
## 39 39 TATAACCAGGGGATCCGGCG 0.25 0.35 0.25 0.15 CGCCGGATCCCCTGGTTATA
## 40 40 CCGGAGAGGAGCAGCCTGTC 0.20 0.40 0.30 0.10 GACAGGCTGCTCCTCTCCGG
## 41 41 CGACGTGGTCTAAACGGAGA 0.30 0.35 0.20 0.15 TCTCCGTTTAGACCACGTCG
## 42 42 TTTTGAAGATGTCGCACGTC 0.20 0.25 0.20 0.35 GACGTGCGACATCTTCAAAA
## 43 43 CTCATGAGCCGATACTGACC 0.25 0.20 0.35 0.20 GGTCAGTATCGGCTCATGAG
## 44 44 CAAAGGCAGTCAGGGTAACC 0.35 0.30 0.25 0.10 GGTTACCCTGACTGCCTTTG
## 45 45 CCGCAAAGCCAACGAACAAC 0.45 0.15 0.40 0.00 GTTGTTCGTTGGCTTTGCGG
## 46 46 TCGACTGTCTGGTGAACAAA 0.30 0.25 0.20 0.25 TTTGTTCACCAGACAGTCGA
## 47 47 TATCGATCACGCATAAACTT 0.35 0.10 0.25 0.30 AAGTTTATGCGTGATCGATA
## 48 48 TTTAAACTTTTACTGCATGA 0.30 0.10 0.15 0.45 TCATGCAGTAAAAGTTTAAA
## 49 49 TCTGGATGTTGCTATGCGAA 0.20 0.30 0.15 0.35 TTCGCATAGCAACATCCAGA
## 50 50 GCGCTGGTGGTCCATTACCG 0.10 0.35 0.30 0.25 CGGTAATGGACCACCAGCGC