##Programming excersice from http://manuals.bioinformatics.ucr.edu/home/programming-in-r

#Assignment:Write a function that calculates for a set of DNA sequences their GC content and generates their reverse and complement. ## Generate an example data frame with ID numbers and DNA sequences fx <- function(test) { x <- as.integer(runif(20, min=1, max=5)) x[x==1] <- “A”; x[x==2] <- “T”; x[x==3] <- “G”; x[x==4] <- “C” paste(x, sep = "“, collapse =”") } z1 <- c() for(i in 1:50) { z1 <- c(fx(i), z1) } z1 <- data.frame(ID=seq(along=z1), Seq=z1) z1

Write each character of sequence into separate vector field and reverse its order

my_split <- strsplit(as.character(z1[1,2]),"") ## Calculate in the same loop the GC content for each sequence using the following command

table(my_split[[1]])/length(my_split[[1]]) my_rev <- rev(my_split[[1]]) paste(my_rev, collapse="")

Generate the sequence complement by replacing G|C|A|T by C|G|T|A

Use ‘apply’ or ‘for loop’ to apply the above operations to all sequences in sample data frame ‘z1’

#First part guided:

#Using the functions provided, first we define the function to generate each sequence's content 
fx=function(test){
  x=as.integer(runif(n=20,min=1,max=5)) #We generate random samples with uniform probability distribution. In this case, the "n" of the sample defines the sequence length. We will generate 20 random samples with uniform probability distribution of numerical results between 1(min) and 5(max), (1 to 4 included).
  x[x==1]="A";#We asign 1 nucleotide initial to each possible number generated in the sample, giving each nucleotide the same probability to appear
  x[x==2]="T";
  x[x==3]="G";
  x[x==4]="C"
  paste(x,sep="",collapse="")#We save the 20 independent results in 1 string without spaces between them
}

z1=c() #we define a vector for our sequences
set.seed(9991) #we set.seed to any chosen number to be able to reproduce our results
for(i in 1:50){
  z1=c(fx(i),z1) ##with a for loop, we call the function 50 times and save the results to our predefined vector. With this loop we will generate 50 samples of randomly generated, 20 nucleotide long, DNA sequences.
}

z1=data.frame(ID=seq(along=z1), Secuencia=z1) #I build the data frame with 1 column for IDs (indexes along the vector of interest) and the second column with the content of each entry of the vector itself
z1
##    ID            Secuencia
## 1   1 TAGATGATACTTAAGGTAGG
## 2   2 TAACGGACCTACTAGGTTAT
## 3   3 GACACGTGCTCATCATCTAG
## 4   4 CAACGCCGTCAACGTTCTTA
## 5   5 ACATCCGTAGCAGACCTGGA
## 6   6 CGGGAACAGACCGTTAGATT
## 7   7 TCCTGGACGTAGGTAGATCA
## 8   8 GTTAGGTCGTCGAGTCATAA
## 9   9 ATGAGGGATGTGCGCATTCC
## 10 10 GGCTTGGTTCGGAGCCAAGT
## 11 11 TAGGGCCAAAGGCGACGACT
## 12 12 AACCTAGGTCCTTAATTCCC
## 13 13 CGGGAGTGCGACCTGACTTC
## 14 14 TTGACTGTCTCTTGGCCGCG
## 15 15 ATGTTACGTTATAAGTGAAG
## 16 16 CAAGGGATCGCGACGCTGGC
## 17 17 AGAGCGAGGTCCCTTCCTCG
## 18 18 GCCGGCGAGTGGACGGCTGC
## 19 19 ACCCTACATGCGTCGCCACA
## 20 20 TAAGCATGGACTAACAATTT
## 21 21 CCAGTGCCACTGAAGTTTGG
## 22 22 ACACTCCAACCTATCCCGAA
## 23 23 TGATGCCCGCAACTATGCGC
## 24 24 TTGAGCCAAAAGTCGGGTAG
## 25 25 GCCAAGAGTTGTTACACGGA
## 26 26 CCGAGGGGGACAGTTCTCGC
## 27 27 TCCTCTGCTAGATCAGTTCT
## 28 28 AACGGAATCAATTGGGATGA
## 29 29 AACGCGATTGGTCCGGTAAT
## 30 30 AAAGTTTCCATCGCAGATTT
## 31 31 TATTCAACTTTGTCAAGAGT
## 32 32 TGAAGCTCGGAGTTACCGGT
## 33 33 CCCGCAGGTAAATGGCTACA
## 34 34 CAAAGAAGGCGCAGCAGCCA
## 35 35 AATGTCAAATTGACTGAGTG
## 36 36 ACTCAGGGCGTGGTGCACAA
## 37 37 ATTGTTTTTCATGACAGTCG
## 38 38 GTGGAATTCCAGTACCGCCA
## 39 39 TATAACCAGGGGATCCGGCG
## 40 40 CCGGAGAGGAGCAGCCTGTC
## 41 41 CGACGTGGTCTAAACGGAGA
## 42 42 TTTTGAAGATGTCGCACGTC
## 43 43 CTCATGAGCCGATACTGACC
## 44 44 CAAAGGCAGTCAGGGTAACC
## 45 45 CCGCAAAGCCAACGAACAAC
## 46 46 TCGACTGTCTGGTGAACAAA
## 47 47 TATCGATCACGCATAAACTT
## 48 48 TTTAAACTTTTACTGCATGA
## 49 49 TCTGGATGTTGCTATGCGAA
## 50 50 GCGCTGGTGGTCCATTACCG

#My proposed solution for second and third part:

#Caculate de G C content for each sequence

my_split=c()
for (i in 1:dim(z1)[1]){
 my_split[i]=strsplit(as.character(z1[i,2]),"")
}#We write each character of sequence into separate vector field
my_split
## [[1]]
##  [1] "T" "A" "G" "A" "T" "G" "A" "T" "A" "C" "T" "T" "A" "A" "G" "G" "T" "A" "G"
## [20] "G"
## 
## [[2]]
##  [1] "T" "A" "A" "C" "G" "G" "A" "C" "C" "T" "A" "C" "T" "A" "G" "G" "T" "T" "A"
## [20] "T"
## 
## [[3]]
##  [1] "G" "A" "C" "A" "C" "G" "T" "G" "C" "T" "C" "A" "T" "C" "A" "T" "C" "T" "A"
## [20] "G"
## 
## [[4]]
##  [1] "C" "A" "A" "C" "G" "C" "C" "G" "T" "C" "A" "A" "C" "G" "T" "T" "C" "T" "T"
## [20] "A"
## 
## [[5]]
##  [1] "A" "C" "A" "T" "C" "C" "G" "T" "A" "G" "C" "A" "G" "A" "C" "C" "T" "G" "G"
## [20] "A"
## 
## [[6]]
##  [1] "C" "G" "G" "G" "A" "A" "C" "A" "G" "A" "C" "C" "G" "T" "T" "A" "G" "A" "T"
## [20] "T"
## 
## [[7]]
##  [1] "T" "C" "C" "T" "G" "G" "A" "C" "G" "T" "A" "G" "G" "T" "A" "G" "A" "T" "C"
## [20] "A"
## 
## [[8]]
##  [1] "G" "T" "T" "A" "G" "G" "T" "C" "G" "T" "C" "G" "A" "G" "T" "C" "A" "T" "A"
## [20] "A"
## 
## [[9]]
##  [1] "A" "T" "G" "A" "G" "G" "G" "A" "T" "G" "T" "G" "C" "G" "C" "A" "T" "T" "C"
## [20] "C"
## 
## [[10]]
##  [1] "G" "G" "C" "T" "T" "G" "G" "T" "T" "C" "G" "G" "A" "G" "C" "C" "A" "A" "G"
## [20] "T"
## 
## [[11]]
##  [1] "T" "A" "G" "G" "G" "C" "C" "A" "A" "A" "G" "G" "C" "G" "A" "C" "G" "A" "C"
## [20] "T"
## 
## [[12]]
##  [1] "A" "A" "C" "C" "T" "A" "G" "G" "T" "C" "C" "T" "T" "A" "A" "T" "T" "C" "C"
## [20] "C"
## 
## [[13]]
##  [1] "C" "G" "G" "G" "A" "G" "T" "G" "C" "G" "A" "C" "C" "T" "G" "A" "C" "T" "T"
## [20] "C"
## 
## [[14]]
##  [1] "T" "T" "G" "A" "C" "T" "G" "T" "C" "T" "C" "T" "T" "G" "G" "C" "C" "G" "C"
## [20] "G"
## 
## [[15]]
##  [1] "A" "T" "G" "T" "T" "A" "C" "G" "T" "T" "A" "T" "A" "A" "G" "T" "G" "A" "A"
## [20] "G"
## 
## [[16]]
##  [1] "C" "A" "A" "G" "G" "G" "A" "T" "C" "G" "C" "G" "A" "C" "G" "C" "T" "G" "G"
## [20] "C"
## 
## [[17]]
##  [1] "A" "G" "A" "G" "C" "G" "A" "G" "G" "T" "C" "C" "C" "T" "T" "C" "C" "T" "C"
## [20] "G"
## 
## [[18]]
##  [1] "G" "C" "C" "G" "G" "C" "G" "A" "G" "T" "G" "G" "A" "C" "G" "G" "C" "T" "G"
## [20] "C"
## 
## [[19]]
##  [1] "A" "C" "C" "C" "T" "A" "C" "A" "T" "G" "C" "G" "T" "C" "G" "C" "C" "A" "C"
## [20] "A"
## 
## [[20]]
##  [1] "T" "A" "A" "G" "C" "A" "T" "G" "G" "A" "C" "T" "A" "A" "C" "A" "A" "T" "T"
## [20] "T"
## 
## [[21]]
##  [1] "C" "C" "A" "G" "T" "G" "C" "C" "A" "C" "T" "G" "A" "A" "G" "T" "T" "T" "G"
## [20] "G"
## 
## [[22]]
##  [1] "A" "C" "A" "C" "T" "C" "C" "A" "A" "C" "C" "T" "A" "T" "C" "C" "C" "G" "A"
## [20] "A"
## 
## [[23]]
##  [1] "T" "G" "A" "T" "G" "C" "C" "C" "G" "C" "A" "A" "C" "T" "A" "T" "G" "C" "G"
## [20] "C"
## 
## [[24]]
##  [1] "T" "T" "G" "A" "G" "C" "C" "A" "A" "A" "A" "G" "T" "C" "G" "G" "G" "T" "A"
## [20] "G"
## 
## [[25]]
##  [1] "G" "C" "C" "A" "A" "G" "A" "G" "T" "T" "G" "T" "T" "A" "C" "A" "C" "G" "G"
## [20] "A"
## 
## [[26]]
##  [1] "C" "C" "G" "A" "G" "G" "G" "G" "G" "A" "C" "A" "G" "T" "T" "C" "T" "C" "G"
## [20] "C"
## 
## [[27]]
##  [1] "T" "C" "C" "T" "C" "T" "G" "C" "T" "A" "G" "A" "T" "C" "A" "G" "T" "T" "C"
## [20] "T"
## 
## [[28]]
##  [1] "A" "A" "C" "G" "G" "A" "A" "T" "C" "A" "A" "T" "T" "G" "G" "G" "A" "T" "G"
## [20] "A"
## 
## [[29]]
##  [1] "A" "A" "C" "G" "C" "G" "A" "T" "T" "G" "G" "T" "C" "C" "G" "G" "T" "A" "A"
## [20] "T"
## 
## [[30]]
##  [1] "A" "A" "A" "G" "T" "T" "T" "C" "C" "A" "T" "C" "G" "C" "A" "G" "A" "T" "T"
## [20] "T"
## 
## [[31]]
##  [1] "T" "A" "T" "T" "C" "A" "A" "C" "T" "T" "T" "G" "T" "C" "A" "A" "G" "A" "G"
## [20] "T"
## 
## [[32]]
##  [1] "T" "G" "A" "A" "G" "C" "T" "C" "G" "G" "A" "G" "T" "T" "A" "C" "C" "G" "G"
## [20] "T"
## 
## [[33]]
##  [1] "C" "C" "C" "G" "C" "A" "G" "G" "T" "A" "A" "A" "T" "G" "G" "C" "T" "A" "C"
## [20] "A"
## 
## [[34]]
##  [1] "C" "A" "A" "A" "G" "A" "A" "G" "G" "C" "G" "C" "A" "G" "C" "A" "G" "C" "C"
## [20] "A"
## 
## [[35]]
##  [1] "A" "A" "T" "G" "T" "C" "A" "A" "A" "T" "T" "G" "A" "C" "T" "G" "A" "G" "T"
## [20] "G"
## 
## [[36]]
##  [1] "A" "C" "T" "C" "A" "G" "G" "G" "C" "G" "T" "G" "G" "T" "G" "C" "A" "C" "A"
## [20] "A"
## 
## [[37]]
##  [1] "A" "T" "T" "G" "T" "T" "T" "T" "T" "C" "A" "T" "G" "A" "C" "A" "G" "T" "C"
## [20] "G"
## 
## [[38]]
##  [1] "G" "T" "G" "G" "A" "A" "T" "T" "C" "C" "A" "G" "T" "A" "C" "C" "G" "C" "C"
## [20] "A"
## 
## [[39]]
##  [1] "T" "A" "T" "A" "A" "C" "C" "A" "G" "G" "G" "G" "A" "T" "C" "C" "G" "G" "C"
## [20] "G"
## 
## [[40]]
##  [1] "C" "C" "G" "G" "A" "G" "A" "G" "G" "A" "G" "C" "A" "G" "C" "C" "T" "G" "T"
## [20] "C"
## 
## [[41]]
##  [1] "C" "G" "A" "C" "G" "T" "G" "G" "T" "C" "T" "A" "A" "A" "C" "G" "G" "A" "G"
## [20] "A"
## 
## [[42]]
##  [1] "T" "T" "T" "T" "G" "A" "A" "G" "A" "T" "G" "T" "C" "G" "C" "A" "C" "G" "T"
## [20] "C"
## 
## [[43]]
##  [1] "C" "T" "C" "A" "T" "G" "A" "G" "C" "C" "G" "A" "T" "A" "C" "T" "G" "A" "C"
## [20] "C"
## 
## [[44]]
##  [1] "C" "A" "A" "A" "G" "G" "C" "A" "G" "T" "C" "A" "G" "G" "G" "T" "A" "A" "C"
## [20] "C"
## 
## [[45]]
##  [1] "C" "C" "G" "C" "A" "A" "A" "G" "C" "C" "A" "A" "C" "G" "A" "A" "C" "A" "A"
## [20] "C"
## 
## [[46]]
##  [1] "T" "C" "G" "A" "C" "T" "G" "T" "C" "T" "G" "G" "T" "G" "A" "A" "C" "A" "A"
## [20] "A"
## 
## [[47]]
##  [1] "T" "A" "T" "C" "G" "A" "T" "C" "A" "C" "G" "C" "A" "T" "A" "A" "A" "C" "T"
## [20] "T"
## 
## [[48]]
##  [1] "T" "T" "T" "A" "A" "A" "C" "T" "T" "T" "T" "A" "C" "T" "G" "C" "A" "T" "G"
## [20] "A"
## 
## [[49]]
##  [1] "T" "C" "T" "G" "G" "A" "T" "G" "T" "T" "G" "C" "T" "A" "T" "G" "C" "G" "A"
## [20] "A"
## 
## [[50]]
##  [1] "G" "C" "G" "C" "T" "G" "G" "T" "G" "G" "T" "C" "C" "A" "T" "T" "A" "C" "C"
## [20] "G"
Ad=c() #we generate a vector to save proportion content of each nucleotide for each of the 50 sequences
Gu=c()
Ci=c()
Ti=c()

for (i in 1:dim(z1)[1]){
  temp_table=table(my_split[[i]])/length(my_split[[i]])
  if ("A" %in% names(temp_table)){Ad[i]=temp_table[["A"]]}
  else{Ad[i]=0}
  if ("G" %in% names(temp_table)){Gu[i]=temp_table[["G"]]}
  else{Gu[i]=0}
  if ("C" %in% names(temp_table)){Ci[i]=temp_table[["C"]]}
  else{Ci[i]=0}
  if ("T" %in% names(temp_table)){Ti[i]=temp_table[["T"]]}
  else{Ti[i]=0}
}
#Ee loop through the list of vectors saved in the variable my_split. We use the function table/length(vector) for each of them. We save the results of the table (proportion of nucleotide content),in the corresponding vector. We do this in order, for each of the 50 vectors in my_split (same order as in my data frame z1)

z2=cbind(z1,pA_orig=Ad, pG_orig=Gu, pC_orig=Ci, pT_orig=Ti) #We construct a new data frame (just intending to leave original data unaltered), where we add 4 new columns with the results of distribution of nucleotides for each sequence
z2
##    ID            Secuencia pA_orig pG_orig pC_orig pT_orig
## 1   1 TAGATGATACTTAAGGTAGG    0.35    0.30    0.05    0.30
## 2   2 TAACGGACCTACTAGGTTAT    0.30    0.20    0.20    0.30
## 3   3 GACACGTGCTCATCATCTAG    0.25    0.20    0.30    0.25
## 4   4 CAACGCCGTCAACGTTCTTA    0.25    0.15    0.35    0.25
## 5   5 ACATCCGTAGCAGACCTGGA    0.30    0.25    0.30    0.15
## 6   6 CGGGAACAGACCGTTAGATT    0.30    0.30    0.20    0.20
## 7   7 TCCTGGACGTAGGTAGATCA    0.25    0.30    0.20    0.25
## 8   8 GTTAGGTCGTCGAGTCATAA    0.25    0.30    0.15    0.30
## 9   9 ATGAGGGATGTGCGCATTCC    0.20    0.35    0.20    0.25
## 10 10 GGCTTGGTTCGGAGCCAAGT    0.15    0.40    0.20    0.25
## 11 11 TAGGGCCAAAGGCGACGACT    0.30    0.35    0.25    0.10
## 12 12 AACCTAGGTCCTTAATTCCC    0.25    0.10    0.35    0.30
## 13 13 CGGGAGTGCGACCTGACTTC    0.15    0.35    0.30    0.20
## 14 14 TTGACTGTCTCTTGGCCGCG    0.05    0.30    0.30    0.35
## 15 15 ATGTTACGTTATAAGTGAAG    0.35    0.25    0.05    0.35
## 16 16 CAAGGGATCGCGACGCTGGC    0.20    0.40    0.30    0.10
## 17 17 AGAGCGAGGTCCCTTCCTCG    0.15    0.30    0.35    0.20
## 18 18 GCCGGCGAGTGGACGGCTGC    0.10    0.50    0.30    0.10
## 19 19 ACCCTACATGCGTCGCCACA    0.25    0.15    0.45    0.15
## 20 20 TAAGCATGGACTAACAATTT    0.40    0.15    0.15    0.30
## 21 21 CCAGTGCCACTGAAGTTTGG    0.20    0.30    0.25    0.25
## 22 22 ACACTCCAACCTATCCCGAA    0.35    0.05    0.45    0.15
## 23 23 TGATGCCCGCAACTATGCGC    0.20    0.25    0.35    0.20
## 24 24 TTGAGCCAAAAGTCGGGTAG    0.30    0.35    0.15    0.20
## 25 25 GCCAAGAGTTGTTACACGGA    0.30    0.30    0.20    0.20
## 26 26 CCGAGGGGGACAGTTCTCGC    0.15    0.40    0.30    0.15
## 27 27 TCCTCTGCTAGATCAGTTCT    0.15    0.15    0.30    0.40
## 28 28 AACGGAATCAATTGGGATGA    0.40    0.30    0.10    0.20
## 29 29 AACGCGATTGGTCCGGTAAT    0.25    0.30    0.20    0.25
## 30 30 AAAGTTTCCATCGCAGATTT    0.30    0.15    0.20    0.35
## 31 31 TATTCAACTTTGTCAAGAGT    0.30    0.15    0.15    0.40
## 32 32 TGAAGCTCGGAGTTACCGGT    0.20    0.35    0.20    0.25
## 33 33 CCCGCAGGTAAATGGCTACA    0.30    0.25    0.30    0.15
## 34 34 CAAAGAAGGCGCAGCAGCCA    0.40    0.30    0.30    0.00
## 35 35 AATGTCAAATTGACTGAGTG    0.35    0.25    0.10    0.30
## 36 36 ACTCAGGGCGTGGTGCACAA    0.25    0.35    0.25    0.15
## 37 37 ATTGTTTTTCATGACAGTCG    0.20    0.20    0.15    0.45
## 38 38 GTGGAATTCCAGTACCGCCA    0.25    0.25    0.30    0.20
## 39 39 TATAACCAGGGGATCCGGCG    0.25    0.35    0.25    0.15
## 40 40 CCGGAGAGGAGCAGCCTGTC    0.20    0.40    0.30    0.10
## 41 41 CGACGTGGTCTAAACGGAGA    0.30    0.35    0.20    0.15
## 42 42 TTTTGAAGATGTCGCACGTC    0.20    0.25    0.20    0.35
## 43 43 CTCATGAGCCGATACTGACC    0.25    0.20    0.35    0.20
## 44 44 CAAAGGCAGTCAGGGTAACC    0.35    0.30    0.25    0.10
## 45 45 CCGCAAAGCCAACGAACAAC    0.45    0.15    0.40    0.00
## 46 46 TCGACTGTCTGGTGAACAAA    0.30    0.25    0.20    0.25
## 47 47 TATCGATCACGCATAAACTT    0.35    0.10    0.25    0.30
## 48 48 TTTAAACTTTTACTGCATGA    0.30    0.10    0.15    0.45
## 49 49 TCTGGATGTTGCTATGCGAA    0.20    0.30    0.15    0.35
## 50 50 GCGCTGGTGGTCCATTACCG    0.10    0.35    0.30    0.25

#Generate the reverse and complement for each sequence

my_rev=lapply(my_split,rev)#We generate the reverse of each vector in my_split list, and save them to a new list 

my_rev_paste=lapply(my_rev, paste, collapse="")#We paste and collapse again every single vectorial entry to have a single string for each reverse sequence

library(mgsub) #install package mgsub previously if not already install
my_rev_comp=mgsub(my_rev_paste,c("G","C","A","T"),c("C","G","T","A"))
#we replace each nucleotide for it's base complement and save this to a new list
                 
my_rev_comp_vec=unlist(my_rev_comp)#i generate a vector from the resulting list of previous steps to be able to join this column to the rest of the data frame

z3=cbind(z2,Rev_comp_seq=my_rev_comp_vec)#I generate a new dataframe contening previous columns plus new Reverse complementary sequence column 
z3
##    ID            Secuencia pA_orig pG_orig pC_orig pT_orig         Rev_comp_seq
## 1   1 TAGATGATACTTAAGGTAGG    0.35    0.30    0.05    0.30 CCTACCTTAAGTATCATCTA
## 2   2 TAACGGACCTACTAGGTTAT    0.30    0.20    0.20    0.30 ATAACCTAGTAGGTCCGTTA
## 3   3 GACACGTGCTCATCATCTAG    0.25    0.20    0.30    0.25 CTAGATGATGAGCACGTGTC
## 4   4 CAACGCCGTCAACGTTCTTA    0.25    0.15    0.35    0.25 TAAGAACGTTGACGGCGTTG
## 5   5 ACATCCGTAGCAGACCTGGA    0.30    0.25    0.30    0.15 TCCAGGTCTGCTACGGATGT
## 6   6 CGGGAACAGACCGTTAGATT    0.30    0.30    0.20    0.20 AATCTAACGGTCTGTTCCCG
## 7   7 TCCTGGACGTAGGTAGATCA    0.25    0.30    0.20    0.25 TGATCTACCTACGTCCAGGA
## 8   8 GTTAGGTCGTCGAGTCATAA    0.25    0.30    0.15    0.30 TTATGACTCGACGACCTAAC
## 9   9 ATGAGGGATGTGCGCATTCC    0.20    0.35    0.20    0.25 GGAATGCGCACATCCCTCAT
## 10 10 GGCTTGGTTCGGAGCCAAGT    0.15    0.40    0.20    0.25 ACTTGGCTCCGAACCAAGCC
## 11 11 TAGGGCCAAAGGCGACGACT    0.30    0.35    0.25    0.10 AGTCGTCGCCTTTGGCCCTA
## 12 12 AACCTAGGTCCTTAATTCCC    0.25    0.10    0.35    0.30 GGGAATTAAGGACCTAGGTT
## 13 13 CGGGAGTGCGACCTGACTTC    0.15    0.35    0.30    0.20 GAAGTCAGGTCGCACTCCCG
## 14 14 TTGACTGTCTCTTGGCCGCG    0.05    0.30    0.30    0.35 CGCGGCCAAGAGACAGTCAA
## 15 15 ATGTTACGTTATAAGTGAAG    0.35    0.25    0.05    0.35 CTTCACTTATAACGTAACAT
## 16 16 CAAGGGATCGCGACGCTGGC    0.20    0.40    0.30    0.10 GCCAGCGTCGCGATCCCTTG
## 17 17 AGAGCGAGGTCCCTTCCTCG    0.15    0.30    0.35    0.20 CGAGGAAGGGACCTCGCTCT
## 18 18 GCCGGCGAGTGGACGGCTGC    0.10    0.50    0.30    0.10 GCAGCCGTCCACTCGCCGGC
## 19 19 ACCCTACATGCGTCGCCACA    0.25    0.15    0.45    0.15 TGTGGCGACGCATGTAGGGT
## 20 20 TAAGCATGGACTAACAATTT    0.40    0.15    0.15    0.30 AAATTGTTAGTCCATGCTTA
## 21 21 CCAGTGCCACTGAAGTTTGG    0.20    0.30    0.25    0.25 CCAAACTTCAGTGGCACTGG
## 22 22 ACACTCCAACCTATCCCGAA    0.35    0.05    0.45    0.15 TTCGGGATAGGTTGGAGTGT
## 23 23 TGATGCCCGCAACTATGCGC    0.20    0.25    0.35    0.20 GCGCATAGTTGCGGGCATCA
## 24 24 TTGAGCCAAAAGTCGGGTAG    0.30    0.35    0.15    0.20 CTACCCGACTTTTGGCTCAA
## 25 25 GCCAAGAGTTGTTACACGGA    0.30    0.30    0.20    0.20 TCCGTGTAACAACTCTTGGC
## 26 26 CCGAGGGGGACAGTTCTCGC    0.15    0.40    0.30    0.15 GCGAGAACTGTCCCCCTCGG
## 27 27 TCCTCTGCTAGATCAGTTCT    0.15    0.15    0.30    0.40 AGAACTGATCTAGCAGAGGA
## 28 28 AACGGAATCAATTGGGATGA    0.40    0.30    0.10    0.20 TCATCCCAATTGATTCCGTT
## 29 29 AACGCGATTGGTCCGGTAAT    0.25    0.30    0.20    0.25 ATTACCGGACCAATCGCGTT
## 30 30 AAAGTTTCCATCGCAGATTT    0.30    0.15    0.20    0.35 AAATCTGCGATGGAAACTTT
## 31 31 TATTCAACTTTGTCAAGAGT    0.30    0.15    0.15    0.40 ACTCTTGACAAAGTTGAATA
## 32 32 TGAAGCTCGGAGTTACCGGT    0.20    0.35    0.20    0.25 ACCGGTAACTCCGAGCTTCA
## 33 33 CCCGCAGGTAAATGGCTACA    0.30    0.25    0.30    0.15 TGTAGCCATTTACCTGCGGG
## 34 34 CAAAGAAGGCGCAGCAGCCA    0.40    0.30    0.30    0.00 TGGCTGCTGCGCCTTCTTTG
## 35 35 AATGTCAAATTGACTGAGTG    0.35    0.25    0.10    0.30 CACTCAGTCAATTTGACATT
## 36 36 ACTCAGGGCGTGGTGCACAA    0.25    0.35    0.25    0.15 TTGTGCACCACGCCCTGAGT
## 37 37 ATTGTTTTTCATGACAGTCG    0.20    0.20    0.15    0.45 CGACTGTCATGAAAAACAAT
## 38 38 GTGGAATTCCAGTACCGCCA    0.25    0.25    0.30    0.20 TGGCGGTACTGGAATTCCAC
## 39 39 TATAACCAGGGGATCCGGCG    0.25    0.35    0.25    0.15 CGCCGGATCCCCTGGTTATA
## 40 40 CCGGAGAGGAGCAGCCTGTC    0.20    0.40    0.30    0.10 GACAGGCTGCTCCTCTCCGG
## 41 41 CGACGTGGTCTAAACGGAGA    0.30    0.35    0.20    0.15 TCTCCGTTTAGACCACGTCG
## 42 42 TTTTGAAGATGTCGCACGTC    0.20    0.25    0.20    0.35 GACGTGCGACATCTTCAAAA
## 43 43 CTCATGAGCCGATACTGACC    0.25    0.20    0.35    0.20 GGTCAGTATCGGCTCATGAG
## 44 44 CAAAGGCAGTCAGGGTAACC    0.35    0.30    0.25    0.10 GGTTACCCTGACTGCCTTTG
## 45 45 CCGCAAAGCCAACGAACAAC    0.45    0.15    0.40    0.00 GTTGTTCGTTGGCTTTGCGG
## 46 46 TCGACTGTCTGGTGAACAAA    0.30    0.25    0.20    0.25 TTTGTTCACCAGACAGTCGA
## 47 47 TATCGATCACGCATAAACTT    0.35    0.10    0.25    0.30 AAGTTTATGCGTGATCGATA
## 48 48 TTTAAACTTTTACTGCATGA    0.30    0.10    0.15    0.45 TCATGCAGTAAAAGTTTAAA
## 49 49 TCTGGATGTTGCTATGCGAA    0.20    0.30    0.15    0.35 TTCGCATAGCAACATCCAGA
## 50 50 GCGCTGGTGGTCCATTACCG    0.10    0.35    0.30    0.25 CGGTAATGGACCACCAGCGC