ΒΝΠ54 | ΑΛΓΟΡΙΘΜΟΙ ΣΤΗ ΜΟΡΙΑΚΗ ΒΙΟΛΟΓΙΑ ΚΑΙ ΣΤΗ ΔΟΜΙΚΗ ΒΙΟΠΛΗΡΟΦΟΡΙΚΗ - 1η Εργασία

Θέμα 3

Θα διαβάσουμε αρχικά τις αλληλουχίες με την παρακάτω συνάρτηση

readfastafile<-function(file){                    # input may be single or multi-fasta
  # Reading of file
  lines<-readLines(file)                          
  # Creation of name records
  siterecords<-grep(">",lines)                    # obtaining number of line for all headers 
  namerecords<-gsub(">", "", lines[siterecords])  # storing names of seqs in a vector
  # Case 1: Single fasta file
  if(length(namerecords)==1){
    sequence<-""
      # we loop on lines from the second line to the end of file
    for(j in 2:length(lines)){
        sequence<-paste(sequence,lines[j],sep="")
      }
  seqs<-list()
  # we name the sequence in the single-record list
  seqs[[namerecords[1]]]=sequence
  }
  # Case 2: Multi-fasta file
  # Creation of a list to hold sequences
  if(length(namerecords)>1){    
    seqs<-list()
    # Looping over all seqs
    for (i in 1:(length(namerecords)-1)){
      sequence<-""
      # looping over number of lines for each sequence
      for(j in (siterecords[i]+1):(siterecords[i+1]-1)){
        # appending sequence with line content
        sequence<-paste(sequence,lines[j],sep="")
        }
      # creation of named list record for each sequence
      seqs[[namerecords[i]]]=sequence
    }
    # appending the last record  
    sequence=""
    for(j in (siterecords[length(namerecords)]+1):length(lines)){
      sequence<-paste(sequence,lines[j],sep="")
      }
    seqs[[namerecords[length(namerecords)]]]=sequence
    }
# returning the list of sequences
return(seqs)
}

Θα χρησιμοποιήσουμε την παραπάνω συνάρτηση για να δημιουργήσουμε τη συλλογή Gata1

Gata1<-readfastafile("gata.fa")
str(Gata1)

## List of 50
##  $ Site1 : chr "TTATAG"
##  $ Site2 : chr "AGATAT"
##  $ Site3 : chr "AGATAG"
##  $ Site4 : chr "ATATCT"
##  $ Site5 : chr "AGATAG"
##  $ Site6 : chr "AGATAG"
##  $ Site7 : chr "AGATAG"
##  $ Site8 : chr "TGATAA"
##  $ Site9 : chr "AGATAA"
##  $ Site10: chr "AGATAA"
##  $ Site11: chr "CGATAG"
##  $ Site12: chr "AGAGTT"
##  $ Site13: chr "TGATAA"
##  $ Site14: chr "TGATAA"
##  $ Site15: chr "AGATGG"
##  $ Site16: chr "AGATAG"
##  $ Site17: chr "AGATTG"
##  $ Site18: chr "AGATAA"
##  $ Site19: chr "TGATAA"
##  $ Site20: chr "AGATAA"
##  $ Site21: chr "AGATAG"
##  $ Site22: chr "TGATAG"
##  $ Site23: chr "TGATCA"
##  $ Site24: chr "TTATCA"
##  $ Site25: chr "AGATGG"
##  $ Site26: chr "TGATAT"
##  $ Site27: chr "AGATAG"
##  $ Site28: chr "TGATAA"
##  $ Site29: chr "GGATAC"
##  $ Site30: chr "AGATAA"
##  $ Site31: chr "CGATAA"
##  $ Site32: chr "TGATAG"
##  $ Site33: chr "AGATAA"
##  $ Site34: chr "TGATTA"
##  $ Site35: chr "AGATAA"
##  $ Site36: chr "AGATAG"
##  $ Site37: chr "TGATAT"
##  $ Site38: chr "AGATAA"
##  $ Site39: chr "TCAGAG"
##  $ Site40: chr "AAGTAG"
##  $ Site41: chr "AGATTA"
##  $ Site42: chr "TGATAG"
##  $ Site43: chr "TGATAG"
##  $ Site44: chr "AGATAC"
##  $ Site45: chr "TGATTG"
##  $ Site46: chr "AGATTA"
##  $ Site47: chr "AGAATA"
##  $ Site48: chr "AGATAA"
##  $ Site49: chr "AGATTA"
##  $ Site50: chr "AGCTTC"

Θα υπολογίσουμε έναν PWM πάνω στη συλλογή Gata1. Μια συνάρτηση για να το κάνουμε είναι η παρακάτω:

makePWM<-function(seqs){
  # extract size of seqs (it has to be the same for all seqs)
  sizeseq<-length(strsplit(seqs[[1]],"")[[1]])
  # PWM is assumed to be DNA (nrow=4)
  motif<-matrix(1, nrow=4, ncol=sizeseq)
  # looping over sequences
  for(i in 1:length(seqs)){
    # splitting sequence string into vector site
    site<-strsplit(seqs[[i]],"")[[1]]
    # looping over each sequence residue-by-residue
    for(k in 1:length(site)){
      if (site[k]=="A") {motif[1,k]=motif[1,k]+1} 
      if (site[k]=="C") {motif[2,k]=motif[2,k]+1} 
      if (site[k]=="G") {motif[3,k]=motif[3,k]+1}
      if (site[k]=="T") {motif[4,k]=motif[4,k]+1}
      }
  }
  #  dividing over number of seqs to get probabilities 
  motif<-motif/length(seqs)
  # creating data.frame and naming rows after residues
  pwm<-as.data.frame(motif)
  row.names(pwm)=c("A","C","G","T")
  columns<-0;
  # naming columns as P[1...N]
  for(i in 1:sizeseq){
    columns[i]<-paste("P",i,sep="")
    }
  colnames(pwm)<-columns
return(pwm)
}

Eφαρμόζουμε αυτήν την συνάρτηση στη συλλογή Gata1:

pwmGata1<-makePWM(Gata1)
pwmGata1

##     P1   P2   P3   P4   P5   P6
## A 0.62 0.04 0.98 0.04 0.74 0.46
## C 0.06 0.04 0.04 0.02 0.08 0.08
## G 0.04 0.92 0.04 0.06 0.06 0.42
## T 0.36 0.08 0.02 0.96 0.20 0.12

Δημιουργία PSSM. Θα χρειαστεί να εργαστούμε σε τρία στάδια. α. Αρχικά να διαβάσουμε την αλληλουχία του E.coli με την συνάρτηση ανάγνωσης που δημιουργήσαμε παραπάνω:

Ecoli<-readfastafile("ecoli.fa")

β. Στη συνέχεια θα υπολογίσουμε τις συχνότητες εμφάνισης των τεσσάρων νουκλεοτιδίων. Μια συνάρτηση για να γίνει αυτό σε μία δεδομένη αλληλουχία είναι η παρακάτω:

nucContent<-function(sequence){
  # remove non A,G,C,T residues
  # the following command uses a Perl style regular expression (regex)
  # telling gsub to change anything that is NOT one of A, G, C, T into the empty string
  dna<-gsub("[^AGCT]", "", sequence, perl=T)
  
  # splitting the sequence string into a vector
  dna<-strsplit(dna,"")[[1]]
  # appending one count of each residue to sequence 
  # in this way we won't need to add pseudocounts and 
  # we guarantee each residue has a value  
  dna<-c(dna, "A", "C", "G", "T")
  # calculate all residues of the vector with table
  nucs<-table(dna)
  # divide all values of table over size of sequence, update table
  nucs<-nucs/length(dna)
  # transform table into data.frame with named rows (alphabetically)
  nContent<-as.data.frame(matrix(nucs))
  row.names(nContent)<-names(nucs)
return(nContent)  
}

γ. Στο επόμενο βήμα θα εφαρμόσουμε τη συνάρτηση nucContent στην αλληλουχία του Ecoli και θα δημιουργήσουμε τον PSSM με μια απλή διαίρεση του PWM με τη σύσταση του Ecoli.

nucContentEcoli<-nucContent(Ecoli)
nucContentEcoli

##          V1
## A 0.2459245
## C 0.2537242
## G 0.2537091
## T 0.2466422

# initialize PSSM as equal to PWM
pssmGata1<-pwmGata1
# adding pseudocounts as a small value
pssmGata1<-pssmGata1+0.001
# loop over columns of PWM/PSSM
for (i in 1:dim(pssmGata1)[2]){
  # update column values over 
  pssmGata1[,i]<-pssmGata1[,i]/nucContentEcoli
}
pssmGata1<-log2(pssmGata1)
pssmGata1

##           P1        P2        P3        P4         P5         P6
## A  1.3363779 -2.584520  1.996038 -2.584520  1.5912582  0.9065514
## C -2.0563801 -2.629565 -2.629565 -3.594800 -1.6472674 -1.6472674
## G -2.6294792  1.860026 -2.629479 -2.056294 -2.0562939  0.7306452
## T  0.5495789 -1.606426 -3.553959  1.962116 -0.2952244 -1.0274129

Στο τελευταίο στάδιο θα αναζητήσουμε το μοτίβο μέσω του PSSM εντός της μεγαλύτερης αλληλουχίας. Θα δημιουργήσουμε μια συνάρτηση που θα σκοράρει την αλληλουχία δεδομένου του PSSM γλιστρώντας ένα νουκλεοτίδιο κάθε φορά.

pssmSearch<-function(pssm, target, threshold){
  # pssm is a log2(ratio) matrix
  # target is a sequence agains which the pssm will be scored
  # threshold is a value in [0..1] that scales the maximum motif score
  # and sets the number of matching positions to be retained 
  if (missing(threshold)==T){threshold=1}
  if ((threshold>1) | (threshold<0)){
      stop("Provide a threshold between 0 and 1")
      }
  sizeseq<-length(strsplit(target[[1]],"")[[1]])
  sizemotif<-length(pssm)
  pssmscore<-vector("numeric",sizeseq-sizemotif+1)
  # calculating maximum motif score
  maxmotifscore<-0;
  for(k in 1:length(pssm)){
      maxmotifscore<-maxmotifscore+max(pssm[,k])
      }
  #
  sequence<-strsplit(target[[1]],"")[[1]]
  for(i in 1:(sizeseq-sizemotif+1)){
      for (j in 1:length(pssm)){
          if(sequence[i+j-1]=="A"){pssmscore[i]=pssmscore[i]+pssm[1,j]}
        if(sequence[i+j-1]=="C"){pssmscore[i]=pssmscore[i]+pssm[2,j]}
          if(sequence[i+j-1]=="G"){pssmscore[i]=pssmscore[i]+pssm[3,j]}
          if(sequence[i+j-1]=="T"){pssmscore[i]=pssmscore[i]+pssm[4,j]}
          }
      }
  pos_scores<-cbind(1:(sizeseq-sizemotif+1), pssmscore)
  # calculate which positions are >= percentile of the 
  indexes<-which(pos_scores[,2]>=threshold*maxmotifscore)
  if(length(indexes)>0){
    # returning positions of matching
    return(pos_scores[indexes,])
    }
  # Case when no matches are found
  if(length(indexes)==0){
      stop("No sites found at this threshold")  
      }
}

ως τελευταία πράξη θα αναζητήσουμε το μοτίβο του GATA1 στην αλληλουχία του Ecoli. Καθώς αναζητούμε τις θέσεις με το maxium score θα θέσουμε threshold=1.

gata1ToEcoli<-pssmSearch(pssmGata1, Ecoli, 1)

Σαν τελευταία ανάλυση, αναφέρουμε το πλήθος των θέσεων που είχαμε επιτυχία:

head(gata1ToEcoli)

##            pssmscore
## [1,]  6230  9.652368
## [2,]  8708  9.652368
## [3,]  9782  9.652368
## [4,] 17601  9.652368
## [5,] 37968  9.652368
## [6,] 38568  9.652368

length(gata1ToEcoli[,1])

## [1] 1072

Συνολικά εντοπίζουμε 1072 θέσεις με maximum score για τον GATA1 στο γονιδίωμα του E.coli. Αυτές μπορουν να παρασταθούν γραφικά παρακάτω:

plot(gata1ToEcoli[,1], gata1ToEcoli[,2], type="h", ylim=c(0, max(gata1ToEcoli[,2])), xlab="Genomic Position", ylab="PSSM Score", las=1, main="E. coli Genome")

ΒΝΠ54 | ΑΛΓΟΡΙΘΜΟΙ ΣΤΗ ΜΟΡΙΑΚΗ ΒΙΟΛΟΓΙΑ ΚΑΙ ΣΤΗ ΔΟΜΙΚΗ ΒΙΟΠΛΗΡΟΦΟΡΙΚΗ - 1η Εργασία - Ενδεικτικές Απαντήσεις

Χριστόφορος Νικολάου

11/25/2020

Θέμα 3