readfastafile<-function(file){ # input may be single or multi-fasta
# Reading of file
lines<-readLines(file)
# Creation of name records
siterecords<-grep(">",lines) # obtaining number of line for all headers
namerecords<-gsub(">", "", lines[siterecords]) # storing names of seqs in a vector
# Case 1: Single fasta file
if(length(namerecords)==1){
sequence<-""
# we loop on lines from the second line to the end of file
for(j in 2:length(lines)){
sequence<-paste(sequence,lines[j],sep="")
}
seqs<-list()
# we name the sequence in the single-record list
seqs[[namerecords[1]]]=sequence
}
# Case 2: Multi-fasta file
# Creation of a list to hold sequences
if(length(namerecords)>1){
seqs<-list()
# Looping over all seqs
for (i in 1:(length(namerecords)-1)){
sequence<-""
# looping over number of lines for each sequence
for(j in (siterecords[i]+1):(siterecords[i+1]-1)){
# appending sequence with line content
sequence<-paste(sequence,lines[j],sep="")
}
# creation of named list record for each sequence
seqs[[namerecords[i]]]=sequence
}
# appending the last record
sequence=""
for(j in (siterecords[length(namerecords)]+1):length(lines)){
sequence<-paste(sequence,lines[j],sep="")
}
seqs[[namerecords[length(namerecords)]]]=sequence
}
# returning the list of sequences
return(seqs)
}
Gata1<-readfastafile("gata.fa")
str(Gata1)
## List of 50
## $ Site1 : chr "TTATAG"
## $ Site2 : chr "AGATAT"
## $ Site3 : chr "AGATAG"
## $ Site4 : chr "ATATCT"
## $ Site5 : chr "AGATAG"
## $ Site6 : chr "AGATAG"
## $ Site7 : chr "AGATAG"
## $ Site8 : chr "TGATAA"
## $ Site9 : chr "AGATAA"
## $ Site10: chr "AGATAA"
## $ Site11: chr "CGATAG"
## $ Site12: chr "AGAGTT"
## $ Site13: chr "TGATAA"
## $ Site14: chr "TGATAA"
## $ Site15: chr "AGATGG"
## $ Site16: chr "AGATAG"
## $ Site17: chr "AGATTG"
## $ Site18: chr "AGATAA"
## $ Site19: chr "TGATAA"
## $ Site20: chr "AGATAA"
## $ Site21: chr "AGATAG"
## $ Site22: chr "TGATAG"
## $ Site23: chr "TGATCA"
## $ Site24: chr "TTATCA"
## $ Site25: chr "AGATGG"
## $ Site26: chr "TGATAT"
## $ Site27: chr "AGATAG"
## $ Site28: chr "TGATAA"
## $ Site29: chr "GGATAC"
## $ Site30: chr "AGATAA"
## $ Site31: chr "CGATAA"
## $ Site32: chr "TGATAG"
## $ Site33: chr "AGATAA"
## $ Site34: chr "TGATTA"
## $ Site35: chr "AGATAA"
## $ Site36: chr "AGATAG"
## $ Site37: chr "TGATAT"
## $ Site38: chr "AGATAA"
## $ Site39: chr "TCAGAG"
## $ Site40: chr "AAGTAG"
## $ Site41: chr "AGATTA"
## $ Site42: chr "TGATAG"
## $ Site43: chr "TGATAG"
## $ Site44: chr "AGATAC"
## $ Site45: chr "TGATTG"
## $ Site46: chr "AGATTA"
## $ Site47: chr "AGAATA"
## $ Site48: chr "AGATAA"
## $ Site49: chr "AGATTA"
## $ Site50: chr "AGCTTC"
makePWM<-function(seqs){
# extract size of seqs (it has to be the same for all seqs)
sizeseq<-length(strsplit(seqs[[1]],"")[[1]])
# PWM is assumed to be DNA (nrow=4)
motif<-matrix(1, nrow=4, ncol=sizeseq)
# looping over sequences
for(i in 1:length(seqs)){
# splitting sequence string into vector site
site<-strsplit(seqs[[i]],"")[[1]]
# looping over each sequence residue-by-residue
for(k in 1:length(site)){
if (site[k]=="A") {motif[1,k]=motif[1,k]+1}
if (site[k]=="C") {motif[2,k]=motif[2,k]+1}
if (site[k]=="G") {motif[3,k]=motif[3,k]+1}
if (site[k]=="T") {motif[4,k]=motif[4,k]+1}
}
}
# dividing over number of seqs to get probabilities
motif<-motif/length(seqs)
# creating data.frame and naming rows after residues
pwm<-as.data.frame(motif)
row.names(pwm)=c("A","C","G","T")
columns<-0;
# naming columns as P[1...N]
for(i in 1:sizeseq){
columns[i]<-paste("P",i,sep="")
}
colnames(pwm)<-columns
return(pwm)
}
pwmGata1<-makePWM(Gata1)
pwmGata1
## P1 P2 P3 P4 P5 P6
## A 0.62 0.04 0.98 0.04 0.74 0.46
## C 0.06 0.04 0.04 0.02 0.08 0.08
## G 0.04 0.92 0.04 0.06 0.06 0.42
## T 0.36 0.08 0.02 0.96 0.20 0.12
Ecoli<-readfastafile("ecoli.fa")
β. Στη συνέχεια θα υπολογίσουμε τις συχνότητες εμφάνισης των τεσσάρων νουκλεοτιδίων. Μια συνάρτηση για να γίνει αυτό σε μία δεδομένη αλληλουχία είναι η παρακάτω:
nucContent<-function(sequence){
# remove non A,G,C,T residues
# the following command uses a Perl style regular expression (regex)
# telling gsub to change anything that is NOT one of A, G, C, T into the empty string
dna<-gsub("[^AGCT]", "", sequence, perl=T)
# splitting the sequence string into a vector
dna<-strsplit(dna,"")[[1]]
# appending one count of each residue to sequence
# in this way we won't need to add pseudocounts and
# we guarantee each residue has a value
dna<-c(dna, "A", "C", "G", "T")
# calculate all residues of the vector with table
nucs<-table(dna)
# divide all values of table over size of sequence, update table
nucs<-nucs/length(dna)
# transform table into data.frame with named rows (alphabetically)
nContent<-as.data.frame(matrix(nucs))
row.names(nContent)<-names(nucs)
return(nContent)
}
γ. Στο επόμενο βήμα θα εφαρμόσουμε τη συνάρτηση nucContent στην αλληλουχία του Ecoli και θα δημιουργήσουμε τον PSSM με μια απλή διαίρεση του PWM με τη σύσταση του Ecoli.
nucContentEcoli<-nucContent(Ecoli)
nucContentEcoli
## V1
## A 0.2459245
## C 0.2537242
## G 0.2537091
## T 0.2466422
# initialize PSSM as equal to PWM
pssmGata1<-pwmGata1
# adding pseudocounts as a small value
pssmGata1<-pssmGata1+0.001
# loop over columns of PWM/PSSM
for (i in 1:dim(pssmGata1)[2]){
# update column values over
pssmGata1[,i]<-pssmGata1[,i]/nucContentEcoli
}
pssmGata1<-log2(pssmGata1)
pssmGata1
## P1 P2 P3 P4 P5 P6
## A 1.3363779 -2.584520 1.996038 -2.584520 1.5912582 0.9065514
## C -2.0563801 -2.629565 -2.629565 -3.594800 -1.6472674 -1.6472674
## G -2.6294792 1.860026 -2.629479 -2.056294 -2.0562939 0.7306452
## T 0.5495789 -1.606426 -3.553959 1.962116 -0.2952244 -1.0274129
pssmSearch<-function(pssm, target, threshold){
# pssm is a log2(ratio) matrix
# target is a sequence agains which the pssm will be scored
# threshold is a value in [0..1] that scales the maximum motif score
# and sets the number of matching positions to be retained
if (missing(threshold)==T){threshold=1}
if ((threshold>1) | (threshold<0)){
stop("Provide a threshold between 0 and 1")
}
sizeseq<-length(strsplit(target[[1]],"")[[1]])
sizemotif<-length(pssm)
pssmscore<-vector("numeric",sizeseq-sizemotif+1)
# calculating maximum motif score
maxmotifscore<-0;
for(k in 1:length(pssm)){
maxmotifscore<-maxmotifscore+max(pssm[,k])
}
#
sequence<-strsplit(target[[1]],"")[[1]]
for(i in 1:(sizeseq-sizemotif+1)){
for (j in 1:length(pssm)){
if(sequence[i+j-1]=="A"){pssmscore[i]=pssmscore[i]+pssm[1,j]}
if(sequence[i+j-1]=="C"){pssmscore[i]=pssmscore[i]+pssm[2,j]}
if(sequence[i+j-1]=="G"){pssmscore[i]=pssmscore[i]+pssm[3,j]}
if(sequence[i+j-1]=="T"){pssmscore[i]=pssmscore[i]+pssm[4,j]}
}
}
pos_scores<-cbind(1:(sizeseq-sizemotif+1), pssmscore)
# calculate which positions are >= percentile of the
indexes<-which(pos_scores[,2]>=threshold*maxmotifscore)
if(length(indexes)>0){
# returning positions of matching
return(pos_scores[indexes,])
}
# Case when no matches are found
if(length(indexes)==0){
stop("No sites found at this threshold")
}
}
ως τελευταία πράξη θα αναζητήσουμε το μοτίβο του GATA1 στην αλληλουχία του Ecoli. Καθώς αναζητούμε τις θέσεις με το maxium score θα θέσουμε threshold=1.
gata1ToEcoli<-pssmSearch(pssmGata1, Ecoli, 1)
Σαν τελευταία ανάλυση, αναφέρουμε το πλήθος των θέσεων που είχαμε επιτυχία:
head(gata1ToEcoli)
## pssmscore
## [1,] 6230 9.652368
## [2,] 8708 9.652368
## [3,] 9782 9.652368
## [4,] 17601 9.652368
## [5,] 37968 9.652368
## [6,] 38568 9.652368
length(gata1ToEcoli[,1])
## [1] 1072
Συνολικά εντοπίζουμε 1072 θέσεις με maximum score για τον GATA1 στο γονιδίωμα του E.coli. Αυτές μπορουν να παρασταθούν γραφικά παρακάτω:
plot(gata1ToEcoli[,1], gata1ToEcoli[,2], type="h", ylim=c(0, max(gata1ToEcoli[,2])), xlab="Genomic Position", ylab="PSSM Score", las=1, main="E. coli Genome")