Ας υποθέσουμε οτι έχουμε αρκετές Fasta αλληλουχίες, που όμως πιστεύουμε ότι πρέπει να έχουνε αρκετές από αυτές κάποιο μοτίβο (TFBS). Υπάρχουν αρκετοί πιθανοί λόγοι γι αυτο:
Όπως και να έχει, εμείς θέλουμε να βρούμε καποιες αλληλουχίες συγκεκριμένου μήκους που υπάρχουν σε αυτές τις ρυθμιστικές περιοχές αρκετές φορές*.
ΠΡΟΣΟΧΗ: Η κανονική ανάλυση του παραπάνω προβλήματος προβλέπει μια στατιστική ανάλυση ωστε το “αρκετές φορές” να στηρίζεται με στατιστικά επιχειρήματα. Στο συγκεκριμένο μάθημα όμως, το παρακάμπτουμε και απλά θεωρούμε με σχετική χαλαρότητα ότι απλά απαιτούμε να υπάρχει αρκετές φορές.
Η βασική ιδέα λοιπόν περιγράφεται από τα παρακάτω βήματα:
μ.κ.Πάμε λοιπόν να εφαρμόσουμε τα βήματα αυτά στην πράξη.
Πέρα απο το βιοπληροφορικό κομμάτι του σημερινού μαθήματος, θα δούμε και μερικά τεχνικά κομμάτια της R.
## Technical element 1
## you can load data that you have saved previously. To save data use the command `saveRDS`. To load them, use the command `readRDS`
datasets = readRDS("motif.RDS")
utr = datasets[[1]]
random.seq = datasets[[2]]
head(utr)
## [[1]]
## [1] "C" "T" "G" "C" "T" "C" "G" "A" "A" "A" "G" "C" "A" "A" "C" "G" "T"
## [18] "A" "A" "G" "G" "C" "G" "A" "T" "G" "A" "A" "A" "G" "G" "A" "A" "A"
## [35] "C" "C" "A" "C" "T" "C" "T" "G" "T" "T" "A" "G" "T" "G" "A" "C" "A"
## [52] "C" "G" "A" "C" "T" "C" "A" "C" "G" "C" "T" "A" "A" "T" "A" "T" "A"
## [69] "C" "A" "C" "G" "A" "A" "G" "T" "G" "T" "G" "T" "A" "A" "C" "C" "C"
## [86] "G" "G" "A" "T" "G" "T" "G" "A" "C" "G" "C" "G" "C" "G" "G" "G" "T"
## [103] "T" "G" "T" "T" "T" "T" "G" "T" "C" "T" "A" "C" "G" "A" "T" "C" "A"
## [120] "C" "T" "A" "T" "G" "C" "G" "A" "C" "A" "G" "A" "T" "G" "A" "G" "C"
## [137] "A" "A" "A" "C" "G" "C" "C" "T" "T" "C" "C" "A" "T" "G" "C" "G" "C"
## [154] "G" "A" "T" "G" "A" "A" "A" "C" "T" "A" "C" "A" "A" "A" "G" "T" "C"
## [171] "T" "A" "T" "C" "C" "C" "T" "C" "G" "G" "G" "T" "C" "T" "G" "G" "A"
## [188] "A" "G" "A" "A" "C" "G" "A" "G" "C" "C" "T" "C" "C"
##
## [[2]]
## [1] "G" "T" "A" "C" "G" "C" "G" "T" "C" "C" "C" "T" "G" "G" "T" "A" "G"
## [18] "G" "T" "G" "T" "C" "T" "T" "G" "T" "T" "G" "C" "G" "G" "C" "T" "G"
## [35] "G" "C" "A" "A" "T" "C" "T" "T" "C" "T" "G" "C" "T" "T" "G" "G" "C"
## [52] "C" "G" "T" "T" "T" "T" "C" "G" "G" "T" "C" "G" "C" "C" "A" "A" "T"
## [69] "A" "G" "C" "A" "C" "C" "G" "A" "G" "G" "C" "C" "A" "A" "C" "T" "C"
## [86] "C" "C" "T" "A" "G" "G" "C" "G" "A" "G" "A" "G" "T" "A" "C" "A" "T"
## [103] "T" "T" "A" "A" "G" "C" "A" "C" "C" "T" "G" "T" "C" "C" "G" "A" "C"
## [120] "A" "G" "G" "C" "T" "A" "T" "A" "C" "T" "C" "T" "A" "T" "G" "C" "G"
## [137] "A" "T" "A" "G" "A" "C" "A" "T" "T" "G" "T" "T" "C" "T" "T" "C" "C"
## [154] "C" "C" "T" "G" "G" "A" "T" "G" "A" "G" "C" "T" "T" "T" "G" "C" "A"
## [171] "G" "G" "G" "A" "G" "A" "G" "G" "A" "A" "A" "C" "C" "G" "C" "C" "C"
## [188] "C" "A" "C" "G" "A" "T" "T" "A" "C" "A" "C" "T" "A"
##
## [[3]]
## [1] "G" "T" "G" "C" "G" "A" "T" "G" "C" "C" "A" "T" "G" "G" "T" "C" "G"
## [18] "C" "T" "C" "A" "G" "A" "G" "A" "A" "A" "G" "T" "C" "A" "C" "G" "A"
## [35] "A" "A" "G" "C" "C" "A" "G" "C" "C" "G" "T" "C" "T" "C" "C" "T" "T"
## [52] "G" "A" "A" "T" "T" "A" "A" "A" "G" "A" "C" "A" "T" "G" "G" "G" "C"
## [69] "C" "G" "T" "T" "C" "C" "G" "A" "A" "C" "T" "C" "C" "T" "A" "A" "C"
## [86] "C" "G" "T" "T" "A" "G" "T" "C" "T" "C" "C" "T" "A" "C" "T" "G" "G"
## [103] "C" "T" "A" "A" "A" "C" "C" "C" "A" "G" "T" "T" "C" "A" "G" "G" "C"
## [120] "C" "T" "T" "G" "T" "G" "C" "G" "A" "T" "T" "G" "C" "T" "A" "G" "G"
## [137] "G" "C" "T" "T" "C" "T" "G" "T" "A" "A" "G" "G" "A" "G" "A" "T" "G"
## [154] "C" "G" "T" "C" "G" "T" "C" "A" "T" "G" "C" "G" "C" "G" "T" "C" "C"
## [171] "G" "C" "T" "G" "G" "C" "A" "A" "T" "A" "G" "T" "C" "G" "C" "C" "A"
## [188] "C" "A" "T" "G" "G" "C" "G" "G" "A" "C" "A" "A" "G"
##
## [[4]]
## [1] "A" "T" "A" "C" "T" "C" "G" "A" "G" "T" "T" "G" "C" "C" "C" "C" "C"
## [18] "T" "A" "T" "G" "C" "C" "C" "G" "A" "A" "A" "T" "T" "T" "C" "C" "C"
## [35] "C" "T" "G" "A" "A" "G" "T" "A" "T" "C" "T" "A" "C" "A" "G" "G" "T"
## [52] "C" "A" "C" "C" "C" "G" "T" "C" "C" "G" "A" "C" "A" "C" "C" "T" "C"
## [69] "C" "A" "T" "A" "T" "C" "A" "A" "T" "C" "T" "T" "A" "G" "T" "G" "A"
## [86] "G" "A" "G" "T" "A" "G" "T" "T" "G" "G" "A" "G" "T" "A" "C" "A" "T"
## [103] "T" "T" "A" "T" "C" "T" "A" "G" "C" "G" "C" "G" "A" "A" "G" "T" "C"
## [120] "G" "A" "T" "C" "A" "A" "C" "A" "G" "C" "C" "T" "A" "T" "G" "C" "G"
## [137] "A" "C" "C" "G" "C" "G" "A" "A" "G" "G" "G" "G" "G" "G" "A" "T" "C"
## [154] "C" "A" "C" "C" "A" "T" "A" "A" "C" "C" "A" "G" "A" "T" "A" "T" "A"
## [171] "A" "T" "C" "T" "T" "C" "A" "G" "G" "T" "C" "C" "C" "G" "A" "C" "A"
## [188] "G" "A" "T" "A" "A" "A" "C" "G" "T" "C" "C" "G" "C"
##
## [[5]]
## [1] "A" "A" "G" "C" "G" "C" "C" "C" "G" "G" "G" "T" "C" "A" "G" "C" "G"
## [18] "A" "G" "T" "A" "G" "T" "C" "T" "A" "G" "G" "C" "A" "T" "C" "G" "T"
## [35] "A" "T" "C" "C" "G" "T" "G" "T" "G" "G" "G" "C" "G" "A" "C" "C" "C"
## [52] "G" "T" "T" "A" "G" "G" "C" "C" "A" "T" "A" "C" "G" "G" "A" "A" "A"
## [69] "C" "T" "T" "T" "A" "A" "A" "C" "A" "C" "G" "C" "C" "C" "G" "G" "A"
## [86] "G" "G" "C" "A" "G" "C" "G" "G" "G" "C" "C" "G" "A" "T" "A" "G" "C"
## [103] "G" "C" "T" "A" "G" "A" "C" "T" "C" "A" "T" "G" "G" "G" "C" "G" "A"
## [120] "T" "A" "G" "G" "C" "T" "G" "C" "T" "C" "T" "A" "G" "G" "A" "C" "T"
## [137] "C" "G" "G" "G" "C" "C" "T" "G" "T" "T" "C" "T" "A" "G" "G" "G" "G"
## [154] "G" "C" "A" "A" "T" "C" "C" "A" "C" "G" "T" "G" "A" "A" "A" "G" "A"
## [171] "G" "C" "T" "T" "C" "C" "C" "G" "C" "C" "A" "C" "C" "A" "C" "C" "A"
## [188] "G" "G" "C" "A" "A" "A" "C" "T" "A" "T" "A" "A" "T"
##
## [[6]]
## [1] "C" "T" "A" "A" "A" "G" "A" "A" "G" "T" "A" "T" "A" "A" "G" "C" "A"
## [18] "T" "C" "C" "T" "C" "A" "A" "A" "C" "G" "C" "G" "T" "C" "C" "T" "G"
## [35] "T" "G" "T" "C" "A" "G" "C" "C" "T" "C" "C" "G" "C" "A" "A" "T" "T"
## [52] "C" "T" "A" "G" "C" "A" "A" "C" "T" "C" "C" "C" "A" "C" "T" "A" "A"
## [69] "G" "A" "G" "C" "C" "A" "T" "C" "C" "A" "G" "A" "G" "A" "A" "C" "C"
## [86] "T" "A" "A" "C" "G" "C" "T" "T" "T" "C" "T" "G" "T" "T" "T" "T" "A"
## [103] "T" "G" "G" "A" "G" "G" "G" "G" "C" "T" "A" "G" "C" "T" "C" "T" "A"
## [120] "G" "C" "T" "A" "T" "G" "C" "G" "A" "C" "A" "G" "C" "G" "A" "T" "G"
## [137] "T" "C" "T" "A" "T" "G" "C" "G" "C" "C" "A" "G" "A" "C" "A" "G" "T"
## [154] "T" "G" "T" "T" "T" "C" "T" "C" "G" "C" "A" "A" "T" "C" "G" "G" "C"
## [171] "C" "A" "G" "C" "A" "G" "A" "G" "T" "G" "T" "T" "G" "G" "C" "C" "A"
## [188] "T" "C" "G" "G" "G" "A" "T" "C" "G" "T" "G" "C" "G"
Το αντικείμενο utr, περιέχει αλληλουχίες από ρυθμιστικές περιοχές γονιδίων, ενώ το αντικείμενο random.seq περιέχει αλληλουχίες τυχαίες. Εμείς σήμερα θα χρησιμοποιήσουμε το utr, όμως βάζω και το random.seq ως αντιπαραβολή.
Κανονικά στο βήμα αυτό αναζητούμε μοτίβα σε ένα ευρος μηκών (συνήθως 6 - 15). Εμείς θα θεωρήσουμε ότι ξέρουμε το μήκος και αυτό ειναι 11.
Το βήμα 2 παρακάμπτεται επειδή έχουμε ήδη τις αλληλουχίες σε μια λιστα, και χωρισμένες ανά γράμμα. Κανονικά, σε έναν κώδικα θα πρέπει να διαβάσετε τις αλληλουχίες αυτές από ένα αρχείο, να τις χωρίσετε και η κάθε μία αλληλουχία να ειναι τελικά ένα vector από νουκλεοτίδια.
Βρίσκουμε όλα τα πιθανά μοτίβα. Θα γράψουμε μια συνάρτηση για να κάνει το βήμα αυτό. Χωρίζουμε την διαδικασία σε 2 βήματα.
# get the motifs of only a single vector s
getSeqSubstr <- function(s, mik){
res <- list()
for(i in 1:(length(s) - mik + 1)){
substring <- s[i:(i+mik-1)]
res[[i]] <- paste(substring, collapse="")
}
return(res)
}
Στην συνέχεια εφαρμόζουμε επαναληπτικά την παραπάνω συνάρτηση
getAllSubstr <- function(d, mik){
allRes <- list()
for(i in 1:length(d)){
allRes[[i]] <- getSeqSubstr(d[[i]], mik)
}
return(allRes)
}
mots <- unlist(getAllSubstr(utr, 11))
head(mots)
## [1] "CTGCTCGAAAG" "TGCTCGAAAGC" "GCTCGAAAGCA" "CTCGAAAGCAA" "TCGAAAGCAAC"
## [6] "CGAAAGCAACG"
length(mots)
## [1] 9500
Ας πάρουμε όλα τα μοναδικά μοτίβα και ας τα μετρήσουμε
sort(table(mots), decreasing=TRUE)[1:10]
## mots
## CTATGCGACAG ACTATGCGACA CGATGCGAATG GCTATGCGACA TATGCGACAGA CACTATGCGAC
## 12 5 5 5 5 4
## CGATGCGACAG CTAAGCGACTG CTGTGCGACCG TATGCGACAGC
## 4 4 4 4
Απο εδώ βλέπουμε ότι ένας καλός υποψήφιος πυρήνας μοτίβου ειναι το CTATGCGACAG. Εξάλλου και τα άλλα μοτίβα μοιάζουν αρκετά με τον πυρήνα. Θα θεωρήσουμε λοιπόν το μοτίβο μας οτι βασίζεται στην αλληλουχία CTATGCGACAG.
Οι επόμενες δύο συναρτήσεις ειναι βοηθητικές, Η πρώτη, παιρνει ως όρισμα ένα vector από μοτίβα (το κάθε ενα ένα string), και τα χωρίζει σε χαρακτήρες. Έτσι τελικά επιστρέφει μία λίστα από μοτίβα, που το καθένα έχει την μορφή ενός vector.
Η δευτερη συνάρτηση απλά βρίσκει την απόσταση δύο μοτίβων, δηλαδή τον αριθμό νουκλεοτιδίων που ειναι διαφορετικά. Προσέξτε την χρήση της συνάρτησης sum με το !=.
splitMotifs <- function(motifs){
res <- list()
for( i in 1:length(motifs)){
res[[i]] <- unlist(strsplit(motifs[i], ""))
}
return(res)
}
getDist <- function(motifA, motifB){
sum(motifA != motifB)
}
Τώρα είμαστε σε θέση να βρούμε όλα τα μοτίβα
getApproxSubs <- function(motif, allMotifs, max.d){
res <- list()
motif.list <- splitMotifs(allMotifs)
motif.vector <- unlist(strsplit(motif, ""))
j <- 1
dists <- c()
for(i in 1:length(motif.list)){
d <- getDist(motif.vector, motif.list[[i]])
dists <- c(dists,d)
if(d <= max.d){
res[[j]] <- motif.list[[i]]
j <- j+1
}
}
##print(sort(dists, decreasing=TRUE))
return(res)
}
myMot <- getApproxSubs("CTATGCGACAG", mots, 4)
head(myMot)
## [[1]]
## [1] "C" "T" "G" "C" "T" "C" "G" "A" "A" "A" "G"
##
## [[2]]
## [1] "C" "T" "A" "T" "G" "C" "G" "A" "C" "A" "G"
##
## [[3]]
## [1] "C" "C" "A" "T" "G" "C" "G" "C" "G" "A" "T"
##
## [[4]]
## [1] "C" "T" "A" "T" "C" "C" "C" "T" "C" "G" "G"
##
## [[5]]
## [1] "C" "T" "A" "G" "G" "C" "G" "A" "G" "A" "G"
##
## [[6]]
## [1] "C" "T" "G" "T" "C" "C" "G" "A" "C" "A" "G"
Αν θυμηθούμε από το προηγούμενο μάθημα πώς διαβάζουμε μοτίβα και τα μετατρέπουμε σε PWM, θα έχουμε:
getPwm = function(motifList){
alphabet=c("A", "C", "G", "T")
for(i in 1:length(motifList)){
v = motifList[[i]]
## initialize the pwm at first iteration only
if(i == 1){
pwm = matrix(0, nrow=length(alphabet), ncol=length(v))
rownames(pwm) = alphabet
}
for(j in 1:ncol(pwm)){
letter = v[j]
pwm[letter, j] = pwm[letter, j] + 1
}
}
pwm = t(t(pwm)/colSums(pwm))
pwm = log2((pwm+1e-6)/0.25)
return(pwm)
}
## the ppm is just the probability position matrix (i.e., the pwm without the log2 step)
getPpm = function(motifList){
alphabet=c("A", "C", "G", "T")
for(i in 1:length(motifList)){
v = motifList[[i]]
## initialize the pwm at first iteration only
if(i == 1){
pwm = matrix(0, nrow=length(alphabet), ncol=length(v))
rownames(pwm) = alphabet
}
for(j in 1:ncol(pwm)){
letter = v[j]
pwm[letter, j] = pwm[letter, j] + 1
}
}
ppm = t(t(pwm)/colSums(pwm))
return(ppm)
}
getPwm(myMot)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## A -1.649075 -2.3011411 1.4913904 -1.301155 -2.108500 -1.786576 -1.786576
## C 1.698832 -1.2016206 -2.1084996 -1.649075 -1.201621 1.659662 -1.523545
## G -3.108475 -0.8605865 -0.5849538 -1.786576 1.659662 -1.786576 1.711656
## T -1.649075 1.4913904 -1.7865765 1.577978 -2.523529 -1.938577 -3.523496
## [,8] [,9] [,10] [,11]
## A 1.632944 -0.8605865 1.4306365 -1.938577
## C -1.523545 1.5061874 -1.2016206 -2.108500
## G -2.301141 -2.3011411 -1.9385773 1.761842
## T -1.523545 -1.3011553 -0.7161976 -3.108475
getPpm(myMot)
## [,1] [,2] [,3] [,4] [,5] [,6]
## A 0.07971014 0.05072464 0.70289855 0.10144928 0.05797101 0.07246377
## C 0.81159420 0.10869565 0.05797101 0.07971014 0.10869565 0.78985507
## G 0.02898551 0.13768116 0.16666667 0.07246377 0.78985507 0.07246377
## T 0.07971014 0.70289855 0.07246377 0.74637681 0.04347826 0.06521739
## [,7] [,8] [,9] [,10] [,11]
## A 0.07246377 0.77536232 0.13768116 0.67391304 0.06521739
## C 0.08695652 0.08695652 0.71014493 0.10869565 0.05797101
## G 0.81884058 0.05072464 0.05072464 0.06521739 0.84782609
## T 0.02173913 0.08695652 0.10144928 0.15217391 0.02898551
library(seqLogo)
## Loading required package: grid
{pdf("positionWeightMatrix.pdf")
seqLogo(getPpm(myMot))
dev.off()}
## png
## 2
seqLogo(getPpm(myMot))