getwd()
## [1] "/home/pavlos/graduate_bioinfo/linux"
rm(list=ls())

Detecting motifs in DNA sequences

Assuming known ‘core’ motif.

Assume that you you have DNA sequences in FASTA format. These sequences may represent promoter sequences, for example 5 kb regions upstream the Transcription Start Site (TSS).

Here, we will consider the sequences of the genes participating in the formation of the ‘Proteasome’. For more information on the ‘Proteasome’ please visit Wikipedia Proteasome. Sequences for the proteasome can be downloaded from this location:

Download the Proteasome 5kb upstream sequences HERE.

wget http://139.91.162.50/bioinf2018/seq5000.txt
## --2018-11-21 22:37:16--  http://139.91.162.50/bioinf2018/seq5000.txt
## Connecting to 139.91.162.50:80... connected.
## HTTP request sent, awaiting response... 200 OK
## Length: 1275255 (1.2M) [text/plain]
## Saving to: ‘seq5000.txt.28’
## 
##      0K .......... .......... .......... .......... ..........  4% 86.1M 0s
##     50K .......... .......... .......... .......... ..........  8%  186M 0s
##    100K .......... .......... .......... .......... .......... 12%  296M 0s
##    150K .......... .......... .......... .......... .......... 16%  335M 0s
##    200K .......... .......... .......... .......... .......... 20%  352M 0s
##    250K .......... .......... .......... .......... .......... 24%  285M 0s
##    300K .......... .......... .......... .......... .......... 28%  371M 0s
##    350K .......... .......... .......... .......... .......... 32%  375M 0s
##    400K .......... .......... .......... .......... .......... 36%  383M 0s
##    450K .......... .......... .......... .......... .......... 40%  329M 0s
##    500K .......... .......... .......... .......... .......... 44%  392M 0s
##    550K .......... .......... .......... .......... .......... 48%  393M 0s
##    600K .......... .......... .......... .......... .......... 52%  398M 0s
##    650K .......... .......... .......... .......... .......... 56%  334M 0s
##    700K .......... .......... .......... .......... .......... 60%  377M 0s
##    750K .......... .......... .......... .......... .......... 64%  395M 0s
##    800K .......... .......... .......... .......... .......... 68%  388M 0s
##    850K .......... .......... .......... .......... .......... 72%  347M 0s
##    900K .......... .......... .......... .......... .......... 76%  388M 0s
##    950K .......... .......... .......... .......... .......... 80%  397M 0s
##   1000K .......... .......... .......... .......... .......... 84%  406M 0s
##   1050K .......... .......... .......... .......... .......... 88%  230M 0s
##   1100K .......... .......... .......... .......... .......... 92%  410M 0s
##   1150K .......... .......... .......... .......... .......... 96%  378M 0s
##   1200K .......... .......... .......... .......... .....     100%  338M=0.004s
## 
## 2018-11-21 22:37:16 (305 MB/s) - ‘seq5000.txt.28’ saved [1275255/1275255]

The file contains the ‘plain’ sequences without any “fasta” format symbol. No sequence names either. The beginning of the files looks like:

head -n 1 seq5000.txt
## AACAAGTCACTCAGTAGGAAAGGCACTTGGAATTGTTTTTCCAGCTTTTCTACCAATCAGATCCAAGTACTTGATGCATTTTTATGGTGAAAGGTCACTATACCCAATACTTTGTGAATACCACAACACAGGGATACTGTACCTCCATAATCAACTAGAGAAGTATTTATTGAATATGCAGGGCACTTCTGCTCTATACCAACTATGACTGGCAGGCTAAATTCAGCCCATCTCATGTTTTTGTGTGGCAAGCAAGGTACGAATGATTCTCATAATTTTAAATGGTTGGAAGAACCACCTTCAAACACTGTTAGGGAACACTGAATGGCTTTGGAAATTAACTTTTGATGCAGGCTGGATAATGTTTATTAATTAATTCAACCTAAAATTACAAGGCAAAACAATGCTTATATATGAAAAAAACTGTAGTAAAGTCATTTTGACAATATTATTTGAATCACAAGTAATGTCAAGATGCTTTATCCACTTCCCATGCTGTCACAAGTCAAAACAATAAAACCACAATTAAATACCACTTTATACCCACTAGAATGCCTATAATCAAAAAGATAGATATTAATAAGTATTGGTAAAGATATGGAGAAATTTGGCTGGATGTGGTGGCTCACGCCTATAATCCCAGCATTTTGGAAGGCCAAGGTGAGCAGATCACTTGAGGACAGGAGTTCCAGACCAGCCTGGCCAACACGGCAAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGCACGGAGGTACATGCCTGTAATCCCAGCTACTCAGGAGGCTGAGGCACAAGAATCACTAGAACCTGGGAGGCAGAGGTTGCAGTGGGCTGAGATCATGCCACTGCACTTCAGCCTGGGCGACAGAGTGAGACTTTGTCTCCAAAAAAAAAAAAAAAAGAAAGAAAAGAAAATAAGAAAAAAAATGTGGATAAATCTGAACCCTCGAACATTGCTGATGGGAAGGTAAAATGGTACGGCCACTCTTGTAAAGAGTCTGAAAGTTTCTCAAAAACTTAAACAGAGTTACGATATGACCCAGCAATTCCACTCCTAGTTAAGTACCCAGAAGAAATAAATGAAATCTCAGCTGGACATGGTGGCTCACACCTGTAATCCCAGCATTTTCTGAGACCTAGGTGGGAGGATCACTTGAACCCAGGAGTTCGAGATCAGCCTGGGCAACATAGTGAGACCCTGTATCTATAAAAAATCATAAATTGGCTGAGTGTGTTGGTGCATGCCTGTAGTTCCAGCTGCTCCAGAGGCTGAGATGACAGGATTGCTTGTGCTAGGGAGGTGGAGGCTGCAGTGAGCCGTGATCGTGCCACTGCACTCCAGCCTGGGCAACAGAAGAAGACCCTGTCTCAATAAAAAATAAAATAGGCTGGGCACAGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGATCACCTGAGGTCAGGAGTTAGAGACCAGCCTGGCCAATATGGCGAAACCCTGTCTCTACTAAAAATACAAAACTTAGCCAGGCGTGGTGGCATGCGCCTCAGGAGGCTGAGGCAGGAGAATCGCTTGAGCCTGGGAGGCAAAGATAGCAGTGTGCCAAGATCGTGCCACTGCATTCCAGCCTGGGTGACAGAGTGAGACTCTGTCTCAAAATAAAATGAAATAAAAGTTAAAACAAGAAGCAAGATATCTATTTCCATAAGAATTTGCAGCATATATATTTTCTGAATTCAGACTACAGTTCCAGCAGCAGTTTTTGAACCCTGAAGCAAGTGCAAAGGATATTCCATATTCCAAAATCCCTTTAACTGTGCAATTGAGCAGCTTTCACCAAACTTTCAATTGGAAGTGATTAACCTGCCACATAATGCTAAAAGGCACTATTAAGAGAAGAACAGAATTCTATTCCACGCAATGAATATGCTCAATTAAAATCATGTTTGAGGATAGTGTATGATAAGAATGTGGTACTTGACCTCTGCAGACTTTCTTCCAAACACATAATGCCAGTCTAAGCATAAAATTGAGGGGTTTTCTACAAAATACCTGACCAGTACTCCTTAAAACTATCAAGGTCACCAAAACAAGAAAAGTGTGAGAAAGTGTCACAAACCAGAAGAGGCTAAGGAGATACAATGACTAAACGTAATGTGGTATCCTGGATGGGACAGCATCTCTAGAGATAAACTAGTGAAACTTGAAAAAAAAAAGCATGGGGTATAGTTAATAGCAATATACCACTGTGGGCTTATTAGTTGTGACATAGATAGCATAGCAATGTAAGACGTTAATGATAAGAGATTCAGCAAATGGTGCTAGGACAATTTCTACATCCACATGCAAAAAGATGAATTTAGACTCTCACCTCACATCACAGAGAAAAAATGAACTCAAAATGAATCATAGACTTAAATGTAAGAGATGAAGTAATAAAGCTTTTAGAAGAAAACATTGGAGAAAATCACTGAGACCTTGCAATTAGGCAAAGACTTCTTAAGTACAACACCAAAAGAATGGTCCATGAAAGAAAAAAAAATGATAAATTGAACCTCACCCAAATTCAAAACGTTTTCACTTCAAAAGACACCATCAAAAAAAGAAAAAGGCAAGCAACAGACTGGGAGAATATATGTGCATATCATATATCTCTCAGGGACTTGATCTAGAATATATAAACCATCCTTACAACTCAACACTAAAAAAACAAATAACTCAATTTAAAAACAGACAAAAGATATAACAGACACTTCACTGAAGAAGATATATGAATGGCTAATAAGCACATGAAAATATGCTCACCGTATTATAAATTATAATTTATAAATTAAAACCACAATGAGACACCACTTCACACCCACTTAGAATGGCTATAACAAAAAACACAGTAAATAACAAATGTTGGCAAGGATGTGGAGAAACAGGAACTCTCATACATTGCTAGTGGAAGTGCAAAATGTTGCTGCCATTTTGGAAAACAGTTTTGCAGTTTCTTAAGAAGTTAAGCATAAATTTTCCCTGTGATCCAGTAACTCCACCCTTAAGTATCTATCAAAGAGAAATGAAAAAAATATGTCCGCACGAAGACTTACATACTAATCTTCTTAGCAGCACTAGTCATAGTAGCCAAAAATTGGAAACTGATGTGCCTATCTAATAGTGAGTAGACAGATAAAAATGTGTTATATCCATAAAATGGAATACTATTCAGCAATAAAAAAGTAAGGAAGTACTGATACATACTACAATATGGATGAATCTCAAAAATATTATGCTAAGTGAAAGAAATCAGACACAAAGACTGCATATTGTATGATTTCATTTATATAAAATGTCCAGAATAGGCAAATTTATAGAAACAAAAAAGTAGATTAGTGGTTGTCTGGGGCTGGGAATGGGACAGAGGATTGATTAACTATAAATAGGCATGAGGGACCTTACTGGAGGGATCAAAAAGTTCTAAAACTGATTTATGGTGATGGTTGCCCCACTTCAGTAAAGTTACCAAAAATCATTTAATTATGCACTTGAAATCAGTGAATTTTATGATATGTAAAATGTGCCTCATTTTGAGTGAGTGTTATACAGGAACTCTACTAACCTTGCAACTTTTCTATAAATCTACAACTATTCTGAAATTTTTGAAAAATTAAATAAAAATAAAAATAAAAAAGTATATGTTTATGAATTGATATCAGTATTTGGCAGTACCTATCTATGTAAAATGACATTTTCAAAGATTAAATAGGTAAAAATTTCCTAACAGATCAGCATTAACAGATGAACACTTAAAATCAATTTTGACAATAGGGAACACTAATTCTGAGCCCCAATTAAGCAAAATATTATCCCCTAAAAAGAATTCTATTCTTCTCATTGGTAGTCTTCCTGTATTACAAAAAAAGTACTCAATTACTATTATATTTTGAATTTCATCAATAAAAATTTGTTGAAATTTGTTTTCTCTTTTGTCATAAAAGTACCAACAAAATATTCTTGATTTTGCCTTTTGGCCTACAAAGCCTAAAATATTTACTATCTGGCTCTTTACAGAAAGTTTCCCAACTCCTGTTCTACACTATAACAGGATAAACATAAACATCATGAGTCTCTTTCTTGAATGTATTCTTATCGTTACCTAGTTGTGGCATAATGTCCTCTGGATATGCCCTATAAAGGATCAGTCTTGCATTGCTTCTGCTATGGCACTATTTGGGATTTCTATAAAAAGAAACACTATTAGCTTAATAGTTGTAGCAAAGCCTTGACAACATAGTTAATATCGTGATATTAAAATGCCAACACCACAGCCCGGTATCCTTAAGATGACCCTCATCAAGCTGTGATGTCACCAGGCTATGGACTCACCGAGGTCTCTTAGATATCAGGTGGCTCCCACGGCATCACACCCTGCCTTTGGCATCACCCTGGCACATCACCTCAGTACGGTGCCATATGCTATCACTGCTGTGGTAGAACCCTGTCCTGATGCCACACCGCCGAGATGTCAGAGTAATTAACCCCCACTTCTTTCCACTCCCTAGGCGGTGGCAGGGAGGGGGAGGGTAGCCACAGTCACCGGTTCGAGCTCAGTAGGTAGTACTCAGCTCCAACCCTTGTCCCTTCCGCCTGCCACTGCAGGTCAGGAACCCCGGGGGACAGGACCTTTAGGCTCTCATAACTCTTGCCCTGCCCCCGAACCTGTTTCATATGGCCGACTCAACGACCCGCCAGCGACTACCCACCCCGGCTCTTTCACACGGTTCCACTCAAAGCCGCTCCCCGTACCTGACTCCATTCTCCAGCTGTCATCCTGGGCCAATTCCACCGGCAACTTGACTACGGACGGACGGTCGCTAGGATCCCTGGGACTTGTAGTTCTGCACTGCTAGGGGCCAAGTCTGTGAGGGCAGCAAAGGCGCCTCCCTGCCCGGAACTGCTCTCAAACTGCAACTCCCAGAGGCGCCGCGCGACGGGAAAAGAAAAGGGAACGAGGAAGGCCGGTG

First, we will read in the sequences paying attention to the fact that the strings should be read as ‘characters’.

## First we will read in the sequences
seq <- read.table("seq5000.txt", colClasses='character')
dim(seq)
## [1] 255   1

Now, we should separate each sequence into a vector of characters. This will make our life easier when we will search for a specific motif.

seq.data <- lapply( seq[,1], ## I must take the first column, because I need a vector here 
                    function(x){unlist(strsplit(x, ""))})
## seq.data is a list
length(seq.data) ## number of elements of the list (i.e. sequences)
## [1] 255
length(seq.data[[1]]) ## length of each sequence
## [1] 5000

Next, we will write a function that will calculate the distance (here, it will be Hamming distance) between two vectors of the same length. The hamming distance just counts the number of positions (indices) where the two vectors have a different state (i.e., number of differences).

getDistance <- function(a,b){
    if(length(a) != length(b)){
        stop("ERROR sequences do not have the same length")
    }
    d <- sum(a != b)
    return(d)
}

The getDistance(a,b) function operates only on two vectors. Thus, we create a new function, that uses the getDistance(a,b) but it works on a whole text, a motif and a threshold. The function, called getDistance.vec(text, motif, thr) finds all sub-texts in the text that have maximum ‘thr’ differences with the ‘motif’. The only trick that we do in this new function is that we split the text in sub-texts, denoted by tt. Also, we use sapply to calculate the getDistance for all sub-texts of the text. If no sub-text that fullfils the criterion will be found, the function will return a NULL.

getDistance.vec <- function(text, motif, thr){
    sapply(1:(length(text)-length(motif)), function(x){
        tt <- text[x:(x+length(motif)-1)]
        ##print(tt)
        d <- getDistance(motif, tt)
        if(d <= thr){
            return(tt)
        }
        return(NULL)
    })
}

Now, we are ready to apply these functions. The motif is provided here:

motif <- c("A", "C", "T", "T",  "C", "C", "G")
## get all motifs (and NULLs if nothing is found)
get.seq.motifs <- lapply(seq.data, getDistance.vec, motif, 1)

Now, it’s easier to put all these motifs in a matrix. Luckily, R, in this application, does not care about the NULLs, thus it nicely arranges only the non-NULL elelents of the matrix.

all.motifs.matrix <- matrix(unlist(get.seq.motifs), ncol=length(motif), byrow=TRUE)
head(all.motifs.matrix)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] "A"  "C"  "T"  "T"  "C"  "T"  "G" 
## [2,] "A"  "C"  "T"  "T"  "C"  "C"  "C" 
## [3,] "A"  "C"  "T"  "T"  "C"  "A"  "G" 
## [4,] "A"  "C"  "T"  "T"  "C"  "A"  "G" 
## [5,] "C"  "C"  "T"  "T"  "C"  "C"  "G" 
## [6,] "A"  "C"  "C"  "T"  "C"  "C"  "G"

To find out the position weight matrix (PWM), we need to summarize the states for each position of the motif. For example, we want to know how many times an A, a C, a G or a T is found for every position of the motif.

Thus, we count per column the number of times we observe each state out of all possible ones (A,C,G,T). A very handy function to count the number of states in a vector (or matrix) is the function table. Thus, we apply the function table for every column of the all.motifs.matrix. The function table will count only the present states. Thus, if in a column there is no ‘G’, for example, then there will be no entry for ‘G’. To force the function table to count all states we want and put a 0 for the states that are not present we combine the table function with the factor function. Thus,

motif.counts <- apply(all.motifs.matrix, 2, function(x){ table(factor(x, levels=c("A", "C", "G", "T") ))})

The next steps are quite simple, we just transform the counts to the PWM.

motif.sums <- sum(motif.counts[,1])
motif.freqs <- motif.counts/motif.sums
motif.log2 <- log2(motif.freqs/0.25)

Assuming unknown motif

The previous case was rather simple because the motif was already given. What can we do if the motif is not given?

Probably, there is no single ‘correct’ answer for this question, but definitely we can think of some approaches.

I contructed the following pipeline:

  1. Define the length of the motif (here: 8)
  2. find all possible motifs, i.e. all possible different substrings of length 8, that exist in the sequences.
  3. Sort them by frequency
  4. Starting from the most frequent one, group them. This is, put all the motifs whose distant to the most frequent is less than t in the same group
  5. Construct PWMs
## we load the parallel library, this will be helpful for the 
## parallel execution of apply loops
library(parallel)
## Again.. read in the data and split them in vectors
raw <- read.table("seq5000.txt", colClasses="character")
raw.data <- lapply( raw[,1], function(x){unlist(strsplit(x, ""))})

Next, we will find the frequencies of each nucleotide. It’s possible that some other states (e.g. N) will be also present, denoting for example, missing data. We will need the nucleotide frequencies later, when we will construct the PWM.

nuc.freq <- table(unlist(raw.data))/sum(table(unlist(raw.data)))
nuc.freq <- nuc.freq[c("A", "C", "G", "T")]

The function scanmotifs will report all substrings (motifs) (not only the unique ones) found in a text. The text should be a vector. The length of the motif ml is given.

scanmotifs <- function(text, ml)
{
  vec <- c()
  sapply(1:(length(text)-ml+1), function(x){
      tt <- text[x:(x+ml-1)]
      vec <- c(vec, paste(tt, collapse=""))
  })
}

The next lines of code are quite important, because we exploit the ability of R to create named vectors (like assortative arrays) and the ‘table’ function which counts the occurences of states in a vector. The named array (here, it is the xxx.sorted) will be very useful for us later.

## the motif length is now user defined
motifLength <- 8
## find all possible motifs of length motifLength
xxx <- mclapply(X=raw.data, mc.cores = 4, FUN = scanmotifs, motifLength)
## cound how frequent they are
xxx.tab <- table(unlist(xxx))
## sort them according to their frequency, i.e. obtain a sorted version of the freuqnecies
xxx.sorted <- sort(xxx.tab, decreasing = TRUE)
## just have the names sorted (based on their frequency) as well
names.sorted <- names(xxx.sorted)

The next step in our analysis is to find all motifs that are “similar” to most frequent ones. For example, if we have found the motif “AAAAAAA” many many times and the motif “AAAAAAAT” fewer times, then we would like to group the “AAAAAAT” in the “AAAAAAA” family of motifs. Here, we define the “similar” as having distance equal or less than thr.1 = 2. To evaluate the distance between two strings we use the function getDistanceStr which accepts two arguments. Both of them are strings. Internally, the function constructs two vectors from the strings and compares them.

Then, we construct the function findsimilar, which will find all similar motifs using the getDistanceStr function. In fact, we apply the function to a vector of many motifs using the parallel version of lapply, the mclapply. It is important that the findsimilar function will return TRUE/FALSE. This will be very handy, because we can use its results for subsetting the vector of all motifs, as we will see in the next section. A final function in this section is the stringToMatrix which transforms an array of strings (that belong to the same motif family) into a matrix. If the motif has length motifLength and there are k motifs in the familty, then the matrix will be ( k motifLength ).

getDistanceStr <- function(a,b){
  v1 <- unlist(strsplit(a, ""))
  v2 <- unlist(strsplit(b, ""))
  d <- sum(v1 != v2)
  return(d)
}

findsimilar <- function(mot1, mot.vec, thr){
  d <- mclapply(mot.vec, getDistanceStr, mot1, mc.cores = 4)
  unlist(d) <= thr
}

stringsToMatrix <- function(stringsVector, stringsLength){
  v <- unlist(strsplit(stringsVector, ""))
  matrix(v, ncol=stringsLength, byrow = TRUE)
}

And now, it’s time to enter the main loop. This takes most of the time since its complexity may be a bit less than ( O(n^2) ), where ( n ) is the number of motifs. However, in practice it works pretty fast because every time we put some motifs in the family of most frequent ones, then they are being removed from the list.

## a similarity threshold to put motifs in the same family of motifs
thr.1 <- 2
## an empty list. We will put all motif families (in matrix format here)
motif.list <- list()
## we copy the names.sorted into a new vector that will be updated all the time
names.sorted.current <- names.sorted
## this is the index for the family of motifs we have constructed so far. 
j <- 1
## The main loop will run while there are motifs in the names.sorted.current that do not yet belong to any family
while(length(names.sorted.current > 0)){
  ## the trick here is that we always try to find
  ## similar motifs to the first one which is the most frequent. 
  ## In each iteration of the loop the most frequent and its 'mates' 
  ## are actually removed from the array.
  matches.found <- findsimilar(names.sorted.current[1],
                        names.sorted.current, thr.1)
  ## this is the vector of matches
  hits <- names.sorted.current[matches.found]
  ## since every match has a different multiplicity, I'm creating
  ## the vector that each motif is multiplied by as many times as it actually
  ## exists
  allhits <- sapply(1:length(hits), function(x){
    rep(hits[x], xxx.sorted[hits[x]])
  })  
  ## I put the matches in a matrix, and store this matrix
  ## in a list
  allhits <- unlist(allhits)
  motif.list[[j]] <- stringsToMatrix(allhits, motifLength)
  ## the index is increased to accept the next motif (for the next iteration of the while loop)
  j <- j+1
  ## this is the trick! We only keep the motifs that didn't belong
  ## to the family. Thus, the size of the motif vector is always 
  ## decreasing. 
  names.sorted.current <- names.sorted.current[!matches.found]
  ## this is just to monitor our progress. 
  ## every 300 steps it will report where we are
  if(j %% 300 == 0)
    print(c(j, length(names.sorted.current)))
}
## [1]   300 12488
## [1]  600 2072
## [1] 900  86
print(c(j, length(names.sorted.current)))
## [1] 962   0

The next steps are rather easy. Here, we just want to print out the motifs using the seqLogo library (from the Bioconductor).

## optional step in case we want the most frequent motifs
number.hits.motifs <- unlist(lapply(motif.list, nrow))
frequent.motifs <- which(number.hits.motifs > nrow(raw)/2)

## here, we will save the PWMs
log.pwms <- list()
freq.pwms <- list()
i <- 1
for(i in 1:length(motif.list)){
  motif.instances <- motif.list[[i]]
  motif.counts <- apply(motif.instances, 2, function(x){ table(factor(x, levels=c("A", "C", "G", "T") ))})
  motif.sums <- sum(motif.counts[,1])
  motif.freqs <- motif.counts/motif.sums
  motif.log2 <- log2(motif.freqs/as.numeric(nuc.freq))
  freq.pwms[[i]] <- motif.freqs
  log.pwms[[i]] <- motif.log2
}
if (!requireNamespace("BiocManager", quietly = TRUE))
     install.packages("BiocManager")
BiocManager::install("seqLogo", version = "3.8")
library(seqLogo)

Finally, we create a PDF with all motifs.

pdf("motifs.pdf", height=5, width=5)
for( i in 1:length(freq.pwms)){
  sums <- apply(freq.pwms[[i]], 2, sum)
  if(sum(sums, na.rm = TRUE) != motifLength){
    next
  }
  seqLogo(makePWM(freq.pwms[[i]]))
}
dev.off()
## png 
##   2