Position Weight Matrices

Generation of the motifs

a = "AAACACATTTTT"
l = nchar(a)
m = 0.2
a.vector = unlist(strsplit(x=a, split=""))
alphabet = c("A", "C", "G", "T")
data = vector("character", length = 1000)
for(i in 1:1000){
  mut = a.vector
  k = rbinom(1, l, prob = m)
  if(k > 0){
    x = sample(1:l, k, replace=F)
    mut[x] = sample(alphabet, k, replace=TRUE)
  }
  data[i] = paste(mut, collapse = "")
}

write.table(data, file="motifs.txt", quote=F, row.names=F, col.names=F)

Generation of a PWM

Count the elements (slow)

pfm = matrix(0, nrow=4, ncol=l)
rowsums = vector(mode="numeric", length = l)
row.names(pfm) = alphabet
a = read.table("motifs.txt", h=F, colClasses = 'character')
for(i in 1:nrow(a)){
  v = unlist(strsplit(a[i,], ""))
  for(j in 1:length(v)){
    rowsums[j] = rowsums[j] + 1
    pfm[v[j], j] = pfm[v[j], j] + 1
  }
}
ppm = pfm/rowsums
pwm = log2(ppm/0.25)

Count the elements (faster and better)

Read the file

a = read.table("motifs.txt", h=F, colClasses = 'character')
m = t(apply(a, 1, function(x){unlist(strsplit(x, ""))}))

Construct the matrix

pfm = matrix(0, nrow=4, ncol=l)
rownames(pfm) = alphabet
pfm = apply(m, 2, function(x){table(factor(x, levels=alphabet))})
ppm = pfm/as.vector(apply(pfm, 2, sum))
pwm = log2(ppm/0.25)
pwm
##         V11       V12       V13       V14       V15       V16       V17
## A  1.777367  1.792439  1.780730 -2.573467  1.768925 -2.351074  1.753605
## C -2.351074 -2.506353 -2.132894  1.805705 -2.442222  1.772308 -2.107803
## G -2.573467 -2.573467 -2.380822 -2.473931 -2.293359 -2.411195 -2.473931
## T -2.265345 -2.380822 -2.795859 -2.680382 -2.293359 -2.321928 -2.210897
##         V18       V19      V110      V111      V112
## A -2.643856 -2.237864 -2.573467 -2.184425 -2.380822
## C -2.473931 -2.035047 -2.107803 -2.293359 -2.473931
## G -2.473931 -2.608232 -2.132894 -2.795859 -2.473931
## T  1.799087  1.757023  1.753605  1.779050  1.785760