Generation of the motifs
a = "AAACACATTTTT"
l = nchar(a)
m = 0.2
a.vector = unlist(strsplit(x=a, split=""))
alphabet = c("A", "C", "G", "T")
data = vector("character", length = 1000)
for(i in 1:1000){
mut = a.vector
k = rbinom(1, l, prob = m)
if(k > 0){
x = sample(1:l, k, replace=F)
mut[x] = sample(alphabet, k, replace=TRUE)
}
data[i] = paste(mut, collapse = "")
}
write.table(data, file="motifs.txt", quote=F, row.names=F, col.names=F)
Generation of a PWM
Count the elements (slow)
pfm = matrix(0, nrow=4, ncol=l)
rowsums = vector(mode="numeric", length = l)
row.names(pfm) = alphabet
a = read.table("motifs.txt", h=F, colClasses = 'character')
for(i in 1:nrow(a)){
v = unlist(strsplit(a[i,], ""))
for(j in 1:length(v)){
rowsums[j] = rowsums[j] + 1
pfm[v[j], j] = pfm[v[j], j] + 1
}
}
ppm = pfm/rowsums
pwm = log2(ppm/0.25)
Count the elements (faster and better)
Read the file
a = read.table("motifs.txt", h=F, colClasses = 'character')
m = t(apply(a, 1, function(x){unlist(strsplit(x, ""))}))
Construct the matrix
pfm = matrix(0, nrow=4, ncol=l)
rownames(pfm) = alphabet
pfm = apply(m, 2, function(x){table(factor(x, levels=alphabet))})
ppm = pfm/as.vector(apply(pfm, 2, sum))
pwm = log2(ppm/0.25)
pwm
## V11 V12 V13 V14 V15 V16 V17
## A 1.777367 1.792439 1.780730 -2.573467 1.768925 -2.351074 1.753605
## C -2.351074 -2.506353 -2.132894 1.805705 -2.442222 1.772308 -2.107803
## G -2.573467 -2.573467 -2.380822 -2.473931 -2.293359 -2.411195 -2.473931
## T -2.265345 -2.380822 -2.795859 -2.680382 -2.293359 -2.321928 -2.210897
## V18 V19 V110 V111 V112
## A -2.643856 -2.237864 -2.573467 -2.184425 -2.380822
## C -2.473931 -2.035047 -2.107803 -2.293359 -2.473931
## G -2.473931 -2.608232 -2.132894 -2.795859 -2.473931
## T 1.799087 1.757023 1.753605 1.779050 1.785760