January 27, 2015

A sample from the Zipf law

set.seed(1234); M <- 10000; N <- 1000; beta <- 2/3
probs <- 1/(1:M)^beta; probs <- probs/sum(probs)
data <- sample(M, size = N, prob = probs, replace = T)
trueCounts <- tabulate(data, nbins = M)
head(trueCounts, n = 20); tail(trueCounts, n = 20)
##  [1] 13 15  5  6  5  7  6  0  4  0  6  8  6  1  3  3  5  3  3  3
##  [1] 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0
tab <- table(table(data)); tab[length(tab):1]
## 
##  15  13   8   7   6   5   4   3   2   1 
##   1   1   1   1   4   4   4  12  51 759

Plot the data

plot(trueCounts); lines(N * probs)

Double logarithmic plot

plot(trueCounts/N, log = "xy"); lines(probs)

But true ordering of words unknown

obsCounts <- sort(trueCounts, decreasing = TRUE)
obsCounts <- obsCounts[obsCounts > 0]
plot(obsCounts); lines(N * probs)

Double logarithmic

plot(obsCounts/N, log = "xy")
lines(probs)

Research question

Notice how the true probability law "fits" beautifully to this plot.

Is that just luck, or is there a theorem here?

Can this be used to estimate the power in the Zipf law?

Mao tomato gene expression

freqs <- c(27, 23, 16, 14:1)
reps <- c(1, 1, 2, 1, 1, 1, 2, 2, 1, 3, 2, 6, 11, 33, 71, 253, 1434)
patternMao <- rep(freqs, reps); plot(patternMao, log = "xy")

Mao (continued)

N <- sum(patternMao); K <- length(patternMao)
plot(patternMao, log = "xy"); lines(1:K, N/(65*(1:K)^(2/3)))

NFI Y-STR data

freqs <- c(13, 7:1); reps <- c(1, 1, 2, 5, 7, 30, 130, 1650)
patternNFI <- rep(freqs, reps); plot(patternNFI, log = "xy")

NFI (continued)

N <- sum(patternNFI); K <- length(patternNFI)
plot(patternNFI, log = "xy"); lines(1:K, N/(170*(1:K)^(1/2)))