Normal Distribution in Corpus Linguistics?

Normal distribution occurs frequently in various fields of (social) science.
For example, weight, height, intelligence, and income are most normally distributed:

x = seq(-4, 4, 0.1)
y = dnorm(x)
par(mfrow = 1:2)
plot(x, y, type = "l")
library(MASS)
attach(chickwts)
truehist(weight)

plot of chunk unnamed-chunk-1

mean(weight)

## [1] 261.3

sd(weight)

## [1] 78.07

shapiro.test(weight)

## 
##  Shapiro-Wilk normality test
## 
## data:  weight 
## W = 0.9767, p-value = 0.2101

It would be surprising if there is not any normal distribution in language.
So can we find such distribution in a Chinese corpus?
Let’s take sentence length in part of the ASBC as the first investigation:

corpus.file = scan("sentences.txt", "char", sep = "\n")
word.list = lapply(corpus.file, strsplit, split = "　")
word.vector = sapply(word.list, unlist)
sentence.length = sapply(word.vector, length)
truehist(sentence.length)

plot of chunk unnamed-chunk-2

mean(sentence.length)

## [1] 8.15

sd(sentence.length)

## [1] 4.39

shapiro.test(sentence.length)

## 
##  Shapiro-Wilk normality test
## 
## data:  sentence.length 
## W = 0.913, p-value < 2.2e-16

Unfortunately, sentence length is not normally distributed.
How about specific word frequency in a chunked balanced corpus?

word.vector = unlist(word.list)
asbc = data.frame(word = word.vector, chunk = cut(1:length(word.vector), breaks = 40, 
    labels = F))
asbc$DE = asbc$word == "的(DE)"
countofDE = tapply(asbc$DE, asbc$chunk, sum)
truehist(countofDE)

plot of chunk unnamed-chunk-3

shapiro.test(countofDE)

## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9878, p-value = 0.9374

It seems normally distributed, but is it just a chance?
Let's re-examine the normality with the corpus size varying:

a = function(word.vector, word = "的(DE)", b = 40) {
    asbc = data.frame(word.vector, chunk = cut(1:length(word.vector), breaks = b, 
        labels = F))
    asbc$DE = asbc$word == "的(DE)"
    countofDE = tapply(asbc$DE, asbc$chunk, sum)
    print(length(word.vector))
    print(shapiro.test(countofDE))
}

len = length(word.vector)
for (i in seq(len/40, len, len/40)) a(word.vector[1:i])

## [1] 1018
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.7066, p-value = 1.251e-07
## 
## [1] 2037
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.8836, p-value = 0.000659
## 
## [1] 3055
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9265, p-value = 0.01243
## 
## [1] 4074
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9507, p-value = 0.08017
## 
## [1] 5092
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9659, p-value = 0.2653
## 
## [1] 6111
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9608, p-value = 0.1789
## 
## [1] 7129
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9554, p-value = 0.1167
## 
## [1] 8148
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9398, p-value = 0.03406
## 
## [1] 9166
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.8994, p-value = 0.001843
## 
## [1] 10185
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9493, p-value = 0.07187
## 
## [1] 11203
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9707, p-value = 0.3796
## 
## [1] 12222
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.947, p-value = 0.05989
## 
## [1] 13240
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9609, p-value = 0.1791
## 
## [1] 14259
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9438, p-value = 0.04634
## 
## [1] 15277
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9466, p-value = 0.0579
## 
## [1] 16296
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9213, p-value = 0.00847
## 
## [1] 17314
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9425, p-value = 0.04187
## 
## [1] 18333
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9046, p-value = 0.002606
## 
## [1] 19351
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9148, p-value = 0.005303
## 
## [1] 20370
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9141, p-value = 0.005063
## 
## [1] 21388
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.914, p-value = 0.004998
## 
## [1] 22407
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9128, p-value = 0.004616
## 
## [1] 23425
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9273, p-value = 0.01315
## 
## [1] 24444
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9237, p-value = 0.01009
## 
## [1] 25462
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.952, p-value = 0.08918
## 
## [1] 26481
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9459, p-value = 0.05491
## 
## [1] 27499
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9386, p-value = 0.03112
## 
## [1] 28518
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9243, p-value = 0.01051
## 
## [1] 29536
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.8958, p-value = 0.001444
## 
## [1] 30555
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.923, p-value = 0.009599
## 
## [1] 31573
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9284, p-value = 0.01434
## 
## [1] 32592
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9126, p-value = 0.004554
## 
## [1] 33610
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9318, p-value = 0.01843
## 
## [1] 34629
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9146, p-value = 0.005243
## 
## [1] 35647
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9296, p-value = 0.01562
## 
## [1] 36666
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9664, p-value = 0.2751
## 
## [1] 37684
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9515, p-value = 0.08564
## 
## [1] 38703
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9474, p-value = 0.0618
## 
## [1] 39721
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.977, p-value = 0.5808
## 
## [1] 40740
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9878, p-value = 0.9374

Or we can further examine the normalities of corpora with less than 2000 words:

word.vector = word.vector[1:1500]
asbc = data.frame(word.vector, chunk = cut(1:length(word.vector), breaks = 40, 
    labels = F))
asbc$DE = asbc$word == "的(DE)"
countofDE = tapply(asbc$DE, asbc$chunk, sum)
truehist(countofDE)

plot of chunk unnamed-chunk-5


word.vector = word.vector[1:5000]
len = length(word.vector)
for (i in seq(len/10, len, len/10)) a(word.vector[1:i])

## [1] 500
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.3994, p-value = 1.321e-11
## 
## [1] 1000
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.7528, p-value = 8.067e-07
## 
## [1] 1500
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.8472, p-value = 7.683e-05
## 
## [1] 2000
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.8419, p-value = 0.0005215
## 
## [1] 2500
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.8768, p-value = 0.00718
## 
## [1] 3000
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.8712, p-value = 0.01234
## 
## [1] 3500
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.8668, p-value = 0.01952
## 
## [1] 4000
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.871, p-value = 0.03494
## 
## [1] 4500
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.8189, p-value = 0.01153
## 
## [1] 5000
## 
##  Shapiro-Wilk normality test
## 
## data:  countofDE 
## W = 0.9036, p-value = 0.1766

In conclusion, a specific word frequency (e.g., 的(DE)) in a chunked Chinese balanced corpus with more than 2000 words is normally distributed.
In the future, 10-fold cross validation of the corpus and test for other words should be performed.
If it is true, normality could be regarded as one of the criteria for a good-enough corpus.