Anaylyzing Liguistc Data: Chapter 3

29/05/2013

library(languageR)

The text of Lewis Carroll’s Alice’s Adventures in Wonderland is available as the data set alice. The vector alice contains all words (defined as sequences of non-space characters) in this novel.

alice <- tolower(alice)
alice[1:5]

## [1] "alice"      "s"          "adventures" "in"         "wonderland"

In this exercise, we study the distribution of three words in this book, Alice, very, and Hare (the second noun of the collocation March Hare). Our goal is to partition this text into 40 equal-sized text chunks, and to study the frequencies with which our three target words occur in these 40 chunks.

We therefore restrict ourselves to the first 27240 tokens, and use cut() to partition the sequence of tokens into 40 equally sized chunks. The output of cut() is a factor with as levels the successive equal-sized chunks of data. For each element in its input vector, i.e. for each word, it specifies the chunk to which that word belongs. We combine the words and the information about their chunks into a data frame with the function data.frame():

wonderland <- data.frame(word = alice[1:27240], chunk = cut(1:27240, breaks = 40, 
    labels = F))
wonderland[1:5, ]

##         word chunk
## 1      alice     1
## 2          s     1
## 3 adventures     1
## 4         in     1
## 5 wonderland     1

We now add a vector of truth values to this data frame to indicate which rows contain the exact string “alice”:

wonderland$alice <- wonderland$word == "alice"
wonderland[1:5, ]

##         word chunk alice
## 1      alice     1  TRUE
## 2          s     1 FALSE
## 3 adventures     1 FALSE
## 4         in     1 FALSE
## 5 wonderland     1 FALSE

We count how often the word Alice occurs in each chunk:

(countOfAlice <- tapply(wonderland$alice, wonderland$chunk, sum))

##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
## 10  7 10  9  4 10  8  8 12  6  9  8  8 14  9 11  6 11 11 15 13 13 18 10 10 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 
## 13 12  9 15 14 17  9 13  7  8  3  7 10  4  7

Finally, we make a frequency table of these counts with xtabs():

(countOfAlice.tab <- xtabs(~countOfAlice))

## countOfAlice
##  3  4  6  7  8  9 10 11 12 13 14 15 17 18 
##  1  2  2  4  5  5  6  3  2  4  2  2  1  1

1. Create similar tables for the words hare and very.

# hare
wonderland$hare <- wonderland$word == "hare"
(countOfhare <- tapply(wonderland$hare, wonderland$chunk, sum))

##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3  8  6  6  2  0 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 
##  0  0  0  0  0  0  0  0  0  0  5  0  0  0  1

(countOfhare.tab <- xtabs(~countOfhare))

## countOfhare
##  0  1  2  3  5  6  8 
## 33  1  1  1  1  2  1

# very
wonderland$very <- wonderland$word == "very"
(countOfvery <- tapply(wonderland$very, wonderland$chunk, sum))

##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  9  3  9  2  4  3  2  2  4  3  1  0  5  9  3  2  3  3  2  3  5  1  5  5  4 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 
##  7  3  7  2  5  1  6  4  1  5  3  3  4  0  1

(countOfvery.tab <- xtabs(~countOfvery))

## countOfvery
##  0  1  2  3  4  5  6  7  9 
##  2  5  6 10  5  6  1  2  3

2. Make a plot that displays by means of high-density lines how often Alice occurs in the successive chunks. Make similar plots for very and hare. What do you see?

par(mfrow = c(3, 1))
plot(names(countOfAlice), countOfAlice, type = "h", xlab = "chunk", ylab = "frequency", 
    main = "alice")
plot(names(countOfhare), countOfhare, type = "h", xlab = "chunk", ylab = "frequency", 
    main = "hare")
plot(names(countOfvery), countOfvery, type = "h", xlab = "chunk", ylab = "frequency", 
    main = "very")

plot of chunk unnamed-chunk-8

3. Make a plot with the number of times Alice occurs in the chunks on the horizontal axis (i.e. as.numeric(names(alice.tab))), and with the proportion of chunks with that count on the vertical axis. Use high-density lines. Make similar sample density plots for very and for hare.

(alice.probs <- xtabs(~countOfAlice)/nrow(countOfAlice))

## countOfAlice
##     3     4     6     7     8     9    10    11    12    13    14    15 
## 0.025 0.050 0.050 0.100 0.125 0.125 0.150 0.075 0.050 0.100 0.050 0.050 
##    17    18 
## 0.025 0.025

(hare.probs <- xtabs(~countOfhare)/nrow(countOfhare))

## countOfhare
##     0     1     2     3     5     6     8 
## 0.825 0.025 0.025 0.025 0.025 0.050 0.025

(very.probs <- xtabs(~countOfvery)/nrow(countOfvery))

## countOfvery
##     0     1     2     3     4     5     6     7     9 
## 0.050 0.125 0.150 0.250 0.125 0.150 0.025 0.050 0.075


par(mfrow = c(3, 1))
plot(as.numeric(names(alice.probs)), alice.probs, type = "h", xlab = "ocurrences", 
    ylab = "relative frequencies", main = "alice")
plot(as.numeric(names(hare.probs)), hare.probs, type = "h", xlab = "ocurrences", 
    ylab = "relative frequencies", main = "hare")
plot(as.numeric(names(very.probs)), very.probs, type = "h", xlab = "ocurrences", 
    ylab = "relative frequencies", main = "very")

plot of chunk unnamed-chunk-9

4. Also plot the corresponding densities under the assumption that these words follow a Poisson distribution with an estimated rate parameter λ equal to the mean of the counts in the chunks. Compare the Poisson densities with the sample densities.

(alice.lambda <- mean(countOfAlice))

## [1] 9.95


par(mfrow = c(2, 1))
plot(as.numeric(names(alice.probs)), alice.probs, type = "h", xlab = "counts", 
    ylab = "# of chunks", main = "alice: observations")
plot(as.numeric(names(alice.probs)), dpois(as.numeric(names(alice.probs)), alice.lambda), 
    type = "h", xlab = "counts", ylab = "# of chunks", main = paste("Poisson(", 
        alice.lambda, ")", sep = ""))

plot of chunk unnamed-chunk-10

(hare.lambda <- mean(countOfhare))

## [1] 0.775


par(mfrow = c(2, 1))
plot(as.numeric(names(hare.probs)), hare.probs, type = "h", xlab = "counts", 
    ylab = "# of chunks", main = "hare: observations")
plot(as.numeric(names(hare.probs)), dpois(as.numeric(names(hare.probs)), hare.lambda), 
    type = "h", xlab = "counts", ylab = "# of chunks", main = paste("Poisson(", 
        hare.lambda, ")", sep = ""))

plot of chunk unnamed-chunk-11

(very.lambda <- mean(countOfvery))

## [1] 3.6


par(mfrow = c(2, 1))
plot(as.numeric(names(very.probs)), very.probs, type = "h", xlab = "counts", 
    ylab = "# of chunks", main = "very: observations")
plot(as.numeric(names(very.probs)), dpois(as.numeric(names(very.probs)), very.lambda), 
    type = "h", xlab = "counts", ylab = "# of chunks", main = paste("Poisson(", 
        very.lambda, ")", sep = ""))

plot of chunk unnamed-chunk-12

5. Make quantile-quantile plots for graphical inspection of whether Alice, very, and hare might follow a Poisson distribution. First create the vector of theoretical quantiles for the X -coordinates, using as percentage points 5%, 10%, 15%, … , 100%. Supply the percentage points as a vector of proportions as first argument to qpois(). The second argument is λ, estimated by the mean count. The sample quantiles are obtained with quantile().

qnts <- seq(0, 1, by = 0.05)
par(mfrow = c(1, 1))
plot(qpois(qnts, alice.lambda), quantile(countOfAlice, qnts), xlab = paste("quantiles for Poisson(", 
    alice.lambda, ")", sep = ""), ylab = "sample quantiles", main = "alice")

plot of chunk unnamed-chunk-13


plot(qpois(qnts, hare.lambda), quantile(countOfhare, qnts), xlab = paste("quantiles for Poisson(", 
    hare.lambda, ")", sep = ""), ylab = "sample quantiles", main = "hare")

plot of chunk unnamed-chunk-13

plot(qpois(qnts, very.lambda), quantile(countOfvery, qnts), xlab = paste("quantiles for Poisson(", 
    very.lambda, ")", sep = ""), ylab = "sample quantiles", main = "very")

plot of chunk unnamed-chunk-13

6. The mean count of Alice is 9.95. In chunk 39, Alice is observed only 4 times. Suppose we only have this chunk of text available. Calculate the likelihood of observing Alice more than 10 times in another chunk of similar size. Assume that Alice follows a Poisson distribution.

1 - ppois(10, 4)

## [1] 0.00284

Recalculate this probability on the basis of the mean count, and compare the expected number of chunks in which Alice occurs more than 10 times with the actual number of chunks.

1 - ppois(10, alice.lambda)

## [1] 0.4107