Overview

This report documents the initial exploratory analysis done on an English language corpus of documents from twitter, from blogs and from the news. It uses the tm package in R along with RWeka to pull out n-gram frequencies in the corpus in preparation for building an n-gram language model for the purposes of a predictive text application.

The original data files were extremely large (around 200M each) so in order to have a set of documents that my limited computing resources could handle, I sampled 10% from each file. The code for reading the data and extracting these samples is in the appendix.

From the sampled data, I extract a corpus of English language documents and preprocess it, including removing profanites. The profanities list used here came from http://www.bannedwordlist.com/.

en_corpus <- extract_corpus("", "en", "en_US")

# Does some preprocessing on the corpus to remove punctuation,
# remove profanities, convert all characters to lowercase, etc.
preProcessCorpus <- function(raw_corpus) {
  docs <- tm_map(raw_corpus, content_transformer(removePunctuation))
  docs <- tm_map(docs, content_transformer(tolower))
  docs <- tm_map(docs, content_transformer(removeNumbers))
  badwords <- get_bad_words("swearWords.txt")
  docs <- tm_map(docs, removeWords, badwords)
  docs <- tm_map(docs, content_transformer(stripWhitespace))
  docs <- tm_map(docs, PlainTextDocument)
  docs
}

# Extracts the swear words from the specified file into
# a character vector.
get_bad_words <- function(wordsfile) {
  con <- file(wordsfile, "r")
  i = 1
  words = character()
  while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
    words <- c(words, oneLine)
  }
  close(con) 
  words
}

en_corpus <- preProcessCorpus(en_corpus)

Now I can create a document-term matrix to get a feel for the word frequencies.

dtm <- DocumentTermMatrix(en_corpus)
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
wf <- data.frame(word=names(freq), freq=freq)
stopwords <- stopwords("english")
wf <- filter(wf, !word %in% stopwords)

Here’s a word cloud of the top 500 most frequent words in the corpus. As we can see, it is all everyday language - this is not a specialized corpus.

Frequency Distribution

I had some difficulty using RWeka’s NGramTokenizer on a corpus containing multiple documents, so in order to obtain unigram, bigram and trigram frequencies, I wrote a function that pulls out these frequencies one by one from each document and collates them.

# Define some tokenizers using RWeka's NgramTokenizer.
UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

# Returns a data frame of ngram frequencies, given the
# specified corpus and tokenizer.
getFrequencies <- function(corpus, tokenizer) {
  for (i in 1:length(corpus)) {
    tf <- termFreq(corpus[[i]], control = list(tokenize = tokenizer))
    wf <- data.frame(word=as.character(names(tf)), freq=as.numeric(tf))
    rownames(wf) <- names(tf)
    if (i == 1) {
      main_wf <- wf
    }
    else {
      intersection <- intersect(wf$word, main_wf$word)
      main_wf[intersection,]$freq <- main_wf[intersection,]$freq + wf[intersection,]$freq
      difference <- setdiff(wf$word, main_wf$word)
      new_words <- as.character(wf[difference,]$word)
      new_freqs <- as.numeric(wf[difference,]$freq)
      newdf <- data.frame(word = c(main_wf$word, new_words), freq = c(main_wf$freq, new_freqs))
      rownames(newdf) <- newdf$word
      main_df = newdf
    }
  }
  main_wf
}

unigram_freqs <- getFrequencies(en_corpus, UnigramTokenizer)

The distribution of unigram frequencies looks almost exponential. In fact, plotting the frequencies themselves results in a very unbalanced plot so here I am plotting a histogram of the log of the unigram frequencies.

qplot(log(unigram_freqs$freq), geom="histogram", binwidth=0.5, xlab="Log of unigram frequency")

As we can see, the vast majority of the words occur very infrequently and as we get to higher word frequencies we see there are fewer words at this end of the scale. This stands to reason. There are many many words that occur only once in the entire corpus. I decided to remove these as this is a good sign that they are either not real words, or words in a foreign language.

unigram_freqs_filtered <- filter(unigram_freqs, freq > 1)
qplot(log(unigram_freqs_filtered$freq), geom="histogram", binwidth=0.5, xlab="Log of unigram frequency")

Question: How many unique words cover 50% of the instances?

In order to answer this question I wrote a function that takes a data frame of word frequencies and the desire fraction of instances to cover.

# Returns the fraction of the dictionary of words need to cover
# the specified fraction of the word instances in the corpus.
fractionOfDictToCoverFractionOfInstances <- function(word_freqs, fraction) {
  if (fraction > 1) {
    fraction <- 1
  }
  if (fraction < 0) {
    fraction <- .01
  }
  dict_length <- nrow(word_freqs)
  sorted_unigrams <- arrange(word_freqs, desc(freq))
  num_word_instances <- sum(sorted_unigrams$freq)
  coverage <- 0
  count <- 1
  while(coverage < fraction) {
    sumToCount <- sum(sorted_unigrams[1:count,]$freq)
    coverage <- sumToCount / num_word_instances
    count <- count+1
  }
  count / dict_length
}

fractionOfDictToCoverFractionOfInstances(unigram_freqs, .5)
## [1] 0.002726993

So it takes only a tiny fraction of all the words in the dictionay to account for 50% of the word instances found in the corpus. We can get a better feeling for this by plotting the cumulative coverage of the unigrams as ordered by frequency. For this plot I am using the filtered list, i.e. excluding unigrams that occur only once:

sorted_unigrams <- arrange(unigram_freqs_filtered, desc(freq))
sorted_unigrams$cum_freq <- cumsum(sorted_unigrams$freq)
sorted_unigrams$cum_coverage <- sorted_unigrams$cum_freq / sum(sorted_unigrams$freq)
sorted_unigrams$ID<-seq.int(nrow(sorted_unigrams))
qplot(sorted_unigrams$ID, sorted_unigrams$cum_coverage, xlab="Unigram index (sorted by frequency)", ylab="Cumulative fraction of total instances covered")

This plot shows the coverage getting closer and closer to 1 (or 100% coverage of the word instances) as it accumlates the coverage provided by each unigram. Since they are ordered highest to lowest frequency, we get close to full coverage very early on and after a while each subsequent unigram adds very little coverage.

Now let’s have a look at bigram frequencies.

bigram_freqs <- getFrequencies(en_corpus, BigramTokenizer)
trigram_freqs <- getFrequencies(en_corpus, TrigramTokenizer)
plt1 <- qplot(log(bigram_freqs$freq), geom="histogram", binwidth=0.5, xlab="Log of bigram frequency")
plt2 <- qplot(log(trigram_freqs$freq), geom="histogram", binwidth=0.5, xlab="Log of trigram frequency")
grid.arrange(plt1, plt2, ncol=2)

From these plots it looks like the frequencies drop more rapidly to negligle levels the higher the n-gram. To see if this is correct, let’s plot 4-gram and 5-gram frequencies.

FourgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
FivegramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 5, max = 5))
fourgram_freqs <- getFrequencies(en_corpus, FourgramTokenizer)
fivegram_freqs <- getFrequencies(en_corpus, FivegramTokenizer)
plt3 <- qplot(log(fourgram_freqs$freq), geom="histogram", binwidth=0.5, xlab="Log of 4-gram frequency")
plt4 <- qplot(log(fivegram_freqs$freq), geom="histogram", binwidth=0.5, xlab="Log of 5-gram frequency")
grid.arrange(plt3, plt4, ncol=2)

As expected, the higher the n-gram the lower the probability of finding it within the corpus. This suggests that beyond a certain n-gram length, the value they provide in a probabilistic model declines.

Towards a language model

The unigram, bigram and trigram counts will form the basis of an n-gram language model that can be used to predict the next word a user wants to type, based on the previous word or words. The Markov assumption allows us to generalize the probability of a word given its history by the probability of that word give a very short portion of its history. Whereas unigram counts will only give us the probability of a word without taking previous words into account, bigrams allow us to take one previous word into account, trigrams 2 previous words, etc.

Appendix

The functions for extracting the data into smaller samples.

library(dplyr)
library(plyr)
library(ggplot2)
library(tm)
library(RWeka)
library(gridExtra)
suppressMessages(library(wordcloud))
# Returns the path to either the directory for a particular
# language, or to a specific file in that directory.
get_path_to_data <- function(rootdir, lang, lang_full, version = "final", create_dir_if_not_exists = FALSE, type="") {
  if (nchar(rootdir) > 0 && rootdir != ".") {
    version_dir = paste0(rootdir, "/", version)
  }
  else {
    version_dir = version
  }
  language_dir = paste0(version_dir, "/", lang_full)
  if (create_dir_if_not_exists) {
    dir.create(language_dir, showWarnings = FALSE, recursive = TRUE)
  }
  if (nchar(type) == 0) {
    path <- language_dir
  }
  else {
    path <- paste0(language_dir, "/", lang_full, ".", type, ".txt")
  }
  path
}

# Extracts the files for a particular language into a Corpus.
extract_corpus <- function(rootdir, lang, lang_full) {
  path = get_path_to_data(rootdir, lang, lang_full, "samples")
  src <- DirSource(path)
  Corpus(src, readerControl = list(language=lang))
}

# Given the directory within which the extracted "final" files are,
# goes through the files for the given language and extracts a
# 10% sample into a new file.
extract_samples <- function(rootdir, lang, lang_full, type) {
  input_path = get_path_to_data(rootdir, lang, lang_full, "final", FALSE, type)
  con <- file(input_path, "r")
  i = 1
  lines = character()
  while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
    stripped <- iconv(oneLine, "latin1", "ASCII", sub="")
    # If removing non-ASCII characters meant removing more than
    # half the line, don't use this line.
    if (nchar(stripped) / nchar(oneLine) < 0.5) {
      next
    }
    # We'll sample the text at 10%
    addLine = rbinom(1, 1, 0.1)
    if (addLine) {
      lines <- c(lines, stripped)
    }
  }
  close(con)  
  output_path = get_path_to_data(rootdir, lang, lang_full, "samples", TRUE, type)
  if (!file.exists(output_path)) {
    result <- file.create(output_path)
  }
  con_out  <- file(output_path, open = "w+")
  writeLines(lines, con_out)
  close(con_out)
}