Capstone: Milestone Report

Week 2 Milestone Report

Load the three data sets, sample and combine the data, and finally calculate and plot the frequency distribution of individual words, two-word combos, and three-word combos.

Load the required libraries and set up parallel processing

# load the required libraries
library(NLP) # for n-grams
library(tm) # for VCorpus
library(RWeka)
library(ggplot2) # for histograms

Count the number of lines in the source text files

$ wc -l *.txt
  899288 en_US.blogs.txt
 1010242 en_US.news.txt
 2360148 en_US.twitter.txt
 4269678 total

Create a “sample corpus” from the original three files from Twitter, blogs, and news.

1. Read the files
1. Calculate the sample size from each file
1. Extract the sample from each file according to calculated sample size
1. Concatenate the individual samples
1. Clean the combined samples of upper case, numbers, punctuation, and whitespace

blogSampleSize = 0
twitterSampleSize = 0
newsSampleSize = 0

# If the previously combined sample data exists on disk, load it from disk
if(file.exists("inMemory.rda")) {
        ## load model
        load("inMemory.rda")
} else {
        # The combined sample data does not exist, so create it now
        blogText = readLines("./final/en_US/en_US.blogs.txt", encoding = "utf-8", skipNul = TRUE)
        twitterText = readLines("./final/en_US/en_US.twitter.txt", encoding = "utf-8", skipNul = TRUE)
        newsText = readLines("./final/en_US/en_US.news.txt", encoding = "utf-8", skipNul = TRUE)
        
        # limit the sample sizes to 20% of the original files
        sample_pct = 0.001

        blogSampleSize = round( length(blogText) * sample_pct,0)
        blogSample <- blogText[sample(1: length(blogText),blogSampleSize)]
        
        twitterSampleSize = round(length(twitterText) * sample_pct,0)
        twitterSample <- twitterText[sample(1:length(twitterText),twitterSampleSize)]
        
        newsSampleSize = round(length(newsText) * sample_pct,0)
        newsSample <- newsText[sample(1:length(newsText),newsSampleSize)]
        
        # remove the large corpus files from memory
        rm(blogText,twitterText,newsText)
        
        head(blogSample)
        head(twitterSample)
        head(newsSample)
        
        # Combine the three samples into one
        allSample <- rbind(blogSample,twitterSample,newsSample)
        # read the sample into an in-memory object, then clean the data
        inMemory <- VCorpus(VectorSource(allSample))
        # lowercase
        inMemory <- tm_map(inMemory, content_transformer(tolower))
        # remove punctuation
        inMemory <- tm_map(inMemory, removePunctuation)
        # remove numbers
        inMemory <- tm_map(inMemory, removeNumbers)
        # remove 1+ whitespace
        inMemory <- tm_map(inMemory, stripWhitespace)
        # save to disk
        save(inMemory, file="inMemory.rda")
}

## [1] "Blog sample count is  899"

## [1] "Twitter sample count is  2360"

## [1] "News sample count is  1010"

Calculate and plot the one-word N-grams, AKA unigrams

if(file.exists("oneWordMatrix.rda")) {
        ## load model
        load("oneWordMatrix.rda")
} else {
        oneWordTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
        oneWordMatrix <- TermDocumentMatrix(inMemory, control = list(tokenize = oneWordTokenizer))
        save(oneWordMatrix, file="oneWordMatrix.rda")
}

if(file.exists("topTerms1.rda")) {
        ## load model
        load("topTerms1.rda")
        load("wordCounts1.rda")
} else {
        topTerms1 <- findFreqTerms(oneWordMatrix, lowfreq = 300)
        save(topTerms1, file="topTerms1.rda")
        wordCounts1 <- rowSums(as.matrix(oneWordMatrix[topTerms1,]))
        wordCounts1 <- data.frame(unigram=names(wordCounts1), frequency=wordCounts1)
        save(wordCounts1, file="wordCounts1.rda")
}

g1 <- ggplot(wordCounts1, aes(x=reorder(unigram, frequency), y=frequency)) +
    geom_bar(stat = "identity") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Unigram") + ylab("Frequency") +
    labs(title = "Top unigrams by frequency")
print(g1)

Calculate and plot the two-word N-grams, AKA bigrams

if(file.exists("twoWordMatrix.rda")) {
        ## load model
        load("twoWordMatrix.rda")
} else {
        twoWordTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
        twoWordMatrix <- TermDocumentMatrix(inMemory, control = list(tokenize = twoWordTokenizer))
        save(twoWordMatrix, file="twoWordMatrix.rda")
}

if(file.exists("topTerms2.rda")) {
        ## load model
        load("topTerms2.rda")
        load("wordCounts2.rda")
} else {
        topTerms2 <- findFreqTerms(twoWordMatrix, lowfreq = 125)
        save(topTerms2, file="topTerms2.rda")
        wordCounts2 <- rowSums(as.matrix(twoWordMatrix[topTerms2,]))
        wordCounts2 <- data.frame(bigram=names(wordCounts2), frequency=wordCounts2)
        save(wordCounts2, file="wordCounts2.rda")
}

g2 <- ggplot(wordCounts2, aes(x=reorder(bigram, frequency), y=frequency)) +
    geom_bar(stat = "identity") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Bigram") + ylab("Frequency") +
    labs(title = "Top bigrams by frequency")
print(g2)

Calculate and plot the three-word N-grams, AKA trigrams

if(file.exists("threeWordMatrix.rda")) {
        ## load model
        load("threeWordMatrix.rda")
} else {
        threeWordTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
        threeWordMatrix <- TermDocumentMatrix(inMemory, control = list(tokenize = threeWordTokenizer))
        save(threeWordMatrix, file="threeWordMatrix.rda")
}

if(file.exists("topTerms3.rda")) {
        ## load model
        load("topTerms3.rda")
        load("wordCounts3.rda")
} else {
        topTerms3 <- findFreqTerms(threeWordMatrix, lowfreq = 20)
        save(topTerms3, file="topTerms3.rda")
        wordCounts3 <- rowSums(as.matrix(threeWordMatrix[topTerms3,]))
        wordCounts3 <- data.frame(trigram=names(wordCounts3), frequency=wordCounts3)
        save(wordCounts3, file="wordCounts3.rda")
}

g3 <- ggplot(wordCounts3, aes(x=reorder(trigram, frequency), y=frequency)) +
    geom_bar(stat = "identity") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Trigram") + ylab("Frequency") +
    labs(title = "Top trigrams by frequency")
print(g3)