Capstone: Milestone Report

Todd Rimes

6/13/2018

Week 2 Milestone Report

Load the three data sets, sample and combine the data, and finally calculate and plot the frequency distribution of individual words, two-word combos, and three-word combos.

  1. Load the required libraries and set up parallel processing
# load the required libraries
library(NLP) # for n-grams
library(tm) # for VCorpus
library(RWeka)
library(ggplot2) # for histograms
Count the number of lines in the source text files
$ wc -l *.txt
  899288 en_US.blogs.txt
 1010242 en_US.news.txt
 2360148 en_US.twitter.txt
 4269678 total
  1. Create a “sample corpus” from the original three files from Twitter, blogs, and news.
blogSampleSize = 0
twitterSampleSize = 0
newsSampleSize = 0

# If the previously combined sample data exists on disk, load it from disk
if(file.exists("inMemory.rda")) {
        ## load model
        load("inMemory.rda")
} else {
        # The combined sample data does not exist, so create it now
        blogText = readLines("./final/en_US/en_US.blogs.txt", encoding = "utf-8", skipNul = TRUE)
        twitterText = readLines("./final/en_US/en_US.twitter.txt", encoding = "utf-8", skipNul = TRUE)
        newsText = readLines("./final/en_US/en_US.news.txt", encoding = "utf-8", skipNul = TRUE)
        
        # limit the sample sizes to 20% of the original files
        sample_pct = 0.001

        blogSampleSize = round( length(blogText) * sample_pct,0)
        blogSample <- blogText[sample(1: length(blogText),blogSampleSize)]
        
        twitterSampleSize = round(length(twitterText) * sample_pct,0)
        twitterSample <- twitterText[sample(1:length(twitterText),twitterSampleSize)]
        
        newsSampleSize = round(length(newsText) * sample_pct,0)
        newsSample <- newsText[sample(1:length(newsText),newsSampleSize)]
        
        # remove the large corpus files from memory
        rm(blogText,twitterText,newsText)
        
        head(blogSample)
        head(twitterSample)
        head(newsSample)
        
        # Combine the three samples into one
        allSample <- rbind(blogSample,twitterSample,newsSample)
        # read the sample into an in-memory object, then clean the data
        inMemory <- VCorpus(VectorSource(allSample))
        # lowercase
        inMemory <- tm_map(inMemory, content_transformer(tolower))
        # remove punctuation
        inMemory <- tm_map(inMemory, removePunctuation)
        # remove numbers
        inMemory <- tm_map(inMemory, removeNumbers)
        # remove 1+ whitespace
        inMemory <- tm_map(inMemory, stripWhitespace)
        # save to disk
        save(inMemory, file="inMemory.rda")
}
## [1] "Blog sample count is  899"
## [1] "Twitter sample count is  2360"
## [1] "News sample count is  1010"
  1. Calculate and plot the one-word N-grams, AKA unigrams
if(file.exists("oneWordMatrix.rda")) {
        ## load model
        load("oneWordMatrix.rda")
} else {
        oneWordTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
        oneWordMatrix <- TermDocumentMatrix(inMemory, control = list(tokenize = oneWordTokenizer))
        save(oneWordMatrix, file="oneWordMatrix.rda")
}

if(file.exists("topTerms1.rda")) {
        ## load model
        load("topTerms1.rda")
        load("wordCounts1.rda")
} else {
        topTerms1 <- findFreqTerms(oneWordMatrix, lowfreq = 300)
        save(topTerms1, file="topTerms1.rda")
        wordCounts1 <- rowSums(as.matrix(oneWordMatrix[topTerms1,]))
        wordCounts1 <- data.frame(unigram=names(wordCounts1), frequency=wordCounts1)
        save(wordCounts1, file="wordCounts1.rda")
}

g1 <- ggplot(wordCounts1, aes(x=reorder(unigram, frequency), y=frequency)) +
    geom_bar(stat = "identity") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Unigram") + ylab("Frequency") +
    labs(title = "Top unigrams by frequency")
print(g1)

  1. Calculate and plot the two-word N-grams, AKA bigrams
if(file.exists("twoWordMatrix.rda")) {
        ## load model
        load("twoWordMatrix.rda")
} else {
        twoWordTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
        twoWordMatrix <- TermDocumentMatrix(inMemory, control = list(tokenize = twoWordTokenizer))
        save(twoWordMatrix, file="twoWordMatrix.rda")
}

if(file.exists("topTerms2.rda")) {
        ## load model
        load("topTerms2.rda")
        load("wordCounts2.rda")
} else {
        topTerms2 <- findFreqTerms(twoWordMatrix, lowfreq = 125)
        save(topTerms2, file="topTerms2.rda")
        wordCounts2 <- rowSums(as.matrix(twoWordMatrix[topTerms2,]))
        wordCounts2 <- data.frame(bigram=names(wordCounts2), frequency=wordCounts2)
        save(wordCounts2, file="wordCounts2.rda")
}

g2 <- ggplot(wordCounts2, aes(x=reorder(bigram, frequency), y=frequency)) +
    geom_bar(stat = "identity") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Bigram") + ylab("Frequency") +
    labs(title = "Top bigrams by frequency")
print(g2)

  1. Calculate and plot the three-word N-grams, AKA trigrams
if(file.exists("threeWordMatrix.rda")) {
        ## load model
        load("threeWordMatrix.rda")
} else {
        threeWordTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
        threeWordMatrix <- TermDocumentMatrix(inMemory, control = list(tokenize = threeWordTokenizer))
        save(threeWordMatrix, file="threeWordMatrix.rda")
}

if(file.exists("topTerms3.rda")) {
        ## load model
        load("topTerms3.rda")
        load("wordCounts3.rda")
} else {
        topTerms3 <- findFreqTerms(threeWordMatrix, lowfreq = 20)
        save(topTerms3, file="topTerms3.rda")
        wordCounts3 <- rowSums(as.matrix(threeWordMatrix[topTerms3,]))
        wordCounts3 <- data.frame(trigram=names(wordCounts3), frequency=wordCounts3)
        save(wordCounts3, file="wordCounts3.rda")
}

g3 <- ggplot(wordCounts3, aes(x=reorder(trigram, frequency), y=frequency)) +
    geom_bar(stat = "identity") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Trigram") + ylab("Frequency") +
    labs(title = "Top trigrams by frequency")
print(g3)