Summary

This document is intended to be a concise report to explain the major features of three text documents and briefly summarize plans for creating a prediction algorithm and Shiny app.

The data was obtained from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip. This work uses the three files under the en_US folder, which contain text from blogs, news and twitter.

The final objective is to create an app that has an input and, based on the 1, 2, or 3 words entered, will predict the next word. To this end, this report shows the initial exploratory analysis.

Data Cleaning

Data cleaning is a very important step in text mining. Therefore, we divided the steps into subsections as follows.

Getting the data

First we load the entire data:

blogs <- readLines('en_US.blogs.txt')
news <- readLines('en_US.news.txt')
twitter <- readLines('en_US.twitter.txt')

Then we take general info of the files loaded:

blogs_lines <- length(blogs)
blogs_words <- sum(stri_count(blogs, regex="\\S+"))
news_lines <- length(news)
news_words <- sum(stri_count(news, regex="\\S+"))
twitter_lines <- length(twitter)
twitter_words <- sum(stri_count(twitter, regex="\\S+"))
total_lines <- blogs_lines + news_lines + twitter_lines
total_words <- blogs_words + news_words + twitter_words
lines_words <- c(blogs_lines, blogs_words, news_lines, news_words, twitter_lines, twitter_words, total_lines, total_words)
names(lines_words) <- c('blogs_lines', 'blogs_words', 'news_lines', 'news_words', 'twitter_lines', 'twitter_words', 'total_lines', 'total_words')
format(lines_words, decimal.mark=",", big.mark=".",small.mark=".")
##   blogs_lines   blogs_words    news_lines    news_words twitter_lines twitter_words 
## "    899.288" " 37.334.131" "  1.010.242" " 34.372.530" "  2.360.148" " 30.373.543" 
##   total_lines   total_words 
## "  4.269.678" "102.080.204"

Therefore, considering all three files, we have 102.080.204 words and 4.269.678 lines.

Sampling

Since we do not have enough CPU and memory to process all the data, we chose to get a random sample of 1% equally distributed among the three files.

blogs_ss <- sample(blogs, length(blogs)*0.01, replace=FALSE)
news_ss <- sample(news, length(news)*0.01, replace=FALSE)
twitter_ss <- sample(twitter, length(twitter)*0.01, replace=FALSE)

alldocs <- paste(blogs_ss, news_ss, twitter_ss)
length(alldocs)
head(alldocs, 1)
## [1] 23601
## [1] "You can contact me back here (spike222@e-mail.ua) LOS ANGELES -- MetLife Inc. will pay nearly $500 million in a settlement involving more than 30 states that claimed it didn't provide life insurance benefits to some of its policyholders, the company said Monday. Blaring Sandwitches in my room with the door closed -Damien Wolfe"

Above we can see that we had 23.601 lines in our sample.

Sentence separation

We then separate the texts into sentences.

library(qdap)
alldocs_sent = foreach(n = 1:length(alldocs), .combine = c) %dopar% {
  sent_detect(alldocs[n], language = "en", model = NULL) # separate into sentences
}
length(alldocs_sent)
head(alldocs_sent, 3)
## [1] 156277
## [1] "You can contact me back here LOS ANGELES -- MetLife Inc."                                                                                                                                
## [2] "will pay nearly $500 million in a settlement involving more than 30 states that claimed it didn't provide life insurance benefits to some of its policyholders, the company said Monday."
## [3] "Blaring Sandwitches in my room with the door closed -Damien Wolfe"

Thus, we went from 23.601 lines before sentence separation, to 156.277 lines after.

Tokenization, Profanity Filtering and Other Transformations

We then build our corpus from the text separated by sentences. Using 12 cores of an amazon instance, we remove profanity words (obtained from http://www.cs.cmu.edu/~biglou/resources/bad-words.txt) and make other general transformations like removing punctuation and transforming to lower case.

library(doMC)
registerDoMC(cores=12)

library(tm)
corpus <- VCorpus(VectorSource(alldocs_sent))

badwords <- readLines("bad-words.txt") # http://www.cs.cmu.edu/~biglou/resources/bad-words.txt

corpus_good = foreach(n = 1:length(corpus), .combine = c) %dopar% {
  corpus_n <- corpus[n]
  corpus_n <- tm_map(corpus_n, removeWords, badwords) # Remove profanity words
  corpus_n <- tm_map(corpus_n, removePunctuation) # Remove punctuation
  corpus_n <- tm_map(corpus_n, content_transformer(tolower)) # Transform to lower case
  corpus_n <- tm_map(corpus_n, removeNumbers) # Remove digits
  corpus_n <- tm_map(corpus_n, stripWhitespace) # Strip spaces
  corpus_n
}

Next we perform tokenization to get one word frequencies (one gram), two words frequencies (bi gram) and three words frequencies (tri gram).

library(RWeka)

one_gram = foreach(n = 1:length(corpus_good), .combine = c) %dopar% {
  NGramTokenizer(corpus_good[[n]], Weka_control(min=1, max=1, delimiters=" \\r\\n\\t.,;:\"()?!"))
}
bi_gram = foreach(n = 1:length(corpus_good), .combine = c) %dopar% {
  NGramTokenizer(corpus_good[[n]], Weka_control(min=2, max=2, delimiters=" \\r\\n\\t.,;:\"()?!"))
}
tri_gram = foreach(n = 1:length(corpus_good), .combine = c) %dopar% {
  NGramTokenizer(corpus_good[[n]], Weka_control(min=3, max=3, delimiters=" \\r\\n\\t.,;:\"()?!"))
}

Exploratory Analysis

To explore our data, we analyze separately text in the three forms tokenized: one word (one gram), two words (bi gram) and three words (tri gram). To observe the data, we plot the top 10 grams with the highest frequencies and a worcloud.

One Word

For the one gram, we plot the top 10 words with the highest frequencies and a wordcloud.

one_gram <- data.frame(table(one_gram))
one_gram <- one_gram[order(one_gram$Freq, decreasing = TRUE),]
names(one_gram) <- c('OneGram', 'Frequency')

library(ggplot2)
ggplot(one_gram[1:10, ], aes(x=reorder(OneGram, Frequency), y=Frequency)) + geom_bar(stat="Identity", fill="Gray") + geom_text(aes(label=Frequency), hjust=-0.1) + coord_flip() + xlab("One Gram") + labs(title="Top 10 One Gram")

library(wordcloud)
wordcloud(one_gram$OneGram, one_gram$Frequency, min.freq=500, random.order=FALSE, rot.per=0.2, colors=brewer.pal(6, "Dark2"), use.r.layout=FALSE)

Two Words

For the bi gram, we plot the top 10 two words with the highest frequencies and a wordcloud.

bi_gram <- data.frame(table(bi_gram))
bi_gram <- bi_gram[order(bi_gram$Freq, decreasing = TRUE),]
names(bi_gram) <- c('BiGram', 'Frequency')

library(ggplot2)
ggplot(bi_gram[1:10, ], aes(x=reorder(BiGram, Frequency), y=Frequency)) + geom_bar(stat="Identity", fill="Gray") + geom_text(aes(label=Frequency), hjust=-0.1) + coord_flip() + xlab("Bi Gram") + labs(title="Top 10 Bi Gram")

library(wordcloud)
wordcloud(bi_gram$BiGram, bi_gram$Frequency, min.freq=400, random.order=FALSE, rot.per=0.2, colors=brewer.pal(6, "Dark2"), use.r.layout=FALSE)

Three Words

For the tri gram, we plot the top 10 three words with the highest frequencies and a wordcloud.

tri_gram <- data.frame(table(tri_gram))
tri_gram <- tri_gram[order(tri_gram$Freq, decreasing = TRUE),]
names(tri_gram) <- c('TriGram', 'Frequency')

library(ggplot2)
ggplot(tri_gram[1:10, ], aes(x=reorder(TriGram, Frequency), y=Frequency)) + geom_bar(stat="Identity", fill="Gray") + geom_text(aes(label=Frequency), hjust=-0.1) + coord_flip() + xlab("Tri Gram") + labs(title="Top 10 Tri Gram")

library(wordcloud)
wordcloud(tri_gram$TriGram, tri_gram$Frequency, min.freq=200, random.order=FALSE, rot.per=0.2, colors=brewer.pal(6, "Dark2"), use.r.layout=FALSE)

Plans for App

The intended shiny app will have a input and, based on the 1, 2, or 3 words entered, will predict the next word.

Therefore, to achieve this we need to do the following:

  1. Get, clean and adjust the Corpus
  2. Extract the 2-gram, 3-gram and 4-gram frequencies
  3. Create a model that uses the frequencies to match the input
  4. Choose and implement a back-off strategy for the model
  5. Perform adjustments to optimize memory size and runtime
  6. Deploy and test app on a shiny server