Executive Summary

This is a milestone report for JHU Capstone project to build a plan to develop a prediction algorithm based on the given text dataset. Per the EDA which looked into the words frequency for 1 to 3 Ngrams –details below, followings are found to be critical for the next steps in building an app that runs on Shiny.io to predict the next word based on the user input:

Exploratory Data Analysis

Data source

Per the instruction, zipped file containing the necessary data is downloaded. The unzipped folder named “final” contains 4 different folders for data in 4 different languages – en_US, de_DE, ru_RU and fi_FI. In this assignment, English data in “en_US” folder is used. In the “en_US” folder, there are 3 files, each gathered by the crawlers for blogs, news sites and Twitter.

blogfile <- "~/Documents/coursera/capstone/final/en_US/en_US.blogs.txt"
newsfile <- "~/Documents/coursera/capstone/final/en_US/en_US.news.txt"
twitterfile <- "~/Documents/coursera/capstone/final/en_US/en_US.twitter.txt"

blogdata <- readLines(blogfile, skipNul = TRUE)
newsdata <- readLines(newsfile, skipNul = TRUE)
twitterdata <- readLines(twitterfile, skipNul = TRUE)

Data summary

# File Size in MB
blogdata_size <- file.size(blogfile)/1024/1024
newsdata_size <- file.size(newsfile)/1024/1024
twitterdata_size <- file.size(twitterfile)/1024/1024

# # of Lines
blogdata_length <- length(blogdata)
newsdata_length <- length(newsdata)
twitterdata_length <- length(twitterdata)

# # of Letters
blogdata_chars <- sum(nchar(blogdata))
newsdata_chars <- sum(nchar(newsdata))
twitterdata_chars <- sum(nchar(twitterdata))

# # of Words
blogdata_words <- sum(sapply(strsplit(blogdata, "\\W+"), length))
newsdata_words <- sum(sapply(strsplit(newsdata, "\\W+"), length))
twitterdata_words <- sum(sapply(strsplit(twitterdata, "\\W+"), length))

data_summary_table <- data.frame(
  item = c("blog", "news", "twitter"), 
  filename = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"), 
  filesize = c(blogdata_size, newsdata_size, twitterdata_size), 
  lines = c(blogdata_length, newsdata_length, twitterdata_length), 
  letters = c(blogdata_chars, newsdata_chars, twitterdata_chars), 
  words = c(blogdata_words, newsdata_words, twitterdata_words)
)

data_summary_table
##      item          filename filesize   lines   letters    words
## 1    blog   en_US.blogs.txt 200.4242  899288 206824505 38371858
## 2    news    en_US.news.txt 196.2775 1010242 203223159 35783087
## 3 twitter en_US.twitter.txt 159.3641 2360148 162096241 31149791

Sampling

Started with 30% sample size –roughly 10% each from blog, news and twitter but unfortunately, it takes too much time for my computer to process the cleaning for such a large data. After few data analysis trials, the result is similar for 10% and 1% sampling.

set.seed(123)

blogdata_samplesize <- blogdata_length * 0.01
newsdata_samplesize <- newsdata_length * 0.01
twitterdata_samplesize <- twitterdata_length * 0.01

blogdata_sample <- sample(blogdata, blogdata_samplesize, replace = FALSE)
newsdata_sample <- sample(newsdata, newsdata_samplesize, replace = FALSE)
twitterdata_sample <- sample(twitterdata, twitterdata_samplesize, replace = FALSE)

sample <- c(blogdata_sample, newsdata_sample, twitterdata_sample)
writeLines(sample, "~/Documents/coursera/capstone/final/en_US/sample.txt")

Data cleaning / Building Corpus

Per the instruction, package “tm is utilized.

Per the top search result for “bad words list for corpus”, a file of list of profane words (bad-words.txt) is downloaded and used from https://www.cs.cmu.edu/~biglou/resources/.

library(tm)

profanity_file <- "~/Documents/coursera/capstone/final/en_US/bad-words.txt"
profanity_words <- readLines(profanity_file)

clean_corpus <- VCorpus(VectorSource(sample))
clean_corpus <- tm_map(clean_corpus, content_transformer(tolower))
clean_corpus <- tm_map(clean_corpus, removeNumbers)
clean_corpus <- tm_map(clean_corpus, removePunctuation)
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords("en"))
clean_corpus <- tm_map(clean_corpus, removeWords, profanity_words)
clean_corpus <- tm_map(clean_corpus, stripWhitespace)

saveRDS(clean_corpus, "~/Documents/coursera/capstone/final/en_US/clean_corpus.rds")

Word frequency

Sparsity of DTM (Document Term Matrix) from the clean Corpus is at 100% meaning that there are too much redundancy in the data. Reducing the sparsity by 0.5%.

library(knitr)
library(dplyr)

dtm <- DocumentTermMatrix(clean_corpus)
inspect(dtm)
## <<DocumentTermMatrix (documents: 42695, terms: 58654)>>
## Non-/sparse entries: 504366/2503728164
## Sparsity           : 100%
## Maximal term length: 105
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   can get good just like new one said time will
##   1388   1   0    0    0    0   0   1    0    1    2
##   1860   1   0    0    0    0   1   1    0    0    0
##   2288   1   0    0    0    0   0   0    0    1    0
##   2328   2   2    0    2    0   1   2    1    1    0
##   2661   7   0    0    1    0   0   3    0    0    4
##   2851   0   1    0    1    0   1   0    0    3    0
##   4912   0   0    0    1    0   0   0    2    0    3
##   591    0   2    0    0    2   0   2    2    4    3
##   7078   1   0    2    0    1   0   0    1    0    1
##   8279   0   0    0    0    0   0   1    0    0    0
clean_dtm <- removeSparseTerms(dtm, 0.995)
inspect(clean_dtm)
## <<DocumentTermMatrix (documents: 42695, terms: 335)>>
## Non-/sparse entries: 164914/14137911
## Sparsity           : 99%
## Maximal term length: 11
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   can get good just like new one said time will
##   1736   1   1    0    0    1   0   1    0    1    2
##   2288   1   0    0    0    0   0   0    0    1    0
##   2328   2   2    0    2    0   1   2    1    1    0
##   235    1   0    1    1    0   0   1    0    1    1
##   2851   0   1    0    1    0   1   0    0    3    0
##   5240   0   1    0    5    0   0   3    0    0    0
##   591    0   2    0    0    2   0   2    2    4    3
##   7078   1   0    2    0    1   0   0    1    0    1
##   8502   2   4    0    0    1   0   2    0    0    2
##   9168   0   2    4    3    3   0   1    1    2    0
words_frequency <- colSums(as.matrix(clean_dtm))
ord <- order(words_frequency, decreasing=TRUE)
words_frequency[head(ord, 25)] %>% kable()
x
will 3126
said 3028
just 3004
one 2969
like 2628
can 2394
get 2314
time 2100
new 1869
good 1773
now 1767
day 1697
love 1640
know 1614
people 1574
back 1466
see 1450
also 1380
dont 1361
first 1358
going 1302
make 1295
think 1275
great 1243
much 1190

Tokenization and N-grams

Similar to the above, adjusting the sparsity was necessary for the following analysis.

library(RWeka)
library(ggplot2)

UniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BiGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TriGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

unigram_dtm <- DocumentTermMatrix(clean_corpus, control = list(tokenize = UniGramTokenizer))
clean_unigram_dtm <- removeSparseTerms(unigram_dtm, 0.995)
unigram_frequency <- colSums(as.matrix(clean_unigram_dtm))
ord_unigram <- order(unigram_frequency, decreasing=TRUE)
unigram_frequency_25 <- unigram_frequency[head(ord_unigram, 25)]
unigram_frequency_25_table <- data.frame(word = names(unigram_frequency_25), freq = unigram_frequency_25)
unigram_frequency_25_table %>% 
  ggplot(aes(reorder(word, freq), freq)) + 
  geom_col() + 
  coord_flip() +
  labs(x="Words", y="Frequency", title = "Top 25 Unigram Words")

bigram_dtm <- DocumentTermMatrix(clean_corpus, control = list(tokenize = BiGramTokenizer))
clean_bigram_dtm <- removeSparseTerms(bigram_dtm, 0.9995)
bigram_frequency <- colSums(as.matrix(clean_bigram_dtm))
ord_bigram <- order(bigram_frequency, decreasing=TRUE)
bigram_frequency_25 <- bigram_frequency[head(ord_bigram, 25)]
bigram_frequency_25_table <- data.frame(word = names(bigram_frequency_25), freq = bigram_frequency_25)
bigram_frequency_25_table %>% 
  ggplot(aes(reorder(word, freq), freq)) + 
  geom_col() + 
  coord_flip() +
  labs(x="Words", y="Frequency", title = "Top 25 Bigram Words")

trigram_dtm <- DocumentTermMatrix(clean_corpus, control = list(tokenize = TriGramTokenizer))
clean_trigram_dtm <- removeSparseTerms(trigram_dtm, 0.9999)
trigram_frequency <- colSums(as.matrix(clean_trigram_dtm))
ord_trigram <- order(trigram_frequency, decreasing=TRUE)
trigram_frequency_25 <- trigram_frequency[head(ord_trigram, 25)]
trigram_frequency_25_table <- data.frame(word = names(trigram_frequency_25), freq = trigram_frequency_25)
trigram_frequency_25_table %>% 
  ggplot(aes(reorder(word, freq), freq)) + 
  geom_col() + 
  coord_flip() +
  labs(x="Words", y="Frequency", title = "Top 25 Trigram Words")

References

End of the document