JHU Capstone Milestone Report

Exploratory Data Analysis

Data source

Per the instruction, zipped file containing the necessary data is downloaded. The unzipped folder named “final” contains 4 different folders for data in 4 different languages – en_US, de_DE, ru_RU and fi_FI. In this assignment, English data in “en_US” folder is used. In the “en_US” folder, there are 3 files, each gathered by the crawlers for blogs, news sites and Twitter.

blogfile <- "~/Documents/coursera/capstone/final/en_US/en_US.blogs.txt"
newsfile <- "~/Documents/coursera/capstone/final/en_US/en_US.news.txt"
twitterfile <- "~/Documents/coursera/capstone/final/en_US/en_US.twitter.txt"

blogdata <- readLines(blogfile, skipNul = TRUE)
newsdata <- readLines(newsfile, skipNul = TRUE)
twitterdata <- readLines(twitterfile, skipNul = TRUE)

Data summary

# File Size in MB
blogdata_size <- file.size(blogfile)/1024/1024
newsdata_size <- file.size(newsfile)/1024/1024
twitterdata_size <- file.size(twitterfile)/1024/1024

# # of Lines
blogdata_length <- length(blogdata)
newsdata_length <- length(newsdata)
twitterdata_length <- length(twitterdata)

# # of Letters
blogdata_chars <- sum(nchar(blogdata))
newsdata_chars <- sum(nchar(newsdata))
twitterdata_chars <- sum(nchar(twitterdata))

# # of Words
blogdata_words <- sum(sapply(strsplit(blogdata, "\\W+"), length))
newsdata_words <- sum(sapply(strsplit(newsdata, "\\W+"), length))
twitterdata_words <- sum(sapply(strsplit(twitterdata, "\\W+"), length))

data_summary_table <- data.frame(
  item = c("blog", "news", "twitter"), 
  filename = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"), 
  filesize = c(blogdata_size, newsdata_size, twitterdata_size), 
  lines = c(blogdata_length, newsdata_length, twitterdata_length), 
  letters = c(blogdata_chars, newsdata_chars, twitterdata_chars), 
  words = c(blogdata_words, newsdata_words, twitterdata_words)
)

data_summary_table

##      item          filename filesize   lines   letters    words
## 1    blog   en_US.blogs.txt 200.4242  899288 206824505 38371858
## 2    news    en_US.news.txt 196.2775 1010242 203223159 35783087
## 3 twitter en_US.twitter.txt 159.3641 2360148 162096241 31149791

Sampling

Started with 30% sample size –roughly 10% each from blog, news and twitter but unfortunately, it takes too much time for my computer to process the cleaning for such a large data. After few data analysis trials, the result is similar for 10% and 1% sampling.

set.seed(123)

blogdata_samplesize <- blogdata_length * 0.01
newsdata_samplesize <- newsdata_length * 0.01
twitterdata_samplesize <- twitterdata_length * 0.01

blogdata_sample <- sample(blogdata, blogdata_samplesize, replace = FALSE)
newsdata_sample <- sample(newsdata, newsdata_samplesize, replace = FALSE)
twitterdata_sample <- sample(twitterdata, twitterdata_samplesize, replace = FALSE)

sample <- c(blogdata_sample, newsdata_sample, twitterdata_sample)
writeLines(sample, "~/Documents/coursera/capstone/final/en_US/sample.txt")

Data cleaning / Building Corpus

Per the instruction, package “tm” is utilized.

Per the top search result for “bad words list for corpus”, a file of list of profane words (bad-words.txt) is downloaded and used from https://www.cs.cmu.edu/~biglou/resources/.

library(tm)

profanity_file <- "~/Documents/coursera/capstone/final/en_US/bad-words.txt"
profanity_words <- readLines(profanity_file)

clean_corpus <- VCorpus(VectorSource(sample))
clean_corpus <- tm_map(clean_corpus, content_transformer(tolower))
clean_corpus <- tm_map(clean_corpus, removeNumbers)
clean_corpus <- tm_map(clean_corpus, removePunctuation)
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords("en"))
clean_corpus <- tm_map(clean_corpus, removeWords, profanity_words)
clean_corpus <- tm_map(clean_corpus, stripWhitespace)

saveRDS(clean_corpus, "~/Documents/coursera/capstone/final/en_US/clean_corpus.rds")

Word frequency

Sparsity of DTM (Document Term Matrix) from the clean Corpus is at 100% meaning that there are too much redundancy in the data. Reducing the sparsity by 0.5%.

library(knitr)
library(dplyr)

dtm <- DocumentTermMatrix(clean_corpus)
inspect(dtm)

## <<DocumentTermMatrix (documents: 42695, terms: 58654)>>
## Non-/sparse entries: 504366/2503728164
## Sparsity           : 100%
## Maximal term length: 105
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   can get good just like new one said time will
##   1388   1   0    0    0    0   0   1    0    1    2
##   1860   1   0    0    0    0   1   1    0    0    0
##   2288   1   0    0    0    0   0   0    0    1    0
##   2328   2   2    0    2    0   1   2    1    1    0
##   2661   7   0    0    1    0   0   3    0    0    4
##   2851   0   1    0    1    0   1   0    0    3    0
##   4912   0   0    0    1    0   0   0    2    0    3
##   591    0   2    0    0    2   0   2    2    4    3
##   7078   1   0    2    0    1   0   0    1    0    1
##   8279   0   0    0    0    0   0   1    0    0    0

clean_dtm <- removeSparseTerms(dtm, 0.995)
inspect(clean_dtm)

## <<DocumentTermMatrix (documents: 42695, terms: 335)>>
## Non-/sparse entries: 164914/14137911
## Sparsity           : 99%
## Maximal term length: 11
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   can get good just like new one said time will
##   1736   1   1    0    0    1   0   1    0    1    2
##   2288   1   0    0    0    0   0   0    0    1    0
##   2328   2   2    0    2    0   1   2    1    1    0
##   235    1   0    1    1    0   0   1    0    1    1
##   2851   0   1    0    1    0   1   0    0    3    0
##   5240   0   1    0    5    0   0   3    0    0    0
##   591    0   2    0    0    2   0   2    2    4    3
##   7078   1   0    2    0    1   0   0    1    0    1
##   8502   2   4    0    0    1   0   2    0    0    2
##   9168   0   2    4    3    3   0   1    1    2    0

words_frequency <- colSums(as.matrix(clean_dtm))
ord <- order(words_frequency, decreasing=TRUE)
words_frequency[head(ord, 25)] %>% kable()

	x
will	3126
said	3028
just	3004
one	2969
like	2628
can	2394
get	2314
time	2100
new	1869
good	1773
now	1767
day	1697
love	1640
know	1614
people	1574
back	1466
see	1450
also	1380
dont	1361
first	1358
going	1302
make	1295
think	1275
great	1243
much	1190

Tokenization and N-grams

Similar to the above, adjusting the sparsity was necessary for the following analysis.

library(RWeka)
library(ggplot2)

UniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BiGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TriGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

unigram_dtm <- DocumentTermMatrix(clean_corpus, control = list(tokenize = UniGramTokenizer))
clean_unigram_dtm <- removeSparseTerms(unigram_dtm, 0.995)
unigram_frequency <- colSums(as.matrix(clean_unigram_dtm))
ord_unigram <- order(unigram_frequency, decreasing=TRUE)
unigram_frequency_25 <- unigram_frequency[head(ord_unigram, 25)]
unigram_frequency_25_table <- data.frame(word = names(unigram_frequency_25), freq = unigram_frequency_25)
unigram_frequency_25_table %>% 
  ggplot(aes(reorder(word, freq), freq)) + 
  geom_col() + 
  coord_flip() +
  labs(x="Words", y="Frequency", title = "Top 25 Unigram Words")

bigram_dtm <- DocumentTermMatrix(clean_corpus, control = list(tokenize = BiGramTokenizer))
clean_bigram_dtm <- removeSparseTerms(bigram_dtm, 0.9995)
bigram_frequency <- colSums(as.matrix(clean_bigram_dtm))
ord_bigram <- order(bigram_frequency, decreasing=TRUE)
bigram_frequency_25 <- bigram_frequency[head(ord_bigram, 25)]
bigram_frequency_25_table <- data.frame(word = names(bigram_frequency_25), freq = bigram_frequency_25)
bigram_frequency_25_table %>% 
  ggplot(aes(reorder(word, freq), freq)) + 
  geom_col() + 
  coord_flip() +
  labs(x="Words", y="Frequency", title = "Top 25 Bigram Words")

trigram_dtm <- DocumentTermMatrix(clean_corpus, control = list(tokenize = TriGramTokenizer))
clean_trigram_dtm <- removeSparseTerms(trigram_dtm, 0.9999)
trigram_frequency <- colSums(as.matrix(clean_trigram_dtm))
ord_trigram <- order(trigram_frequency, decreasing=TRUE)
trigram_frequency_25 <- trigram_frequency[head(ord_trigram, 25)]
trigram_frequency_25_table <- data.frame(word = names(trigram_frequency_25), freq = trigram_frequency_25)
trigram_frequency_25_table %>% 
  ggplot(aes(reorder(word, freq), freq)) + 
  geom_col() + 
  coord_flip() +
  labs(x="Words", y="Frequency", title = "Top 25 Trigram Words")