This is a milestone report for JHU Capstone project to build a plan to develop a prediction algorithm based on the given text dataset. Per the EDA which looked into the words frequency for 1 to 3 Ngrams –details below, followings are found to be critical for the next steps in building an app that runs on Shiny.io to predict the next word based on the user input:
Per the instruction, zipped file containing the necessary data is downloaded. The unzipped folder named “final” contains 4 different folders for data in 4 different languages – en_US, de_DE, ru_RU and fi_FI. In this assignment, English data in “en_US” folder is used. In the “en_US” folder, there are 3 files, each gathered by the crawlers for blogs, news sites and Twitter.
blogfile <- "~/Documents/coursera/capstone/final/en_US/en_US.blogs.txt"
newsfile <- "~/Documents/coursera/capstone/final/en_US/en_US.news.txt"
twitterfile <- "~/Documents/coursera/capstone/final/en_US/en_US.twitter.txt"
blogdata <- readLines(blogfile, skipNul = TRUE)
newsdata <- readLines(newsfile, skipNul = TRUE)
twitterdata <- readLines(twitterfile, skipNul = TRUE)
# File Size in MB
blogdata_size <- file.size(blogfile)/1024/1024
newsdata_size <- file.size(newsfile)/1024/1024
twitterdata_size <- file.size(twitterfile)/1024/1024
# # of Lines
blogdata_length <- length(blogdata)
newsdata_length <- length(newsdata)
twitterdata_length <- length(twitterdata)
# # of Letters
blogdata_chars <- sum(nchar(blogdata))
newsdata_chars <- sum(nchar(newsdata))
twitterdata_chars <- sum(nchar(twitterdata))
# # of Words
blogdata_words <- sum(sapply(strsplit(blogdata, "\\W+"), length))
newsdata_words <- sum(sapply(strsplit(newsdata, "\\W+"), length))
twitterdata_words <- sum(sapply(strsplit(twitterdata, "\\W+"), length))
data_summary_table <- data.frame(
item = c("blog", "news", "twitter"),
filename = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
filesize = c(blogdata_size, newsdata_size, twitterdata_size),
lines = c(blogdata_length, newsdata_length, twitterdata_length),
letters = c(blogdata_chars, newsdata_chars, twitterdata_chars),
words = c(blogdata_words, newsdata_words, twitterdata_words)
)
data_summary_table
## item filename filesize lines letters words
## 1 blog en_US.blogs.txt 200.4242 899288 206824505 38371858
## 2 news en_US.news.txt 196.2775 1010242 203223159 35783087
## 3 twitter en_US.twitter.txt 159.3641 2360148 162096241 31149791
Started with 30% sample size –roughly 10% each from blog, news and twitter but unfortunately, it takes too much time for my computer to process the cleaning for such a large data. After few data analysis trials, the result is similar for 10% and 1% sampling.
set.seed(123)
blogdata_samplesize <- blogdata_length * 0.01
newsdata_samplesize <- newsdata_length * 0.01
twitterdata_samplesize <- twitterdata_length * 0.01
blogdata_sample <- sample(blogdata, blogdata_samplesize, replace = FALSE)
newsdata_sample <- sample(newsdata, newsdata_samplesize, replace = FALSE)
twitterdata_sample <- sample(twitterdata, twitterdata_samplesize, replace = FALSE)
sample <- c(blogdata_sample, newsdata_sample, twitterdata_sample)
writeLines(sample, "~/Documents/coursera/capstone/final/en_US/sample.txt")
Per the instruction, package “tm” is utilized.
Per the top search result for “bad words list for corpus”, a file of list of profane words (bad-words.txt) is downloaded and used from https://www.cs.cmu.edu/~biglou/resources/.
library(tm)
profanity_file <- "~/Documents/coursera/capstone/final/en_US/bad-words.txt"
profanity_words <- readLines(profanity_file)
clean_corpus <- VCorpus(VectorSource(sample))
clean_corpus <- tm_map(clean_corpus, content_transformer(tolower))
clean_corpus <- tm_map(clean_corpus, removeNumbers)
clean_corpus <- tm_map(clean_corpus, removePunctuation)
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords("en"))
clean_corpus <- tm_map(clean_corpus, removeWords, profanity_words)
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
saveRDS(clean_corpus, "~/Documents/coursera/capstone/final/en_US/clean_corpus.rds")
Sparsity of DTM (Document Term Matrix) from the clean Corpus is at 100% meaning that there are too much redundancy in the data. Reducing the sparsity by 0.5%.
library(knitr)
library(dplyr)
dtm <- DocumentTermMatrix(clean_corpus)
inspect(dtm)
## <<DocumentTermMatrix (documents: 42695, terms: 58654)>>
## Non-/sparse entries: 504366/2503728164
## Sparsity : 100%
## Maximal term length: 105
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs can get good just like new one said time will
## 1388 1 0 0 0 0 0 1 0 1 2
## 1860 1 0 0 0 0 1 1 0 0 0
## 2288 1 0 0 0 0 0 0 0 1 0
## 2328 2 2 0 2 0 1 2 1 1 0
## 2661 7 0 0 1 0 0 3 0 0 4
## 2851 0 1 0 1 0 1 0 0 3 0
## 4912 0 0 0 1 0 0 0 2 0 3
## 591 0 2 0 0 2 0 2 2 4 3
## 7078 1 0 2 0 1 0 0 1 0 1
## 8279 0 0 0 0 0 0 1 0 0 0
clean_dtm <- removeSparseTerms(dtm, 0.995)
inspect(clean_dtm)
## <<DocumentTermMatrix (documents: 42695, terms: 335)>>
## Non-/sparse entries: 164914/14137911
## Sparsity : 99%
## Maximal term length: 11
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs can get good just like new one said time will
## 1736 1 1 0 0 1 0 1 0 1 2
## 2288 1 0 0 0 0 0 0 0 1 0
## 2328 2 2 0 2 0 1 2 1 1 0
## 235 1 0 1 1 0 0 1 0 1 1
## 2851 0 1 0 1 0 1 0 0 3 0
## 5240 0 1 0 5 0 0 3 0 0 0
## 591 0 2 0 0 2 0 2 2 4 3
## 7078 1 0 2 0 1 0 0 1 0 1
## 8502 2 4 0 0 1 0 2 0 0 2
## 9168 0 2 4 3 3 0 1 1 2 0
words_frequency <- colSums(as.matrix(clean_dtm))
ord <- order(words_frequency, decreasing=TRUE)
words_frequency[head(ord, 25)] %>% kable()
| x | |
|---|---|
| will | 3126 |
| said | 3028 |
| just | 3004 |
| one | 2969 |
| like | 2628 |
| can | 2394 |
| get | 2314 |
| time | 2100 |
| new | 1869 |
| good | 1773 |
| now | 1767 |
| day | 1697 |
| love | 1640 |
| know | 1614 |
| people | 1574 |
| back | 1466 |
| see | 1450 |
| also | 1380 |
| dont | 1361 |
| first | 1358 |
| going | 1302 |
| make | 1295 |
| think | 1275 |
| great | 1243 |
| much | 1190 |
Similar to the above, adjusting the sparsity was necessary for the following analysis.
library(RWeka)
library(ggplot2)
UniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BiGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TriGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
unigram_dtm <- DocumentTermMatrix(clean_corpus, control = list(tokenize = UniGramTokenizer))
clean_unigram_dtm <- removeSparseTerms(unigram_dtm, 0.995)
unigram_frequency <- colSums(as.matrix(clean_unigram_dtm))
ord_unigram <- order(unigram_frequency, decreasing=TRUE)
unigram_frequency_25 <- unigram_frequency[head(ord_unigram, 25)]
unigram_frequency_25_table <- data.frame(word = names(unigram_frequency_25), freq = unigram_frequency_25)
unigram_frequency_25_table %>%
ggplot(aes(reorder(word, freq), freq)) +
geom_col() +
coord_flip() +
labs(x="Words", y="Frequency", title = "Top 25 Unigram Words")
bigram_dtm <- DocumentTermMatrix(clean_corpus, control = list(tokenize = BiGramTokenizer))
clean_bigram_dtm <- removeSparseTerms(bigram_dtm, 0.9995)
bigram_frequency <- colSums(as.matrix(clean_bigram_dtm))
ord_bigram <- order(bigram_frequency, decreasing=TRUE)
bigram_frequency_25 <- bigram_frequency[head(ord_bigram, 25)]
bigram_frequency_25_table <- data.frame(word = names(bigram_frequency_25), freq = bigram_frequency_25)
bigram_frequency_25_table %>%
ggplot(aes(reorder(word, freq), freq)) +
geom_col() +
coord_flip() +
labs(x="Words", y="Frequency", title = "Top 25 Bigram Words")
trigram_dtm <- DocumentTermMatrix(clean_corpus, control = list(tokenize = TriGramTokenizer))
clean_trigram_dtm <- removeSparseTerms(trigram_dtm, 0.9999)
trigram_frequency <- colSums(as.matrix(clean_trigram_dtm))
ord_trigram <- order(trigram_frequency, decreasing=TRUE)
trigram_frequency_25 <- trigram_frequency[head(ord_trigram, 25)]
trigram_frequency_25_table <- data.frame(word = names(trigram_frequency_25), freq = trigram_frequency_25)
trigram_frequency_25_table %>%
ggplot(aes(reorder(word, freq), freq)) +
geom_col() +
coord_flip() +
labs(x="Words", y="Frequency", title = "Top 25 Trigram Words")
–
End of the document