This document is intended to be a concise report to explain the major features of three text documents and briefly summarize plans for creating a prediction algorithm and Shiny app.
The data was obtained from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip. This work uses the three files under the en_US folder, which contain text from blogs, news and twitter.
The final objective is to create an app that has an input and, based on the 1, 2, or 3 words entered, will predict the next word. To this end, this report shows the initial exploratory analysis.
Data cleaning is a very important step in text mining. Therefore, we divided the steps into subsections as follows.
First we load the entire data:
blogs <- readLines('en_US.blogs.txt')
news <- readLines('en_US.news.txt')
twitter <- readLines('en_US.twitter.txt')
Then we take general info of the files loaded:
blogs_lines <- length(blogs)
blogs_words <- sum(stri_count(blogs, regex="\\S+"))
news_lines <- length(news)
news_words <- sum(stri_count(news, regex="\\S+"))
twitter_lines <- length(twitter)
twitter_words <- sum(stri_count(twitter, regex="\\S+"))
total_lines <- blogs_lines + news_lines + twitter_lines
total_words <- blogs_words + news_words + twitter_words
lines_words <- c(blogs_lines, blogs_words, news_lines, news_words, twitter_lines, twitter_words, total_lines, total_words)
names(lines_words) <- c('blogs_lines', 'blogs_words', 'news_lines', 'news_words', 'twitter_lines', 'twitter_words', 'total_lines', 'total_words')
format(lines_words, decimal.mark=",", big.mark=".",small.mark=".")
## blogs_lines blogs_words news_lines news_words twitter_lines twitter_words
## " 899.288" " 37.334.131" " 1.010.242" " 34.372.530" " 2.360.148" " 30.373.543"
## total_lines total_words
## " 4.269.678" "102.080.204"
Therefore, considering all three files, we have 102.080.204 words and 4.269.678 lines.
Since we do not have enough CPU and memory to process all the data, we chose to get a random sample of 1% equally distributed among the three files.
blogs_ss <- sample(blogs, length(blogs)*0.01, replace=FALSE)
news_ss <- sample(news, length(news)*0.01, replace=FALSE)
twitter_ss <- sample(twitter, length(twitter)*0.01, replace=FALSE)
alldocs <- paste(blogs_ss, news_ss, twitter_ss)
length(alldocs)
head(alldocs, 1)
## [1] 23601
## [1] "You can contact me back here (spike222@e-mail.ua) LOS ANGELES -- MetLife Inc. will pay nearly $500 million in a settlement involving more than 30 states that claimed it didn't provide life insurance benefits to some of its policyholders, the company said Monday. Blaring Sandwitches in my room with the door closed -Damien Wolfe"
Above we can see that we had 23.601 lines in our sample.
We then separate the texts into sentences.
library(qdap)
alldocs_sent = foreach(n = 1:length(alldocs), .combine = c) %dopar% {
sent_detect(alldocs[n], language = "en", model = NULL) # separate into sentences
}
length(alldocs_sent)
head(alldocs_sent, 3)
## [1] 156277
## [1] "You can contact me back here LOS ANGELES -- MetLife Inc."
## [2] "will pay nearly $500 million in a settlement involving more than 30 states that claimed it didn't provide life insurance benefits to some of its policyholders, the company said Monday."
## [3] "Blaring Sandwitches in my room with the door closed -Damien Wolfe"
Thus, we went from 23.601 lines before sentence separation, to 156.277 lines after.
We then build our corpus from the text separated by sentences. Using 12 cores of an amazon instance, we remove profanity words (obtained from http://www.cs.cmu.edu/~biglou/resources/bad-words.txt) and make other general transformations like removing punctuation and transforming to lower case.
library(doMC)
registerDoMC(cores=12)
library(tm)
corpus <- VCorpus(VectorSource(alldocs_sent))
badwords <- readLines("bad-words.txt") # http://www.cs.cmu.edu/~biglou/resources/bad-words.txt
corpus_good = foreach(n = 1:length(corpus), .combine = c) %dopar% {
corpus_n <- corpus[n]
corpus_n <- tm_map(corpus_n, removeWords, badwords) # Remove profanity words
corpus_n <- tm_map(corpus_n, removePunctuation) # Remove punctuation
corpus_n <- tm_map(corpus_n, content_transformer(tolower)) # Transform to lower case
corpus_n <- tm_map(corpus_n, removeNumbers) # Remove digits
corpus_n <- tm_map(corpus_n, stripWhitespace) # Strip spaces
corpus_n
}
Next we perform tokenization to get one word frequencies (one gram), two words frequencies (bi gram) and three words frequencies (tri gram).
library(RWeka)
one_gram = foreach(n = 1:length(corpus_good), .combine = c) %dopar% {
NGramTokenizer(corpus_good[[n]], Weka_control(min=1, max=1, delimiters=" \\r\\n\\t.,;:\"()?!"))
}
bi_gram = foreach(n = 1:length(corpus_good), .combine = c) %dopar% {
NGramTokenizer(corpus_good[[n]], Weka_control(min=2, max=2, delimiters=" \\r\\n\\t.,;:\"()?!"))
}
tri_gram = foreach(n = 1:length(corpus_good), .combine = c) %dopar% {
NGramTokenizer(corpus_good[[n]], Weka_control(min=3, max=3, delimiters=" \\r\\n\\t.,;:\"()?!"))
}
To explore our data, we analyze separately text in the three forms tokenized: one word (one gram), two words (bi gram) and three words (tri gram). To observe the data, we plot the top 10 grams with the highest frequencies and a worcloud.
For the one gram, we plot the top 10 words with the highest frequencies and a wordcloud.
one_gram <- data.frame(table(one_gram))
one_gram <- one_gram[order(one_gram$Freq, decreasing = TRUE),]
names(one_gram) <- c('OneGram', 'Frequency')
library(ggplot2)
ggplot(one_gram[1:10, ], aes(x=reorder(OneGram, Frequency), y=Frequency)) + geom_bar(stat="Identity", fill="Gray") + geom_text(aes(label=Frequency), hjust=-0.1) + coord_flip() + xlab("One Gram") + labs(title="Top 10 One Gram")
library(wordcloud)
wordcloud(one_gram$OneGram, one_gram$Frequency, min.freq=500, random.order=FALSE, rot.per=0.2, colors=brewer.pal(6, "Dark2"), use.r.layout=FALSE)
For the bi gram, we plot the top 10 two words with the highest frequencies and a wordcloud.
bi_gram <- data.frame(table(bi_gram))
bi_gram <- bi_gram[order(bi_gram$Freq, decreasing = TRUE),]
names(bi_gram) <- c('BiGram', 'Frequency')
library(ggplot2)
ggplot(bi_gram[1:10, ], aes(x=reorder(BiGram, Frequency), y=Frequency)) + geom_bar(stat="Identity", fill="Gray") + geom_text(aes(label=Frequency), hjust=-0.1) + coord_flip() + xlab("Bi Gram") + labs(title="Top 10 Bi Gram")
library(wordcloud)
wordcloud(bi_gram$BiGram, bi_gram$Frequency, min.freq=400, random.order=FALSE, rot.per=0.2, colors=brewer.pal(6, "Dark2"), use.r.layout=FALSE)
For the tri gram, we plot the top 10 three words with the highest frequencies and a wordcloud.
tri_gram <- data.frame(table(tri_gram))
tri_gram <- tri_gram[order(tri_gram$Freq, decreasing = TRUE),]
names(tri_gram) <- c('TriGram', 'Frequency')
library(ggplot2)
ggplot(tri_gram[1:10, ], aes(x=reorder(TriGram, Frequency), y=Frequency)) + geom_bar(stat="Identity", fill="Gray") + geom_text(aes(label=Frequency), hjust=-0.1) + coord_flip() + xlab("Tri Gram") + labs(title="Top 10 Tri Gram")
library(wordcloud)
wordcloud(tri_gram$TriGram, tri_gram$Frequency, min.freq=200, random.order=FALSE, rot.per=0.2, colors=brewer.pal(6, "Dark2"), use.r.layout=FALSE)
The intended shiny app will have a input and, based on the 1, 2, or 3 words entered, will predict the next word.
Therefore, to achieve this we need to do the following: