Clean the environment.

rm(list = ls())

Introduction.

The goal of the capstone project (or the Swiftkey project on NLP (Natural Language Processing)) is to develop a predictive text model using a large text corpora of plain-text documents (written in English) as training data. NLP techniques are also involved to perform the (text or word sequence) analysis and build the predictive model.

The present “Milestone report” includes a summary of exploratory data analysis of the training data. Also, the report briefly summarizes the plans for developing the predictive model.

Initial exploratory data analysis of the plain-text files.

The exploratory data analysis of the training data includes the following steps:

Reading the text lines from the selected files: en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt.

blogs <- readLines("I:/Coursera/Data Science Specialization/Course10_Capstone Project/Project/Project Data/en_US.blogs.txt") # encoding = 'UTF-8')

news <- readLines("I:/Coursera/Data Science Specialization/Course10_Capstone Project/Project/Project Data/en_US.news.txt") # encoding = 'UTF-8')

twitter <- readLines("I:/Coursera/Data Science Specialization/Course10_Capstone Project/Project/Project Data/en_US.twitter.txt") # encoding = 'UTF-8')

Finding the overall size of the selected plain-text files.

m <- file.info("I:/Coursera/Data Science Specialization/Course10_Capstone Project/Project/Project Data/en_US.blogs.txt")$size

n <- file.info("I:/Coursera/Data Science Specialization/Course10_Capstone Project/Project/Project Data/en_US.news.txt")$size

p <- file.info("I:/Coursera/Data Science Specialization/Course10_Capstone Project/Project/Project Data/en_US.twitter.txt")$size
size.blogs <- paste0("The size of the file en_US.blogs.txt (in Megabyte (MB)): ", sprintf("%.3f", m / 1024^2))
print(size.blogs)
## [1] "The size of the file en_US.blogs.txt (in Megabyte (MB)): 200.424"
size.news <- paste0("The size of the file en_US.news.txt (in Megabyte (MB)): ", sprintf("%.3f", n / 1024^2))
print(size.news)
## [1] "The size of the file en_US.news.txt (in Megabyte (MB)): 196.278"
size.twitter <- paste0("The size of the file en_US.twitter.txt (in Megabyte (MB)): ", sprintf("%.3f", p / 1024^2))
print(size.twitter)
## [1] "The size of the file en_US.twitter.txt (in Megabyte (MB)): 159.364"

Determining the nr. of text lines and other features of the selected plain-text files.

# One way to determine the nr. of text lines, for example in file of <en_US.twitter.txt> is:
nr.of.lines.twitter <- length(news)
print(nr.of.lines.twitter)
## [1] 77259
# Another way, more comprehensive to determine the nr. of text lines and other features of the selected plain-text files is:
require(stringi)
## Loading required package: stringi
#-----------------------------------------------------------------------------------------------------------
# Note: the printed values given by the <stri_stats_general> are:

# Lines - number of lines (number of non-missing strings in the vector);
# LinesNEmpty - number of lines with at least one non-WHITE_SPACE character;
# Chars - total number of Unicode code points detected;
# CharsNWhite - number of Unicode code points that are not WHITE_SPACEs;
#-----------------------------------------------------------------------------------------------------------
# Features of <en_US.blogs.txt>:
var1 <- stri_stats_general(blogs)
print(var1)
##       Lines LinesNEmpty       Chars CharsNWhite 
##      899288      899288   208361438   171926076
# Features of <en_US.news.txt>:
var2 <- stri_stats_general(news)
print(var2)
##       Lines LinesNEmpty       Chars CharsNWhite 
##       77259       77259    15683765    13117038
# Features of <en_US.twitter.txt>:
var3 <- stri_stats_general(twitter)
print(var3)
##       Lines LinesNEmpty       Chars CharsNWhite 
##     2360148     2360148   162384825   134370864

Counting the words per line from all three files.

words.in.blogs <- stri_count_words(blogs)
summary(words.in.blogs)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   29.00   42.43   61.00 6726.00
words.in.news <- stri_count_words(news)
summary(words.in.news)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   32.00   34.87   46.00 1123.00
words.in.twitter <- stri_count_words(twitter)
summary(words.in.twitter)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0     7.0    12.0    12.8    18.0    60.0

** A better summarization of the previous findings and more from all selected three files.**

data.summarized <- data.frame(source = c("blogs", "news", "twitter"),
           size.file.in.MB = c(size.blogs, size.news, size.twitter),
           nr.of.lines = c(length(blogs), length(news), length(twitter)),
           nr.of.words = c(sum(words.in.blogs), sum(words.in.news), sum(words.in.twitter)),
           mean.nr.of.words = c(mean(words.in.blogs), mean(words.in.news), mean(words.in.twitter)))
print(data.summarized)
##    source
## 1   blogs
## 2    news
## 3 twitter
##                                                      size.file.in.MB
## 1   The size of the file en_US.blogs.txt (in Megabyte (MB)): 200.424
## 2    The size of the file en_US.news.txt (in Megabyte (MB)): 196.278
## 3 The size of the file en_US.twitter.txt (in Megabyte (MB)): 159.364
##   nr.of.lines nr.of.words mean.nr.of.words
## 1      899288    38154238         42.42716
## 2       77259     2693898         34.86840
## 3     2360148    30218125         12.80349

Sampling data in a fashion that the code can be done on a personal computer.

require(stringr)
sampling.1 <- blogs[sample(1:length(blogs), 20000)]
sampling.2 <- news[sample(1:length(news), 20000)]
sampling.3 <- twitter[sample(1:length(twitter), 10000)]

# The new sample file is composed of 50,000 lines, with 20,000 from blogs and news, and 10,000 from twitter)

sampling.overall <- c(sampling.1, sampling.1, sampling.1)
# get the ASCII charact
ascllen <- stri_enc_toascii(sampling.overall)
ascllen <- stri_replace_all_regex(sampling.overall,'\032','')

rm(sampling.1, sampling.2, sampling.3, sampling.overall) # release memory

Defining individual corpus from each data set. Defining an overall corpus for analysis

To reduce the intensive computational effort, with long times of waiting, it will be taken 1% of data from the final corpus. This 1% will be used for the extended exploratory analysis.

require(tm)
corpus.unique <- Corpus(VectorSource(ascllen)) # here, it must be used Corpus instead of VCorpus. The use of VCorpus gives bad results when finding the N-grams. See the link: https://stackoverflow.com/questions/42757183/creating-n-grams-with-tm-rweka-works-with-vcorpus-but-not-corpus
corpus.sample <- sample(corpus.unique, length(corpus.unique) * 0.01)

Cleaning process of the text.

require(tm)
# Considering only the plain text.
corpus.cleaned.1 <- tm_map(corpus.sample, PlainTextDocument)
# Making all words in lowercase versions.
corpus.cleaned.2 <- tm_map(corpus.cleaned.1, content_transformer(tolower))
# Remove numbers.
corpus.cleaned.3 <- tm_map(corpus.cleaned.2, content_transformer(removeNumbers))
require(tm)
# Removing the white spaces.
corpus.cleaned.4 <- tm_map(corpus.cleaned.3, content_transformer(stripWhitespace))
# Stemming.
corpus.cleaned.5 <- tm_map(corpus.cleaned.4, content_transformer(stemDocument))
# Removing puctuation.
corpus.cleaned.6 <- tm_map(corpus.cleaned.5, content_transformer(removePunctuation))
# Removing the stopwords.
corpus.cleaned.final <- tm_map(corpus.cleaned.6, content_transformer(removeWords), stopwords("english"))

rm(corpus.unique, corpus.sample, corpus.cleaned.1, corpus.cleaned.2, corpus.cleaned.3, corpus.cleaned.4, corpus.cleaned.5, corpus.cleaned.6) # release memory

Second exploratory data analysis of the <corpus.cleaned>. Plotting the N-grams

Finding words that occur at least 50 times in the <corpus.cleaned>.

require(tm)
dtm.sample <- TermDocumentMatrix(corpus.cleaned.final, control = list(stemming = T, wordLengths = c(3,Inf)))
findFreqTerms(dtm.sample, 50)
##  [1] "also"  "can"   "day"   "get"   "just"  "know"  "like"  "look" 
##  [9] "make"  "now"   "one"   "peopl" "thing" "think" "time"  "use"  
## [17] "will"  "year"

Finding words that occur at least 20 times in the <corpus.cleaned>.

After building a document-term matrix, we can show the importance of word visualization (in frequency) with a word cloud (also kown as a tag cloud).

require(wordcloud)
require(RColorBrewer)

wcloud <- as.matrix(dtm.sample)
v <- sort(rowSums(wcloud), decreasing = TRUE)
d <- data.frame(word = names(v), freq = v)
wordcloud(d$word, d$freq, c(4,.3), 20, random.order = FALSE, col = rainbow(500))

rm(dtm.sample) # release memory

Second exploratory data analysis of the <corpus.cleaned>. Ploting the N-grams

Plotting 1-gram (from <corpus.cleaned>).

require(tm)
require(RWeka)
require(data.table)
require(dplyr)
require(ggplot2)

unigram <- NGramTokenizer(corpus.cleaned.final, control = list(Weka_control(min = 1, max = 1), stemming = T, wordLengths = c(3,Inf)))
unigram <- data.frame(table(unigram))
unigram <- unigram[order(unigram$Freq, decreasing = TRUE), ]
colnames(unigram) <- c("word.meaning", "word.frequency")
write.csv(unigram, file = "./unigram.csv")

length(unigram$word.meaning)
## [1] 32702
par(mar = c(5,4,2,2), las = 2)
barplot(height = unigram$word.frequency[1:15], names.arg = unigram$word.meaning[1:15], horiz = FALSE, col = heat.colors(15), main = "Top 15 word-unigram" , ylab = "Frequency")

require(ggplot2)
unigram$ratio <- sapply(1:length(unigram$word.meaning), function(x) sum(unigram$word.frequency[1:x]))
unigram$ratio <- unigram$ratio * 100 / sum(unigram$word.frequency)
unigram$number <- 1:length(unigram$word.meaning)

g <- ggplot(unigram, aes(x = ratio, y = number))
g + geom_line() + labs(title = "The number of word unigrams vs. Total number of words") + labs(x = "Total number of words (%)", y = "The number of word unigrams") + coord_trans(y = "log10")

Plotting 2-gram (from <corpus.cleaned>).

library(rJava)
require(tm)
require(RWeka)
require(data.table)
require(dplyr)
require(ggplot2)

bigram <- NGramTokenizer(corpus.cleaned.final, control = Weka_control(min = 2, max = 2))
bigram <- data.frame(table(bigram))
bigram <- bigram[order(bigram$Freq, decreasing = TRUE), ]
colnames(bigram) <- c("word.meaning", "word.frequency")
write.csv(bigram, file = "./bigram.csv")

length(bigram$word.meaning)
## [1] 13550
par(mar = c(8,4,2,2), las = 2)
barplot(height = bigram$word.frequency[1:15], names.arg = bigram$word.meaning[1:15], horiz = FALSE, col = heat.colors(15), main = "Top 15 word-bigram" , ylab = "Frequency")

require(ggplot2)
bigram$ratio <- sapply(1:length(bigram$word.meaning), function(x) sum(bigram$word.frequency[1:x]))
bigram$ratio <- bigram$ratio * 100 / sum(bigram$word.frequency)
bigram$number <- 1:length(bigram$word.meaning)

g <- ggplot(bigram, aes(x = ratio, y = number))
g + geom_line() + labs(title = "The number of word bigrams vs. Total number of words") + labs(x = "Total number of words (%)", y = "The number of word bigrams") + coord_trans(y = "log10")

Plotting 3-gram (from <corpus.cleaned>).

require(tm)
require(RWeka)
require(data.table)
require(dplyr)
require(ggplot2)

trigram <- NGramTokenizer(corpus.cleaned.final, control = Weka_control(min = 3, max = 3))
trigram <- data.frame(table(trigram))
trigram <-trigram[order(trigram$Freq, decreasing = TRUE), ]
colnames(trigram) <- c("word.meaning","word.frequency")
write.csv(trigram, file = "./trigram.csv")

length(trigram$word.meaning)
## [1] 13980
par(mar = c(10,4,2,2), las = 2)
barplot(height = trigram$word.frequency[1:30], names.arg = trigram$word.meaning[1:30], horiz = FALSE, col = heat.colors(15), main = "Top 30 word-trigram" , ylab = "Frequency")

require(ggplot2)
trigram$ratio <- sapply(1:length(trigram$word.meaning), function(x) sum(trigram$word.frequency[1:x]))
trigram$ratio <- trigram$ratio * 100 / sum(trigram$word.frequency)
trigram$number <- 1:length(trigram$word.meaning)

g <- ggplot(trigram, aes(x = ratio, y = number))
g + geom_line() + labs(title = "The number of word trigrams vs. Total number of words") + labs(x = "Total number of words (%)", y = "The number of word trigrams") + coord_trans(y = "log10")

Conclusions

The next steps will include building a “Shiny app”" to allow the user to obtain a suggestion of the most probable bigram word when inputs a unigram word etc., and developing the prediction algorithm implemented in Shiny app, and preparing a pitch about the app and publishing it.