Clean the environment.
rm(list = ls())
The goal of the capstone project (or the Swiftkey project on NLP (Natural Language Processing)) is to develop a predictive text model using a large text corpora of plain-text documents (written in English) as training data. NLP techniques are also involved to perform the (text or word sequence) analysis and build the predictive model.
The present “Milestone report” includes a summary of exploratory data analysis of the training data. Also, the report briefly summarizes the plans for developing the predictive model.
The exploratory data analysis of the training data includes the following steps:
blogs <- readLines("I:/Coursera/Data Science Specialization/Course10_Capstone Project/Project/Project Data/en_US.blogs.txt") # encoding = 'UTF-8')
news <- readLines("I:/Coursera/Data Science Specialization/Course10_Capstone Project/Project/Project Data/en_US.news.txt") # encoding = 'UTF-8')
twitter <- readLines("I:/Coursera/Data Science Specialization/Course10_Capstone Project/Project/Project Data/en_US.twitter.txt") # encoding = 'UTF-8')
m <- file.info("I:/Coursera/Data Science Specialization/Course10_Capstone Project/Project/Project Data/en_US.blogs.txt")$size
n <- file.info("I:/Coursera/Data Science Specialization/Course10_Capstone Project/Project/Project Data/en_US.news.txt")$size
p <- file.info("I:/Coursera/Data Science Specialization/Course10_Capstone Project/Project/Project Data/en_US.twitter.txt")$size
size.blogs <- paste0("The size of the file en_US.blogs.txt (in Megabyte (MB)): ", sprintf("%.3f", m / 1024^2))
print(size.blogs)
## [1] "The size of the file en_US.blogs.txt (in Megabyte (MB)): 200.424"
size.news <- paste0("The size of the file en_US.news.txt (in Megabyte (MB)): ", sprintf("%.3f", n / 1024^2))
print(size.news)
## [1] "The size of the file en_US.news.txt (in Megabyte (MB)): 196.278"
size.twitter <- paste0("The size of the file en_US.twitter.txt (in Megabyte (MB)): ", sprintf("%.3f", p / 1024^2))
print(size.twitter)
## [1] "The size of the file en_US.twitter.txt (in Megabyte (MB)): 159.364"
# One way to determine the nr. of text lines, for example in file of <en_US.twitter.txt> is:
nr.of.lines.twitter <- length(news)
print(nr.of.lines.twitter)
## [1] 77259
# Another way, more comprehensive to determine the nr. of text lines and other features of the selected plain-text files is:
require(stringi)
## Loading required package: stringi
#-----------------------------------------------------------------------------------------------------------
# Note: the printed values given by the <stri_stats_general> are:
# Lines - number of lines (number of non-missing strings in the vector);
# LinesNEmpty - number of lines with at least one non-WHITE_SPACE character;
# Chars - total number of Unicode code points detected;
# CharsNWhite - number of Unicode code points that are not WHITE_SPACEs;
#-----------------------------------------------------------------------------------------------------------
# Features of <en_US.blogs.txt>:
var1 <- stri_stats_general(blogs)
print(var1)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 208361438 171926076
# Features of <en_US.news.txt>:
var2 <- stri_stats_general(news)
print(var2)
## Lines LinesNEmpty Chars CharsNWhite
## 77259 77259 15683765 13117038
# Features of <en_US.twitter.txt>:
var3 <- stri_stats_general(twitter)
print(var3)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162384825 134370864
words.in.blogs <- stri_count_words(blogs)
summary(words.in.blogs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 29.00 42.43 61.00 6726.00
words.in.news <- stri_count_words(news)
summary(words.in.news)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.00 32.00 34.87 46.00 1123.00
words.in.twitter <- stri_count_words(twitter)
summary(words.in.twitter)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 7.0 12.0 12.8 18.0 60.0
data.summarized <- data.frame(source = c("blogs", "news", "twitter"),
size.file.in.MB = c(size.blogs, size.news, size.twitter),
nr.of.lines = c(length(blogs), length(news), length(twitter)),
nr.of.words = c(sum(words.in.blogs), sum(words.in.news), sum(words.in.twitter)),
mean.nr.of.words = c(mean(words.in.blogs), mean(words.in.news), mean(words.in.twitter)))
print(data.summarized)
## source
## 1 blogs
## 2 news
## 3 twitter
## size.file.in.MB
## 1 The size of the file en_US.blogs.txt (in Megabyte (MB)): 200.424
## 2 The size of the file en_US.news.txt (in Megabyte (MB)): 196.278
## 3 The size of the file en_US.twitter.txt (in Megabyte (MB)): 159.364
## nr.of.lines nr.of.words mean.nr.of.words
## 1 899288 38154238 42.42716
## 2 77259 2693898 34.86840
## 3 2360148 30218125 12.80349
require(stringr)
sampling.1 <- blogs[sample(1:length(blogs), 20000)]
sampling.2 <- news[sample(1:length(news), 20000)]
sampling.3 <- twitter[sample(1:length(twitter), 10000)]
# The new sample file is composed of 50,000 lines, with 20,000 from blogs and news, and 10,000 from twitter)
sampling.overall <- c(sampling.1, sampling.1, sampling.1)
# get the ASCII charact
ascllen <- stri_enc_toascii(sampling.overall)
ascllen <- stri_replace_all_regex(sampling.overall,'\032','')
rm(sampling.1, sampling.2, sampling.3, sampling.overall) # release memory
To reduce the intensive computational effort, with long times of waiting, it will be taken 1% of data from the final corpus. This 1% will be used for the extended exploratory analysis.
require(tm)
corpus.unique <- Corpus(VectorSource(ascllen)) # here, it must be used Corpus instead of VCorpus. The use of VCorpus gives bad results when finding the N-grams. See the link: https://stackoverflow.com/questions/42757183/creating-n-grams-with-tm-rweka-works-with-vcorpus-but-not-corpus
corpus.sample <- sample(corpus.unique, length(corpus.unique) * 0.01)
require(tm)
# Considering only the plain text.
corpus.cleaned.1 <- tm_map(corpus.sample, PlainTextDocument)
# Making all words in lowercase versions.
corpus.cleaned.2 <- tm_map(corpus.cleaned.1, content_transformer(tolower))
# Remove numbers.
corpus.cleaned.3 <- tm_map(corpus.cleaned.2, content_transformer(removeNumbers))
require(tm)
# Removing the white spaces.
corpus.cleaned.4 <- tm_map(corpus.cleaned.3, content_transformer(stripWhitespace))
# Stemming.
corpus.cleaned.5 <- tm_map(corpus.cleaned.4, content_transformer(stemDocument))
# Removing puctuation.
corpus.cleaned.6 <- tm_map(corpus.cleaned.5, content_transformer(removePunctuation))
# Removing the stopwords.
corpus.cleaned.final <- tm_map(corpus.cleaned.6, content_transformer(removeWords), stopwords("english"))
rm(corpus.unique, corpus.sample, corpus.cleaned.1, corpus.cleaned.2, corpus.cleaned.3, corpus.cleaned.4, corpus.cleaned.5, corpus.cleaned.6) # release memory
require(tm)
dtm.sample <- TermDocumentMatrix(corpus.cleaned.final, control = list(stemming = T, wordLengths = c(3,Inf)))
findFreqTerms(dtm.sample, 50)
## [1] "also" "can" "day" "get" "just" "know" "like" "look"
## [9] "make" "now" "one" "peopl" "thing" "think" "time" "use"
## [17] "will" "year"
After building a document-term matrix, we can show the importance of word visualization (in frequency) with a word cloud (also kown as a tag cloud).
require(wordcloud)
require(RColorBrewer)
wcloud <- as.matrix(dtm.sample)
v <- sort(rowSums(wcloud), decreasing = TRUE)
d <- data.frame(word = names(v), freq = v)
wordcloud(d$word, d$freq, c(4,.3), 20, random.order = FALSE, col = rainbow(500))
rm(dtm.sample) # release memory
require(tm)
require(RWeka)
require(data.table)
require(dplyr)
require(ggplot2)
unigram <- NGramTokenizer(corpus.cleaned.final, control = list(Weka_control(min = 1, max = 1), stemming = T, wordLengths = c(3,Inf)))
unigram <- data.frame(table(unigram))
unigram <- unigram[order(unigram$Freq, decreasing = TRUE), ]
colnames(unigram) <- c("word.meaning", "word.frequency")
write.csv(unigram, file = "./unigram.csv")
length(unigram$word.meaning)
## [1] 32702
par(mar = c(5,4,2,2), las = 2)
barplot(height = unigram$word.frequency[1:15], names.arg = unigram$word.meaning[1:15], horiz = FALSE, col = heat.colors(15), main = "Top 15 word-unigram" , ylab = "Frequency")
require(ggplot2)
unigram$ratio <- sapply(1:length(unigram$word.meaning), function(x) sum(unigram$word.frequency[1:x]))
unigram$ratio <- unigram$ratio * 100 / sum(unigram$word.frequency)
unigram$number <- 1:length(unigram$word.meaning)
g <- ggplot(unigram, aes(x = ratio, y = number))
g + geom_line() + labs(title = "The number of word unigrams vs. Total number of words") + labs(x = "Total number of words (%)", y = "The number of word unigrams") + coord_trans(y = "log10")
library(rJava)
require(tm)
require(RWeka)
require(data.table)
require(dplyr)
require(ggplot2)
bigram <- NGramTokenizer(corpus.cleaned.final, control = Weka_control(min = 2, max = 2))
bigram <- data.frame(table(bigram))
bigram <- bigram[order(bigram$Freq, decreasing = TRUE), ]
colnames(bigram) <- c("word.meaning", "word.frequency")
write.csv(bigram, file = "./bigram.csv")
length(bigram$word.meaning)
## [1] 13550
par(mar = c(8,4,2,2), las = 2)
barplot(height = bigram$word.frequency[1:15], names.arg = bigram$word.meaning[1:15], horiz = FALSE, col = heat.colors(15), main = "Top 15 word-bigram" , ylab = "Frequency")
require(ggplot2)
bigram$ratio <- sapply(1:length(bigram$word.meaning), function(x) sum(bigram$word.frequency[1:x]))
bigram$ratio <- bigram$ratio * 100 / sum(bigram$word.frequency)
bigram$number <- 1:length(bigram$word.meaning)
g <- ggplot(bigram, aes(x = ratio, y = number))
g + geom_line() + labs(title = "The number of word bigrams vs. Total number of words") + labs(x = "Total number of words (%)", y = "The number of word bigrams") + coord_trans(y = "log10")
require(tm)
require(RWeka)
require(data.table)
require(dplyr)
require(ggplot2)
trigram <- NGramTokenizer(corpus.cleaned.final, control = Weka_control(min = 3, max = 3))
trigram <- data.frame(table(trigram))
trigram <-trigram[order(trigram$Freq, decreasing = TRUE), ]
colnames(trigram) <- c("word.meaning","word.frequency")
write.csv(trigram, file = "./trigram.csv")
length(trigram$word.meaning)
## [1] 13980
par(mar = c(10,4,2,2), las = 2)
barplot(height = trigram$word.frequency[1:30], names.arg = trigram$word.meaning[1:30], horiz = FALSE, col = heat.colors(15), main = "Top 30 word-trigram" , ylab = "Frequency")
require(ggplot2)
trigram$ratio <- sapply(1:length(trigram$word.meaning), function(x) sum(trigram$word.frequency[1:x]))
trigram$ratio <- trigram$ratio * 100 / sum(trigram$word.frequency)
trigram$number <- 1:length(trigram$word.meaning)
g <- ggplot(trigram, aes(x = ratio, y = number))
g + geom_line() + labs(title = "The number of word trigrams vs. Total number of words") + labs(x = "Total number of words (%)", y = "The number of word trigrams") + coord_trans(y = "log10")
The next steps will include building a “Shiny app”" to allow the user to obtain a suggestion of the most probable bigram word when inputs a unigram word etc., and developing the prediction algorithm implemented in Shiny app, and preparing a pitch about the app and publishing it.