The purpose of this document is to initiate the process that will lead to the create of a NLP application that predicts the next word in a sentence being typed. This report does not represent the final project content. It provides exploratory analysis for further investigations. First the data is randomly sampled, then analyzed word-for-word (UniGram) for occurrence distributions. The data was then analyzed by phrases of 2 or 3 words for occurrences. It was shown that by eliminating phrases that occur too often a significant amount of the data can be covered (up to 94% with three-word phrases).
The Coursera dataset is composed of 3 .csv files from the HC Corpora. Among the different corpora available, we will process the english one. The enlish corpus is composed of three text files: Tweets, News articles, and Blogs posts.
The files have been downloaded into my working directory
File size: 301.4 Mb Length: 2360148 Word count: 30373792
File size: 19.2 Mb Length: 77259 Word count: 2643972
File size: 248.5 Mb Length: 899288 Word count: 37334441
#Loading libraries
library(tm)
library(caTools)
library(wordcloud)
library(RColorBrewer)
library(RWeka)
set.seed(144)
#Loading the 3 files
twitter <- readLines("en_US.twitter.txt")
news <- readLines("en_US.news.txt")
blogs <- readLines("en_US.blogs.txt")
#Function to make the data ready to use.
ready <- function (char) {
corpus = Corpus(VectorSource(char))
corpus = tm_map(corpus, tolower)
corpus = tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords("english"))
corpus = tm_map(corpus, stemDocument)
return (corpus)
}
#We randomly sample 0.1% of twitter, 3% of news, and 0.2% of blogs rows to avoid heavy processings.
spl = sample.split(twitter, 0.001)
sampleTwitter = subset(twitter, spl == TRUE)
spl2 = sample.split(news, 0.03)
sampleNews = subset(news, spl2 == TRUE)
spl3 = sample.split(blogs, 0.002)
sampleBlogs = subset(blogs, spl3 == TRUE)
#List of corpora to call the ready function
listCorpus <- list(sampleTwitter, sampleNews, sampleBlogs)
#delete the unused items
rm("twitter", "news", "blogs", "spl", "spl2", "spl3", "sampleTwitter", "sampleNews", "sampleBlogs")
# call the ready function
corpusTwitter <- ready(listCorpus[[1]])
corpusNews <- ready(listCorpus[[2]])
corpusBlogs <- ready(listCorpus[[3]])
dtmTwitter = DocumentTermMatrix(corpusTwitter)
dtmTwitter = removeSparseTerms(dtmTwitter, 0.97)
dfTwitter = as.data.frame(as.matrix(dtmTwitter))
dtmBlogs = DocumentTermMatrix(corpusBlogs)
dtmBlogs = removeSparseTerms(dtmBlogs, 0.97)
dfBlogs = as.data.frame(as.matrix(dtmBlogs))
dtmNews = DocumentTermMatrix(corpusNews)
dtmNews = removeSparseTerms(dtmNews, 0.97)
dfNews = as.data.frame(as.matrix(dtmNews))
rm("listCorpus", "ready")
The word cloud gives us visual insight about the most frequent words in the corpora.
par(mfrow = c(1,3))
wordcloud(colnames(dfTwitter),
colSums(dfTwitter),
scale=c(3, .25),
colors=brewer.pal(9, "YlOrRd"),
rot.per=0.35,
random.order=FALSE)
title("Twitter")
wordcloud(colnames(dfNews),
colSums(dfNews),
scale=c(3, .5),
colors=brewer.pal(9, "YlOrRd"),
rot.per=0.35,
random.order=FALSE)
title("News")
wordcloud(colnames(dfBlogs),
colSums(dfBlogs),
scale=c(2, .25),
colors=brewer.pal(9, "YlOrRd"),
rot.per=0.35,
random.order=FALSE)
title("Blogs")
##n-Grams N-Gram charts of the 3 files. We only display the first 20 tokens.
UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
par(mfrow = c(3,1))
dtmTwitter <- DocumentTermMatrix(corpusTwitter, control=list(tokenize=UnigramTokenizer))
freqTwitter <- sort(colSums(as.matrix(dtmTwitter)), decreasing=TRUE)[1:20]
barTwitter <- barplot(freqTwitter, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="red", main=paste0("Frequency of 1-Gram for Twitter"))
text(barTwitter, par("usr")[3], labels=names(freqTwitter), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)
dtmNews <- DocumentTermMatrix(corpusNews, control=list(tokenize=UnigramTokenizer))
freqNews <- sort(colSums(as.matrix(dtmNews)), decreasing=TRUE)[1:20]
barNews <- barplot(freqNews, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="red", main=paste0("Frequency of 1-Gram for News"))
text(barNews, par("usr")[3], labels=names(freqNews), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)
dtmNews <- DocumentTermMatrix(corpusNews, control=list(tokenize=UnigramTokenizer))
freqNews <- sort(colSums(as.matrix(dtmNews)), decreasing=TRUE)[1:20]
barNews <- barplot(freqNews, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="red", main=paste0("Frequency of 1-Gram for News"))
text(barNews, par("usr")[3], labels=names(freqNews), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
par(mfrow = c(3,1))
dtmTwitter <- DocumentTermMatrix(corpusTwitter, control=list(tokenize=BigramTokenizer))
freqTwitter <- sort(colSums(as.matrix(dtmTwitter)), decreasing=TRUE)[1:20]
barTwitter <- barplot(freqTwitter, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="blue", main=paste0("Frequency of 2-Grams for Twitter"))
text(barTwitter, par("usr")[3], labels=names(freqTwitter), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)
dtmNews <- DocumentTermMatrix(corpusNews, control=list(tokenize=UnigramTokenizer))
freqNews <- sort(colSums(as.matrix(dtmNews)), decreasing=TRUE)[1:20]
barNews <- barplot(freqNews, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="blue", main=paste0("Frequency of 2-Grams for News"))
text(barNews, par("usr")[3], labels=names(freqNews), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)
dtmNews <- DocumentTermMatrix(corpusNews, control=list(tokenize=UnigramTokenizer))
freqNews <- sort(colSums(as.matrix(dtmNews)), decreasing=TRUE)[1:20]
barNews <- barplot(freqNews, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="blue", main=paste0("Frequency of 2-Grams for News"))
text(barNews, par("usr")[3], labels=names(freqNews), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
par(mfrow = c(3,1))
dtmTwitter <- DocumentTermMatrix(corpusTwitter, control=list(tokenize=TrigramTokenizer))
freqTwitter <- sort(colSums(as.matrix(dtmTwitter)), decreasing=TRUE)[1:20]
barTwitter <- barplot(freqTwitter, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="yellow", main=paste0("Frequency of 3-Grams for Twitter"))
text(barTwitter, par("usr")[3], labels=names(freqTwitter), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)
dtmNews <- DocumentTermMatrix(corpusNews, control=list(tokenize=UnigramTokenizer))
freqNews <- sort(colSums(as.matrix(dtmNews)), decreasing=TRUE)[1:20]
barNews <- barplot(freqNews, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="yellow", main=paste0("Frequency of 3-Grams for News"))
text(barNews, par("usr")[3], labels=names(freqNews), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)
dtmNews <- DocumentTermMatrix(corpusNews, control=list(tokenize=UnigramTokenizer))
freqNews <- sort(colSums(as.matrix(dtmNews)), decreasing=TRUE)[1:20]
barNews <- barplot(freqNews, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="yellow", main=paste0("Frequency of 3-Grams for News"))
text(barNews, par("usr")[3], labels=names(freqNews), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)
As we can see in the charts, the bi-Gram and tri-Gram represent a very small collection.
This data will be used to create the NLP application. The remaining steps are:
We will take a subset of the data. This will allow us to train and test a model with reasonable performances.
Develop and publish a shiny app.