Introduction

The purpose of this document is to initiate the process that will lead to the create of a NLP application that predicts the next word in a sentence being typed. This report does not represent the final project content. It provides exploratory analysis for further investigations. First the data is randomly sampled, then analyzed word-for-word (UniGram) for occurrence distributions. The data was then analyzed by phrases of 2 or 3 words for occurrences. It was shown that by eliminating phrases that occur too often a significant amount of the data can be covered (up to 94% with three-word phrases).

The Data

The Coursera dataset is composed of 3 .csv files from the HC Corpora. Among the different corpora available, we will process the english one. The enlish corpus is composed of three text files: Tweets, News articles, and Blogs posts.
The files have been downloaded into my working directory

Twitter

File size: 301.4 Mb Length: 2360148 Word count: 30373792

News

File size: 19.2 Mb Length: 77259 Word count: 2643972

Blogs

File size: 248.5 Mb Length: 899288 Word count: 37334441

Loading and Cleaning the data

#Loading libraries
library(tm)
library(caTools)
library(wordcloud)
library(RColorBrewer)
library(RWeka)
set.seed(144)

#Loading the 3 files
twitter <- readLines("en_US.twitter.txt")
news <- readLines("en_US.news.txt")
blogs <- readLines("en_US.blogs.txt")

#Function to make the data ready to use.
ready <- function (char) {
        corpus = Corpus(VectorSource(char))
        corpus = tm_map(corpus, tolower)
        corpus = tm_map(corpus, PlainTextDocument)  
        corpus = tm_map(corpus, removePunctuation)        
        corpus = tm_map(corpus, removeWords, stopwords("english"))      
        corpus = tm_map(corpus, stemDocument)      

    return (corpus)
}

#We randomly sample 0.1% of twitter, 3% of news, and 0.2% of blogs rows to avoid heavy processings. 
spl = sample.split(twitter, 0.001)
sampleTwitter = subset(twitter, spl == TRUE)
spl2 = sample.split(news, 0.03)
sampleNews = subset(news, spl2 == TRUE)
spl3 = sample.split(blogs, 0.002)
sampleBlogs = subset(blogs, spl3 == TRUE)

#List of corpora to call the ready function
listCorpus <- list(sampleTwitter, sampleNews, sampleBlogs)
#delete the unused items
rm("twitter", "news", "blogs", "spl", "spl2", "spl3", "sampleTwitter", "sampleNews", "sampleBlogs")

# call the ready function
corpusTwitter <- ready(listCorpus[[1]])
corpusNews <- ready(listCorpus[[2]])
corpusBlogs <- ready(listCorpus[[3]])

dtmTwitter = DocumentTermMatrix(corpusTwitter)
dtmTwitter = removeSparseTerms(dtmTwitter, 0.97)        
dfTwitter = as.data.frame(as.matrix(dtmTwitter))

dtmBlogs = DocumentTermMatrix(corpusBlogs)
dtmBlogs = removeSparseTerms(dtmBlogs, 0.97)        
dfBlogs = as.data.frame(as.matrix(dtmBlogs))

dtmNews = DocumentTermMatrix(corpusNews)
dtmNews = removeSparseTerms(dtmNews, 0.97)        
dfNews = as.data.frame(as.matrix(dtmNews))

rm("listCorpus", "ready")

Exploratory analysis

Word cloud

The word cloud gives us visual insight about the most frequent words in the corpora.

par(mfrow = c(1,3))

wordcloud(colnames(dfTwitter), 
          colSums(dfTwitter), 
          scale=c(3, .25),
          colors=brewer.pal(9, "YlOrRd"),
          rot.per=0.35,
          random.order=FALSE)
title("Twitter")

wordcloud(colnames(dfNews), 
          colSums(dfNews), 
          scale=c(3, .5),
          colors=brewer.pal(9, "YlOrRd"),
          rot.per=0.35,
          random.order=FALSE)
title("News")

wordcloud(colnames(dfBlogs), 
          colSums(dfBlogs), 
          scale=c(2, .25),
          colors=brewer.pal(9, "YlOrRd"),
          rot.per=0.35,
          random.order=FALSE)
title("Blogs")

##n-Grams N-Gram charts of the 3 files. We only display the first 20 tokens.

1-gram

UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
par(mfrow = c(3,1))


dtmTwitter <- DocumentTermMatrix(corpusTwitter, control=list(tokenize=UnigramTokenizer))
freqTwitter <- sort(colSums(as.matrix(dtmTwitter)), decreasing=TRUE)[1:20]    
barTwitter <- barplot(freqTwitter, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="red", main=paste0("Frequency of 1-Gram for Twitter"))
text(barTwitter, par("usr")[3], labels=names(freqTwitter), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)

dtmNews <- DocumentTermMatrix(corpusNews, control=list(tokenize=UnigramTokenizer))
freqNews <- sort(colSums(as.matrix(dtmNews)), decreasing=TRUE)[1:20]    
barNews <- barplot(freqNews, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="red", main=paste0("Frequency of 1-Gram for News"))
text(barNews, par("usr")[3], labels=names(freqNews), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)

dtmNews <- DocumentTermMatrix(corpusNews, control=list(tokenize=UnigramTokenizer))
freqNews <- sort(colSums(as.matrix(dtmNews)), decreasing=TRUE)[1:20]    
barNews <- barplot(freqNews, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="red", main=paste0("Frequency of 1-Gram for News"))
text(barNews, par("usr")[3], labels=names(freqNews), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)

2-Grams

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
par(mfrow = c(3,1))


dtmTwitter <- DocumentTermMatrix(corpusTwitter, control=list(tokenize=BigramTokenizer))
freqTwitter <- sort(colSums(as.matrix(dtmTwitter)), decreasing=TRUE)[1:20]    
barTwitter <- barplot(freqTwitter, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="blue", main=paste0("Frequency of 2-Grams for Twitter"))
text(barTwitter, par("usr")[3], labels=names(freqTwitter), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)

dtmNews <- DocumentTermMatrix(corpusNews, control=list(tokenize=UnigramTokenizer))
freqNews <- sort(colSums(as.matrix(dtmNews)), decreasing=TRUE)[1:20]    
barNews <- barplot(freqNews, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="blue", main=paste0("Frequency of 2-Grams for News"))
text(barNews, par("usr")[3], labels=names(freqNews), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)

dtmNews <- DocumentTermMatrix(corpusNews, control=list(tokenize=UnigramTokenizer))
freqNews <- sort(colSums(as.matrix(dtmNews)), decreasing=TRUE)[1:20]    
barNews <- barplot(freqNews, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="blue", main=paste0("Frequency of 2-Grams for News"))
text(barNews, par("usr")[3], labels=names(freqNews), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)

3-Grams

TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
par(mfrow = c(3,1))


dtmTwitter <- DocumentTermMatrix(corpusTwitter, control=list(tokenize=TrigramTokenizer))
freqTwitter <- sort(colSums(as.matrix(dtmTwitter)), decreasing=TRUE)[1:20]    
barTwitter <- barplot(freqTwitter, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="yellow", main=paste0("Frequency of 3-Grams for Twitter"))
text(barTwitter, par("usr")[3], labels=names(freqTwitter), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)

dtmNews <- DocumentTermMatrix(corpusNews, control=list(tokenize=UnigramTokenizer))
freqNews <- sort(colSums(as.matrix(dtmNews)), decreasing=TRUE)[1:20]    
barNews <- barplot(freqNews, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="yellow", main=paste0("Frequency of 3-Grams for News"))
text(barNews, par("usr")[3], labels=names(freqNews), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)

dtmNews <- DocumentTermMatrix(corpusNews, control=list(tokenize=UnigramTokenizer))
freqNews <- sort(colSums(as.matrix(dtmNews)), decreasing=TRUE)[1:20]    
barNews <- barplot(freqNews, axes=FALSE, axisnames=FALSE, ylab="Frequency", col="yellow", main=paste0("Frequency of 3-Grams for News"))
text(barNews, par("usr")[3], labels=names(freqNews), srt=60, adj=c(1.1,1.1), xpd=TRUE, cex=0.9)
axis(2)

As we can see in the charts, the bi-Gram and tri-Gram represent a very small collection.

Remaining tasks

This data will be used to create the NLP application. The remaining steps are:

We will take a subset of the data. This will allow us to train and test a model with reasonable performances.
Develop and publish a shiny app.

SwiftKey/Johns Hopkins Milestone Report

Sofian Hamiti

July, 2015