Executive Summary

In this milestone report an exploratory analysis will be done on the data set provided by Swiftkey in order to prepare for the development of a prediction algorithm using n-grams. The input text data comes in different languages and English is our choice for this study. During this exploratory analysis the major features of the training data set will be indentified and a summary of the plans to build a “next word predicton model” will be presented.

Sys.setenv(DYLD_FALLBACK_LIBRARY_PATH="/Library/Frameworks/R.framework/Resources/lib:/Library/Frameworks/R.framework/Resources/lib:/Library/Frameworks/R.framework/Resources/lib:/Library/Frameworks/R.framework/Resources/lib:/Users/Jayme/lib:/usr/local/lib:/usr/lib::::::::")
library(rJava)

Load the required packages

library(stringi)
library(tm)
## Loading required package: NLP
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(RWeka)
library(wordcloud)
## Loading required package: RColorBrewer

Gettting the data

Download and extract the text documents files.

The datasets come in 3 different types (News, Blogs and Twitter feeds)and in four natural languages: English (en_US), German (de_DE), Russian (ru_RU) and Finnish (fi_FI).

####library(RCurl)
####data_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
####if(!file.exists( "SwiftKey_data.zip")){
####  download.file(data_url, "SwiftKey_data.zip", method = "libcurl")

### extract zip files
####unzip("SwiftKey_data.zip", exdir = "./final/en_US", list = TRUE)
####}

Read the English (en_US) text files

setwd("~/DS/datasciencecoursera/Capstone/en_US")
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
list.files()
## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"

Get the file sizes and display a summary info about the files

blogs.size <- file.info("en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info("en_US.news.txt")$size / 1024 ^ 2
twitter.size <- file.info("en_US.twitter.txt")$size / 1024 ^ 2

# Get the numbers of words in the files
blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)

# Summary of the data sets
summaryTable <- data.frame(source = c("blogs", "news", "twitter"),
           file.size.MB = c(blogs.size, news.size, twitter.size),
           num.lines = c(length(blogs), length(news), length(twitter)),
           num.words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
           mean.num.words = c(mean(blogs.words), mean(news.words), mean(twitter.words)))
summaryTable
##    source file.size.MB num.lines num.words mean.num.words
## 1   blogs           NA    899288  37546246       41.75108
## 2    news           NA   1010242  34762395       34.40997
## 3 twitter           NA   2360148  30093410       12.75065

Sampling

Since the data is too big we will randomly select 1% of each dataset.

set.seed(456)
# We combine the three samples.
sampleText <- c(sample(blogs, length(blogs) * 0.01),
                 sample(news, length(news) * 0.01),
                 sample(twitter, length(twitter) * 0.01))
#number of lines
length(sampleText) 
## [1] 42695

Create the corpus and then clean it

We will do various pre-processing steps on the text data before analyzing it.

corpus <- VCorpus(VectorSource(sampleText))

toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") # to remove urls
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")                     # to remove htags
corpus <- tm_map(corpus, toSpace, "#\\w+")                        # remove twitter handles 
corpus <- tm_map(corpus, toSpace, "/|@|\\|")
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, PlainTextDocument)
corpus1 <- tm_map(corpus, stemDocument)

# The corpus object below doesn't contain the stop words 
corpus <- tm_map(corpus1, removeWords, stopwords("english"))

# Note that corpus1 has the stop words because in the shiny application the next word 
# to predict might be a stopword

Exploratory Analysis

## print the first 10 lines of the corpus
for (i in 1:10){
print(corpus[[i]]$content)
}
## [1] "meanwhil  th annual bc human resourc manag associ confer tradeshow  take place   popular new vancouv convent centr  seem somewhat unconvent canada’ comedian rick mercer’ resourc will  doubt  deleg laughing…"
## [1] " north korea launch  intercontinent ballist missil call  taepodong japan    unit state  shoot   news report suggest   occur  earli  pm april"
## [1] "  born   small villag call mexbrough south yorkshir  now live   wife  two young daughter  southampton"
## [1] " allur  beach metal detect"
## [1] " also  particular logic  even intellig   way  execut  bad guy  grab  daughter   pureheart politician  order  forc     bid   somehow feel  need  move  across europ use  guy  doesnt work    stupid plan   doom  fail whi even move  anywher   first place  whi   eeeeeeevil industrialist insist    cargoship full  toxic wast unload   ukrain whi  just let somali pirat hijack   just dump  wast  somalia   one will  abl  oppos     equal corrupt  disorgan hell hole  world  full  "
## [1] "god bless us everi one sourc tini tim"
## [1] "  don’t know   tell "
## [1] "along london’ south bank last august"
## [1] " think"
## [1] "sara megibow   newer agent build  client list  bio can  found    look  via agentquerycom literari fiction scienc fiction chick lit commerci fiction fantasi women fiction romanc histor fiction young adult multicultur middl grade"
## Saving the final corpus
saveRDS(corpus1, file = "corpus1.RData")

##  Compute the document term matrix and keep words that are between 3 and 5 in lenght 
dtm <- DocumentTermMatrix(corpus, control=list(wordLengths=c(3,15)))

# display the new numbers of documents(rows) and terms (columns) 
dim(dtm)
## [1] 42695 42152
# Before continuing we remove the sparse terms this will allow to reduce the number of columns
#convert document term matrix to a term document matrix
tdm <- t(dtm)

Another exploratory method is to visualize frequent terms in a collection of documents is through the wordcloud package.

# The wordcloud package plots a picture of the words based on the frequency counts of words. 
# The word with the highest count receives the largest relevant plotting space (or font size).
wordcloud(corpus)

N-gram tokenization

Unigram <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
Bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
Trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
Quadgram <- function(x) NGramTokenizer(x, Weka_control(min=4, max=4))
options(mc.cores=1)  # to avoid hanging in MAC
# function to compute the N-gram term frequency 
computeFreq <- function(tdm) {
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}
# function to plot the 30 more frequent words
wordsPlot <- function(data, label) {
  ggplot(data[1:30,], aes(reorder(word, -freq), freq)) +
         labs(x = label, y = "Frequency") +
         theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
         geom_bar(stat = "identity", fill = "green", col="red") + coord_flip()
}

# Call the computeFreq function to compute the frequencies of most common n-grams in the corpus
freqU <- computeFreq(removeSparseTerms(TermDocumentMatrix(corpus), 0.99))
freqB <- computeFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = Bigram)), 0.999))
freqT <- computeFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = Trigram)), 0.9999))

Exploratory Analysis through graphs

Note that stop words were not removed in the cleanup process, as stop words might exist as the next word in an n-gram

Plotting a histogram of 30 most frequent unigram

wordsPlot(freqU, "30 Most Common Unigrams")

Plotting a histogram of 30 most frequent bigram

wordsPlot(freqB, "30 Most Common Bigrams")

Plotting a histogram of 30 most frequent trigram

wordsPlot(freqT, "30 Most Common Trigrams")

WHAT’S NEXT

After this exploratory analysis of the media data, the next step(s) of this capstone project would be to finalize the predictive algorithm model, and implement this model as a Shiny app.

The Shiny app will consist of an interface with an input text area where the user can enter a sentence, and the N-gram predictive model application will suggest the most likely next word or a list of words to choose from.