Overview

The overall objective of the capstone project is to construct a predictive model that retreives the word that is most likely to appear next, in a sequence of words within a sentence. In this week’s report, the objective is to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.

The data is made available at https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

The Data - Data Import

blogs   <-  readLines("final/en_US/en_US.blogs.txt",   encoding = "UTF-8", skipNul = TRUE)
news    <-  readLines("final/en_US/en_US.news.txt",    encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("final/en_US/en_US.news.txt", encoding =
## "UTF-8", skipNul = TRUE): incomplete final line found on 'final/en_US/
## en_US.news.txt'
twitter <-  readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

Summary Statistics

We now illustrate the size of each file, number of lines & words, mean word count, and the top number of characters per line in each file:

fsize_blogs   <- file.info("../final/en_US/en_US.blogs.txt")$size / 1024^2
fsize_news    <- file.info("../final/en_US/en_US.news.txt")$size / 1024^2
fsize_twitter <- file.info("../final/en_US/en_US.twitter.txt")$size / 1024^2
line_cnt_blogs   <- length(blogs)    # count of lines in the file
line_cnt_news    <- length(news)     #
line_cnt_twitter <- length(twitter)  #
word_cnt_blogs   <- sum(sapply(gregexpr("\\W+", blogs),   length)) + 1    # count of words in the file
word_cnt_news    <- sum(sapply(gregexpr("\\W+", news),    length)) + 1
word_cnt_twitter <- sum(sapply(gregexpr("\\W+", twitter), length)) + 1
longest_blogs    <- max(nchar(blogs))    # max count of char per line.
longest_news     <- max(nchar(news))     # 
longest_twitter  <- max(nchar(twitter))  # 

knitr::kable(data.frame(
  DataSources     = c("Blogs", "News", "Twitter"),
  file.size.MB    = format(c(fsize_blogs, fsize_news, fsize_twitter), digits = 5),
  total.Lines     = format(c(line_cnt_blogs, line_cnt_news, line_cnt_twitter), big.mark = ","),
  total.Word.count   = format(c(word_cnt_blogs, word_cnt_news, word_cnt_twitter), big.mark = ","),
  avg.words.per.line = format(c(word_cnt_blogs/line_cnt_blogs, 
                                word_cnt_news/line_cnt_news, 
                                word_cnt_twitter/line_cnt_twitter),
                              digits = 5),
  max.char.per.line  = format(c(longest_blogs, longest_news, longest_twitter), big.mark = ",")))
DataSources file.size.MB total.Lines total.Word.count avg.words.per.line max.char.per.line
Blogs NA 899,288 38,222,279 42.503 40,833
News NA 77,259 2,748,071 35.570 5,760
Twitter NA 2,360,148 30,433,285 12.895 140

Exploratory Analysis - Data Cleaning, Merging, & Sampling

library(textmineR)
## Warning: package 'textmineR' was built under R version 3.3.3
## Loading required package: Matrix
blogs   <- iconv(blogs,   "UTF-8", "ASCII", "byte")
news    <- iconv(news,    "UTF-8", "ASCII", "byte")
twitter <- iconv(twitter, "UTF-8", "ASCII", "byte")

set.seed(1234)
data.sample <- c(sample(blogs,   length(blogs) * 0.01),
                 sample(news,    length(news) * 0.01),
                 sample(twitter, length(twitter) * 0.01))
rm(blogs)
rm(news)
rm(twitter)
library(tm)
## Warning: package 'tm' was built under R version 3.3.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.3
library(tmap)
## Warning: package 'tmap' was built under R version 3.3.3
library(NLP)
library(RColorBrewer)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
corpus <- VCorpus(VectorSource(data.sample))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

The Word Cloud

Most Common Words in Sampled Text

library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.3
wordcloud(corpus, max.words=100, random.order=FALSE, colors=brewer.pal(8,"Dark2"))

Word Cloud After Removing Stop Words

tcorpus<-tm_map(corpus, removeWords, stopwords("english"))
wordcloud(tcorpus, max.words=100, random.order=FALSE, colors=brewer.pal(8,"Dark2"))

The N-Gram (Fequency)

Next, we list the most occuring words in the dataset, by using unigrams, bigrams, and trigrams:

library(ngram)
library(RWeka)
options(mc.cores=1)

unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram  <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

getFreq <- function(tdm) {
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}

makePlot <- function(data, label) {
  ggplot(data[1:50,], aes(reorder(word, -freq), freq)) +
         labs(x = label, y = "Frequency") +
         theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("grey50"))
}

50 Most Common Unigrams in Sample

We illustrate the top 50 using the histogram:

freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, contro=list(tokenize = unigram)), 0.99))
makePlot(freq1, "50 Most Common unigrams")

freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.999))
makePlot(freq2, "50 Most Common bigrams")

freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
makePlot(freq3, "50 Most Common trigrams")

Conclusion & Future Work

The next step of the capstone project will be to finetune the predictive algorithm, and lastly deploy the shiny app. The algorithm will utilize n-gram models to predict the ocurence of the next word in sequence, similar to the methods above. We will then compare the cases where stop words are included and not.

For example, when using the trigram model to predict the next word, if a match is not found using the trigram, then the predictive algorithm will go back to the bigram model, and if a match is still not found, it will then go back to the unigram model.

For the shiny app, the user interface will have a text input box for users to input text, and following which, the app will utilize the predictive algorithm to return the word that is most likely to occur next in sequence.