Overview

The overall objective of the capstone project is to construct a predictive model that retreives the word that is most likely to appear next, in a sequence of words within a sentence. In this week’s report, the objective is to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.

The data is made available at https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

The Data - Data Import

blogs   <-  readLines("final/en_US/en_US.blogs.txt",   encoding = "UTF-8", skipNul = TRUE)
news    <-  readLines("final/en_US/en_US.news.txt",    encoding = "UTF-8", skipNul = TRUE)

## Warning in readLines("final/en_US/en_US.news.txt", encoding =
## "UTF-8", skipNul = TRUE): incomplete final line found on 'final/en_US/
## en_US.news.txt'

twitter <-  readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

Summary Statistics

We now illustrate the size of each file, number of lines & words, mean word count, and the top number of characters per line in each file:

fsize_blogs   <- file.info("../final/en_US/en_US.blogs.txt")$size / 1024^2
fsize_news    <- file.info("../final/en_US/en_US.news.txt")$size / 1024^2
fsize_twitter <- file.info("../final/en_US/en_US.twitter.txt")$size / 1024^2
line_cnt_blogs   <- length(blogs)    # count of lines in the file
line_cnt_news    <- length(news)     #
line_cnt_twitter <- length(twitter)  #
word_cnt_blogs   <- sum(sapply(gregexpr("\\W+", blogs),   length)) + 1    # count of words in the file
word_cnt_news    <- sum(sapply(gregexpr("\\W+", news),    length)) + 1
word_cnt_twitter <- sum(sapply(gregexpr("\\W+", twitter), length)) + 1
longest_blogs    <- max(nchar(blogs))    # max count of char per line.
longest_news     <- max(nchar(news))     # 
longest_twitter  <- max(nchar(twitter))  # 

knitr::kable(data.frame(
  DataSources     = c("Blogs", "News", "Twitter"),
  file.size.MB    = format(c(fsize_blogs, fsize_news, fsize_twitter), digits = 5),
  total.Lines     = format(c(line_cnt_blogs, line_cnt_news, line_cnt_twitter), big.mark = ","),
  total.Word.count   = format(c(word_cnt_blogs, word_cnt_news, word_cnt_twitter), big.mark = ","),
  avg.words.per.line = format(c(word_cnt_blogs/line_cnt_blogs, 
                                word_cnt_news/line_cnt_news, 
                                word_cnt_twitter/line_cnt_twitter),
                              digits = 5),
  max.char.per.line  = format(c(longest_blogs, longest_news, longest_twitter), big.mark = ",")))

DataSources	file.size.MB	total.Lines	total.Word.count	avg.words.per.line	max.char.per.line
Blogs	NA	899,288	38,222,279	42.503	40,833
News	NA	77,259	2,748,071	35.570	5,760
Twitter	NA	2,360,148	30,433,285	12.895	140

Exploratory Analysis - Data Cleaning, Merging, & Sampling

library(textmineR)

## Warning: package 'textmineR' was built under R version 3.3.3

## Loading required package: Matrix

blogs   <- iconv(blogs,   "UTF-8", "ASCII", "byte")
news    <- iconv(news,    "UTF-8", "ASCII", "byte")
twitter <- iconv(twitter, "UTF-8", "ASCII", "byte")

set.seed(1234)
data.sample <- c(sample(blogs,   length(blogs) * 0.01),
                 sample(news,    length(news) * 0.01),
                 sample(twitter, length(twitter) * 0.01))
rm(blogs)
rm(news)
rm(twitter)

library(tm)

## Warning: package 'tm' was built under R version 3.3.3

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 3.3.3

library(tmap)

## Warning: package 'tmap' was built under R version 3.3.3

library(NLP)
library(RColorBrewer)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.3.3

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

corpus <- VCorpus(VectorSource(data.sample))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

The Word Cloud

Most Common Words in Sampled Text

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.3.3

wordcloud(corpus, max.words=100, random.order=FALSE, colors=brewer.pal(8,"Dark2"))

Word Cloud After Removing Stop Words

tcorpus<-tm_map(corpus, removeWords, stopwords("english"))
wordcloud(tcorpus, max.words=100, random.order=FALSE, colors=brewer.pal(8,"Dark2"))

The N-Gram (Fequency)

Next, we list the most occuring words in the dataset, by using unigrams, bigrams, and trigrams:

library(ngram)
library(RWeka)
options(mc.cores=1)

unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram  <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

getFreq <- function(tdm) {
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}

makePlot <- function(data, label) {
  ggplot(data[1:50,], aes(reorder(word, -freq), freq)) +
         labs(x = label, y = "Frequency") +
         theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("grey50"))
}

50 Most Common Unigrams in Sample

We illustrate the top 50 using the histogram:

freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, contro=list(tokenize = unigram)), 0.99))
makePlot(freq1, "50 Most Common unigrams")

freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.999))
makePlot(freq2, "50 Most Common bigrams")

freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
makePlot(freq3, "50 Most Common trigrams")

Conclusion & Future Work

The next step of the capstone project will be to finetune the predictive algorithm, and lastly deploy the shiny app. The algorithm will utilize n-gram models to predict the ocurence of the next word in sequence, similar to the methods above. We will then compare the cases where stop words are included and not.

For example, when using the trigram model to predict the next word, if a match is not found using the trigram, then the predictive algorithm will go back to the bigram model, and if a match is still not found, it will then go back to the unigram model.

For the shiny app, the user interface will have a text input box for users to input text, and following which, the app will utilize the predictive algorithm to return the word that is most likely to occur next in sequence.

Capstone Project Milestone Report

Wei Hao Khoong

11 June 2018