Import Data

blogs <- readLines("~/DataScienceSpecialization/Capstone/data/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("~/DataScienceSpecialization/Capstone/data/en_us.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("~/DataScienceSpecialization/Capstone/
## data/en_us.news.txt", : incomplete final line found on '~/
## DataScienceSpecialization/Capstone/data/en_us.news.txt'
twitter <- readLines("~/DataScienceSpecialization/Capstone/data/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

Preprocess Data

# Convert coding to ASCII
blogs <- iconv(blogs, "latin1", "ASCII", sub = "")
news <- iconv(news, "latin1", "ASCII", sub = "")
twitter <- iconv(twitter, "latin1", "ASCII", sub = "")

# Remove last line from news
news <- news[-77259]

Summary Statistics

fsize_blogs <- file.size("~/DataScienceSpecialization/Capstone/data/en_US.blogs.txt") / 1024^2
fsize_news <- file.size("~/DataScienceSpecialization/Capstone/data/en_us.news.txt") / 1024^2
fsize_twitter <- file.size("~/DataScienceSpecialization/Capstone/data/en_US.twitter.txt") / 1024^2

lines_blogs <- length(blogs)
lines_news <- length(news)
lines_twitter <- length(twitter)

knitr::kable(data.frame(
    DataSources = c("Blogs", "News", "Twitter"),
    FileSize_MB = format(c(fsize_blogs, fsize_news,     fsize_twitter), digits = 5),
    TotalLines = format(c(lines_blogs, lines_news, lines_twitter), big.mark = ",")))
DataSources FileSize_MB TotalLines
Blogs 200.42 899,288
News 196.28 77,258
Twitter 159.36 2,360,148

Sample Data

# sample 1% of the data
set.seed(54321)
sampleData <- c(sample(blogs, length(blogs) * 0.01),
                sample(news, length(news) * 0.01),
                sample(twitter, length(twitter) * 0.01))

Build and Clean Corpus Using tm Package

corpus <- VCorpus(VectorSource(sampleData))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

WordCloud

Wordcloud of the most common words in the corpus.

wordcloud(corpus, max.words=100, random.order=FALSE, colors=brewer.pal(8,"Dark2"))

Tokenize and Construct Frequencies

uni_token <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bi_token <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tri_token <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

getFreq <- function(tdm) {
    freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
    return(data.frame(word = names(freq), freq = freq))
}

makePlot <- function(data, label) {
    ggplot(data[1:50,], aes(reorder(word, -freq), freq)) +
        labs(x = label, y = "Frequency") +
        theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
        geom_bar(stat = "identity", fill = I("grey50"))
}

Frequency Plots

freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = uni_token)), 0.99))
makePlot(freq1, "50 Most Common Unigrams")

freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bi_token)), 0.999))
makePlot(freq2, "50 Most Common Bigrams")

freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = tri_token)), 0.9999))
makePlot(freq3, "50 Most Common Trigrams")

Next Steps

Further work will be to fine tune the Katz Back Off prediction algorithm and deploy the shiny app.

Prediction Algorithm

The predictive algorithm will utilize n-grams and the Katz Back Off model to predict the next word in a sentence. The Back Off algorithim searches for the terms using the largest n-gram model first, then the second largest and so forth (ex. 4-gram > 3-gram > 2-gram > 1-gram).

Shiny App

A simple Shiny app will be created that includes a text box for input of the search string and an output pane containing the prediction of the next word in the search term.