Synopsis

The goal of the project is to build a prediction model to find the next probable word based on the sequence of words before it. We are using data from a corpus called HC Corpora to build our prediction model.

Data Exploration

en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt are the files in the corpus for US English language. Let us find the word counts and line counts for the above files first.

# load libraries, set seed
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(RWeka)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
set.seed(1234321)

# constants
TRAIN_SET_RATIO <- 0.05
TEST_SET_RATIO <- 0.5
MAX_NUM_PREDICTIONS <- 10

MIN_TDM_SPARSE <- 0.9

TOP_NGRAM_WORD_COUNT <- 10
TOP_NGRAM_HISTOGRAM_COUNT <- 25
TOP_NGRAM_WORDCLOUD_COUNT <- 100

RAW_DATA_DIR_NAME <- 'RawData'
TRAIN_DIR_NAME <- 'TrainData'
TEST_DIR_NAME <- 'TestData'
VALIDATE_DIR_NAME <- 'ValidateData'

BLOGS_FILE_NAME <- 'en_US.blogs.txt'
NEWS_FILE_NAME <- 'en_US.news_mod.txt'   # modified file to fix an issue with original file
NEWS_ORIGINAL_FILE_NAME <- 'en_US.news.txt'
TWITTER_FILE_NAME <- 'en_US.twitter.txt'

MODEL_DIR_NAME <- 'Model'
UNIGRAM_FILE_NAME <- 'Unigram.Rds'
BIGRAM_FILE_NAME <- 'Bigram.Rds'
TRIGRAM_FILE_NAME <- 'Trigram.Rds'
QUADGRAM_FILE_NAME <- 'Quadgram.Rds'
PENTAGRAM_FILE_NAME <- 'Pentagram.Rds'

USE_PENTAGRAM <- FALSE
# fixes erros in the news file where ASCII code O(32) can be interpreted as EOF
FixFileError <- function()
{
    originalNewsFileName <- paste0(RAW_DATA_DIR_NAME, '/', NEWS_ORIGINAL_FILE_NAME)
    modifiedNewsFileName <- paste0(RAW_DATA_DIR_NAME, '/', NEWS_FILE_NAME)
    if (file.exists(modifiedNewsFileName) == FALSE)
    {
        # read original contents
        con <- file(originalNewsFileName, "rb")
        fileLines <- readLines(con)
        close(con)
        
        # clean up the error
        updatedFileLines <- gsub("\032", "", fileLines, perl = TRUE)
        
        # save the modified file
        con <- file(modifiedNewsFileName, "w")
        writeLines(updatedFileLines, con)
        close(con)
        
        # move the file to parent directory
        file.rename(originalNewsFileName, NEWS_ORIGINAL_FILE_NAME)
        
        # cleanup big objects
        rm(list = c('fileLines', 'updatedFileLines'))
    }
}

# gets lines from a given file
GetLinesFromFile <- function(fileName, dirName)
{
    con <- file(paste0(dirName, '/', fileName), "r")
    fileLines <- readLines(con, skipNul = TRUE, encoding = "UTF-8")
    close(con)
    return (fileLines)
}

# gets words from a given file
GetWordsFromFile <- function(fileName, dirName)
{
    fullFileName <- paste0(dirName, '/', fileName)
    fileWords <- scan(fullFileName, character(0), encoding = "UTF-8")
    return (fileWords)
}

FixFileError()
blogLines <- GetLinesFromFile(BLOGS_FILE_NAME, RAW_DATA_DIR_NAME)
newsLines <- GetLinesFromFile(NEWS_FILE_NAME, RAW_DATA_DIR_NAME)
twitterLines <- GetLinesFromFile(TWITTER_FILE_NAME, RAW_DATA_DIR_NAME)

lcBlog <- length(blogLines)
lcNews <- length(newsLines)
lcTwitter <- length(twitterLines)

blogWords <- GetWordsFromFile(BLOGS_FILE_NAME, RAW_DATA_DIR_NAME)
newsWords <- GetWordsFromFile(NEWS_FILE_NAME, RAW_DATA_DIR_NAME)
twitterWords <- GetWordsFromFile(TWITTER_FILE_NAME, RAW_DATA_DIR_NAME)
## Warning in scan(fullFileName, character(0), encoding = "UTF-8"): EOF within
## quoted string
## Warning in scan(fullFileName, character(0), encoding = "UTF-8"): embedded
## nul(s) found in input
wcBlog <- length(blogWords)
wcNews <- length(newsWords)
wcTwitter <- length(twitterWords)

statStr <- paste0(BLOGS_FILE_NAME, ": Line count is ", lcBlog,
                        ", Word count is ", wcBlog, "\n",
                  NEWS_FILE_NAME, ": Line count is ", lcNews,
                        ", Word count is ", wcNews, "\n",
                  TWITTER_FILE_NAME, ": Line count is ", lcTwitter,
                        ", Word count is ", wcTwitter)
cat(statStr)
## en_US.blogs.txt: Line count is 899288, Word count is 35314678
## en_US.news_mod.txt: Line count is 1010242, Word count is 29313525
## en_US.twitter.txt: Line count is 2360148, Word count is 9141571
# clear the big objects
rm(list = c('blogLines', 'newsLines', 'twitterLines', 'blogWords', 'newsWords', 'twitterWords'))

Datasets Creation

Our goal is to use 50% of the content for training data set, 25% of the content for test data set and 25% of the content for validation data set. For now, we are keeping blogs, news and twitter as separate documents. It is possible to combine all of them and use one document. This will be considered as a possible alternative approach as we build our models.

Given the large size of the original files and the processing time, we will use 5% of the content for training data set, 50% for test data set and 45% for validation data set for now. This will be revised as we explore various options further!

# function for dividing the dataset
DivideDataset <- function (x, fileName)
{
   # figure the indices
   trainLen <- round(TRAIN_SET_RATIO * length(x))
   testLen <- round(TEST_SET_RATIO * length(x))
   testEndIndex <- trainLen + testLen
   
   # divide the data set
   train <- x[1:trainLen]
   test <- x[(trainLen + 1):testEndIndex]
   validate <- x[(testEndIndex + 1):length(x)]
   
   # write the data sets
   write(train, paste0(TRAIN_DIR_NAME, '/', fileName))
   write(test, paste0(TEST_DIR_NAME, '/', fileName))
   write(validate, paste0(VALIDATE_DIR_NAME, '/', fileName))
}

# create corpus, sample the data for randomizing the lines
docs <- Corpus(DirSource(RAW_DATA_DIR_NAME, encoding = "UTF-8"), readerControl = list(language = "en"))
blogs <- sample(docs[[1]][[1]])
news <- sample(docs[[2]][[1]])
twitter <- sample(docs[[3]][[1]])

# divide the data sets and save them
DivideDataset(blogs, BLOGS_FILE_NAME)
DivideDataset(news, NEWS_FILE_NAME)
DivideDataset(twitter, TWITTER_FILE_NAME)

# clear the big objects
rm(list = c('docs', 'blogs', 'news', 'twitter'))

Data Cleanup and TDM Analysis

Let us clean up the data before analysis.

corpus <- Corpus(DirSource(TRAIN_DIR_NAME, encoding = "UTF-8"), readerControl = list(language = "en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, PlainTextDocument)

Unigrams

Let us create a Term Document Matrix for Unigrams.

unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unigramTDM <- TermDocumentMatrix(corpus, control = list(tokenizer = unigramTokenizer))
unigramTDM <- removeSparseTerms(unigramTDM, MIN_TDM_SPARSE)

Top 10 unigrams and their frequency is shown followed by a simple histogram for top 25 words followed by a wordcloud for top 100 words. All of this is based on Training Data.

# top 10 unigrams
unigramFreq <- sort(rowSums(as.matrix(unigramTDM)), decreasing = TRUE)
unigramDF = data.frame(Term = names(unigramFreq), Frequency = unigramFreq,
                       stringsAsFactors = FALSE)
head(unigramDF, n = TOP_NGRAM_WORD_COUNT)
##      Term Frequency
## the   the    238726
## and   and    120018
## for   for     55037
## that that     52030
## you   you     46861
## with with     35866
## was   was     31521
## this this     27212
## have have     26600
## are   are     24566
# histogram (top 25 unigrams)
g <- ggplot(unigramDF[1:TOP_NGRAM_HISTOGRAM_COUNT,], aes(Term, Frequency)) + 
     geom_bar(stat = "identity") +
     theme(axis.text.x = element_text(angle = 45, hjust = 1))
g

# wordcloud (top 100 unigrams)
wordcloud(names(unigramFreq), unigramFreq, scale = c(8, 0.5),
          max.words = TOP_NGRAM_WORDCLOUD_COUNT, random.color = TRUE,
          rot.per = 0.25, colors = brewer.pal(6, 'Dark2'))

# clear the big objects
rm(list = c('unigramTDM', 'unigramFreq'))

Bigrams

Let us create a Term Document Matrix for Bigrams.

bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2, delimiters = " \\n\\t\\r"))
bigramTDM <- TermDocumentMatrix(corpus, control = list(tokenizer = bigramTokenizer))
bigramTDM <- removeSparseTerms(bigramTDM, MIN_TDM_SPARSE)

Top 10 bigrams and their frequency is shown followed by a simple histogram for top 25 words followed by a wordcloud for top 100 words. All of this is based on Training Data.

# top 10 bigrams
bigramFreq <- sort(rowSums(as.matrix(bigramTDM)), decreasing = TRUE)
bigramDF = data.frame(Term = names(bigramFreq), Frequency = bigramFreq,
                      stringsAsFactors = FALSE)
head(bigramDF, n = TOP_NGRAM_WORD_COUNT)
##              Term Frequency
## of the     of the     21457
## in the     in the     20588
## to the     to the     10710
## for the   for the     10207
## on the     on the      9783
## to be       to be      8014
## at the     at the      7041
## and the   and the      6354
## in a         in a      6046
## with the with the      5420
# histogram (top 25 bigrams)
g <- ggplot(bigramDF[1:TOP_NGRAM_HISTOGRAM_COUNT,], aes(Term, Frequency)) + 
     geom_bar(stat = "identity") +
     theme(axis.text.x = element_text(angle = 45, hjust = 1))
g

# wordcloud (top 100 bigrams)
wordcloud(names(bigramFreq), bigramFreq, scale = c(4, 0.5),
          max.words = TOP_NGRAM_WORDCLOUD_COUNT, random.color = TRUE,
          rot.per = 0.25, colors = brewer.pal(6, 'Dark2'))

# clear the big objects
rm(list = c('bigramTDM', 'bigramFreq'))

Trigrams

Let us create a Term Document Matrix for Trigrams.

trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3, delimiters = " \\n\\t\\r"))
trigramTDM <- TermDocumentMatrix(corpus, control = list(tokenizer = trigramTokenizer))
trigramTDM <- removeSparseTerms(trigramTDM, MIN_TDM_SPARSE)

Top 10 trigrams and their frequency is shown followed by a simple histogram for top 25 words followed by a wordcloud for top 100 words. All of this is based on Training Data.

# top 10 trigrams
trigramFreq <- sort(rowSums(as.matrix(trigramTDM)), decreasing = TRUE)
trigramDF = data.frame(Term = names(trigramFreq), Frequency = trigramFreq,
                       stringsAsFactors = FALSE)
head(trigramDF, n = TOP_NGRAM_WORD_COUNT)
##                          Term Frequency
## one of the         one of the      1693
## a lot of             a lot of      1470
## thanks for the thanks for the      1158
## going to be       going to be       864
## to be a               to be a       843
## i want to           i want to       746
## it was a             it was a       743
## the end of         the end of       719
## out of the         out of the       713
## some of the       some of the       671
# histogram (top 25 trigrams)
g <- ggplot(trigramDF[1:TOP_NGRAM_HISTOGRAM_COUNT,], aes(Term, Frequency)) + 
     geom_bar(stat = "identity") +
     theme(axis.text.x = element_text(angle = 45, hjust = 1))
g

# wordcloud (top 100 trigrams)
wordcloud(names(trigramFreq), trigramFreq, scale = c(2, 0.5),
          max.words = TOP_NGRAM_WORDCLOUD_COUNT, random.color = TRUE,
          rot.per = 0.25, colors = brewer.pal(6, 'Dark2'))

# clear the big objects
rm(list = c('trigramTDM', 'trigramFreq'))

Quadgrams

Let us create a Term Document Matrix for Quadgrams.

quadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4, delimiters = " \\n\\t\\r"))
quadgramTDM <- TermDocumentMatrix(corpus, control = list(tokenizer = quadgramTokenizer))
quadgramTDM <- removeSparseTerms(quadgramTDM, MIN_TDM_SPARSE)

Top 10 quadgrams and their frequency is shown followed by a simple histogram for top 25 words followed by a wordcloud for top 100 words. All of this is based on Training Data.

# top 10 quadgrams
quadgramFreq <- sort(rowSums(as.matrix(quadgramTDM)), decreasing = TRUE)
quadgramDF = data.frame(Term = names(quadgramFreq), Frequency = quadgramFreq,
                       stringsAsFactors = FALSE)
head(quadgramDF, n = TOP_NGRAM_WORD_COUNT)
##                                        Term Frequency
## the end of the               the end of the       393
## the rest of the             the rest of the       346
## for the first time       for the first time       328
## at the end of                 at the end of       322
## thanks for the follow thanks for the follow       298
## at the same time           at the same time       254
## is going to be               is going to be       215
## in the middle of           in the middle of       210
## when it comes to           when it comes to       206
## is one of the                 is one of the       204
# histogram (top 25 quadgrams)
g <- ggplot(quadgramDF[1:TOP_NGRAM_HISTOGRAM_COUNT,], aes(Term, Frequency)) + 
     geom_bar(stat = "identity") +
     theme(axis.text.x = element_text(angle = 45, hjust = 1))
g

# wordcloud (top 100 quadgrams)
wordcloud(names(quadgramFreq), quadgramFreq, scale = c(1.5, 0.5),
          max.words = TOP_NGRAM_WORDCLOUD_COUNT, random.color = TRUE,
          rot.per = 0.25, colors = brewer.pal(6, 'Dark2'))

# clear the big objects
rm(list = c('quadgramTDM', 'quadgramFreq'))

Pentagrams

Let us create a Term Document Matrix for Pentagrams.

pentagramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 5, max = 5, delimiters = " \\n\\t\\r"))
pentagramTDM <- TermDocumentMatrix(corpus, control = list(tokenizer = pentagramTokenizer))
pentagramTDM <- removeSparseTerms(pentagramTDM, MIN_TDM_SPARSE)

Top 10 pentagrams and their frequency is shown followed by a simple histogram for top 25 words followed by a wordcloud for top 100 words. All of this is based on Training Data.

# top 10 pentagrams
pentagramFreq <- sort(rowSums(as.matrix(pentagramTDM)), decreasing = TRUE)
pentagramDF = data.frame(Term = names(pentagramFreq), Frequency = pentagramFreq,
                       stringsAsFactors = FALSE)
head(pentagramDF, n = TOP_NGRAM_WORD_COUNT)
##                                        Term Frequency
## at the end of the         at the end of the       186
## in the middle of the   in the middle of the       102
## for the first time in for the first time in        86
## the end of the day       the end of the day        61
## by the end of the         by the end of the        60
## thank you so much for thank you so much for        58
## its going to be a         its going to be a        57
## there are a lot of       there are a lot of        56
## for the rest of the     for the rest of the        47
## i cant wait to see       i cant wait to see        47
# histogram (top 25 pentagrams)
g <- ggplot(pentagramDF[1:TOP_NGRAM_HISTOGRAM_COUNT,], aes(Term, Frequency)) + 
     geom_bar(stat = "identity") +
     theme(axis.text.x = element_text(angle = 45, hjust = 1))
g

# wordcloud (top 100 pentagrams)
wordcloud(names(pentagramFreq), pentagramFreq, scale = c(1.0, 0.5),
          max.words = TOP_NGRAM_WORDCLOUD_COUNT, random.color = TRUE,
          rot.per = 0.25, colors = brewer.pal(6, 'Dark2'))

# clear the big objects
rm(list = c('pentagramTDM', 'pentagramFreq', 'corpus'))

Modeling

Let us extract the phrase part and the prediction part from the N-Grams. For pentagram, phrase is the first four words and prediction is the fifth word. For quadgram, phrase is the first three words and prediction is the fourth word. For trigram, phrase is the first two words and prediction is the third word. For bigram, phrase is the first word and prediction is the second word. For unigram, there is no phrase and prediction is the word.

unigramDF <- rename(unigramDF, Prediction = Term)
rownames(unigramDF) <- NULL # reduces dataframe size
# function to create additional columns in data frame for Bigrams
AddProbabilityToBigrams<-function(x)
{
    # extract the bigram Term
    term <- x[['Term']]
    Phrase <- sapply(term, function(t) { regmatches(t, regexpr("+([a-z]+)", t))})
    Prediction <- sapply(term, function(t) { regmatches(t, regexpr("+([a-z]+$)", t))})
    return(cbind(x, Phrase, Prediction, stringsAsFactors = FALSE))
}
bigramDF <- AddProbabilityToBigrams(bigramDF)
bigramDF <- arrange(bigramDF, Phrase, desc(Frequency))
bigramDF <- select(bigramDF, Phrase, Frequency, Prediction) # reduces dataframe size
# function to create additional columns in data frame for Trigrams
AddProbabilityToTrigrams<-function(x)
{
    # extract the trigram Term
    term <- x[['Term']]
    Phrase <- sapply(term, function(t) { regmatches(t, regexpr("+([a-z]+) ([a-z]+)", t))})
    Prediction <- sapply(term, function(t) { regmatches(t, regexpr("+([a-z]+$)", t))})
    return(cbind(x, Phrase, Prediction, stringsAsFactors = FALSE))
}
trigramDF <- AddProbabilityToTrigrams(trigramDF)
trigramDF <- arrange(trigramDF, Phrase, desc(Frequency))
trigramDF <- select(trigramDF, Phrase, Frequency, Prediction) # reduces dataframe size
# function to create additional columns in data frame for Quadgrams
AddProbabilityToQuadgrams<-function(x)
{
    # extract the quadgram Term
    term <- x[['Term']]
    #Phrase <- sapply(term, function(t) { regmatches(t, regexpr("+([a-z]+) ([a-z]+) ([a-z]+)", t))})
    #Prediction <- sapply(term, function(t) { regmatches(t, regexpr("+([a-z]+$)", t))})
    Phrase <- sapply(term, function(t) { w<-strsplit(t, ' ')[[1]];paste(w[1], w[2], w[3]) })
    Prediction <- sapply(term, function(t) { w<-strsplit(t, ' ')[[1]];w[4]})
    return(cbind(x, Phrase, Prediction, stringsAsFactors = FALSE))
}
quadgramDF <- AddProbabilityToQuadgrams(quadgramDF)
quadgramDF <- arrange(quadgramDF, Phrase, desc(Frequency))
quadgramDF <- select(quadgramDF, Phrase, Frequency, Prediction) # reduces dataframe size
# function to create additional columns in data frame for Pentagrams
AddProbabilityToPentagrams<-function(x)
{
    # extract the pentagram Terms
    term <- x[['Term']]
    Phrase <- sapply(term, function(t) { w<-strsplit(t, ' ')[[1]];paste(w[1], w[2], w[3], w[4]) })
    Prediction <- sapply(term, function(t) { w<-strsplit(t, ' ')[[1]];w[5]})
    return(cbind(x, Phrase, Prediction, stringsAsFactors = FALSE))
}
pentagramDF <- AddProbabilityToPentagrams(pentagramDF)
pentagramDF <- arrange(pentagramDF, Phrase, desc(Frequency))
pentagramDF <- select(pentagramDF, Phrase, Frequency, Prediction) # reduces dataframe size

Save N-Grams

We save the N-Grams so that it is faster to load and get to prediction!

# form the file names
fullUnigramFile <- paste0(MODEL_DIR_NAME, '/', UNIGRAM_FILE_NAME)
fullBigramFile <- paste0(MODEL_DIR_NAME, '/', BIGRAM_FILE_NAME)
fullTrigramFile <- paste0(MODEL_DIR_NAME, '/', TRIGRAM_FILE_NAME)
fullQuadgramFile <- paste0(MODEL_DIR_NAME, '/', QUADGRAM_FILE_NAME)
fullPentagramFile <- paste0(MODEL_DIR_NAME, '/', PENTAGRAM_FILE_NAME)

# save the files
saveRDS(unigramDF, fullUnigramFile)
saveRDS(bigramDF, fullBigramFile)
saveRDS(trigramDF, fullTrigramFile)
saveRDS(quadgramDF, fullQuadgramFile)
saveRDS(pentagramDF, fullPentagramFile)

Predictor

Our approach to prediction is as follows. We look up the phrase in Pentagrams first and return the top candidates if there is a match. If there is no match in Pentagrams, we try it in Quadgrams for a match If there is no match in Quadgrams, we try it in Trigrams for a match If there is no match in Trigrams, we try it in bigrams for a match. If there is no match in bigrams, we return the top candidates in unigrams.

# function to clean the input where inputPhrase is a character vector
CleanInput <- function(inputPhrase)
{
    # create corpus and follow the exact cleaning steps
    # as was done with training data
    inputCorpus <- VCorpus(VectorSource(c(inputPhrase)))
    inputCorpus <- tm_map(inputCorpus, removePunctuation)
    inputCorpus <- tm_map(inputCorpus, removeNumbers)
    inputCorpus <- tm_map(inputCorpus, tolower)
    inputCorpus <- tm_map(inputCorpus, PlainTextDocument)
    
    return(as.character(inputCorpus[[1]]))
}

# function to predict the next word given an input phrase,
# udf, bdf, tdf, qdf and pdf are unigram, bigram, trigram,
# quadgram and pentagram dataframes, the maximum number of
# words to be predicted and whether prediction should happen
# for partial words in progress
#
# Note: if the last letter in the input is not a space, it is
# taken as a word in progress
#
# returns a vector of predictions
PredictWord <- function(inputPhrase, udf, bdf, tdf, qdf, pdf, maxNumPredictions,
                        predictWordsInProgress)
{
    # is the last word a word in progress?
    isLastWordInProgress <- TRUE
    wordInProgress <- ""
    len <- nchar(inputPhrase)
    if (substr(inputPhrase, len, len) == ' ')
        isLastWordInProgress <- FALSE
    
    # clean the input first and count the words in the phrase
    inputPhrase <- CleanInput(inputPhrase)
    inputWords <- strsplit(inputPhrase, ' ')[[1]]
    numInputWords <- length(inputWords)
    
    if (isLastWordInProgress == TRUE && predictWordsInProgress == TRUE)
    {
        # find the word in progress and decrement the total number of words
        if (numInputWords > 0)
        {
            wordInProgress <- inputWords[numInputWords]
            numInputWords <- numInputWords - 1
        }
    }
    
    # pentagrams
    if (USE_PENTAGRAM == TRUE && numInputWords >= 4)
    {
        phrase <- paste(inputWords[numInputWords - 3], inputWords[numInputWords - 2],
                        inputWords[numInputWords - 1], inputWords[numInputWords])
        matchDF <- filter(pdf, Phrase == phrase)
        if (nrow(matchDF) > 0)
        {
            predictions <- matchDF[['Prediction']]
            if (nchar(wordInProgress) > 0)
            {
                pattern <- paste0("^", wordInProgress)
                predictions <- grep(pattern, predictions, value = TRUE)
            }
            
            # is there anything to return?
            if (length(predictions) > 0)
            {
                if (length(predictions) > maxNumPredictions)
                    predictions <- predictions[1:maxNumPredictions]
                #cat("5-PG", fill = TRUE)
                return(predictions)
            }
        }
    }
    
    # quadgrams
    if (numInputWords >= 3)
    {
        phrase <- paste(inputWords[numInputWords - 2], inputWords[numInputWords - 1],
                        inputWords[numInputWords])
        matchDF <- filter(qdf, Phrase == phrase)
        if (nrow(matchDF) > 0)
        {
            predictions <- matchDF[['Prediction']]
            if (nchar(wordInProgress) > 0)
            {
                pattern <- paste0("^", wordInProgress)
                predictions <- grep(pattern, predictions, value = TRUE)
            }
            
            # is there anything to return?
            if (length(predictions) > 0)
            {
                if (length(predictions) > maxNumPredictions)
                    predictions <- predictions[1:maxNumPredictions]
                #cat("4-QG", fill = TRUE)
                return(predictions)
            }
        }
    }
    
    # trigrams
    if (numInputWords >= 2)
    {
        phrase <- paste(inputWords[numInputWords - 1], inputWords[numInputWords])
        matchDF <- filter(tdf, Phrase == phrase)
        if (nrow(matchDF) > 0)
        {
            predictions <- matchDF[['Prediction']]
            if (nchar(wordInProgress) > 0)
            {
                pattern <- paste0("^", wordInProgress)
                predictions <- grep(pattern, predictions, value = TRUE)
            }
            
            # is there anything to return?
            if (length(predictions) > 0)
            {
                if (length(predictions) > maxNumPredictions)
                    predictions <- predictions[1:maxNumPredictions]
                #cat("3-TG", fill = TRUE)
                return(predictions)
            }
        }
    }
    
    # bigrams
    if (numInputWords == 1)
    {
        phrase <- inputWords[numInputWords]
        matchDF <- filter(bdf, Phrase == phrase)
        if (nrow(matchDF) > 0)
        {
            predictions <- matchDF[['Prediction']]
            if (nchar(wordInProgress) > 0)
            {
                pattern <- paste0("^", wordInProgress)
                predictions <- grep(pattern, predictions, value = TRUE)
            }
            
            # is there anything to return?
            if (length(predictions) > 0)
            {
                if (length(predictions) > maxNumPredictions)
                    predictions <- predictions[1:maxNumPredictions]
                #cat("2-BG", fill = TRUE)
                return(predictions)
            }
        }
    }
    
    # unigrams
    predictions <- udf[['Prediction']]
    
    if (nchar(wordInProgress) > 0)
    {
        pattern <- paste0("^", wordInProgress)
        predictions <- grep(pattern, predictions, value = TRUE)
    }
    
    if (length(predictions) > maxNumPredictions)
        predictions <- predictions[1:maxNumPredictions]
    #cat("1-UG", fill = TRUE)
    return(predictions)
}
# load the N-Grams
unigramDF <- readRDS(fullUnigramFile)
bigramDF <- readRDS(fullBigramFile)
trigramDF <- readRDS(fullTrigramFile)
quadgramDF <- readRDS(fullQuadgramFile)

pentagramDF <- NULL
if (USE_PENTAGRAM == TRUE)
{
    pentagramDF <- readRDS(fullPentagramFile)
}
x <- "Let us go to "
z <- PredictWord(x, unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF, MAX_NUM_PREDICTIONS, FALSE)
z
##  [1] "the"     "bed"     "sleep"   "a"       "school"  "work"    "my"     
##  [8] "college" "hell"    "church"

Accuracy

Let us do some tests to understand the accuracy of prediction. The scneario of usage of this model involves user interaction to pick the next word from a set of choices or type their own. In our case, we have three categories for analysis. Accuracy of the top word, accuracy of being in the top 5 words and accuracy of being in the top 10 words.

ACCURACY_ANALYSIS_NUM_ITEMS <- 300

# gets N lines from a given file
GetNLinesFromFile <- function(fileName, dirName, numLines)
{
    con <- file(paste0(dirName, '/', fileName), "r")
    fileLines <- readLines(con, n = numLines, skipNul = TRUE, encoding = "UTF-8")
    close(con)
    return (fileLines)
}

# returns accuracy count in a data frame
GetAccuracyCount <- function (fileName, dirName, numLines, udf, bdf, tdf, qdf, pdf)
{
    # some counters
    sampleCount <- 0
    top1Predictions <- 0
    top5Predictions <- 0
    top10Predictions <- 0
    
    df <- data.frame(sample = numeric(1), top1 = numeric(1), top5 = numeric(1), top10 = numeric(1))
    
    # read the file and process each line
    lines <- GetNLinesFromFile(fileName, dirName, numLines)
    for (lineIdx in 1:length(lines))
    {
        if (length(lines) == 0) break
        
        # clean the data and split into words
        line <- CleanInput(lines[[lineIdx]])
        words <- strsplit(line, ' ')[[1]]
        numWords <- length(words)
        
        # predict each word starting with a phrase of 1 word and up
        maxWordIdx <- numWords - 1
        for (wordIdx in 1:maxWordIdx)
        {
            if (maxWordIdx < 1) break
            
            inputPhrase <- paste0(words[1:wordIdx], " ", collapse = "")
            predictions <- PredictWord(inputPhrase, udf, bdf, tdf, qdf, pdf, 10, FALSE)
            
            # is the expected word in predictions?
            expectedWord <- words[[wordIdx + 1]]
            expectedWordIdx <- 0
            
            for (idx in 1:length(predictions))
            {
                if (length(predictions) == 0) break
                
                if (predictions[[idx]] == expectedWord)
                {
                    expectedWordIdx <- idx
                    break
                }
            }
            
            # do the statistics
            sampleCount <- sampleCount + 1
            if (expectedWordIdx != 0)
            {
                if (expectedWordIdx == 1) {top1Predictions <- top1Predictions + 1}
                if (expectedWordIdx <= 5) {top5Predictions <- top5Predictions + 1}
                if (expectedWordIdx <= 10) {top10Predictions <- top10Predictions + 1}
            }
        }
    }
    
    # update the dataframe
    df$sample[1] <- sampleCount
    df$top1[1] <- top1Predictions
    df$top5[1] <- top5Predictions
    df$top10[1] <- top10Predictions
    
    return(df)
}

# predictions
df <- GetAccuracyCount(BLOGS_FILE_NAME, TEST_DIR_NAME, ACCURACY_ANALYSIS_NUM_ITEMS,
                       unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF)
row.names(df) <- "Blogs"

dfNews <- GetAccuracyCount(NEWS_FILE_NAME, TEST_DIR_NAME, ACCURACY_ANALYSIS_NUM_ITEMS,
                       unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF)
row.names(dfNews) <- "News"

dfTwitter <- GetAccuracyCount(TWITTER_FILE_NAME, TEST_DIR_NAME, ACCURACY_ANALYSIS_NUM_ITEMS,
                       unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF)
row.names(dfTwitter) <- "Twitter"

dfc <- rbind(df, dfNews, dfTwitter)

The following shows the sample size and the number of correct predictions made. Prediction is made after a word is completed.

dfc
##         sample top1 top5 top10
## Blogs    11161 1250 2244  2631
## News     10352 1165 2016  2346
## Twitter   3480  398  724   863

The following table shows the accuracy in terms of percentage values:

Top 1 % Top 5 % Top 10 %
Blogs 11.2% 20.1% 23.6%
News 11.3% 19.5% 22.7%
Twitter 11.4% 20.8% 24.8%
ALL 11.3% 19.9% 23.4%

The following code performs prediction as each and every letter is being typed.

ACCURACY_ANALYSIS_NUM_ITEMS <- 100

# returns accuracy count in a data frame for WIP (words in progress)
GetAccuracyCountWIP <- function (fileName, dirName, numLines, udf, bdf, tdf, qdf, pdf)
{
    # some counters
    sampleCount <- 0
    top1Predictions <- 0
    top5Predictions <- 0
    top10Predictions <- 0
    
    df <- data.frame(sample = numeric(1), top1 = numeric(1), top5 = numeric(1), top10 = numeric(1))
    
    # read the file and process each line
    lines <- GetNLinesFromFile(fileName, dirName, numLines)
    for (lineIdx in 1:length(lines))
    {
        if (length(lines) == 0) break
        
        # clean the data and split into words
        line <- CleanInput(lines[[lineIdx]])
        words <- strsplit(line, ' ')[[1]]
        numWords <- length(words)
        
        # predict each word assuming each letter is being typed
        inputPhrase <- ""
        for (wordIdx in 1:numWords)
        {
            if (numWords < 1) break
            
            expectedWord <- words[[wordIdx]]
            wordLen <- nchar(expectedWord)
            
            for (charIdx in 0:wordLen)
            {
                if (charIdx == 0)
                {
                    if (wordIdx > 1) {inputPhrase <- paste0(inputPhrase, " ")}
                }
                else
                {
                    inputPhrase <- paste0(inputPhrase, substring(expectedWord, charIdx, charIdx))
                }
                #cat(inputPhrase, fill = TRUE)
                predictions <- PredictWord(inputPhrase, udf, bdf, tdf, qdf, pdf, 10, TRUE)
                
                # is the expected word in predictions?
                expectedWordIdx <- 0
                
                for (idx in 1:length(predictions))
                {
                    if (length(predictions) == 0) break
                    
                    if (predictions[[idx]] == expectedWord)
                    {
                        expectedWordIdx <- idx
                        break
                    }
                }
                
                # do the statistics
                sampleCount <- sampleCount + 1
                if (expectedWordIdx != 0)
                {
                    if (expectedWordIdx == 1) {top1Predictions <- top1Predictions + 1}
                    if (expectedWordIdx <= 5) {top5Predictions <- top5Predictions + 1}
                    if (expectedWordIdx <= 10) {top10Predictions <- top10Predictions + 1}
                }
            }
        }
    }
    
    # update the dataframe
    df$sample[1] <- sampleCount
    df$top1[1] <- top1Predictions
    df$top5[1] <- top5Predictions
    df$top10[1] <- top10Predictions
    
    return(df)
}

# predictions
df <- GetAccuracyCountWIP(BLOGS_FILE_NAME, TEST_DIR_NAME, ACCURACY_ANALYSIS_NUM_ITEMS,
                       unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF)
row.names(df) <- "Blogs"

dfNews <- GetAccuracyCountWIP(NEWS_FILE_NAME, TEST_DIR_NAME, ACCURACY_ANALYSIS_NUM_ITEMS,
                       unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF)
row.names(dfNews) <- "News"

dfTwitter <- GetAccuracyCountWIP(TWITTER_FILE_NAME, TEST_DIR_NAME, ACCURACY_ANALYSIS_NUM_ITEMS,
                       unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF)
row.names(dfTwitter) <- "Twitter"

dfi <- rbind(df, dfNews, dfTwitter)

The following shows the sample size and the number of correct predictions made. Prediction is made as each letter is being typed.

dfi
##         sample top1  top5 top10
## Blogs    18719 8161 10989 11700
## News     20781 9522 12526 13348
## Twitter   6844 2951  3989  4263

The following table shows the accuracy in terms of percentage values:

Top 1 % Top 5 % Top 10 %
Blogs 43.6% 58.7% 62.5%
News 45.8% 60.3% 64.2%
Twitter 43.1% 58.3% 62.3%
ALL 44.5% 59.3% 63.2%

Resources, Accuracy and Performance

This application has been deployed to Shinyapps environment. Due to 1 GB memory size limits, Pentagram was not included.

Memory The following table shows the number of rows and memory size of various N-Gram objects in the R environment.

Rows Memory (MB)
Unigram 143,691 9
Bigram 1,562,907 50
Trigram 3,343,187 166
Quadgram 4,057,408 297
Pentagram 4,093,723 362
TOTAL 13,200,916 884

Accuracy

Performance

Future Possibilities

Data sets: Cross-validation technique is used for creation of training data set, test data set and validation data set. The percentages envsioned (from the original corpus) are 5% for training, 50% for testing and 45% for validation. Due to memory and processing constraints, testing was done on much smaller dataset.

Data Cleaning: Punctuation charcaters need to be studied more carefully. For example, period and exclamation indicate end of a sentence which is valuable information. Numbers have been removed and its impact need to be studied. Removing stopwords can increase the predictability in some cases.

Modeling: After researching various approaches, paper by Daniel Jurafsky & James H. Martin is used as the basis for building prediction model. 1-gram, 2-gram, 3-gram, 4-gram and 5-gram are the N-grams that have been built. Analysis needs to be made to see if the larger N-grams yield better results. Also, further optimizations are possible which requires more research and experimentation.

Resource Optimization: N-Grams memory usage can be reduced further by using TRIE data structures or even a simple lookup of words using a fixed lexicon. This has the side benefit of validation of misspelled words in the model.

Conclusions

N-Grams as a method of prediction works! Using the model to make predictions as the user is typing increases the accuracy while significantly enhancing the end user experience. Further refinements can be made to make this model even better!!!

Appendix 1: Trimming N-Grams

The following code can be used to trim the N-Grams knowing our interest is limited to number of words to be predicted. The value is set to 10 now and an experiment was run to understand the effect of various MAX_NUM_PREDICTIONS (10, 5, 1). The drawback to this approach is that we can make predictions only after a word is completed by the use. If we keep the entire N-Gram, we can predictions even as the user is typing a partial word.

Given the savings in memory is not significant, we are not trimming the N-Grams. Please see the table below for reduction % of number of rows. NP stands for Number of Predictions. Red% stands for Reduction %.

Rows Memory (MB) NP=10 Rows NP=10 Red% NP=5 Rows NP=5 Red% NP=1 Rows NP=1 Red%
Unigram 143,691 9 10 100% 5 100% 1 100%
Bigram 1,562,907 50 403,761 74% 298,071 81% 135,904 91%
Trigram 3,343,187 166 2,485,389 26% 2,214,177 34% 1,489,168 55%
Quadgram 4,057,408 297 3,878,993 4% 3,752,510 8% 3,198,945 21%
Pentagram 4,093,723 362 4,077,302 0% 4,055,580 1% 3,878,756 5%
TOTAL 13,200,916 884 10,845,455 18% 10,320,343 22% 8,702,774 34%
# function to find the size of the trimmed data frame
# x is the input data frame
FindTrimmedDataFrameSize <- function(x, numPredictions)
{
    curPhrase <- ''
    numCurPhrasePredictions <- 0
    dfNumRows <- 0
    
    for (idx in 1:nrow(x))
    {
        # new cur phrase?
        if (x[idx, "Phrase"] != curPhrase)
        {
            # start a new cur phrase
            curPhrase <- x[idx, "Phrase"]
            numCurPhrasePredictions <- 0
        }
        
        # are we within limits?
        if (numCurPhrasePredictions < numPredictions)
        {
            dfNumRows <- dfNumRows + 1
            numCurPhrasePredictions <- numCurPhrasePredictions + 1
        }
    }
    
    return(dfNumRows)
}

# function to trim the N-Gram data frame to have results that matter
# and used. For a given Phrase, only the top numPredictions are kept
# x is the input data frame
TrimNgramDataFrame <- function(x, numPredictions)
{
    # find the trimmed data frame size (faster to create fixed size data frame)
    dfNumRows <- FindTrimmedDataFrameSize(x, numPredictions)
    
    # create a new data frame for results
    df <- data.frame(Phrase = character(dfNumRows), Frequency = numeric(dfNumRows),
                     Prediction = character(dfNumRows), stringsAsFactors = FALSE)
    curPhrase <- ''
    numCurPhrasePredictions <- 0
    dfRowIdx <- 1
    
    for (idx in 1:nrow(x))
    {
        # new cur phrase?
        if (x[idx, "Phrase"] != curPhrase)
        {
            # start a new cur phrase
            curPhrase <- x[idx, "Phrase"]
            numCurPhrasePredictions <- 0
        }
        
        # are we within limits?
        if (numCurPhrasePredictions < numPredictions)
        {
            df[dfRowIdx,] <- x[idx,]
            dfRowIdx <- dfRowIdx + 1
            numCurPhrasePredictions <- numCurPhrasePredictions + 1
        }
    }
    
    return(df)
}

References

Daniel Jurafsky & James H. Martin: Speech and Language Processing. Draft of September 1, 2014.

Kailash Awati, Sensanalytics Consulting: A gentle introduction to text mining using R.