The goal of the project is to build a prediction model to find the next probable word based on the sequence of words before it. We are using data from a corpus called HC Corpora to build our prediction model.
en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt are the files in the corpus for US English language. Let us find the word counts and line counts for the above files first.
# load libraries, set seed
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(RWeka)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
set.seed(1234321)
# constants
TRAIN_SET_RATIO <- 0.05
TEST_SET_RATIO <- 0.5
MAX_NUM_PREDICTIONS <- 10
MIN_TDM_SPARSE <- 0.9
TOP_NGRAM_WORD_COUNT <- 10
TOP_NGRAM_HISTOGRAM_COUNT <- 25
TOP_NGRAM_WORDCLOUD_COUNT <- 100
RAW_DATA_DIR_NAME <- 'RawData'
TRAIN_DIR_NAME <- 'TrainData'
TEST_DIR_NAME <- 'TestData'
VALIDATE_DIR_NAME <- 'ValidateData'
BLOGS_FILE_NAME <- 'en_US.blogs.txt'
NEWS_FILE_NAME <- 'en_US.news_mod.txt' # modified file to fix an issue with original file
NEWS_ORIGINAL_FILE_NAME <- 'en_US.news.txt'
TWITTER_FILE_NAME <- 'en_US.twitter.txt'
MODEL_DIR_NAME <- 'Model'
UNIGRAM_FILE_NAME <- 'Unigram.Rds'
BIGRAM_FILE_NAME <- 'Bigram.Rds'
TRIGRAM_FILE_NAME <- 'Trigram.Rds'
QUADGRAM_FILE_NAME <- 'Quadgram.Rds'
PENTAGRAM_FILE_NAME <- 'Pentagram.Rds'
USE_PENTAGRAM <- FALSE
# fixes erros in the news file where ASCII code O(32) can be interpreted as EOF
FixFileError <- function()
{
originalNewsFileName <- paste0(RAW_DATA_DIR_NAME, '/', NEWS_ORIGINAL_FILE_NAME)
modifiedNewsFileName <- paste0(RAW_DATA_DIR_NAME, '/', NEWS_FILE_NAME)
if (file.exists(modifiedNewsFileName) == FALSE)
{
# read original contents
con <- file(originalNewsFileName, "rb")
fileLines <- readLines(con)
close(con)
# clean up the error
updatedFileLines <- gsub("\032", "", fileLines, perl = TRUE)
# save the modified file
con <- file(modifiedNewsFileName, "w")
writeLines(updatedFileLines, con)
close(con)
# move the file to parent directory
file.rename(originalNewsFileName, NEWS_ORIGINAL_FILE_NAME)
# cleanup big objects
rm(list = c('fileLines', 'updatedFileLines'))
}
}
# gets lines from a given file
GetLinesFromFile <- function(fileName, dirName)
{
con <- file(paste0(dirName, '/', fileName), "r")
fileLines <- readLines(con, skipNul = TRUE, encoding = "UTF-8")
close(con)
return (fileLines)
}
# gets words from a given file
GetWordsFromFile <- function(fileName, dirName)
{
fullFileName <- paste0(dirName, '/', fileName)
fileWords <- scan(fullFileName, character(0), encoding = "UTF-8")
return (fileWords)
}
FixFileError()
blogLines <- GetLinesFromFile(BLOGS_FILE_NAME, RAW_DATA_DIR_NAME)
newsLines <- GetLinesFromFile(NEWS_FILE_NAME, RAW_DATA_DIR_NAME)
twitterLines <- GetLinesFromFile(TWITTER_FILE_NAME, RAW_DATA_DIR_NAME)
lcBlog <- length(blogLines)
lcNews <- length(newsLines)
lcTwitter <- length(twitterLines)
blogWords <- GetWordsFromFile(BLOGS_FILE_NAME, RAW_DATA_DIR_NAME)
newsWords <- GetWordsFromFile(NEWS_FILE_NAME, RAW_DATA_DIR_NAME)
twitterWords <- GetWordsFromFile(TWITTER_FILE_NAME, RAW_DATA_DIR_NAME)
## Warning in scan(fullFileName, character(0), encoding = "UTF-8"): EOF within
## quoted string
## Warning in scan(fullFileName, character(0), encoding = "UTF-8"): embedded
## nul(s) found in input
wcBlog <- length(blogWords)
wcNews <- length(newsWords)
wcTwitter <- length(twitterWords)
statStr <- paste0(BLOGS_FILE_NAME, ": Line count is ", lcBlog,
", Word count is ", wcBlog, "\n",
NEWS_FILE_NAME, ": Line count is ", lcNews,
", Word count is ", wcNews, "\n",
TWITTER_FILE_NAME, ": Line count is ", lcTwitter,
", Word count is ", wcTwitter)
cat(statStr)
## en_US.blogs.txt: Line count is 899288, Word count is 35314678
## en_US.news_mod.txt: Line count is 1010242, Word count is 29313525
## en_US.twitter.txt: Line count is 2360148, Word count is 9141571
# clear the big objects
rm(list = c('blogLines', 'newsLines', 'twitterLines', 'blogWords', 'newsWords', 'twitterWords'))
Our goal is to use 50% of the content for training data set, 25% of the content for test data set and 25% of the content for validation data set. For now, we are keeping blogs, news and twitter as separate documents. It is possible to combine all of them and use one document. This will be considered as a possible alternative approach as we build our models.
Given the large size of the original files and the processing time, we will use 5% of the content for training data set, 50% for test data set and 45% for validation data set for now. This will be revised as we explore various options further!
# function for dividing the dataset
DivideDataset <- function (x, fileName)
{
# figure the indices
trainLen <- round(TRAIN_SET_RATIO * length(x))
testLen <- round(TEST_SET_RATIO * length(x))
testEndIndex <- trainLen + testLen
# divide the data set
train <- x[1:trainLen]
test <- x[(trainLen + 1):testEndIndex]
validate <- x[(testEndIndex + 1):length(x)]
# write the data sets
write(train, paste0(TRAIN_DIR_NAME, '/', fileName))
write(test, paste0(TEST_DIR_NAME, '/', fileName))
write(validate, paste0(VALIDATE_DIR_NAME, '/', fileName))
}
# create corpus, sample the data for randomizing the lines
docs <- Corpus(DirSource(RAW_DATA_DIR_NAME, encoding = "UTF-8"), readerControl = list(language = "en"))
blogs <- sample(docs[[1]][[1]])
news <- sample(docs[[2]][[1]])
twitter <- sample(docs[[3]][[1]])
# divide the data sets and save them
DivideDataset(blogs, BLOGS_FILE_NAME)
DivideDataset(news, NEWS_FILE_NAME)
DivideDataset(twitter, TWITTER_FILE_NAME)
# clear the big objects
rm(list = c('docs', 'blogs', 'news', 'twitter'))
Let us clean up the data before analysis.
corpus <- Corpus(DirSource(TRAIN_DIR_NAME, encoding = "UTF-8"), readerControl = list(language = "en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, PlainTextDocument)
Let us create a Term Document Matrix for Unigrams.
unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unigramTDM <- TermDocumentMatrix(corpus, control = list(tokenizer = unigramTokenizer))
unigramTDM <- removeSparseTerms(unigramTDM, MIN_TDM_SPARSE)
Top 10 unigrams and their frequency is shown followed by a simple histogram for top 25 words followed by a wordcloud for top 100 words. All of this is based on Training Data.
# top 10 unigrams
unigramFreq <- sort(rowSums(as.matrix(unigramTDM)), decreasing = TRUE)
unigramDF = data.frame(Term = names(unigramFreq), Frequency = unigramFreq,
stringsAsFactors = FALSE)
head(unigramDF, n = TOP_NGRAM_WORD_COUNT)
## Term Frequency
## the the 238726
## and and 120018
## for for 55037
## that that 52030
## you you 46861
## with with 35866
## was was 31521
## this this 27212
## have have 26600
## are are 24566
# histogram (top 25 unigrams)
g <- ggplot(unigramDF[1:TOP_NGRAM_HISTOGRAM_COUNT,], aes(Term, Frequency)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
g
# wordcloud (top 100 unigrams)
wordcloud(names(unigramFreq), unigramFreq, scale = c(8, 0.5),
max.words = TOP_NGRAM_WORDCLOUD_COUNT, random.color = TRUE,
rot.per = 0.25, colors = brewer.pal(6, 'Dark2'))
# clear the big objects
rm(list = c('unigramTDM', 'unigramFreq'))
Let us create a Term Document Matrix for Bigrams.
bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2, delimiters = " \\n\\t\\r"))
bigramTDM <- TermDocumentMatrix(corpus, control = list(tokenizer = bigramTokenizer))
bigramTDM <- removeSparseTerms(bigramTDM, MIN_TDM_SPARSE)
Top 10 bigrams and their frequency is shown followed by a simple histogram for top 25 words followed by a wordcloud for top 100 words. All of this is based on Training Data.
# top 10 bigrams
bigramFreq <- sort(rowSums(as.matrix(bigramTDM)), decreasing = TRUE)
bigramDF = data.frame(Term = names(bigramFreq), Frequency = bigramFreq,
stringsAsFactors = FALSE)
head(bigramDF, n = TOP_NGRAM_WORD_COUNT)
## Term Frequency
## of the of the 21457
## in the in the 20588
## to the to the 10710
## for the for the 10207
## on the on the 9783
## to be to be 8014
## at the at the 7041
## and the and the 6354
## in a in a 6046
## with the with the 5420
# histogram (top 25 bigrams)
g <- ggplot(bigramDF[1:TOP_NGRAM_HISTOGRAM_COUNT,], aes(Term, Frequency)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
g
# wordcloud (top 100 bigrams)
wordcloud(names(bigramFreq), bigramFreq, scale = c(4, 0.5),
max.words = TOP_NGRAM_WORDCLOUD_COUNT, random.color = TRUE,
rot.per = 0.25, colors = brewer.pal(6, 'Dark2'))
# clear the big objects
rm(list = c('bigramTDM', 'bigramFreq'))
Let us create a Term Document Matrix for Trigrams.
trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3, delimiters = " \\n\\t\\r"))
trigramTDM <- TermDocumentMatrix(corpus, control = list(tokenizer = trigramTokenizer))
trigramTDM <- removeSparseTerms(trigramTDM, MIN_TDM_SPARSE)
Top 10 trigrams and their frequency is shown followed by a simple histogram for top 25 words followed by a wordcloud for top 100 words. All of this is based on Training Data.
# top 10 trigrams
trigramFreq <- sort(rowSums(as.matrix(trigramTDM)), decreasing = TRUE)
trigramDF = data.frame(Term = names(trigramFreq), Frequency = trigramFreq,
stringsAsFactors = FALSE)
head(trigramDF, n = TOP_NGRAM_WORD_COUNT)
## Term Frequency
## one of the one of the 1693
## a lot of a lot of 1470
## thanks for the thanks for the 1158
## going to be going to be 864
## to be a to be a 843
## i want to i want to 746
## it was a it was a 743
## the end of the end of 719
## out of the out of the 713
## some of the some of the 671
# histogram (top 25 trigrams)
g <- ggplot(trigramDF[1:TOP_NGRAM_HISTOGRAM_COUNT,], aes(Term, Frequency)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
g
# wordcloud (top 100 trigrams)
wordcloud(names(trigramFreq), trigramFreq, scale = c(2, 0.5),
max.words = TOP_NGRAM_WORDCLOUD_COUNT, random.color = TRUE,
rot.per = 0.25, colors = brewer.pal(6, 'Dark2'))
# clear the big objects
rm(list = c('trigramTDM', 'trigramFreq'))
Let us create a Term Document Matrix for Quadgrams.
quadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4, delimiters = " \\n\\t\\r"))
quadgramTDM <- TermDocumentMatrix(corpus, control = list(tokenizer = quadgramTokenizer))
quadgramTDM <- removeSparseTerms(quadgramTDM, MIN_TDM_SPARSE)
Top 10 quadgrams and their frequency is shown followed by a simple histogram for top 25 words followed by a wordcloud for top 100 words. All of this is based on Training Data.
# top 10 quadgrams
quadgramFreq <- sort(rowSums(as.matrix(quadgramTDM)), decreasing = TRUE)
quadgramDF = data.frame(Term = names(quadgramFreq), Frequency = quadgramFreq,
stringsAsFactors = FALSE)
head(quadgramDF, n = TOP_NGRAM_WORD_COUNT)
## Term Frequency
## the end of the the end of the 393
## the rest of the the rest of the 346
## for the first time for the first time 328
## at the end of at the end of 322
## thanks for the follow thanks for the follow 298
## at the same time at the same time 254
## is going to be is going to be 215
## in the middle of in the middle of 210
## when it comes to when it comes to 206
## is one of the is one of the 204
# histogram (top 25 quadgrams)
g <- ggplot(quadgramDF[1:TOP_NGRAM_HISTOGRAM_COUNT,], aes(Term, Frequency)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
g
# wordcloud (top 100 quadgrams)
wordcloud(names(quadgramFreq), quadgramFreq, scale = c(1.5, 0.5),
max.words = TOP_NGRAM_WORDCLOUD_COUNT, random.color = TRUE,
rot.per = 0.25, colors = brewer.pal(6, 'Dark2'))
# clear the big objects
rm(list = c('quadgramTDM', 'quadgramFreq'))
Let us create a Term Document Matrix for Pentagrams.
pentagramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 5, max = 5, delimiters = " \\n\\t\\r"))
pentagramTDM <- TermDocumentMatrix(corpus, control = list(tokenizer = pentagramTokenizer))
pentagramTDM <- removeSparseTerms(pentagramTDM, MIN_TDM_SPARSE)
Top 10 pentagrams and their frequency is shown followed by a simple histogram for top 25 words followed by a wordcloud for top 100 words. All of this is based on Training Data.
# top 10 pentagrams
pentagramFreq <- sort(rowSums(as.matrix(pentagramTDM)), decreasing = TRUE)
pentagramDF = data.frame(Term = names(pentagramFreq), Frequency = pentagramFreq,
stringsAsFactors = FALSE)
head(pentagramDF, n = TOP_NGRAM_WORD_COUNT)
## Term Frequency
## at the end of the at the end of the 186
## in the middle of the in the middle of the 102
## for the first time in for the first time in 86
## the end of the day the end of the day 61
## by the end of the by the end of the 60
## thank you so much for thank you so much for 58
## its going to be a its going to be a 57
## there are a lot of there are a lot of 56
## for the rest of the for the rest of the 47
## i cant wait to see i cant wait to see 47
# histogram (top 25 pentagrams)
g <- ggplot(pentagramDF[1:TOP_NGRAM_HISTOGRAM_COUNT,], aes(Term, Frequency)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
g
# wordcloud (top 100 pentagrams)
wordcloud(names(pentagramFreq), pentagramFreq, scale = c(1.0, 0.5),
max.words = TOP_NGRAM_WORDCLOUD_COUNT, random.color = TRUE,
rot.per = 0.25, colors = brewer.pal(6, 'Dark2'))
# clear the big objects
rm(list = c('pentagramTDM', 'pentagramFreq', 'corpus'))
Let us extract the phrase part and the prediction part from the N-Grams. For pentagram, phrase is the first four words and prediction is the fifth word. For quadgram, phrase is the first three words and prediction is the fourth word. For trigram, phrase is the first two words and prediction is the third word. For bigram, phrase is the first word and prediction is the second word. For unigram, there is no phrase and prediction is the word.
unigramDF <- rename(unigramDF, Prediction = Term)
rownames(unigramDF) <- NULL # reduces dataframe size
# function to create additional columns in data frame for Bigrams
AddProbabilityToBigrams<-function(x)
{
# extract the bigram Term
term <- x[['Term']]
Phrase <- sapply(term, function(t) { regmatches(t, regexpr("+([a-z]+)", t))})
Prediction <- sapply(term, function(t) { regmatches(t, regexpr("+([a-z]+$)", t))})
return(cbind(x, Phrase, Prediction, stringsAsFactors = FALSE))
}
bigramDF <- AddProbabilityToBigrams(bigramDF)
bigramDF <- arrange(bigramDF, Phrase, desc(Frequency))
bigramDF <- select(bigramDF, Phrase, Frequency, Prediction) # reduces dataframe size
# function to create additional columns in data frame for Trigrams
AddProbabilityToTrigrams<-function(x)
{
# extract the trigram Term
term <- x[['Term']]
Phrase <- sapply(term, function(t) { regmatches(t, regexpr("+([a-z]+) ([a-z]+)", t))})
Prediction <- sapply(term, function(t) { regmatches(t, regexpr("+([a-z]+$)", t))})
return(cbind(x, Phrase, Prediction, stringsAsFactors = FALSE))
}
trigramDF <- AddProbabilityToTrigrams(trigramDF)
trigramDF <- arrange(trigramDF, Phrase, desc(Frequency))
trigramDF <- select(trigramDF, Phrase, Frequency, Prediction) # reduces dataframe size
# function to create additional columns in data frame for Quadgrams
AddProbabilityToQuadgrams<-function(x)
{
# extract the quadgram Term
term <- x[['Term']]
#Phrase <- sapply(term, function(t) { regmatches(t, regexpr("+([a-z]+) ([a-z]+) ([a-z]+)", t))})
#Prediction <- sapply(term, function(t) { regmatches(t, regexpr("+([a-z]+$)", t))})
Phrase <- sapply(term, function(t) { w<-strsplit(t, ' ')[[1]];paste(w[1], w[2], w[3]) })
Prediction <- sapply(term, function(t) { w<-strsplit(t, ' ')[[1]];w[4]})
return(cbind(x, Phrase, Prediction, stringsAsFactors = FALSE))
}
quadgramDF <- AddProbabilityToQuadgrams(quadgramDF)
quadgramDF <- arrange(quadgramDF, Phrase, desc(Frequency))
quadgramDF <- select(quadgramDF, Phrase, Frequency, Prediction) # reduces dataframe size
# function to create additional columns in data frame for Pentagrams
AddProbabilityToPentagrams<-function(x)
{
# extract the pentagram Terms
term <- x[['Term']]
Phrase <- sapply(term, function(t) { w<-strsplit(t, ' ')[[1]];paste(w[1], w[2], w[3], w[4]) })
Prediction <- sapply(term, function(t) { w<-strsplit(t, ' ')[[1]];w[5]})
return(cbind(x, Phrase, Prediction, stringsAsFactors = FALSE))
}
pentagramDF <- AddProbabilityToPentagrams(pentagramDF)
pentagramDF <- arrange(pentagramDF, Phrase, desc(Frequency))
pentagramDF <- select(pentagramDF, Phrase, Frequency, Prediction) # reduces dataframe size
We save the N-Grams so that it is faster to load and get to prediction!
# form the file names
fullUnigramFile <- paste0(MODEL_DIR_NAME, '/', UNIGRAM_FILE_NAME)
fullBigramFile <- paste0(MODEL_DIR_NAME, '/', BIGRAM_FILE_NAME)
fullTrigramFile <- paste0(MODEL_DIR_NAME, '/', TRIGRAM_FILE_NAME)
fullQuadgramFile <- paste0(MODEL_DIR_NAME, '/', QUADGRAM_FILE_NAME)
fullPentagramFile <- paste0(MODEL_DIR_NAME, '/', PENTAGRAM_FILE_NAME)
# save the files
saveRDS(unigramDF, fullUnigramFile)
saveRDS(bigramDF, fullBigramFile)
saveRDS(trigramDF, fullTrigramFile)
saveRDS(quadgramDF, fullQuadgramFile)
saveRDS(pentagramDF, fullPentagramFile)
Our approach to prediction is as follows. We look up the phrase in Pentagrams first and return the top candidates if there is a match. If there is no match in Pentagrams, we try it in Quadgrams for a match If there is no match in Quadgrams, we try it in Trigrams for a match If there is no match in Trigrams, we try it in bigrams for a match. If there is no match in bigrams, we return the top candidates in unigrams.
# function to clean the input where inputPhrase is a character vector
CleanInput <- function(inputPhrase)
{
# create corpus and follow the exact cleaning steps
# as was done with training data
inputCorpus <- VCorpus(VectorSource(c(inputPhrase)))
inputCorpus <- tm_map(inputCorpus, removePunctuation)
inputCorpus <- tm_map(inputCorpus, removeNumbers)
inputCorpus <- tm_map(inputCorpus, tolower)
inputCorpus <- tm_map(inputCorpus, PlainTextDocument)
return(as.character(inputCorpus[[1]]))
}
# function to predict the next word given an input phrase,
# udf, bdf, tdf, qdf and pdf are unigram, bigram, trigram,
# quadgram and pentagram dataframes, the maximum number of
# words to be predicted and whether prediction should happen
# for partial words in progress
#
# Note: if the last letter in the input is not a space, it is
# taken as a word in progress
#
# returns a vector of predictions
PredictWord <- function(inputPhrase, udf, bdf, tdf, qdf, pdf, maxNumPredictions,
predictWordsInProgress)
{
# is the last word a word in progress?
isLastWordInProgress <- TRUE
wordInProgress <- ""
len <- nchar(inputPhrase)
if (substr(inputPhrase, len, len) == ' ')
isLastWordInProgress <- FALSE
# clean the input first and count the words in the phrase
inputPhrase <- CleanInput(inputPhrase)
inputWords <- strsplit(inputPhrase, ' ')[[1]]
numInputWords <- length(inputWords)
if (isLastWordInProgress == TRUE && predictWordsInProgress == TRUE)
{
# find the word in progress and decrement the total number of words
if (numInputWords > 0)
{
wordInProgress <- inputWords[numInputWords]
numInputWords <- numInputWords - 1
}
}
# pentagrams
if (USE_PENTAGRAM == TRUE && numInputWords >= 4)
{
phrase <- paste(inputWords[numInputWords - 3], inputWords[numInputWords - 2],
inputWords[numInputWords - 1], inputWords[numInputWords])
matchDF <- filter(pdf, Phrase == phrase)
if (nrow(matchDF) > 0)
{
predictions <- matchDF[['Prediction']]
if (nchar(wordInProgress) > 0)
{
pattern <- paste0("^", wordInProgress)
predictions <- grep(pattern, predictions, value = TRUE)
}
# is there anything to return?
if (length(predictions) > 0)
{
if (length(predictions) > maxNumPredictions)
predictions <- predictions[1:maxNumPredictions]
#cat("5-PG", fill = TRUE)
return(predictions)
}
}
}
# quadgrams
if (numInputWords >= 3)
{
phrase <- paste(inputWords[numInputWords - 2], inputWords[numInputWords - 1],
inputWords[numInputWords])
matchDF <- filter(qdf, Phrase == phrase)
if (nrow(matchDF) > 0)
{
predictions <- matchDF[['Prediction']]
if (nchar(wordInProgress) > 0)
{
pattern <- paste0("^", wordInProgress)
predictions <- grep(pattern, predictions, value = TRUE)
}
# is there anything to return?
if (length(predictions) > 0)
{
if (length(predictions) > maxNumPredictions)
predictions <- predictions[1:maxNumPredictions]
#cat("4-QG", fill = TRUE)
return(predictions)
}
}
}
# trigrams
if (numInputWords >= 2)
{
phrase <- paste(inputWords[numInputWords - 1], inputWords[numInputWords])
matchDF <- filter(tdf, Phrase == phrase)
if (nrow(matchDF) > 0)
{
predictions <- matchDF[['Prediction']]
if (nchar(wordInProgress) > 0)
{
pattern <- paste0("^", wordInProgress)
predictions <- grep(pattern, predictions, value = TRUE)
}
# is there anything to return?
if (length(predictions) > 0)
{
if (length(predictions) > maxNumPredictions)
predictions <- predictions[1:maxNumPredictions]
#cat("3-TG", fill = TRUE)
return(predictions)
}
}
}
# bigrams
if (numInputWords == 1)
{
phrase <- inputWords[numInputWords]
matchDF <- filter(bdf, Phrase == phrase)
if (nrow(matchDF) > 0)
{
predictions <- matchDF[['Prediction']]
if (nchar(wordInProgress) > 0)
{
pattern <- paste0("^", wordInProgress)
predictions <- grep(pattern, predictions, value = TRUE)
}
# is there anything to return?
if (length(predictions) > 0)
{
if (length(predictions) > maxNumPredictions)
predictions <- predictions[1:maxNumPredictions]
#cat("2-BG", fill = TRUE)
return(predictions)
}
}
}
# unigrams
predictions <- udf[['Prediction']]
if (nchar(wordInProgress) > 0)
{
pattern <- paste0("^", wordInProgress)
predictions <- grep(pattern, predictions, value = TRUE)
}
if (length(predictions) > maxNumPredictions)
predictions <- predictions[1:maxNumPredictions]
#cat("1-UG", fill = TRUE)
return(predictions)
}
# load the N-Grams
unigramDF <- readRDS(fullUnigramFile)
bigramDF <- readRDS(fullBigramFile)
trigramDF <- readRDS(fullTrigramFile)
quadgramDF <- readRDS(fullQuadgramFile)
pentagramDF <- NULL
if (USE_PENTAGRAM == TRUE)
{
pentagramDF <- readRDS(fullPentagramFile)
}
x <- "Let us go to "
z <- PredictWord(x, unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF, MAX_NUM_PREDICTIONS, FALSE)
z
## [1] "the" "bed" "sleep" "a" "school" "work" "my"
## [8] "college" "hell" "church"
Let us do some tests to understand the accuracy of prediction. The scneario of usage of this model involves user interaction to pick the next word from a set of choices or type their own. In our case, we have three categories for analysis. Accuracy of the top word, accuracy of being in the top 5 words and accuracy of being in the top 10 words.
ACCURACY_ANALYSIS_NUM_ITEMS <- 300
# gets N lines from a given file
GetNLinesFromFile <- function(fileName, dirName, numLines)
{
con <- file(paste0(dirName, '/', fileName), "r")
fileLines <- readLines(con, n = numLines, skipNul = TRUE, encoding = "UTF-8")
close(con)
return (fileLines)
}
# returns accuracy count in a data frame
GetAccuracyCount <- function (fileName, dirName, numLines, udf, bdf, tdf, qdf, pdf)
{
# some counters
sampleCount <- 0
top1Predictions <- 0
top5Predictions <- 0
top10Predictions <- 0
df <- data.frame(sample = numeric(1), top1 = numeric(1), top5 = numeric(1), top10 = numeric(1))
# read the file and process each line
lines <- GetNLinesFromFile(fileName, dirName, numLines)
for (lineIdx in 1:length(lines))
{
if (length(lines) == 0) break
# clean the data and split into words
line <- CleanInput(lines[[lineIdx]])
words <- strsplit(line, ' ')[[1]]
numWords <- length(words)
# predict each word starting with a phrase of 1 word and up
maxWordIdx <- numWords - 1
for (wordIdx in 1:maxWordIdx)
{
if (maxWordIdx < 1) break
inputPhrase <- paste0(words[1:wordIdx], " ", collapse = "")
predictions <- PredictWord(inputPhrase, udf, bdf, tdf, qdf, pdf, 10, FALSE)
# is the expected word in predictions?
expectedWord <- words[[wordIdx + 1]]
expectedWordIdx <- 0
for (idx in 1:length(predictions))
{
if (length(predictions) == 0) break
if (predictions[[idx]] == expectedWord)
{
expectedWordIdx <- idx
break
}
}
# do the statistics
sampleCount <- sampleCount + 1
if (expectedWordIdx != 0)
{
if (expectedWordIdx == 1) {top1Predictions <- top1Predictions + 1}
if (expectedWordIdx <= 5) {top5Predictions <- top5Predictions + 1}
if (expectedWordIdx <= 10) {top10Predictions <- top10Predictions + 1}
}
}
}
# update the dataframe
df$sample[1] <- sampleCount
df$top1[1] <- top1Predictions
df$top5[1] <- top5Predictions
df$top10[1] <- top10Predictions
return(df)
}
# predictions
df <- GetAccuracyCount(BLOGS_FILE_NAME, TEST_DIR_NAME, ACCURACY_ANALYSIS_NUM_ITEMS,
unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF)
row.names(df) <- "Blogs"
dfNews <- GetAccuracyCount(NEWS_FILE_NAME, TEST_DIR_NAME, ACCURACY_ANALYSIS_NUM_ITEMS,
unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF)
row.names(dfNews) <- "News"
dfTwitter <- GetAccuracyCount(TWITTER_FILE_NAME, TEST_DIR_NAME, ACCURACY_ANALYSIS_NUM_ITEMS,
unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF)
row.names(dfTwitter) <- "Twitter"
dfc <- rbind(df, dfNews, dfTwitter)
The following shows the sample size and the number of correct predictions made. Prediction is made after a word is completed.
dfc
## sample top1 top5 top10
## Blogs 11161 1250 2244 2631
## News 10352 1165 2016 2346
## Twitter 3480 398 724 863
The following table shows the accuracy in terms of percentage values:
| Top 1 % | Top 5 % | Top 10 % | |
|---|---|---|---|
| Blogs | 11.2% | 20.1% | 23.6% |
| News | 11.3% | 19.5% | 22.7% |
| 11.4% | 20.8% | 24.8% | |
| ALL | 11.3% | 19.9% | 23.4% |
The following code performs prediction as each and every letter is being typed.
ACCURACY_ANALYSIS_NUM_ITEMS <- 100
# returns accuracy count in a data frame for WIP (words in progress)
GetAccuracyCountWIP <- function (fileName, dirName, numLines, udf, bdf, tdf, qdf, pdf)
{
# some counters
sampleCount <- 0
top1Predictions <- 0
top5Predictions <- 0
top10Predictions <- 0
df <- data.frame(sample = numeric(1), top1 = numeric(1), top5 = numeric(1), top10 = numeric(1))
# read the file and process each line
lines <- GetNLinesFromFile(fileName, dirName, numLines)
for (lineIdx in 1:length(lines))
{
if (length(lines) == 0) break
# clean the data and split into words
line <- CleanInput(lines[[lineIdx]])
words <- strsplit(line, ' ')[[1]]
numWords <- length(words)
# predict each word assuming each letter is being typed
inputPhrase <- ""
for (wordIdx in 1:numWords)
{
if (numWords < 1) break
expectedWord <- words[[wordIdx]]
wordLen <- nchar(expectedWord)
for (charIdx in 0:wordLen)
{
if (charIdx == 0)
{
if (wordIdx > 1) {inputPhrase <- paste0(inputPhrase, " ")}
}
else
{
inputPhrase <- paste0(inputPhrase, substring(expectedWord, charIdx, charIdx))
}
#cat(inputPhrase, fill = TRUE)
predictions <- PredictWord(inputPhrase, udf, bdf, tdf, qdf, pdf, 10, TRUE)
# is the expected word in predictions?
expectedWordIdx <- 0
for (idx in 1:length(predictions))
{
if (length(predictions) == 0) break
if (predictions[[idx]] == expectedWord)
{
expectedWordIdx <- idx
break
}
}
# do the statistics
sampleCount <- sampleCount + 1
if (expectedWordIdx != 0)
{
if (expectedWordIdx == 1) {top1Predictions <- top1Predictions + 1}
if (expectedWordIdx <= 5) {top5Predictions <- top5Predictions + 1}
if (expectedWordIdx <= 10) {top10Predictions <- top10Predictions + 1}
}
}
}
}
# update the dataframe
df$sample[1] <- sampleCount
df$top1[1] <- top1Predictions
df$top5[1] <- top5Predictions
df$top10[1] <- top10Predictions
return(df)
}
# predictions
df <- GetAccuracyCountWIP(BLOGS_FILE_NAME, TEST_DIR_NAME, ACCURACY_ANALYSIS_NUM_ITEMS,
unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF)
row.names(df) <- "Blogs"
dfNews <- GetAccuracyCountWIP(NEWS_FILE_NAME, TEST_DIR_NAME, ACCURACY_ANALYSIS_NUM_ITEMS,
unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF)
row.names(dfNews) <- "News"
dfTwitter <- GetAccuracyCountWIP(TWITTER_FILE_NAME, TEST_DIR_NAME, ACCURACY_ANALYSIS_NUM_ITEMS,
unigramDF, bigramDF, trigramDF, quadgramDF, pentagramDF)
row.names(dfTwitter) <- "Twitter"
dfi <- rbind(df, dfNews, dfTwitter)
The following shows the sample size and the number of correct predictions made. Prediction is made as each letter is being typed.
dfi
## sample top1 top5 top10
## Blogs 18719 8161 10989 11700
## News 20781 9522 12526 13348
## Twitter 6844 2951 3989 4263
The following table shows the accuracy in terms of percentage values:
| Top 1 % | Top 5 % | Top 10 % | |
|---|---|---|---|
| Blogs | 43.6% | 58.7% | 62.5% |
| News | 45.8% | 60.3% | 64.2% |
| 43.1% | 58.3% | 62.3% | |
| ALL | 44.5% | 59.3% | 63.2% |
This application has been deployed to Shinyapps environment. Due to 1 GB memory size limits, Pentagram was not included.
Memory The following table shows the number of rows and memory size of various N-Gram objects in the R environment.
| Rows | Memory (MB) | |
|---|---|---|
| Unigram | 143,691 | 9 |
| Bigram | 1,562,907 | 50 |
| Trigram | 3,343,187 | 166 |
| Quadgram | 4,057,408 | 297 |
| Pentagram | 4,093,723 | 362 |
| TOTAL | 13,200,916 | 884 |
Accuracy
Using completed words, accuracy is 11% for the top word.
Accuracy is 20% when examining the top five predicted words.
Using in-progress words, accuracy is 45% for the top word.
Accuracy is 60% when examining the top five prdicted words.
Performance
All measurements done at https://mgravi7.shinyapps.io/TypeAhead/
Initial N-Gram load time is about 17 seconds
Response time for prediction is typically under 100 milliseconds!
UI displays the time take for prediction
Data sets: Cross-validation technique is used for creation of training data set, test data set and validation data set. The percentages envsioned (from the original corpus) are 5% for training, 50% for testing and 45% for validation. Due to memory and processing constraints, testing was done on much smaller dataset.
Data Cleaning: Punctuation charcaters need to be studied more carefully. For example, period and exclamation indicate end of a sentence which is valuable information. Numbers have been removed and its impact need to be studied. Removing stopwords can increase the predictability in some cases.
Modeling: After researching various approaches, paper by Daniel Jurafsky & James H. Martin is used as the basis for building prediction model. 1-gram, 2-gram, 3-gram, 4-gram and 5-gram are the N-grams that have been built. Analysis needs to be made to see if the larger N-grams yield better results. Also, further optimizations are possible which requires more research and experimentation.
Resource Optimization: N-Grams memory usage can be reduced further by using TRIE data structures or even a simple lookup of words using a fixed lexicon. This has the side benefit of validation of misspelled words in the model.
N-Grams as a method of prediction works! Using the model to make predictions as the user is typing increases the accuracy while significantly enhancing the end user experience. Further refinements can be made to make this model even better!!!
The following code can be used to trim the N-Grams knowing our interest is limited to number of words to be predicted. The value is set to 10 now and an experiment was run to understand the effect of various MAX_NUM_PREDICTIONS (10, 5, 1). The drawback to this approach is that we can make predictions only after a word is completed by the use. If we keep the entire N-Gram, we can predictions even as the user is typing a partial word.
Given the savings in memory is not significant, we are not trimming the N-Grams. Please see the table below for reduction % of number of rows. NP stands for Number of Predictions. Red% stands for Reduction %.
| Rows | Memory (MB) | NP=10 Rows | NP=10 Red% | NP=5 Rows | NP=5 Red% | NP=1 Rows | NP=1 Red% | |
|---|---|---|---|---|---|---|---|---|
| Unigram | 143,691 | 9 | 10 | 100% | 5 | 100% | 1 | 100% |
| Bigram | 1,562,907 | 50 | 403,761 | 74% | 298,071 | 81% | 135,904 | 91% |
| Trigram | 3,343,187 | 166 | 2,485,389 | 26% | 2,214,177 | 34% | 1,489,168 | 55% |
| Quadgram | 4,057,408 | 297 | 3,878,993 | 4% | 3,752,510 | 8% | 3,198,945 | 21% |
| Pentagram | 4,093,723 | 362 | 4,077,302 | 0% | 4,055,580 | 1% | 3,878,756 | 5% |
| TOTAL | 13,200,916 | 884 | 10,845,455 | 18% | 10,320,343 | 22% | 8,702,774 | 34% |
# function to find the size of the trimmed data frame
# x is the input data frame
FindTrimmedDataFrameSize <- function(x, numPredictions)
{
curPhrase <- ''
numCurPhrasePredictions <- 0
dfNumRows <- 0
for (idx in 1:nrow(x))
{
# new cur phrase?
if (x[idx, "Phrase"] != curPhrase)
{
# start a new cur phrase
curPhrase <- x[idx, "Phrase"]
numCurPhrasePredictions <- 0
}
# are we within limits?
if (numCurPhrasePredictions < numPredictions)
{
dfNumRows <- dfNumRows + 1
numCurPhrasePredictions <- numCurPhrasePredictions + 1
}
}
return(dfNumRows)
}
# function to trim the N-Gram data frame to have results that matter
# and used. For a given Phrase, only the top numPredictions are kept
# x is the input data frame
TrimNgramDataFrame <- function(x, numPredictions)
{
# find the trimmed data frame size (faster to create fixed size data frame)
dfNumRows <- FindTrimmedDataFrameSize(x, numPredictions)
# create a new data frame for results
df <- data.frame(Phrase = character(dfNumRows), Frequency = numeric(dfNumRows),
Prediction = character(dfNumRows), stringsAsFactors = FALSE)
curPhrase <- ''
numCurPhrasePredictions <- 0
dfRowIdx <- 1
for (idx in 1:nrow(x))
{
# new cur phrase?
if (x[idx, "Phrase"] != curPhrase)
{
# start a new cur phrase
curPhrase <- x[idx, "Phrase"]
numCurPhrasePredictions <- 0
}
# are we within limits?
if (numCurPhrasePredictions < numPredictions)
{
df[dfRowIdx,] <- x[idx,]
dfRowIdx <- dfRowIdx + 1
numCurPhrasePredictions <- numCurPhrasePredictions + 1
}
}
return(df)
}
Daniel Jurafsky & James H. Martin: Speech and Language Processing. Draft of September 1, 2014.
Kailash Awati, Sensanalytics Consulting: A gentle introduction to text mining using R.