Import Corpus

Before we start building the prediction model. We will import the tokenized corpus from “blogtokenized.txt”, “twittertokenized.txt”, and “newstokenized.txt”.

#Word counts
con <- file("blogtokenized.txt", "r")
blogchar<- readLines(con)
blogword <- length(blogchar)
close(con) 

con <- file("twittertokenized.txt", "r")
twitterchar <-  readLines(con)
twitterword <- length(twitterchar)
close(con) 

con <- file("newstokenized.txt", "r")
newschar <- readLines(con)
newsword <- length(newschar)
close(con) 

N-gram Prediction Model

In this project, we will build a word prediction model which provides the most likely next word followed by its previous n words. This model is so called n-gram model, and n-gram refers to a \(n\) sequence of words.

Maximum Likelihood Estimate (MLE)

N-gram model predicts \(x_i\) based on $ x_{i-(n-1)},…,x_{i-1}$. MLE assigns probability on each word that exist in corpus. The probability is the number of n-gram events divided by the number of total n-gram events.

In the n-gram model, independence assumptions (Markov assumption) are made so that each word depends only on the last \(n-i\) words. In probability terms, this is \(P(x_{i}\mid x_{i-(n-1)},...,x_{i-1})\). Simply we will count up the number of particular word or combination of words and divide by the total number of occurances.

In equations MLE can be written as below

However this probability on each word is higher than the actual probability because no probability mass is assigned to unobserved n-grams. Therefore we will use discounting technique to “steal” some probability assigned to the observed n-grams to distribute to unobserved n-grams.

Discounting

Using discounting, some of the probability assinged to the observed n-gram will be distributed to unobserved n-grams. In this project, we will perform absolute discounting of 0.75.

Katz’s back-off model

Combinding the MLE algo and discounting leads to the model we will constrcut, Katz’s back-off model. Katz’s back-off model estimates probabilities of unseen n-grams by redistributing some of the probability mass from observed n-gram to unobserved ones through discounting. We will build Katz’s 3-gram model and we use absolute discounting with value of 0.75.

Step 1. Save unigram, bigram and trigram info

First we need to generate unigrams, bigrams and trigrams from the corpus. These n-grams will be used to construct the Katz’s BO model throughout this project. The code below will create unigram, bigram, and trigram with frequencies.

unigs <- tokens_ngrams(tokens(str_c(c(blogchar,twitterchar,newschar),collapse = " ")),n=1)[[1]]
unigs <- count(unigs)
unigs <- unigs[order(-unigs$freq),]
row.names(unigs) <- 1:nrow(unigs)
bigrs <- tokens_ngrams(tokens(str_c(c(blogchar,twitterchar,newschar),collapse = " ")),n=2)[[1]]
bigrs <- count(bigrs)
bigrs <- bigrs[order(-bigrs$freq),]
row.names(bigrs) <- 1:nrow(bigrs)
trigs <- tokens_ngrams(tokens(str_c(c(blogchar,twitterchar,newschar),collapse = " ")),n=3)[[1]]
trigs <- count(trigs)
trigs <- trigs[order(-trigs$freq),]
row.names(trigs) <- 1:nrow(trigs)

Step 2. Assign probability for observed trigram

In this step, we will estimate the probability for words completing observed trigram. We will first calculate MLE then discount it by 0.75 to get the backed-off probability of observed trigram, \(q_{BO}\). The equation for calculating \(q_{BO}\) is \[q_{BO}= \frac{c(w_{i-2},w_{i-1},w)-\gamma}{c(w_{i-2},w_{i-1})}\] Below is a r code calculating \(q_{BO}\). AS an example, we will use “i_love” as the bigram prefix to predict the third word.

#ObservedTrig returns the observed trigrams in the corpus that starts with bigPre
getObservedTrig <- function(bigPre=bigPre, trigs=trigs){
  output <- data.frame(ngrams=vector(mode='character', length=0), freq=vector(mode='integer', length=0))
  trigram_index <- grep(paste0("^", bigPre,"_"), trigs$x)
  output <- trigs[trigram_index,]
  return(output)
}

getObservedTrigramProb <- function(ObservedTrig=ObservedTrig, bigrs=bigrs, bigPre=bigPre, gamma=gamma){
  if(nrow(ObservedTrig)<1) return(NULL)
  obCount <- bigrs[(bigrs$x ==bigPre),]$freq
  obTrigProbs <- (ObservedTrig$freq-gamma)/obCount
  output <- cbind(as.data.frame(ObservedTrig$x), obTrigProbs)
  colnames(output) <- c("ngram", "prob")
  row.names(output) <- 1:nrow(output)
  return(output)
}

Step 3. Assign probability for unobserved trigram

Next task is to estimate probability for a word completing unobserved trigram. The back-off probability of unobserved trigram is \[q_{BO}(w_i\mid w_{i-2}, w_{i-1}) = \alpha(w_{i-2}, w_{i-1})\frac{q_{BO}(w_i\mid w_{i-1})}{\sum_{w \subseteq B(w_{i-2}, w_{i-1})}q_{BO}(w \mid w_{i-1})}\] where

This is more complicated than handling the observed trigram. Hence I will break it up into multiple substeps.

Step 3.1. Find words that complete unobserved trigrams

getUnobservedTrigs <- function(ObservedTrig=ObservedTrig, unigs=unigs){
  observedlast <- sapply(ObservedTrig$x, FUN= function(y) paste(tail(strsplit(as.character(y), "_")[[1]],1)))
  output <- unigs[!(unigs$x %in% observedlast),]$x
  return(output)
}

Step 3.2. Calculate \(\alpha(w_{i-1})\)

getAlphaBigr <- function(bigPre=bigPre, bigrs=bigrs, gamma=0.5){
  w_i_1 <- strsplit(bigPre, "_")[[1]][2]
  w_i_1 <- unigs[unigs$x==w_i_1,]
  bigramcount <- bigrs[grep(paste0("^",w_i_1$x,"_"), bigrs$x),]
  if (nrow(bigramcount)<1) return(1)
  output <- 1- sum((bigramcount$freq-gamma)/w_i_1$freq)
  return(output)
}

Step 3.3. Calculate backed off probabilities $$q_{BO} for bigrams

getBObigrams <- function(bigPre=bigPre, UnobservedTrigs=UnobservedTrigs){
  w_i_1 <- strsplit(bigPre, "_")[[1]][2]
  output <- paste0(w_i_1,"_" ,UnobservedTrigs)
  return(output)
}


getObsBOBigrams <- function(bigrs=bigrs, BObigrams=BObigrams){
  output <- bigrs[bigrs$x %in% BObigrams,]
  return(output)
}

getUnObsBOBigrams <- function(bigrs=bigrs, BObigrams=BObigrams, ObsBOBigrams=ObsBOBigrams){
  output <- BObigrams[!(BObigrams %in% ObsBOBigrams$x)]
  return(output)
}

getObsBOBigramsProbs <- function(bigPre=bigPre, ObsBOBigrams=ObsBOBigrams, unigs=unigs, gamma=gamma){
  w_i_1 <- strsplit(bigPre, "_")[[1]][2]
  w_i_1 <- unigs[unigs$x == w_i_1,]
  output <- (ObsBOBigrams$freq-gamma)/w_i_1$freq
  output <- data.frame(x=ObsBOBigrams$x, prob=output)
  return(output)
}



getUnObsBOBigramsProbs <- function(UnObsBOBigrams=UnObsBOBigrams, unigs=unigs, AlphaBigr=AlphaBigr){
  #get the unobserved bigram tails
  UnObsBOBigramsTails <- sapply(UnObsBOBigrams, FUN= function(y) paste(tail(strsplit(as.character(y), "_")[[1]],1)))
  UnObsBOBigramsTails <- unigs[unigs$x %in% UnObsBOBigramsTails,]
  denom <- sum(UnObsBOBigramsTails$freq)
  output <- data.frame(x=UnObsBOBigrams, prob=(AlphaBigr*UnObsBOBigramsTails$freq/denom))
  return(output)
}

Step 4. Calculated discounted probability mass that will be distributed to unobserved trigram, \(\alpha(w_{i-2}, w_{i-1})\)

\[\alpha(w_{i-2}, w_{i-1}) = 1- \sum_{w \subseteq A(w_{i-1})} \frac{c(w_{i-2}, w_{i-1}, w)-\gamma}{c(w_{i-2}, w_{i-1})}\]

getAlphaTrig <- function(bigPre=bigPre, trigs=trigs, gamma=0.5){
  trigscount <- trigs[grep(paste0("^",bigPre,"_"), trigs$x),]
  bigPrecount <- bigrs[bigrs$x==bigPre,]
  if (nrow(trigscount)<1) return(1)
  output <- 1- sum((trigscount$freq-gamma)/bigPrecount$freq)
  return(output)
}

Step 5. Calculate unobserved trigram probabilities \(q_{BO}(w_i \mid w_{i-2}, w_{i-1})\)

\[q_{BO}(w_i\mid w_{i-2}, w_{i-1}) = \alpha(w_{i-2}, w_{i-1})\frac{q_{BO}(w_i\mid w_{i-1})}{\sum_{w \subseteq B(w_{i-2}, w_{i-1})}q_{BO}(w \mid w_{i-1})}\]

getUnObsTrigramProbs <- function(bigPre=bigPre, QboBigrams=QboBigrams, AlphaTrig=AlphaTrig){
  sumQboBigrams <- sum(QboBigrams$prob)
  UnObsTrigrams <- paste(str_split(bigPre, "_")[[1]][1], QboBigrams$x, sep="_")
  output <- AlphaTrig*QboBigrams$prob/sumQboBigrams
  output <- data.frame(ngram=UnObsTrigrams, prob=output)
  return(output)
}

Step 6. Select \(w_i\) with the highest \(q_{BO}(w_i \mid w_{i-2}, w_{i-1})\)

getNextWord <- function(ObservedTrigramProb=ObservedTrigramProb, UnObsTrigramProb=UnObsTrigramProb, choices){
  QboTrigrams <- rbind(ObservedTrigramProb, UnObsTrigramProb)
  QboTrigrams <- QboTrigrams[order(-QboTrigrams$prob),]
  QboTrigrams$ngram <- sapply(QboTrigrams$ngram, FUN= function(y) paste(tail(strsplit(as.character(y), "_")[[1]],1)))
  QboTrigrams <- QboTrigrams[QboTrigrams$ngram %in% choices,]
  output <- QboTrigrams[order(-QboTrigrams$prob),]
  return(output)
}
predictNextWord <- function(gamma, bigPre, choices, unigs, bigrs,trigs){
  ObservedTrig <- getObservedTrig(bigPre,trigs)
  ObservedTrigramProb <- getObservedTrigramProb(ObservedTrig, bigrs, bigPre,gamma)
  UnobservedTrigs <- getUnobservedTrigs(ObservedTrig, unigs)
  AlphaBigr <- getAlphaBigr(bigPre, bigrs, gamma)
  BObigrams <- getBObigrams(bigPre,UnobservedTrigs)
  ObsBOBigrams <- getObsBOBigrams(bigrs, BObigrams)
  UnObsBOBigrams <- getUnObsBOBigrams(bigrs=bigrs, BObigrams=BObigrams, ObsBOBigrams=ObsBOBigrams)
  ObsBOBigramsProbs <- getObsBOBigramsProbs(bigPre, ObsBOBigrams, unigs, gamma)
  UnObsBOBigramsProbs <- getUnObsBOBigramsProbs(UnObsBOBigrams=UnObsBOBigrams, unigs=unigs, AlphaBigr=AlphaBigr)
  QboBigrams <- rbind(ObsBOBigramsProbs, UnObsBOBigramsProbs)
  AlphaTrig <- getAlphaTrig(bigPre, trigs, gamma)
  UnObsTrigramProbs <- getUnObsTrigramProbs(bigPre, QboBigrams, AlphaTrig)
  output <- getNextWord(ObservedTrigramProb, UnObsTrigramProbs, choices)
  return(output)
} 

gamma <- 0.5
bigPre <- "settle_the"
choices <- c('account', 'case', 'incident', 'matter')
NextWord <- predictNextWord(gamma, bigPre, choices, unigs, bigrs,trigs)
NextWord
##         ngram         prob
## 37       case 2.116521e-03
## 442  incident 3.631901e-04
## 567    matter 2.880473e-04
## 1619  account 8.766657e-05

Assessing model performance

Since the model is constructed now we will assess how well the model performs.

Speed

We measure how fast the model runs. We will use the same example that has bigPre as i_love' and‘you’,‘him’, ‘candy’, ‘sun’` as potential word to follow.

gamma <- 0.5
bigPre <- "i_would"
choices <- c('give','die', 'sleep', 'eat')

start_time <- Sys.time()
NextWord <- predictNextWord(gamma, bigPre, choices, unigs, bigrs,trigs)
end_time <- Sys.time()
round(end_time - start_time, 2)
## Time difference of 9.76 secs

You can see it takes a out 5.28 seconds to get completed. This is not horribly slow but still slower than the text recommendations we see in our phones or tablets.

Accuracy

Now we will measure accuracy. Simply we will measure the proportion of correct answers provided by the model. We will randomly select 4 unigram words with top 100 frequencies from the test dataset and sample random 100 trigrams that ends with those 4 unigram words.

First, we will import the test data.

#Word counts
con <- file("blogtokenized_test.txt", "r")
blogchar_test<- readLines(con)
blogword_test <- length(blogchar_test)
close(con) 

con <- file("twittertokenized_test.txt", "r")
twitterchar_test <- readLines(con)
twitterword_test <- length(twitterchar_test)
close(con) 

con <- file("newstokenized_test.txt", "r")
newschar_test <- readLines(con)
newsword_test <- length(newschar_test)
close(con) 

unigs_test <- tokens_ngrams(tokens(str_c(c(blogchar_test,twitterchar_test,newschar_test),collapse = " ")),n=1)[[1]]
unigs_test <- count(unigs_test)
unigs_test <- unigs_test[order(-unigs_test$freq),]
row.names(unigs_test) <- 1:nrow(unigs_test)

bigrs_test <- tokens_ngrams(tokens(str_c(c(blogchar_test,twitterchar_test,newschar_test),collapse = " ")),n=2)[[1]]
bigrs_test <- count(bigrs_test)
bigrs_test <- bigrs_test[order(-bigrs_test$freq),]
row.names(bigrs_test) <- 1:nrow(bigrs_test)

trigs_test <- tokens_ngrams(tokens(str_c(c(blogchar_test,twitterchar_test,newschar_test),collapse = " ")),n=3)[[1]]
trigs_test <- count(trigs_test)
trigs_test <- trigs_test[order(-trigs_test$freq),]
row.names(trigs_test) <- 1:nrow(trigs_test)

The R code below will estimate the accuracy of the Katz’s back off model using the test data.

set.seed(1234)
SampleWords<-sample(unigs_test$x[1:100], 4)

TestTrigrams <- trigs_test[(sapply(trigs_test$x, FUN= function(y) paste(tail(strsplit(as.character(y), "_")[[1]],1))) %in% SampleWords),]
set.seed(1234)
SampleTrigs <- sample(TestTrigrams$x, 50)

BigPreList<- sapply(SampleTrigs, FUN= function(y) paste(strsplit(as.character(y), "_")[[1]][1], strsplit(as.character(y), "_")[[1]][2],sep='_')) 

prediction <- sapply(BigPreList, FUN = function(y) predictNextWord(gamma, y, SampleWords,unigs_test, bigrs_test,trigs_test)$ngram[1])
sum(paste0(BigPreList,'_', prediction)==SampleTrigs)/length(SampleTrigs)
## [1] 0.98

We can see that the accuracy of this model is about 78%. This is a lot better than the previous Stupid Back off model.