N-Gram Language Model in R

JBob
August 2025

Overview & Libraries

The following libraries are requried to effectivley run the model:

library(stringr)
library(dplyr)

Helper Functions

Functions for tokenizing text, extracting the last words, and removing the final word from strings.

tokenizer <- function(lines) {
  lines <- tolower(lines)
  tokens <- unlist(strsplit(lines, "[^a-z']+"))
  tokens[tokens != ""]
}

getLastWords <- function(string, words) {
  pattern <- paste("[a-z']+( [a-z']+){", words - 1, "}$", sep = "")
  substring(string, str_locate(string, pattern)[, 1])
}

removeLastWord <- function(string) {
  sub(" [a-z']+$", "", string)
}

Load, Sample, and Tokenize Data

Read 2% from each text source, combine samples, and tokenize into unigrams, bigrams, and trigrams.

set.seed(123)

read_sample <- function(path, pct) {
  lines <- readLines(path, warn = FALSE, encoding = "UTF-8")
  sample(lines, round(length(lines) * pct))
}

tweets <- read_sample("/home/rstudio/en_US.twitter.txt", 0.02)
news   <- read_sample("/home/rstudio/en_US.news.txt",    0.02)
blogs  <- read_sample("/home/rstudio/en_US.blogs.txt",   0.02)
samplelines <- sample(c(tweets, news, blogs))

tokens <- tokenizer(samplelines)
tokens2 <- c(tokens[-1], "<eos>")
tokens3 <- c(tokens2[-1], "<eos>")
unigrams <- tokens
bigrams <- paste(tokens, tokens2)
trigrams <- paste(tokens, tokens2, tokens3)

Calculate Probabilities & Create Data Frames

Filter n-grams with frequency > 2, compute MLE probabilities, and store results in data frames.

ngram_probs <- function(ngrams) {
  tbl <- table(ngrams)
  tbl <- tbl[tbl > 2]
  sort(tbl / sum(tbl), decreasing = TRUE)
}

unigramProbs <- ngram_probs(unigrams)
bigramProbs  <- ngram_probs(bigrams)
trigramProbs <- ngram_probs(trigrams)

unigramDF <- data.frame(Words = names(unigramProbs),
                        Probability = as.numeric(unigramProbs),
                        stringsAsFactors = FALSE)

bigramDF <- data.frame(FirstWords = removeLastWord(names(bigramProbs)),
                       LastWord = getLastWords(names(bigramProbs), 1),
                       Probability = as.numeric(bigramProbs),
                       stringsAsFactors = FALSE)

trigramDF <- data.frame(FirstWords = removeLastWord(names(trigramProbs)),
                        LastWord = getLastWords(names(trigramProbs), 1),
                        Probability = as.numeric(trigramProbs),
                        stringsAsFactors = FALSE)

Prediction Function & Example

Predict the next three likely words based on input text using trigram, bigram, and unigram back-off.

predictor <- function(input) {
  n <- length(strsplit(input, " ")[[1]])
  input <- tolower(input)
  prediction <- character(0)

  if (n >= 2)
    prediction <- trigramDF %>% 
      filter(FirstWords == getLastWords(input, 2)) %>%
      arrange(desc(Probability)) %>% 
      pull(LastWord)

  if (length(prediction) < 3 && n >= 1)
    prediction <- c(prediction, bigramDF %>%
                      filter(FirstWords == getLastWords(input, 1)) %>%
                      arrange(desc(Probability)) %>%
                      pull(LastWord))

  if (length(prediction) < 3)
    prediction <- c(prediction, unigramDF$Words)

  unique(prediction)[1:3]
}

# Example
predictor("I am going")
[1] "to"      "on"      "through"