James S
August 2025
The following libraries are requried to effectivley run the model:
library(stringr)
library(dplyr)
Functions for tokenizing text, extracting the last words, and removing the final word from strings.
tokenizer <- function(lines) {
lines <- tolower(lines)
tokens <- unlist(strsplit(lines, "[^a-z']+"))
tokens[tokens != ""]
}
getLastWords <- function(string, words) {
pattern <- paste("[a-z']+( [a-z']+){", words - 1, "}$", sep = "")
substring(string, str_locate(string, pattern)[, 1])
}
removeLastWord <- function(string) {
sub(" [a-z']+$", "", string)
}
Read 2% from each text source, combine samples, and tokenize into unigrams, bigrams, and trigrams.
set.seed(123)
read_sample <- function(path, pct) {
lines <- readLines(path, warn = FALSE, encoding = "UTF-8")
sample(lines, round(length(lines) * pct))
}
tweets <- read_sample("/home/rstudio/en_US.twitter.txt", 0.02)
news <- read_sample("/home/rstudio/en_US.news.txt", 0.02)
blogs <- read_sample("/home/rstudio/en_US.blogs.txt", 0.02)
samplelines <- sample(c(tweets, news, blogs))
tokens <- tokenizer(samplelines)
tokens2 <- c(tokens[-1], "<eos>")
tokens3 <- c(tokens2[-1], "<eos>")
unigrams <- tokens
bigrams <- paste(tokens, tokens2)
trigrams <- paste(tokens, tokens2, tokens3)
Filter n-grams with frequency > 2, compute MLE probabilities, and store results in data frames.
ngram_probs <- function(ngrams) {
tbl <- table(ngrams)
tbl <- tbl[tbl > 2]
sort(tbl / sum(tbl), decreasing = TRUE)
}
unigramProbs <- ngram_probs(unigrams)
bigramProbs <- ngram_probs(bigrams)
trigramProbs <- ngram_probs(trigrams)
unigramDF <- data.frame(Words = names(unigramProbs),
Probability = as.numeric(unigramProbs),
stringsAsFactors = FALSE)
bigramDF <- data.frame(FirstWords = removeLastWord(names(bigramProbs)),
LastWord = getLastWords(names(bigramProbs), 1),
Probability = as.numeric(bigramProbs),
stringsAsFactors = FALSE)
trigramDF <- data.frame(FirstWords = removeLastWord(names(trigramProbs)),
LastWord = getLastWords(names(trigramProbs), 1),
Probability = as.numeric(trigramProbs),
stringsAsFactors = FALSE)
Predict the next three likely words based on input text using trigram, bigram, and unigram back-off.
predictor <- function(input) {
n <- length(strsplit(input, " ")[[1]])
input <- tolower(input)
prediction <- character(0)
if (n >= 2)
prediction <- trigramDF %>%
filter(FirstWords == getLastWords(input, 2)) %>%
arrange(desc(Probability)) %>%
pull(LastWord)
if (length(prediction) < 3 && n >= 1)
prediction <- c(prediction, bigramDF %>%
filter(FirstWords == getLastWords(input, 1)) %>%
arrange(desc(Probability)) %>%
pull(LastWord))
if (length(prediction) < 3)
prediction <- c(prediction, unigramDF$Words)
unique(prediction)[1:3]
}
# Example
predictor("I am going")
[1] "to" "on" "through"