library(quanteda)
library(quanteda.textstats)
library(data.table)
library(ggplot2)
library(stringr)
library(stringi)
library(knitr)
blogs <- readLines("en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)stats <- data.frame(
File = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs),
length(news),
length(twitter)),
Words = c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))),
Max_Line_Chars = c(max(nchar(blogs)),
max(nchar(news)),
max(nchar(twitter))),
Size_MB = round(c(object.size(blogs),
object.size(news),
object.size(twitter)) / 1e6, 1)
)
kable(stats,
format.args = list(big.mark = ","),
caption = "Table 1: Summary Statistics for en_US Corpus Files")| File | Lines | Words | Max_Line_Chars | Size_MB |
|---|---|---|---|---|
| Blogs | 899,288 | 37,546,250 | 40,833 | 267.8 |
| News | 1,010,242 | 34,762,395 | 11,384 | 269.8 |
| 2,360,148 | 30,093,413 | 140 | 334.5 |
The full corpus is too large to process efficiently in memory for EDA. We sample 5% of each file, combine them into a single corpus, and clean the tokens before analysis.
set.seed(42)
sample_pct <- 0.05
sample_text <- c(
sample(blogs, round(length(blogs) * sample_pct)),
sample(news, round(length(news) * sample_pct)),
sample(twitter, round(length(twitter) * sample_pct))
)
# Free memory from full files
rm(blogs, news, twitter)
gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2672953 142.8 7761071 414.5 NA 6739866 360.0
## Vcells 12709130 97.0 171483085 1308.4 16384 214013247 1632.8
## Total lines in sample: 213483
# Build quanteda corpus and clean tokens
corp <- corpus(sample_text)
toks <- tokens(corp,
remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE,
remove_url = TRUE) |>
tokens_tolower()dfm1 <- dfm(toks)
freq1 <- textstat_frequency(dfm1, n = 40)
ggplot(freq1, aes(x = reorder(feature, frequency), y = frequency)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Top 40 Most Frequent Words (Unigrams)",
x = "Word",
y = "Frequency"
) +
theme_minimal(base_size = 12)toks2 <- tokens_ngrams(toks, n = 2)
dfm2 <- dfm(toks2)
freq2 <- textstat_frequency(dfm2, n = 40)
ggplot(freq2, aes(x = reorder(feature, frequency), y = frequency)) +
geom_col(fill = "darkorange") +
coord_flip() +
labs(
title = "Top 40 Most Frequent Word Pairs (Bigrams)",
x = "Bigram",
y = "Frequency"
) +
theme_minimal(base_size = 12)toks3 <- tokens_ngrams(toks, n = 3)
dfm3 <- dfm(toks3)
freq3 <- textstat_frequency(dfm3, n = 40)
ggplot(freq3, aes(x = reorder(feature, frequency), y = frequency)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(
title = "Top 40 Most Frequent Word Triples (Trigrams)",
x = "Trigram",
y = "Frequency"
) +
theme_minimal(base_size = 12)How many unique words are needed to cover X% of all word instances in the corpus?
# Sorted unigram frequencies (descending)
all_freq <- sort(colSums(dfm1), decreasing = TRUE)
total <- sum(all_freq)
cumulative <- cumsum(all_freq) / total
cover50 <- which(cumulative >= 0.50)[1]
cover90 <- which(cumulative >= 0.90)[1]
cat("Unique words needed to cover 50% of all instances:", cover50, "\n")## Unique words needed to cover 50% of all instances: 145
## Unique words needed to cover 90% of all instances: 8227
# Plot coverage curve (limit to first 20,000 words for readability)
n_plot <- min(length(cumulative), 20000)
cover_df <- data.frame(
n_words = 1:n_plot,
cumulative = cumulative[1:n_plot]
)
ggplot(cover_df, aes(x = n_words, y = cumulative)) +
geom_line(color = "purple", linewidth = 1) +
geom_hline(yintercept = c(0.5, 0.9),
linetype = "dashed", color = "red") +
annotate("text", x = cover50 + 200, y = 0.52,
label = paste0("50% coverage\n@ ", cover50, " words"),
hjust = 0, size = 3.5) +
annotate("text", x = cover90 + 200, y = 0.88,
label = paste0("90% coverage\n@ ", cover90, " words"),
hjust = 0, size = 3.5) +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Word Coverage Curve",
x = "Number of Unique Words (ranked by frequency)",
y = "Cumulative % of All Word Instances"
) +
theme_minimal(base_size = 12)To estimate how many words come from foreign languages, we can compare the corpus vocabulary against a known English word list. Words not found in the list are either foreign, misspelled, or highly informal (slang, abbreviations).
library(quanteda)
library(words)
# Get all unique tokens from the sample
vocab <- featnames(dfm1)
english_words <- tolower(words::words$word)
# Flag tokens NOT in the English dictionary
oov <- vocab[!vocab %in% english_words]
oov_pct <- round(length(oov) / length(vocab) * 100, 1)
cat("Total unique tokens in sample: ", length(vocab), "\n")## Total unique tokens in sample: 148234
## Tokens not in English dictionary: 100067
## OOV percentage: 67.5 %
# Preview the most frequent OOV tokens
oov_freq <- freq1[freq1$feature %in% oov, ]
head(oov_freq, 20) |> kable(caption = "Table 2: Most Frequent Out-of-Vocabulary Tokens")| feature | frequency | rank | docfreq | group | |
|---|---|---|---|---|---|
| 4 | a | 118799 | 4 | 72326 | all |
| 7 | i | 81935 | 7 | 47756 | all |
Many OOV tokens are not truly foreign — they include usernames, hashtags, contractions, abbreviations, and internet slang.
library(data.table)
# Helper: tokens_ngrams -> data.table with prefix | target | freq
make_ngram_table <- function(toks, n) {
ng <- tokens_ngrams(toks, n = n)
d <- dfm(ng)
freq <- colSums(d)
dt <- data.table(ngram = names(freq), freq = as.integer(freq))
dt <- dt[freq >= 2] # prune singletons
dt[, target := sub(".*_", "", ngram)] # last word
dt[, prefix := sub("_[^_]+$", "", ngram)] # all but last word
dt[, ngram := NULL]
setkey(dt, prefix)
dt
}
bigram_dt <- make_ngram_table(toks, 2)
trigram_dt <- make_ngram_table(toks, 3)
fourgram_dt <- make_ngram_table(toks, 4)
# Unigram table for final fallback — built from freq1 (already in memory)
uni_dt <- data.table(
target = freq1$feature,
freq = freq1$frequency
)
setorder(uni_dt, -freq)
cat("Bigram pairs: ", nrow(bigram_dt), "\n")## Bigram pairs: 371545
## Trigram pairs: 345673
## Fourgram pairs: 139413
When the user types a sequence of words not seen in training, we back off to a shorter context. The Stupid Backoff algorithm (Brants et al., 2007) does this with a fixed penalty of λ = 0.4 per backoff step:
\[S(w \mid \text{context}) = \begin{cases} \dfrac{f(\text{context},\, w)}{f(\text{context})} & \text{if context+w was seen} \\[8pt] 0.4 \times S(w \mid \text{shorter context}) & \text{otherwise} \end{cases}\]
The function below implements this in four lines of logic:
predict_next <- function(input, n_results = 3) {
# Clean input the same way we cleaned training data
words <- input |>
tolower() |>
str_replace_all("[^a-z ]", "") |>
str_squish() |>
str_split(" ") |>
unlist()
w <- words # all cleaned words
n <- length(w)
# 4-gram lookup: use last 3 words as prefix
if (n >= 3) {
prefix <- paste(tail(w, 3), collapse = "_")
hits <- fourgram_dt[prefix][order(-freq)][, target][1:n_results]
if (!all(is.na(hits))) return(na.omit(hits))
}
# 3-gram lookup: use last 2 words as prefix
if (n >= 2) {
prefix <- paste(tail(w, 2), collapse = "_")
hits <- trigram_dt[prefix][order(-freq)][, target][1:n_results]
if (!all(is.na(hits))) return(na.omit(hits))
}
# 2-gram lookup: use last 1 word as prefix
if (n >= 1) {
prefix <- tail(w, 1)
hits <- bigram_dt[prefix][order(-freq)][, target][1:n_results]
if (!all(is.na(hits))) return(na.omit(hits))
}
# Final fallback: return most frequent unigrams
return(uni_dt[1:n_results, target])
}test_inputs <- c("I want to", "happy new", "the", "xkqzw")
results <- data.frame(
Input = test_inputs,
Prediction = sapply(test_inputs, function(x) {
paste(predict_next(x), collapse = " | ")
})
)
kable(results, caption = "Table 3: Sample Predictions from the Backoff Model")| Input | Prediction | |
|---|---|---|
| I want to | I want to | be | see | go |
| happy new | happy new | year | years | year’s |
| the | the | first | same | best |
| xkqzw | xkqzw | the | to | and |