1. Loading the Data

library(quanteda)
library(quanteda.textstats)
library(data.table)
library(ggplot2)
library(stringr)
library(stringi)
library(knitr)

blogs   <- readLines("en_US/en_US.blogs.txt",   encoding = "UTF-8", skipNul = TRUE)
news    <- readLines("en_US/en_US.news.txt",    encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

2. Basic Summary Statistics

stats <- data.frame(
  File           = c("Blogs", "News", "Twitter"),
  Lines          = c(length(blogs),
                     length(news),
                     length(twitter)),
  Words          = c(sum(stri_count_words(blogs)),
                     sum(stri_count_words(news)),
                     sum(stri_count_words(twitter))),
  Max_Line_Chars = c(max(nchar(blogs)),
                     max(nchar(news)),
                     max(nchar(twitter))),
  Size_MB        = round(c(object.size(blogs),
                            object.size(news),
                            object.size(twitter)) / 1e6, 1)
)

kable(stats,
      format.args = list(big.mark = ","),
      caption = "Table 1: Summary Statistics for en_US Corpus Files")
Table 1: Summary Statistics for en_US Corpus Files
File Lines Words Max_Line_Chars Size_MB
Blogs 899,288 37,546,250 40,833 267.8
News 1,010,242 34,762,395 11,384 269.8
Twitter 2,360,148 30,093,413 140 334.5

3. Sampling & Cleaning

The full corpus is too large to process efficiently in memory for EDA. We sample 5% of each file, combine them into a single corpus, and clean the tokens before analysis.

set.seed(42)
sample_pct <- 0.05

sample_text <- c(
  sample(blogs,   round(length(blogs)   * sample_pct)),
  sample(news,    round(length(news)    * sample_pct)),
  sample(twitter, round(length(twitter) * sample_pct))
)

# Free memory from full files
rm(blogs, news, twitter)
gc()
##            used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells  2672953 142.8    7761071  414.5         NA   6739866  360.0
## Vcells 12709130  97.0  171483085 1308.4      16384 214013247 1632.8
cat("Total lines in sample:", length(sample_text), "\n")
## Total lines in sample: 213483
# Build quanteda corpus and clean tokens
corp <- corpus(sample_text)

toks <- tokens(corp,
               remove_punct   = TRUE,
               remove_numbers = TRUE,
               remove_symbols = TRUE,
               remove_url     = TRUE) |>
        tokens_tolower()

4. Unigram Frequency Analysis

dfm1  <- dfm(toks)
freq1 <- textstat_frequency(dfm1, n = 40)

ggplot(freq1, aes(x = reorder(feature, frequency), y = frequency)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Top 40 Most Frequent Words (Unigrams)",
    x     = "Word",
    y     = "Frequency"
  ) +
  theme_minimal(base_size = 12)


5. Bigram (2-gram) Frequency Analysis

toks2 <- tokens_ngrams(toks, n = 2)
dfm2  <- dfm(toks2)
freq2 <- textstat_frequency(dfm2, n = 40)

ggplot(freq2, aes(x = reorder(feature, frequency), y = frequency)) +
  geom_col(fill = "darkorange") +
  coord_flip() +
  labs(
    title = "Top 40 Most Frequent Word Pairs (Bigrams)",
    x     = "Bigram",
    y     = "Frequency"
  ) +
  theme_minimal(base_size = 12)


6. Trigram (3-gram) Frequency Analysis

toks3 <- tokens_ngrams(toks, n = 3)
dfm3  <- dfm(toks3)
freq3 <- textstat_frequency(dfm3, n = 40)

ggplot(freq3, aes(x = reorder(feature, frequency), y = frequency)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(
    title = "Top 40 Most Frequent Word Triples (Trigrams)",
    x     = "Trigram",
    y     = "Frequency"
  ) +
  theme_minimal(base_size = 12)


7. Coverage Analysis — How Many Words Do You Need?

How many unique words are needed to cover X% of all word instances in the corpus?

# Sorted unigram frequencies (descending)
all_freq   <- sort(colSums(dfm1), decreasing = TRUE)
total      <- sum(all_freq)
cumulative <- cumsum(all_freq) / total

cover50 <- which(cumulative >= 0.50)[1]
cover90 <- which(cumulative >= 0.90)[1]

cat("Unique words needed to cover 50% of all instances:", cover50, "\n")
## Unique words needed to cover 50% of all instances: 145
cat("Unique words needed to cover 90% of all instances:", cover90, "\n")
## Unique words needed to cover 90% of all instances: 8227
# Plot coverage curve (limit to first 20,000 words for readability)
n_plot     <- min(length(cumulative), 20000)
cover_df   <- data.frame(
  n_words    = 1:n_plot,
  cumulative = cumulative[1:n_plot]
)

ggplot(cover_df, aes(x = n_words, y = cumulative)) +
  geom_line(color = "purple", linewidth = 1) +
  geom_hline(yintercept = c(0.5, 0.9),
             linetype = "dashed", color = "red") +
  annotate("text", x = cover50 + 200, y = 0.52,
           label = paste0("50% coverage\n@ ", cover50, " words"),
           hjust = 0, size = 3.5) +
  annotate("text", x = cover90 + 200, y = 0.88,
           label = paste0("90% coverage\n@ ", cover90, " words"),
           hjust = 0, size = 3.5) +
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Word Coverage Curve",
    x     = "Number of Unique Words (ranked by frequency)",
    y     = "Cumulative % of All Word Instances"
  ) +
  theme_minimal(base_size = 12)


8. Foreign Language & Out-of-Vocabulary Words

To estimate how many words come from foreign languages, we can compare the corpus vocabulary against a known English word list. Words not found in the list are either foreign, misspelled, or highly informal (slang, abbreviations).

library(quanteda)
library(words)

# Get all unique tokens from the sample
vocab <- featnames(dfm1)

english_words <- tolower(words::words$word)

# Flag tokens NOT in the English dictionary
oov        <- vocab[!vocab %in% english_words]
oov_pct    <- round(length(oov) / length(vocab) * 100, 1)

cat("Total unique tokens in sample:  ", length(vocab), "\n")
## Total unique tokens in sample:   148234
cat("Tokens not in English dictionary:", length(oov),  "\n")
## Tokens not in English dictionary: 100067
cat("OOV percentage:                  ", oov_pct, "%\n")
## OOV percentage:                   67.5 %
# Preview the most frequent OOV tokens
oov_freq <- freq1[freq1$feature %in% oov, ]
head(oov_freq, 20) |> kable(caption = "Table 2: Most Frequent Out-of-Vocabulary Tokens")
Table 2: Most Frequent Out-of-Vocabulary Tokens
feature frequency rank docfreq group
4 a 118799 4 72326 all
7 i 81935 7 47756 all

Many OOV tokens are not truly foreign — they include usernames, hashtags, contractions, abbreviations, and internet slang.

9. Building the N-gram Model

library(data.table)

# Helper: tokens_ngrams -> data.table with prefix | target | freq
make_ngram_table <- function(toks, n) {
  ng   <- tokens_ngrams(toks, n = n)
  d    <- dfm(ng)
  freq <- colSums(d)

  dt <- data.table(ngram = names(freq), freq = as.integer(freq))
  dt <- dt[freq >= 2]                                 # prune singletons
  dt[, target := sub(".*_",    "",  ngram)]           # last word
  dt[, prefix := sub("_[^_]+$", "", ngram)]           # all but last word
  dt[, ngram  := NULL]
  setkey(dt, prefix)
  dt
}

bigram_dt   <- make_ngram_table(toks, 2)
trigram_dt  <- make_ngram_table(toks, 3)
fourgram_dt <- make_ngram_table(toks, 4)

# Unigram table for final fallback — built from freq1 (already in memory)
uni_dt <- data.table(
  target = freq1$feature,
  freq   = freq1$frequency
)
setorder(uni_dt, -freq)

cat("Bigram pairs:    ", nrow(bigram_dt),   "\n")
## Bigram pairs:     371545
cat("Trigram pairs:   ", nrow(trigram_dt),  "\n")
## Trigram pairs:    345673
cat("Fourgram pairs:  ", nrow(fourgram_dt), "\n")
## Fourgram pairs:   139413

10. Handling Unseen N-grams — Stupid Backoff

When the user types a sequence of words not seen in training, we back off to a shorter context. The Stupid Backoff algorithm (Brants et al., 2007) does this with a fixed penalty of λ = 0.4 per backoff step:

\[S(w \mid \text{context}) = \begin{cases} \dfrac{f(\text{context},\, w)}{f(\text{context})} & \text{if context+w was seen} \\[8pt] 0.4 \times S(w \mid \text{shorter context}) & \text{otherwise} \end{cases}\]

The function below implements this in four lines of logic:

predict_next <- function(input, n_results = 3) {

  # Clean input the same way we cleaned training data
  words <- input |>
    tolower() |>
    str_replace_all("[^a-z ]", "") |>
    str_squish() |>
    str_split(" ") |>
    unlist()

  w  <- words                    # all cleaned words
  n  <- length(w)

  # 4-gram lookup: use last 3 words as prefix
  if (n >= 3) {
    prefix <- paste(tail(w, 3), collapse = "_")
    hits   <- fourgram_dt[prefix][order(-freq)][, target][1:n_results]
    if (!all(is.na(hits))) return(na.omit(hits))
  }

  # 3-gram lookup: use last 2 words as prefix
  if (n >= 2) {
    prefix <- paste(tail(w, 2), collapse = "_")
    hits   <- trigram_dt[prefix][order(-freq)][, target][1:n_results]
    if (!all(is.na(hits))) return(na.omit(hits))
  }

  # 2-gram lookup: use last 1 word as prefix
  if (n >= 1) {
    prefix <- tail(w, 1)
    hits   <- bigram_dt[prefix][order(-freq)][, target][1:n_results]
    if (!all(is.na(hits))) return(na.omit(hits))
  }

  # Final fallback: return most frequent unigrams
  return(uni_dt[1:n_results, target])
}

Quick Test

test_inputs <- c("I want to", "happy new", "the", "xkqzw")

results <- data.frame(
  Input      = test_inputs,
  Prediction = sapply(test_inputs, function(x) {
    paste(predict_next(x), collapse = " | ")
  })
)

kable(results, caption = "Table 3: Sample Predictions from the Backoff Model")
Table 3: Sample Predictions from the Backoff Model
Input Prediction
I want to I want to be | see | go
happy new happy new year | years | year’s
the the first | same | best
xkqzw xkqzw the | to | and