Load libraries

0) Setting up corpus using en_US.twitter.txt

# # Read 1000 from the en_US.twitter.txt" file
# con <- file("data/final/en_US/en_US.twitter.txt", "r")
# corpus <- readLines(con, 1000)
# close(con)

0) Or Sample corpus

1) Text cleaning and tokenization

Cleans and tokenizes text

Summary Statistic

# Basic summary report for corpus

# Number of lines
n_lines <- length(corpus)

# Characters per line and total
chars_per_line <- nchar(corpus, type = "chars", allowNA = TRUE)
total_chars <- sum(chars_per_line, na.rm = TRUE)

# Token stats (uses existing `tokens`)
tokens_per_line <- vapply(tokens, length, integer(1))
# Exclude boundary tokens from token stats
tokens_no_bound <- lapply(tokens, function(t) t[!(t %in% c("<s>", "</s>"))])
tokens_flat <- unlist(tokens_no_bound, use.names = FALSE)

total_tokens <- length(tokens_flat)
avg_tokens_per_line <- mean(vapply(tokens_no_bound, length, integer(1)))
vocab <- unique(tokens_flat)
vocab_size <- length(vocab)

# Top 20 most frequent tokens
token_freq <- sort(table(tokens_flat), decreasing = TRUE)
top_tokens <- head(token_freq, 20)

cat("=== Corpus Summary ===\n")
## === Corpus Summary ===
cat(sprintf("Lines: %d\n", n_lines))
## Lines: 4
cat(sprintf("Total characters: %d\n", total_chars))
## Total characters: 167
cat(sprintf("Avg characters per line: %.2f\n", mean(chars_per_line, na.rm = TRUE)))
## Avg characters per line: 41.75
cat(sprintf("Total tokens (excluding boundaries): %d\n", total_tokens))
## Total tokens (excluding boundaries): 36
cat(sprintf("Avg tokens per line: %.2f\n", avg_tokens_per_line))
## Avg tokens per line: 9.00
cat(sprintf("Vocabulary size: %d\n\n", vocab_size))
## Vocabulary size: 20
cat("Top 20 tokens:\n")
## Top 20 tokens:
print(data.frame(token = names(top_tokens), count = as.integer(top_tokens), row.names = NULL))
##      token count
## 1      the     6
## 2      dog     3
## 3      fox     3
## 4    quick     3
## 5        a     2
## 6      and     2
## 7    jumps     2
## 8     lazy     2
## 9     over     2
## 10  became     1
## 11    blue     1
## 12   brown     1
## 13     cat     1
## 14    fast     1
## 15 friends     1
## 16    high     1
## 17     rug     1
## 18    runs     1
## 19  sleeps     1
## 20    warm     1
# Add plots for summary statistics
library(ggplot2)

# Plot: Distribution of tokens per line
ggplot(data.frame(tokens_per_line = vapply(tokens_no_bound, length, integer(1))), aes(x = tokens_per_line)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
  labs(title = "Distribution of Tokens per Line", x = "Tokens per Line", y = "Frequency")

# Plot: Top 20 most frequent tokens
top_tokens_df <- data.frame(token = names(top_tokens), count = as.integer(top_tokens))
ggplot(top_tokens_df, aes(x = reorder(token, -count), y = count)) +
  geom_bar(stat = "identity", fill = "darkorange") +
  labs(title = "Top 20 Most Frequent Tokens", x = "Token", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

2) Build n-gram counts

This script builds n-gram counts

# -------------------------------
# 2. Build n-gram counts
# -------------------------------
# Helper: make n-grams from a vector of tokens
make_ngrams <- function(tokens, n) {
    if (length(tokens) < n) return(character(0))
    vapply(seq_len(length(tokens) - n + 1), function(i) {
        paste(tokens[i:(i + n - 1)], collapse = " ")
    }, character(1))
}

# Build counts for given n (e.g., 1,2,3)
build_ngram_counts <- function(tokens_list, n) {
    all_ngrams <- unlist(lapply(tokens_list, make_ngrams, n = n), use.names = FALSE)
    # Tabulate counts
    counts <- table(all_ngrams)
    # Convert to named integer vector
    counts <- sort(as.integer(counts), decreasing = TRUE)
    names(counts) <- names(table(all_ngrams))[order(table(all_ngrams), decreasing = TRUE)]
    counts
}

# Example: build uni-, bi-, tri-gram counts
unigram_counts <- build_ngram_counts(tokens, 1)
bigram_counts  <- build_ngram_counts(tokens, 2)
trigram_counts <- build_ngram_counts(tokens, 3)

3) Convert counts to conditional probabilities

Converts counts to probabilities (with Laplace smoothing)

# -------------------------------
# 3. Convert counts to conditional probabilities
#    P(next | history) with add-1 (Laplace) smoothing
# -------------------------------
# Extract vocabulary size from unigrams (for smoothing)
vocab <- unique(names(unigram_counts))
V <- length(vocab)


# Build conditional probability tables from n-gram counts
# For bigrams: P(w2 | w1)
build_bigram_probs <- function(bigram_counts, unigram_counts, V) {
    # Split bigrams into history and next
    parts <- strsplit(names(bigram_counts), " ")
    history <- vapply(parts, function(p) p[1], character(1))
    nextw   <- vapply(parts, function(p) p[2], character(1))
    # Add-1 smoothing: (count(w1,w2)+1)/(count(w1)+V)
    num <- as.numeric(bigram_counts) + 1
    denom <- unigram_counts[history]
    denom[is.na(denom)] <- 0
    denom <- denom + V
    probs <- num / denom
    # Build data.frame
    data.frame(history = history, next_word = nextw, prob = probs, stringsAsFactors = FALSE)
}


# For trigrams: P(w3 | w1 w2)
build_trigram_probs <- function(trigram_counts, bigram_counts, V) {
    parts <- strsplit(names(trigram_counts), " ")
    history <- vapply(parts, function(p) paste(p[1], p[2]), character(1))
    nextw   <- vapply(parts, function(p) p[3], character(1))
    num <- as.numeric(trigram_counts) + 1
    denom <- bigram_counts[history]
    denom[is.na(denom)] <- 0
    denom <- denom + V
    probs <- num / denom
    data.frame(history = history, next_word = nextw, prob = probs, stringsAsFactors = FALSE)
}

bigram_probs  <- build_bigram_probs(bigram_counts, unigram_counts, V)
trigram_probs <- build_trigram_probs(trigram_counts, bigram_counts, V)

4) Predict next word

Predicts next word given a prefix

# -------------------------------
# 4. Predict next word
#    Backoff: try trigram -> bigram -> unigram
# -------------------------------
predict_next <- function(prefix, k = 5) {
    # Clean and tokenize prefix (no boundary tokens)
    prefix <- tolower(prefix)
    prefix <- gsub("[^a-z\\s]", " ", prefix)
    prefix <- gsub("\\s+", " ", prefix)
    prefix <- trimws(prefix)
    words <- unlist(strsplit(prefix, " "))
    words <- words[words != ""]
    
    # Try trigram history if we have at least 2 words
    if (length(words) >= 2) {
        hist3 <- paste(words[length(words)-1], words[length(words)])
        cand3 <- subset(trigram_probs, history == hist3)
        if (nrow(cand3) > 0) {
            cand3 <- cand3[order(-cand3$prob), ]
            return(head(cand3$next_word, k))
        }
    }
    # Fall back to bigram using last word
    if (length(words) >= 1) {
        hist2 <- words[length(words)]
        cand2 <- subset(bigram_probs, history == hist2)
        if (nrow(cand2) > 0) {
            cand2 <- cand2[order(-cand2$prob), ]
            return(head(cand2$next_word, k))
        }
    }
    # Final backoff: most frequent unigrams (excluding boundary tokens)
    uni <- data.frame(token = names(unigram_counts), count = as.numeric(unigram_counts))
    uni <- subset(uni, !(token %in% c("<s>", "</s>")))
    uni <- uni[order(-uni$count), ]
    head(uni$token, k)
}

5) Demo

# -------------------------------
# 5. Demo
# -------------------------------
cat("Top predictions for 'the quick':\n")
## Top predictions for 'the quick':
print(predict_next("the quick", k = 5))
## [1] "blue"  "brown"
cat("\nTop predictions for 'the':\n")
## 
## Top predictions for 'the':
print(predict_next("the", k = 5))
## [1] "quick" "dog"   "fox"   "lazy"  "warm"
cat("\nTop predictions for 'fox jumps':\n")
## 
## Top predictions for 'fox jumps':
print(predict_next("fox jumps", k = 5))
## [1] "over"