1) Text cleaning and tokenization
Cleans and tokenizes text
Summary Statistic
# Basic summary report for corpus
# Number of lines
n_lines <- length(corpus)
# Characters per line and total
chars_per_line <- nchar(corpus, type = "chars", allowNA = TRUE)
total_chars <- sum(chars_per_line, na.rm = TRUE)
# Token stats (uses existing `tokens`)
tokens_per_line <- vapply(tokens, length, integer(1))
# Exclude boundary tokens from token stats
tokens_no_bound <- lapply(tokens, function(t) t[!(t %in% c("<s>", "</s>"))])
tokens_flat <- unlist(tokens_no_bound, use.names = FALSE)
total_tokens <- length(tokens_flat)
avg_tokens_per_line <- mean(vapply(tokens_no_bound, length, integer(1)))
vocab <- unique(tokens_flat)
vocab_size <- length(vocab)
# Top 20 most frequent tokens
token_freq <- sort(table(tokens_flat), decreasing = TRUE)
top_tokens <- head(token_freq, 20)
cat("=== Corpus Summary ===\n")
## === Corpus Summary ===
cat(sprintf("Lines: %d\n", n_lines))
## Lines: 4
cat(sprintf("Total characters: %d\n", total_chars))
## Total characters: 167
cat(sprintf("Avg characters per line: %.2f\n", mean(chars_per_line, na.rm = TRUE)))
## Avg characters per line: 41.75
cat(sprintf("Total tokens (excluding boundaries): %d\n", total_tokens))
## Total tokens (excluding boundaries): 36
cat(sprintf("Avg tokens per line: %.2f\n", avg_tokens_per_line))
## Avg tokens per line: 9.00
cat(sprintf("Vocabulary size: %d\n\n", vocab_size))
## Vocabulary size: 20
cat("Top 20 tokens:\n")
## Top 20 tokens:
print(data.frame(token = names(top_tokens), count = as.integer(top_tokens), row.names = NULL))
## token count
## 1 the 6
## 2 dog 3
## 3 fox 3
## 4 quick 3
## 5 a 2
## 6 and 2
## 7 jumps 2
## 8 lazy 2
## 9 over 2
## 10 became 1
## 11 blue 1
## 12 brown 1
## 13 cat 1
## 14 fast 1
## 15 friends 1
## 16 high 1
## 17 rug 1
## 18 runs 1
## 19 sleeps 1
## 20 warm 1
# Add plots for summary statistics
library(ggplot2)
# Plot: Distribution of tokens per line
ggplot(data.frame(tokens_per_line = vapply(tokens_no_bound, length, integer(1))), aes(x = tokens_per_line)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
labs(title = "Distribution of Tokens per Line", x = "Tokens per Line", y = "Frequency")

# Plot: Top 20 most frequent tokens
top_tokens_df <- data.frame(token = names(top_tokens), count = as.integer(top_tokens))
ggplot(top_tokens_df, aes(x = reorder(token, -count), y = count)) +
geom_bar(stat = "identity", fill = "darkorange") +
labs(title = "Top 20 Most Frequent Tokens", x = "Token", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

3) Convert counts to conditional probabilities
Converts counts to probabilities (with Laplace smoothing)
# -------------------------------
# 3. Convert counts to conditional probabilities
# P(next | history) with add-1 (Laplace) smoothing
# -------------------------------
# Extract vocabulary size from unigrams (for smoothing)
vocab <- unique(names(unigram_counts))
V <- length(vocab)
# Build conditional probability tables from n-gram counts
# For bigrams: P(w2 | w1)
build_bigram_probs <- function(bigram_counts, unigram_counts, V) {
# Split bigrams into history and next
parts <- strsplit(names(bigram_counts), " ")
history <- vapply(parts, function(p) p[1], character(1))
nextw <- vapply(parts, function(p) p[2], character(1))
# Add-1 smoothing: (count(w1,w2)+1)/(count(w1)+V)
num <- as.numeric(bigram_counts) + 1
denom <- unigram_counts[history]
denom[is.na(denom)] <- 0
denom <- denom + V
probs <- num / denom
# Build data.frame
data.frame(history = history, next_word = nextw, prob = probs, stringsAsFactors = FALSE)
}
# For trigrams: P(w3 | w1 w2)
build_trigram_probs <- function(trigram_counts, bigram_counts, V) {
parts <- strsplit(names(trigram_counts), " ")
history <- vapply(parts, function(p) paste(p[1], p[2]), character(1))
nextw <- vapply(parts, function(p) p[3], character(1))
num <- as.numeric(trigram_counts) + 1
denom <- bigram_counts[history]
denom[is.na(denom)] <- 0
denom <- denom + V
probs <- num / denom
data.frame(history = history, next_word = nextw, prob = probs, stringsAsFactors = FALSE)
}
bigram_probs <- build_bigram_probs(bigram_counts, unigram_counts, V)
trigram_probs <- build_trigram_probs(trigram_counts, bigram_counts, V)
5) Demo
# -------------------------------
# 5. Demo
# -------------------------------
cat("Top predictions for 'the quick':\n")
## Top predictions for 'the quick':
print(predict_next("the quick", k = 5))
## [1] "blue" "brown"
cat("\nTop predictions for 'the':\n")
##
## Top predictions for 'the':
print(predict_next("the", k = 5))
## [1] "quick" "dog" "fox" "lazy" "warm"
cat("\nTop predictions for 'fox jumps':\n")
##
## Top predictions for 'fox jumps':
print(predict_next("fox jumps", k = 5))
## [1] "over"