The goal remains to build a next-word prediction model using a 600 MB English corpus in en_nlp.db, and deploy it via a Shiny app. We’ve completed a trigram model and now aim to explore its statistical properties to optimize the prediction algorithm.
4 SQLite databases were created. One for each language: english -
en_nlp.dbetc.
-- files table origanly had language but for size database was split into langages
CREATE TABLE files (
file_id INTEGER PRIMARY KEY AUTOINCREMENT,
file_name TEXT, -- full file path
file_size_mb REAL
)
-- text loaded into sentences tables
CREATE TABLE sentences (
sentence_id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER,
position INTEGER,
text_content TEXT,
word_count INTEGER,
FOREIGN KEY (file_id) REFERENCES files(file_id)
)
-- ngrams table
CREATE TABLE ngrams (
prefix TEXT,
next_word TEXT,
count INTEGER,
PRIMARY KEY (prefix, next_word)
)
To refine the model, we’ll analyze word and n-gram frequencies, dictionary coverage, foreign language influence, and coverage enhancement strategies. Below are the new questions, approaches, and proposed outputs.
library(RSQLite)
library(dplyr)
library(tokenizers)
library(ggplot2)
library(hunspell) # For foreign word detection
library(progress)
# Connect to database
conn <- dbConnect(SQLite(), "en_nlp.db")
sample_sentences <- function(conn, sample_size) {
query <- "
SELECT CAST(text_content AS TEXT) AS sentence
FROM sentences
WHERE sentence_id IN (
SELECT sentence_id
FROM sentences
ORDER BY RANDOM()
LIMIT ?
)"
data <- dbGetQuery(conn, query, params = list(sample_size))
data$sentence <- as.character(data$sentence)
data$sentence[is.na(data$sentence)] <- ""
data
}
# Sample 100,000 sentences
sample_size <- 100000
sampled_sentences <- sample_sentences(conn, sample_size)
cat("Sampled", nrow(sampled_sentences), "sentences\n")
Sampled 100000 sentences
# Tokenize all sentences into words
words_list <- lapply(sampled_sentences$sentence, tokenize_words, lowercase = TRUE, strip_punct = TRUE)
words <- unlist(words_list)
# Count word frequencies
word_freq <- table(words) %>% as.data.frame() %>%
rename(word = words, freq = Freq) %>%
arrange(desc(freq)) %>%
mutate(rank = row_number())
# Plot: Log-Log Word Frequency vs. Rank (Zipf’s Law)
p1 <- ggplot(word_freq, aes(x = rank, y = freq)) +
geom_line() +
scale_x_log10("Rank (log scale)") +
scale_y_log10("Frequency (log scale)") +
ggtitle("Word Frequency Distribution (Zipf’s Law)") +
theme_minimal()
print(p1)
#ggsave("word_freq_zipf.png", p1, width = 8, height = 6)
# Table: Top 10 Frequent Words
top_words <- head(word_freq, 10) %>% select(word, freq)
print(top_words)
#write.csv(top_words, "top_words.csv", row.names = FALSE)
# Functions for bigram and trigram extraction
extract_bigrams <- function(sentence) {
words <- tokenize_words(sentence, lowercase = TRUE, strip_punct = TRUE)[[1]]
if (length(words) < 2) return(data.frame(prefix = character(), next_word = character()))
data.frame(
prefix = words[1:(length(words)-1)],
next_word = words[2:length(words)]
)
}
extract_trigrams <- function(sentence) {
words <- tokenize_words(sentence, lowercase = TRUE, strip_punct = TRUE)[[1]]
if (length(words) < 3) return(data.frame(prefix = character(), next_word = character()))
n <- length(words) - 2
data.frame(
prefix = sapply(1:n, function(i) paste(words[i], words[i+1], sep=" ")),
next_word = words[(1:n) + 2]
)
}
# Generate bigrams and trigrams with progress bar
pb <- progress_bar$new(total = nrow(sampled_sentences), format = "[:bar] :percent eta: :eta")
bigrams_list <- list()
trigrams_list <- list()
for (i in 1:nrow(sampled_sentences)) {
#pb$tick()
sentence <- sampled_sentences$sentence[i]
bigrams_list[[i]] <- extract_bigrams(sentence)
trigrams_list[[i]] <- extract_trigrams(sentence)
}
# Combine and count
bigrams <- bind_rows(bigrams_list) %>%
group_by(prefix, next_word) %>%
summarise(freq = n(), .groups = "drop") %>%
arrange(desc(freq))
trigrams <- bind_rows(trigrams_list) %>%
group_by(prefix, next_word) %>%
summarise(freq = n(), .groups = "drop") %>%
arrange(desc(freq))
# Plot: Top 20 Bigrams and Trigrams Bar Plot
top_bigrams <- head(bigrams, 20) %>% mutate(n_gram = paste(prefix, next_word))
top_trigrams <- head(trigrams, 20) %>% mutate(n_gram = paste(prefix, next_word))
p2 <- ggplot(bind_rows(mutate(top_bigrams, type = "Bigram"), mutate(top_trigrams, type = "Trigram")),
aes(x = reorder(n_gram, freq), y = freq, fill = type)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x = "N-Gram", y = "Frequency", title = "Top 20 Bigrams and Trigrams") +
facet_wrap(~type, scales = "free_y") +
theme_minimal()
print(p2)
#ggsave("ngram_freq_bar.png", p2, width = 10, height = 6)
# Table: N-Gram Frequency Summary
ngram_summary <- data.frame(
"N-Gram Type" = c("Bigram", "Trigram"),
"Unique Count" = c(nrow(bigrams), nrow(trigrams)),
"Avg Frequency" = c(mean(bigrams$freq), mean(trigrams$freq)),
"Max Frequency" = c(max(bigrams$freq), max(trigrams$freq))
)
print("N-Gram Frequency Summary:")
[1] "N-Gram Frequency Summary:"
print(ngram_summary)
#write.csv(ngram_summary, "ngram_summary.csv", row.names = FALSE)
# Calculate cumulative coverage
total_instances <- sum(word_freq$freq)
word_freq <- word_freq %>%
mutate(cum_freq = cumsum(freq),
cum_percent = cum_freq / total_instances * 100)
cutoff_50 <- word_freq %>% filter(cum_percent >= 50) %>% slice(1)
cutoff_90 <- word_freq %>% filter(cum_percent >= 90) %>% slice(1)
# Plot: Cumulative Frequency Curve
p3 <- ggplot(word_freq, aes(x = rank, y = cum_percent)) +
geom_line() +
geom_vline(xintercept = cutoff_50$rank, linetype = "dashed", color = "blue") +
geom_vline(xintercept = cutoff_90$rank, linetype = "dashed", color = "red") +
labs(x = "Number of Unique Words", y = "Cumulative % of Instances",
title = "Dictionary Coverage") +
annotate("text", x = cutoff_50$rank, y = 60, label = "50%", color = "blue") +
annotate("text", x = cutoff_90$rank, y = 95, label = "90%", color = "red") +
theme_minimal()
print(p3)
#ggsave("coverage_curve.png", p3, width = 8, height = 6)
# Table: Coverage Thresholds
coverage_table <- data.frame(
Coverage = c("50%", "90%"),
"Unique Words" = c(cutoff_50$rank, cutoff_90$rank),
"Example Words" = c(cutoff_50$word, cutoff_90$word)
)
Pruning words with less than 5 frequency seem to improve “foreign” words detection (expected 5%-10%). However “foreign” is misleading as the top foreign words are misspelled English words. I used SCOWL to match English words. Tokenizing words into stems proved beneficial.
# Tokenize and clean with regex
words_list <- lapply(sampled_sentences$sentence, tokenize_word_stems, language = "english")
words <- unlist(words_list)
words_clean <- gsub("[[:punct:]]|[0-9]", "", words)
word_freq <- table(words_clean) %>% as.data.frame() %>%
rename(word = words_clean, freq = Freq) %>%
arrange(desc(freq)) %>%
mutate(rank = row_number())
# Load SCOWL English word list
english_words <- readLines("scowl_words.txt", encoding = "UTF-8", warn = FALSE)
# Add minimal extras (slang, abbreviations)
extra_words <- c("dont", "wont", "cant", "gonna", "yall", "aint", "im", "ive", "id", "youre",
"its", "theyre", "wasnt", "isnt", "didnt", "thats", "heres", "theres",
"blog", "rt", "lol", "haha", "dr")
english_words_clean <- c(english_words, extra_words)
# Classify words
word_freq$word <- as.character(word_freq$word)
word_freq$is_english <- word_freq$word %in% english_words_clean
foreign_words <- word_freq %>% filter(!is_english)
# Proportion check (unfiltered)
english_count <- sum(word_freq$freq[word_freq$is_english])
foreign_count <- sum(word_freq$freq[!word_freq$is_english])
cat("English instances:", english_count, "\n")
English instances: 1306877
cat("Foreign instances:", foreign_count, "\n")
Foreign instances: 245355
cat("Foreign proportion:", foreign_count / (english_count + foreign_count) * 100, "%\n")
Foreign proportion: 15.80659 %
# Filter high-frequency "foreign" words
foreign_words_filtered <- foreign_words %>% filter(freq < 5)
# Proportion check (filtered)
english_count_filtered <- english_count + sum(foreign_words$freq[foreign_words$freq >= 5])
foreign_count_filtered <- sum(foreign_words_filtered$freq)
cat("English instances (pruned):", english_count_filtered, "\n")
English instances (pruned): 1516671
cat("Foreign instances (pruned):", foreign_count_filtered, "\n")
Foreign instances (pruned): 35561
cat("Foreign proportion (pruned):", foreign_count_filtered / (english_count_filtered + foreign_count_filtered) * 100, "%\n")
Foreign proportion (pruned): 2.290959 %
# Plot: Pie Chart (filtered)
pie_data <- data.frame(category = c("English", "Foreign"),
count = c(english_count_filtered, foreign_count_filtered))
p4 <- ggplot(pie_data, aes(x = "", y = count, fill = category)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
labs(title = "English vs. Foreign Word Instances (Filtered)") +
theme_void() +
scale_fill_manual(values = c("English" = "blue", "Foreign" = "red"))
print(p4)
# Table: Top 10 Suspected Foreign Words
top_foreign <- head(foreign_words_filtered, 10) %>% select(word, freq)
print("Top 10 Suspected Foreign Words:")
[1] "Top 10 Suspected Foreign Words:"
print(top_foreign)
NA
NA
NA
Using
# Simulate clustering (simplified: assume rare words grouped as <UNK>)
vocab_full <- word_freq %>%
group_by(word) %>%
summarise(freq = sum(freq)) %>%
arrange(desc(freq)) %>%
mutate(cum_freq = cumsum(freq), cum_percent = cum_freq / total_instances * 100)
vocab_reduced <- word_freq %>%
mutate(word = ifelse(freq < 5, "<UNK>", word)) %>%
group_by(word) %>%
summarise(freq = sum(freq)) %>%
arrange(desc(freq)) %>%
mutate(cum_freq = cumsum(freq), cum_percent = cum_freq / total_instances * 100)
# Plot: Coverage vs. Dictionary Size
p5 <- ggplot() +
geom_line(data = vocab_full, aes(x = row_number(vocab_full), y = cum_percent, color = "Full Vocab")) +
geom_line(data = vocab_reduced, aes(x = row_number(vocab_reduced), y = cum_percent, color = "Reduced Vocab")) +
labs(x = "Dictionary Size", y = "Cumulative % of Instances",
title = "Coverage: Full vs. Reduced Vocabulary") +
scale_color_manual(values = c("Full Vocab" = "blue", "Reduced Vocab" = "red")) +
theme_minimal()
print(p5)
Develop an efficient, accurate next-word prediction system, informed by exploratory insights, and deploy via Shiny.
Core: Trigram lookup with Add-K smoothing, top-5 predictions. Backoff: Fall back to bigrams/unigrams using frequency data. Storage: SQLite full model, in-memory pruned set.
UI: Text input, top-5 prediction dropdown, stats tab with frequency plots/tables.
Server: Load pruned ngrams, predict reactively, display exploratory plots (e.g., Zipf, coverage curves).
Features: Real-time prediction, toggle for smoothing/backoff options.
Current State: Functional trigram model from 1,000 sentences, expandable to 20,000 (~10%), stored in ngrams table (prefix, next_word, count). Ready for exploratory analysis and prediction development.