# Load the actual data files
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", warn = FALSE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", warn = FALSE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
# Calculate actual statistics
calc_stats <- function(text_lines, filename) {
n_lines <- length(text_lines)
n_chars <- sum(nchar(text_lines))
n_words <- sum(sapply(strsplit(text_lines, "\\s+"), length))
size_mb <- round(object.size(text_lines) / (1024^2), 1)
return(c(Lines = n_lines, Words = n_words, Chars = n_chars, Size_MB = size_mb))
}
summary_df <- data.frame(
File = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
rbind(
calc_stats(blogs, "blogs"),
calc_stats(news, "news"),
calc_stats(twitter, "twitter")
)
)
# Format numbers for readability
summary_df$Lines <- format(summary_df$Lines, big.mark = ",")
summary_df$Words <- format(summary_df$Words, big.mark = ",")
summary_df$Chars <- format(summary_df$Chars, big.mark = ",")
summary_df
## File Lines Words Chars Size_MB
## 1 en_US.blogs.txt 899,288 37,334,131 206,824,505 255.4
## 2 en_US.news.txt 1,010,206 34,371,031 203,214,543 257.3
## 3 en_US.twitter.txt 2,360,148 30,373,583 162,096,241 319.0
What changed: Instead of hardcoded numbers, this actually reads your files and calculates the real statistics. This ensures accuracy and lets reviewers verify your numbers.
set.seed(42) # For reproducibility
# Take actual random samples
blogs_sample <- sample(blogs, 3)
news_sample <- sample(news, 3)
twitter_sample <- sample(twitter, 3)
cat("## Sample Blogs\n")
## ## Sample Blogs
for(i in 1:3) cat(i, ": ", blogs_sample[i], "\n", sep="")
## 1: his taste on your tongue
## 2: What's new from the old is the pale palette that has replaced the moonlight and magnolias style of overwrought red bordello rooms once associated with New Orleans. Pale rooms have had a foothold here for the past ten years in the Uptown homes that once dripped with ornate window treatments, Tester beds, and Victorian parlor sets.
## 3: Be yourself. Especially do not feign affection.
cat("\n## Sample News\n")
##
## ## Sample News
for(i in 1:3) cat(i, ": ", news_sample[i], "\n", sep="")
## 1: Now, off to Island Peak.
## 2: -- Assemble stuffing or dressing, wrap tightly and refrigerate.
## 3: "When you are reaching a half-million and up for salary, is that appropriate for a non-profit?" said Miniutti. "Some put a stake in the ground and say they shouldn't be earning more than the president."
cat("\n## Sample Twitter\n")
##
## ## Sample Twitter
for(i in 1:3) cat(i, ": ", twitter_sample[i], "\n", sep="")
## 1: the joy of small town life cuz there ain't much to do lol
## 2: Stick to baseball Brian Anderson, you're a terrible bball announcer.
## 3: are you sure you didn't look into a full size mirror? Haha
What changed: Real samples from your actual data files. This will show encoding issues, special characters, URLs, hashtags - all the messy reality you’ll need to handle.
library(ggplot2)
# Sample 1000 lines from each dataset for visualization
set.seed(123)
sample_size <- 1000
blogs_samp <- sample(blogs, min(sample_size, length(blogs)))
news_samp <- sample(news, min(sample_size, length(news)))
twitter_samp <- sample(twitter, min(sample_size, length(twitter)))
# Calculate actual line lengths
line_lengths <- data.frame(
Dataset = c(
rep("Blogs", length(blogs_samp)),
rep("News", length(news_samp)),
rep("Twitter", length(twitter_samp))
),
Characters = c(
nchar(blogs_samp),
nchar(news_samp),
nchar(twitter_samp)
)
)
ggplot(line_lengths, aes(x=Characters, fill=Dataset)) +
geom_histogram(bins=30, alpha=0.6, position="identity") +
facet_wrap(~Dataset, scales = "free_y") +
labs(title="Line Length Distribution (Actual Data)",
x="Characters per Line",
y="Frequency") +
theme_minimal() +
theme(legend.position = "none")
# Calculate actual words per line
words_per_line <- data.frame(
Dataset = c(
rep("Blogs", length(blogs_samp)),
rep("News", length(news_samp)),
rep("Twitter", length(twitter_samp))
),
Words = c(
sapply(strsplit(blogs_samp, "\\s+"), length),
sapply(strsplit(news_samp, "\\s+"), length),
sapply(strsplit(twitter_samp, "\\s+"), length)
)
)
ggplot(words_per_line, aes(x=Words, fill=Dataset)) +
geom_histogram(bins=30, alpha=0.6, position="identity") +
facet_wrap(~Dataset, scales = "free_y") +
labs(title="Words per Line Distribution (Actual Data)",
x="Words per Line",
y="Frequency") +
theme_minimal() +
theme(legend.position = "none")
# Summary statistics
cat("\n## Summary Statistics:\n")
##
## ## Summary Statistics:
aggregate(Words ~ Dataset, data = words_per_line,
FUN = function(x) c(Mean = mean(x), Median = median(x), SD = sd(x)))
## Dataset Words.Mean Words.Median Words.SD
## 1 Blogs 41.248000 27.000000 46.200725
## 2 News 34.337000 33.000000 21.862649
## 3 Twitter 12.902000 12.000000 6.916374
What changed: Real calculations from sampled data.
You’ll see the actual distributions, which might surprise you. Added
scales = "free_y" because Twitter will have way more short
lines.
# =========================================
# ngram_model_build.R
# SwiftKey Project — N-gram Model Build
# =========================================
library(quanteda)
library(quanteda.textstats)
library(dplyr)
# -------------------------
# Load data
# -------------------------
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", warn = FALSE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", warn = FALSE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
all_text <- c(blogs, news, twitter)
# -------------------------
# Sample proportionally (50k lines)
# -------------------------
set.seed(123)
n_total <- 50000
n_blogs <- round(length(blogs)/length(all_text) * n_total)
n_news <- round(length(news)/length(all_text) * n_total)
n_twitter <- n_total - n_blogs - n_news
sample_text <- c(
sample(blogs, n_blogs),
sample(news, n_news),
sample(twitter, n_twitter)
)
# -------------------------
# Create corpus and clean text
# -------------------------
corpus <- corpus(sample_text)
# Basic tokenization (KEEP stopwords for all n-grams)
tokens_clean <- tokens(
corpus,
what = "word",
remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE,
remove_url = TRUE
) %>%
tokens_tolower() %>%
tokens_remove("")
# Load profanity list (create your own or download from GitHub)
# profanity <- readLines("profanity_list.txt", warn = FALSE)
# tokens_clean <- tokens_remove(tokens_clean, profanity)
# -------------------------
# Build N-grams (consistent approach)
# -------------------------
# Unigrams
dfm_uni <- dfm(tokens_clean)
freq_uni <- textstat_frequency(dfm_uni)
freq_uni <- freq_uni[freq_uni$frequency > 2, ]
saveRDS(freq_uni, "unigrams.rds")
# Bigrams
tokens_bi <- tokens_ngrams(tokens_clean, n = 2, concatenator = " ")
dfm_bi <- dfm(tokens_bi)
freq_bi <- textstat_frequency(dfm_bi)
freq_bi <- freq_bi[freq_bi$frequency > 2, ]
saveRDS(freq_bi, "bigrams.rds")
# Trigrams
tokens_tri <- tokens_ngrams(tokens_clean, n = 3, concatenator = " ")
dfm_tri <- dfm(tokens_tri)
freq_tri <- textstat_frequency(dfm_tri)
freq_tri <- freq_tri[freq_tri$frequency > 2, ]
saveRDS(freq_tri, "trigrams.rds")
# Fourgrams (for better context)
tokens_four <- tokens_ngrams(tokens_clean, n = 4, concatenator = " ")
dfm_four <- dfm(tokens_four)
freq_four <- textstat_frequency(dfm_four)
freq_four <- freq_four[freq_four$frequency > 2, ]
saveRDS(freq_four, "fourgrams.rds")
# -------------------------
# Print verification
# -------------------------
cat("\n=== MODEL STATISTICS ===\n")
cat("Unigrams:", nrow(freq_uni), "\n")
cat("Bigrams:", nrow(freq_bi), "\n")
cat("Trigrams:", nrow(freq_tri), "\n")
cat("Fourgrams:", nrow(freq_four), "\n")
cat("\n=== Top 20 Unigrams ===\n")
print(head(freq_uni, 20))
cat("\n=== Top 20 Bigrams ===\n")
print(head(freq_bi, 20))
cat("\n=== Top 20 Trigrams ===\n")
print(head(freq_tri, 20))
Key changes: 1. Consistent stopword handling: Kept stopwords in ALL n-grams so backoff works correctly 2. Added concatenator = ” “: Uses space instead of underscore for easier splitting 3. Added 4-grams: Gives better predictions for longer context 4. Added URL removal: Twitter data has lots of URLs 5. Added profanity filter placeholder: Important for public-facing apps 6. Added model statistics: Shows size of each n-gram table
library(quanteda)
library(quanteda.textstats)
# -------------------------
# Load pre-computed n-gram models
# -------------------------
unigrams <- readRDS("unigrams.rds")
bigrams <- readRDS("bigrams.rds")
trigrams <- readRDS("trigrams.rds")
fourgrams <- readRDS("fourgrams.rds")
# -------------------------
# Prediction function with Stupid Backoff
# -------------------------
predict_next_word <- function(text, n = 3) {
# Handle empty input
if (is.null(text) || nchar(trimws(text)) == 0) {
return(head(unigrams$feature, n))
}
# Clean input text
text <- tolower(trimws(text))
words <- unlist(strsplit(text, "\\s+"))
words <- words[words != ""] # Remove empty strings
if (length(words) == 0) {
return(head(unigrams$feature, n))
}
# -------------------------
# Try Fourgrams (last 3 words)
# -------------------------
if (length(words) >= 3) {
context <- paste(tail(words, 3), collapse = " ")
pattern <- paste0("^", gsub("([.|()\\^{}+$*?])", "\\\\\\1", context), " ")
matches <- fourgrams[grepl(pattern, fourgrams$feature), ]
if (nrow(matches) > 0) {
# Extract just the last word
matches$next_word <- sub(paste0(context, " "), "", matches$feature)
matches <- matches[order(-matches$frequency), ]
return(head(matches$next_word, n))
}
}
# -------------------------
# Try Trigrams (last 2 words)
# -------------------------
if (length(words) >= 2) {
context <- paste(tail(words, 2), collapse = " ")
pattern <- paste0("^", gsub("([.|()\\^{}+$*?])", "\\\\\\1", context), " ")
matches <- trigrams[grepl(pattern, trigrams$feature), ]
if (nrow(matches) > 0) {
matches$next_word <- sub(paste0(context, " "), "", matches$feature)
matches <- matches[order(-matches$frequency), ]
return(head(matches$next_word, n))
}
}
# -------------------------
# Try Bigrams (last 1 word)
# -------------------------
if (length(words) >= 1) {
context <- tail(words, 1)
pattern <- paste0("^", gsub("([.|()\\^{}+$*?])", "\\\\\\1", context), " ")
matches <- bigrams[grepl(pattern, bigrams$feature), ]
if (nrow(matches) > 0) {
matches$next_word <- sub(paste0(context, " "), "", matches$feature)
matches <- matches[order(-matches$frequency), ]
return(head(matches$next_word, n))
}
}
# -------------------------
# Final fallback: Most common unigrams
# -------------------------
return(head(unigrams$feature[order(-unigrams$frequency)], n))
}
# -------------------------
# Test the function
# -------------------------
cat("\n=== PREDICTION EXAMPLES ===\n\n")
test_phrases <- c(
"I would like to",
"The president of the",
"How are you",
"Looking forward to",
"Thank you for"
)
for (phrase in test_phrases) {
predictions <- predict_next_word(phrase, n = 5)
cat("Input: '", phrase, "'\n", sep = "")
cat("Predictions:", paste(predictions, collapse = ", "), "\n\n")
}
Key changes: 1. Fixed pattern
matching: Now correctly escapes special regex characters 2.
Added error handling: Handles empty input, whitespace,
etc. 3. Added 4-gram support: Better predictions with
more context 4. Proper word extraction: Uses space
separator to extract next word 5. Added test examples:
Shows the function actually works 6. Better variable
names: context and next_word are
clearer
✓ Loaded and explored all three datasets
✓ Calculated summary statistics and visualized distributions
✓ Designed n-gram model architecture with consistent preprocessing
✓ Implemented Stupid Backoff prediction algorithm
⚙ Building full n-gram frequency tables on complete dataset
⚙ Testing prediction accuracy on held-out validation set
⚙ Profiling memory usage and prediction speed
□ Implement Kneser-Ney smoothing for better handling of unseen
n-grams
□ Add profanity filter and sanitization
□ Optimize model size (pruning low-frequency n-grams)
□ Build Shiny app interface with autocomplete widget
□ Deploy app and conduct user testing
What changed: Realistic breakdown of what’s done vs. what’s next, with actual technical considerations.
# -------------------------
# Model Evaluation (run after building model)
# -------------------------
# Load test set (separate from training)
set.seed(456)
test_size <- 5000
test_text <- c(
sample(blogs, round(test_size * 0.3)),
sample(news, round(test_size * 0.2)),
sample(twitter, round(test_size * 0.5))
)
# Evaluate prediction accuracy
evaluate_model <- function(test_sentences, k = 3) {
correct <- 0
total <- 0
for (sentence in test_sentences) {
words <- unlist(strsplit(tolower(sentence), "\\s+"))
# Need at least 2 words (1 for context, 1 for prediction)
if (length(words) < 2) next
# Test each word prediction
for (i in 2:length(words)) {
context <- paste(words[1:(i-1)], collapse = " ")
actual_word <- words[i]
predictions <- predict_next_word(context, n = k)
if (actual_word %in% predictions) {
correct <- correct + 1
}
total <- total + 1
}
}
accuracy <- correct / total
return(list(
accuracy = accuracy,
correct = correct,
total = total,
coverage = accuracy * 100
))
}
# Run evaluation
cat("\nEvaluating model on test set...\n")
results_top1 <- evaluate_model(head(test_text, 100), k = 1)
results_top3 <- evaluate_model(head(test_text, 100), k = 3)
results_top5 <- evaluate_model(head(test_text, 100), k = 5)
cat("\n=== MODEL PERFORMANCE ===\n")
cat(sprintf("Top-1 Accuracy: %.2f%%\n", results_top1$coverage))
cat(sprintf("Top-3 Accuracy: %.2f%%\n", results_top3$coverage))
cat(sprintf("Top-5 Accuracy: %.2f%%\n", results_top5$coverage))
What this adds: Actual performance metrics so you can report how well your model works.