# Define file paths
files <- c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt")
# Load data with error handling
if(all(file.exists(files))) {
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", warn = FALSE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", warn = FALSE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
} else {
stop("Data files not found. Please ensure files are in the working directory.")
}
# Function to calculate statistics
calc_stats <- function(text_lines, filename) {
n_lines <- length(text_lines)
n_words <- sum(stri_count_words(text_lines))
n_chars <- sum(nchar(text_lines))
size_mb <- round(object.size(text_lines) / (1024^2), 1)
return(data.frame(File = filename, Lines = n_lines, Words = n_words,
Chars = n_chars, Size_MB = size_mb))
}
summary_df <- rbind(
calc_stats(blogs, "Blogs"),
calc_stats(news, "News"),
calc_stats(twitter, "Twitter")
)
kable(summary_df, caption = "Summary Statistics of Input Data", format.args = list(big.mark = ","))
| File | Lines | Words | Chars | Size_MB |
|---|---|---|---|---|
| Blogs | 899,288 | 37,546,806 | 206,824,505 | 255.4 bytes |
| News | 1,010,206 | 34,761,151 | 203,214,543 | 257.3 bytes |
| 2,360,148 | 30,096,690 | 162,096,241 | 319 bytes |
#3 Data Sampling and Preprocessing
set.seed(123)
sample_pct <- 0.05
sample_text <- c(
sample(blogs, length(blogs) * sample_pct),
sample(news, length(news) * sample_pct),
sample(twitter, length(twitter) * sample_pct)
)
# Free up memory immediately after sampling
rm(blogs, news, twitter); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2643732 141.2 8064720 430.8 6784801 362.4
## Vcells 12647931 96.5 91663959 699.4 114558574 874.1
# Clean and Tokenize
tokens_clean <- tokens(
corpus(sample_text),
remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE,
remove_url = TRUE
) %>%
tokens_tolower()
#4 Building N-Gram Models
get_freq_table <- function(tokens, n) {
if (n > 1) {
toks <- tokens_ngrams(tokens, n = n, concatenator = " ")
} else {
toks <- tokens
}
dfm_obj <- dfm(toks)
freq <- textstat_frequency(dfm_obj) %>%
select(feature, frequency) %>%
filter(frequency > 1) # Remove singletons to save memory
if (n > 1) {
# Extract context and last word for fast lookup
freq$context <- sub(" [^ ]+$", "", freq$feature)
freq$prediction <- sub("^.* ", "", freq$feature)
}
return(freq)
}
unigrams <- get_freq_table(tokens_clean, 1)
bigrams <- get_freq_table(tokens_clean, 2)
trigrams <- get_freq_table(tokens_clean, 3)
fourgrams <- get_freq_table(tokens_clean, 4)
#5 Prediction Logic: Stupid Backoff
predict_next_word <- function(input_text, n_results = 3) {
# Standardize input cleaning
input_tokens <- tokens(tolower(input_text), remove_punct = TRUE) %>%
as.character()
len <- length(input_tokens)
# Step 1: 4-gram lookup
if (len >= 3) {
ctx <- paste(tail(input_tokens, 3), collapse = " ")
res <- fourgrams %>% filter(context == ctx) %>% head(n_results)
if (nrow(res) >= n_results) return(res$prediction)
}
# Step 2: 3-gram lookup (Backoff)
if (len >= 2) {
ctx <- paste(tail(input_tokens, 2), collapse = " ")
res <- trigrams %>% filter(context == ctx) %>% head(n_results)
if (nrow(res) > 0) {
preds <- res$prediction
if(length(preds) < n_results) {
# Recursive call for more results if needed
preds <- unique(c(preds, predict_next_word(tail(input_tokens, 1), n_results)))
}
return(preds[1:n_results])
}
}
# Step 3: 2-gram lookup (Backoff)
if (len >= 1) {
ctx <- tail(input_tokens, 1)
res <- bigrams %>% filter(context == ctx) %>% head(n_results)
if (nrow(res) > 0) return(res$prediction[1:min(nrow(res), n_results)])
}
# Step 4: Unigram default
return(head(unigrams$feature, n_results))
}
#6 Testing and Visualizations
ggplot(head(bigrams, 15), aes(x = reorder(feature, frequency), y = frequency)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Most Frequent Bigrams", x = "Bigram", y = "Frequency") +
theme_minimal()
#7 Model Prediction Test
test_phrases <- c("How are", "I want to", "The end of", "See you")
results <- lapply(test_phrases, function(p) {
preds <- predict_next_word(p, 3)
data.frame(Input = p, Predictions = paste(preds, collapse = ", "))
})
kable(bind_rows(results))
| Input | Predictions |
|---|---|
| How are | you, u, ya |
| I want to | be, do, go |
| The end of | the, this, my |
| See you | there, in, at |