This project implements a text prediction system using n-gram language models with backoff smoothing. It processes text corpora, builds frequency tables for unigrams, bigrams, and trigrams, and predicts the next word based on input text.
# ===============================
# Load required packages
# ===============================
library(stringr) # string manipulation: cleaning, splitting
library(dplyr) # data manipulation
library(tidyr) # splitting n-grams
library(tidytext) # tokenization (unigrams, bigrams, trigrams)
library(data.table) # fast data manipulation and memory efficiency
library(knitr) # for nice table output in R Markdown
library(ggplot2) # for visualizations
Explanation: - stringr: Provides string manipulation functions for cleaning and processing text - dplyr: Enables data manipulation operations (filtering, arranging, summarizing) - tidyr: Helps in reshaping data, particularly for separating n-grams into components - tidytext: Specialized package for text mining and tokenization into n-grams - data.table: High-performance data manipulation for large datasets
# ===============================
# Function: Read and preprocess corpus (with optional text input)
# ===============================
read_and_clean_corpus <- function(file_path, text_input = NULL) {
if (!is.null(text_input)) {
# Use text_input if provided
corpus <- text_input
} else {
# Otherwise read from file
# Open file connection
con <- file(file_path, "r")
# Initialize empty vector
corpus <- character()
# Read file in chunks to avoid memory overload
repeat {
lines <- readLines(con, n = 10000, warn = FALSE)
if (length(lines) == 0) break
corpus <- c(corpus, lines)
}
close(con)
}
# Text cleaning
corpus <- tolower(corpus) # lowercase
corpus <- str_replace_all(corpus, "[^a-z\\s]", " ") # remove punctuation/numbers
corpus <- str_squish(corpus) # remove extra spaces
return(corpus)
}
What it does: 1. File Reading:
Opens a connection to the text file and reads it in chunks of 10,000
lines to prevent memory issues with large files 2. Text
Normalization: - Converts all text to lowercase
(tolower()) - Removes all non-alphabetic characters
(punctuation, numbers) using regex [^a-z\\s] - Replaces
multiple spaces with single spaces using str_squish()
Purpose: Creates a clean, standardized text corpus for analysis by removing noise and inconsistencies.
# ===============================
# Function: Build n-grams and frequency tables
# ===============================
build_ngrams <- function(corpus) {
df <- tibble(text = corpus)
# Unigrams
unigrams <- df %>% unnest_tokens(word, text)
unigram_counts <- unigrams %>% count(word, sort = TRUE)
# Bigrams
bigrams <- df %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
bigram_counts <- bigrams %>% count(bigram, sort = TRUE)
# Trigrams
trigrams <- df %>% unnest_tokens(trigram, text, token = "ngrams", n = 3)
trigram_counts <- trigrams %>% count(trigram, sort = TRUE)
list(unigram_counts = unigram_counts,
bigram_counts = bigram_counts,
trigram_counts = trigram_counts)
}
# Test with a small example
cat("Testing n-gram generation with example:\n")
## Testing n-gram generation with example:
example_corpus <- c("the quick brown fox jumps over the lazy dog")
ngram_test <- build_ngrams(example_corpus)
cat("\nTop 3 Unigrams:\n")
##
## Top 3 Unigrams:
print(head(ngram_test$unigram_counts, 3))
## # A tibble: 3 × 2
## word n
## <chr> <int>
## 1 the 2
## 2 brown 1
## 3 dog 1
cat("\nTop 3 Bigrams:\n")
##
## Top 3 Bigrams:
print(head(ngram_test$bigram_counts, 3))
## # A tibble: 3 × 2
## bigram n
## <chr> <int>
## 1 brown fox 1
## 2 fox jumps 1
## 3 jumps over 1
cat("\nTop 3 Trigrams:\n")
##
## Top 3 Trigrams:
print(head(ngram_test$trigram_counts, 3))
## # A tibble: 3 × 2
## trigram n
## <chr> <int>
## 1 brown fox jumps 1
## 2 fox jumps over 1
## 3 jumps over the 1
What it does: 1. Data Structure:
Converts the corpus into a tibble (enhanced data frame) 2.
Unigram Extraction: - Tokenizes text into individual
words using unnest_tokens() - Counts frequency of each
word, sorted by most frequent 3. Bigram Extraction: -
Extracts word pairs (2-grams) using token = "ngrams", n = 2
- Counts frequency of each bigram 4. Trigram
Extraction: - Extracts word triples (3-grams) using
token = "ngrams", n = 3 - Counts frequency of each
trigram
Purpose: Creates frequency tables that form the foundation of the language model.
# ===============================
# Function: Prepare bigram lookup table
# ===============================
prepare_bigram_model <- function(bigram_counts) {
# Split bigram into w1 and w2
bigram_dt <- as.data.table(
tidyr::separate(bigram_counts, bigram, into = c("w1", "w2"), sep = " ")
)
# Create list of second words keyed by first word
bigram_model <- bigram_dt[, list(next_words = list(.SD)), by = w1, .SDcols = c("w2","n")]
bigram_list <- setNames(bigram_model$next_words, bigram_model$w1)
# Apply Laplace smoothing
bigram_dt[, prob := (n + 1) / sum(n + 1), by = w1]
list(bigram_dt = bigram_dt, bigram_list = bigram_list)
}
# Test with example
cat("Testing bigram model preparation:\n")
## Testing bigram model preparation:
example_bigrams <- tibble(
bigram = c("the quick", "quick brown", "brown fox", "the lazy"),
n = c(2, 1, 1, 1)
)
bigram_model_test <- prepare_bigram_model(example_bigrams)
cat("\nBigram table with probabilities:\n")
##
## Bigram table with probabilities:
print(bigram_model_test$bigram_dt)
## w1 w2 n prob
## <char> <char> <num> <num>
## 1: the quick 2 0.6
## 2: quick brown 1 1.0
## 3: brown fox 1 1.0
## 4: the lazy 1 0.4
What it does: 1. Bigram Splitting:
Separates each bigram into two columns: w1 and
w2 2. Data Organization: Groups by
w1 and creates a list of possible next words
(w2) with frequencies 3. Laplace
Smoothing: - Calculates probability:
P(w2|w1) = (count(w1,w2) + 1) / (sum(count(w1,*)) + V) -
Adds 1 to each count to handle unseen words
# ===============================
# Exploratory Data Analysis
# ===============================
# List of datasets
files <- c(
twitter = "C:/user/rui/Doutoramento/DataScienceSpecialization/Developing data products/DDP/final/en_US/en_US.twitter.txt",
blogs = "C:/user/rui/Doutoramento/DataScienceSpecialization/Developing data products/DDP/final/en_US/en_US.blogs.txt",
news = "C:/user/rui/Doutoramento/DataScienceSpecialization/Developing data products/DDP/final/en_US/en_US.news.txt"
)
# Initialize storage for summaries
dataset_summaries <- list()
word_distributions <- list()
for (name in names(files)) {
# Read and clean
corpus <- read_and_clean_corpus(files[[name]])
# Basic summaries
lines <- length(corpus)
words_per_line <- str_count(corpus, "[^\\s]+") # Escape backslash
chars_per_line <- nchar(corpus)
dataset_summaries[[name]] <- tibble(
dataset = name,
lines = lines,
mean_words_per_line = mean(words_per_line),
median_words_per_line = median(words_per_line),
mean_chars_per_line = mean(chars_per_line),
median_chars_per_line = median(chars_per_line),
max_chars_line = max(chars_per_line)
)
# Store for plotting
word_distributions[[name]] <- tibble(
dataset = name,
words_per_line = words_per_line
)
}
# Combine summary tables
eda_summary <- bind_rows(dataset_summaries)
# Display as a table
knitr::kable(eda_summary, caption = "Basic summary statistics for each dataset")
| dataset | lines | mean_words_per_line | median_words_per_line | mean_chars_per_line | median_chars_per_line | max_chars_line |
|---|---|---|---|---|---|---|
| 2360148 | 12.94711 | 12 | 64.9002 | 60 | 140 | |
| blogs | 899288 | 42.12252 | 29 | 222.1567 | 150 | 39069 |
| news | 77259 | 34.45879 | 32 | 193.3049 | 177 | 3373 |
# ===============================
# Combine word distributions
# ===============================
all_words <- bind_rows(word_distributions)
ggplot(all_words, aes(x = words_per_line, fill = dataset)) +
geom_histogram(binwidth = 1, alpha = 0.5, position = "identity") +
labs(title = "Distribution of Words per Line Across Datasets",
x = "Words per Line",
y = "Frequency") +
theme_minimal()
# ===============================
# Function: Top Words per Dataset
# ===============================
top_words_list <- list()
for (name in names(files)) {
corpus <- read_and_clean_corpus(files[[name]])
df <- tibble(text = corpus)
unigrams <- df %>% unnest_tokens(word, text)
top_words_list[[name]] <- unigrams %>%
count(word, sort = TRUE) %>%
slice_head(n = 10) %>%
mutate(dataset = name)
}
top_words_all <- bind_rows(top_words_list)
ggplot(top_words_all, aes(x = reorder(word, n), y = n, fill = dataset)) +
geom_col(show.legend = FALSE) +
facet_wrap(~dataset, scales = "free_y") +
coord_flip() +
labs(title = "Top 10 Words per Dataset",
x = "Word",
y = "Frequency") +
theme_minimal()