Overview

This project implements a text prediction system using n-gram language models with backoff smoothing. It processes text corpora, builds frequency tables for unigrams, bigrams, and trigrams, and predicts the next word based on input text.

1. Load Required Packages

# ===============================
# Load required packages
# ===============================
library(stringr)    # string manipulation: cleaning, splitting
library(dplyr)      # data manipulation
library(tidyr)      # splitting n-grams
library(tidytext)   # tokenization (unigrams, bigrams, trigrams)
library(data.table) # fast data manipulation and memory efficiency
library(knitr)      # for nice table output in R Markdown
library(ggplot2)    # for visualizations 

Explanation: - stringr: Provides string manipulation functions for cleaning and processing text - dplyr: Enables data manipulation operations (filtering, arranging, summarizing) - tidyr: Helps in reshaping data, particularly for separating n-grams into components - tidytext: Specialized package for text mining and tokenization into n-grams - data.table: High-performance data manipulation for large datasets

2. Text Preprocessing Function

# ===============================
# Function: Read and preprocess corpus (with optional text input)
# ===============================
read_and_clean_corpus <- function(file_path, text_input = NULL) {
    if (!is.null(text_input)) {
        # Use text_input if provided
        corpus <- text_input
    } else {
        # Otherwise read from file
        # Open file connection
        con <- file(file_path, "r")
        
        # Initialize empty vector
        corpus <- character()
        
        # Read file in chunks to avoid memory overload
        repeat {
            lines <- readLines(con, n = 10000, warn = FALSE)
            if (length(lines) == 0) break
            corpus <- c(corpus, lines)
        }
        close(con)
    }
    
    # Text cleaning
    corpus <- tolower(corpus)  # lowercase
    corpus <- str_replace_all(corpus, "[^a-z\\s]", " ")  # remove punctuation/numbers
    corpus <- str_squish(corpus)  # remove extra spaces
    
    return(corpus)
}

What it does: 1. File Reading: Opens a connection to the text file and reads it in chunks of 10,000 lines to prevent memory issues with large files 2. Text Normalization: - Converts all text to lowercase (tolower()) - Removes all non-alphabetic characters (punctuation, numbers) using regex [^a-z\\s] - Replaces multiple spaces with single spaces using str_squish()

Purpose: Creates a clean, standardized text corpus for analysis by removing noise and inconsistencies.

3. N-gram Generation Function

# ===============================
# Function: Build n-grams and frequency tables
# ===============================
build_ngrams <- function(corpus) {
    df <- tibble(text = corpus)
    
    # Unigrams
    unigrams <- df %>% unnest_tokens(word, text)
    unigram_counts <- unigrams %>% count(word, sort = TRUE)
    
    # Bigrams
    bigrams <- df %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
    bigram_counts <- bigrams %>% count(bigram, sort = TRUE)
    
    # Trigrams
    trigrams <- df %>% unnest_tokens(trigram, text, token = "ngrams", n = 3)
    trigram_counts <- trigrams %>% count(trigram, sort = TRUE)
    
    list(unigram_counts = unigram_counts,
         bigram_counts = bigram_counts,
         trigram_counts = trigram_counts)
}

# Test with a small example
cat("Testing n-gram generation with example:\n")
## Testing n-gram generation with example:
example_corpus <- c("the quick brown fox jumps over the lazy dog")
ngram_test <- build_ngrams(example_corpus)

cat("\nTop 3 Unigrams:\n")
## 
## Top 3 Unigrams:
print(head(ngram_test$unigram_counts, 3))
## # A tibble: 3 × 2
##   word      n
##   <chr> <int>
## 1 the       2
## 2 brown     1
## 3 dog       1
cat("\nTop 3 Bigrams:\n")
## 
## Top 3 Bigrams:
print(head(ngram_test$bigram_counts, 3))
## # A tibble: 3 × 2
##   bigram         n
##   <chr>      <int>
## 1 brown fox      1
## 2 fox jumps      1
## 3 jumps over     1
cat("\nTop 3 Trigrams:\n")
## 
## Top 3 Trigrams:
print(head(ngram_test$trigram_counts, 3))
## # A tibble: 3 × 2
##   trigram             n
##   <chr>           <int>
## 1 brown fox jumps     1
## 2 fox jumps over      1
## 3 jumps over the      1

What it does: 1. Data Structure: Converts the corpus into a tibble (enhanced data frame) 2. Unigram Extraction: - Tokenizes text into individual words using unnest_tokens() - Counts frequency of each word, sorted by most frequent 3. Bigram Extraction: - Extracts word pairs (2-grams) using token = "ngrams", n = 2 - Counts frequency of each bigram 4. Trigram Extraction: - Extracts word triples (3-grams) using token = "ngrams", n = 3 - Counts frequency of each trigram

Purpose: Creates frequency tables that form the foundation of the language model.

4. Bigram Model Preparation with Smoothing

# ===============================
# Function: Prepare bigram lookup table
# ===============================
prepare_bigram_model <- function(bigram_counts) {
    # Split bigram into w1 and w2
    bigram_dt <- as.data.table(
        tidyr::separate(bigram_counts, bigram, into = c("w1", "w2"), sep = " ")
    )
    
    # Create list of second words keyed by first word
    bigram_model <- bigram_dt[, list(next_words = list(.SD)), by = w1, .SDcols = c("w2","n")]
    bigram_list <- setNames(bigram_model$next_words, bigram_model$w1)
    
    # Apply Laplace smoothing
    bigram_dt[, prob := (n + 1) / sum(n + 1), by = w1]
    
    list(bigram_dt = bigram_dt, bigram_list = bigram_list)
}

# Test with example
cat("Testing bigram model preparation:\n")
## Testing bigram model preparation:
example_bigrams <- tibble(
  bigram = c("the quick", "quick brown", "brown fox", "the lazy"),
  n = c(2, 1, 1, 1)
)

bigram_model_test <- prepare_bigram_model(example_bigrams)
cat("\nBigram table with probabilities:\n")
## 
## Bigram table with probabilities:
print(bigram_model_test$bigram_dt)
##        w1     w2     n  prob
##    <char> <char> <num> <num>
## 1:    the  quick     2   0.6
## 2:  quick  brown     1   1.0
## 3:  brown    fox     1   1.0
## 4:    the   lazy     1   0.4

What it does: 1. Bigram Splitting: Separates each bigram into two columns: w1 and w2 2. Data Organization: Groups by w1 and creates a list of possible next words (w2) with frequencies 3. Laplace Smoothing: - Calculates probability: P(w2|w1) = (count(w1,w2) + 1) / (sum(count(w1,*)) + V) - Adds 1 to each count to handle unseen words

1.1 Exploratory Data Analysis

# ===============================

# Exploratory Data Analysis

# ===============================

# List of datasets

files <- c(
twitter = "C:/user/rui/Doutoramento/DataScienceSpecialization/Developing data products/DDP/final/en_US/en_US.twitter.txt",
blogs   = "C:/user/rui/Doutoramento/DataScienceSpecialization/Developing data products/DDP/final/en_US/en_US.blogs.txt",
news    = "C:/user/rui/Doutoramento/DataScienceSpecialization/Developing data products/DDP/final/en_US/en_US.news.txt"
)

# Initialize storage for summaries

dataset_summaries <- list()
word_distributions <- list()

for (name in names(files)) {

# Read and clean

corpus <- read_and_clean_corpus(files[[name]])

# Basic summaries

lines <- length(corpus)
words_per_line <- str_count(corpus, "[^\\s]+")    # Escape backslash
chars_per_line <- nchar(corpus)

dataset_summaries[[name]] <- tibble(
dataset = name,
lines = lines,
mean_words_per_line = mean(words_per_line),
median_words_per_line = median(words_per_line),
mean_chars_per_line = mean(chars_per_line),
median_chars_per_line = median(chars_per_line),
max_chars_line = max(chars_per_line)
)

# Store for plotting

word_distributions[[name]] <- tibble(
dataset = name,
words_per_line = words_per_line
)
}

# Combine summary tables

eda_summary <- bind_rows(dataset_summaries)

# Display as a table

knitr::kable(eda_summary, caption = "Basic summary statistics for each dataset")
Basic summary statistics for each dataset
dataset lines mean_words_per_line median_words_per_line mean_chars_per_line median_chars_per_line max_chars_line
twitter 2360148 12.94711 12 64.9002 60 140
blogs 899288 42.12252 29 222.1567 150 39069
news 77259 34.45879 32 193.3049 177 3373

1.2 Combine word distributions

# ===============================
# Combine word distributions
# ===============================
all_words <- bind_rows(word_distributions)

ggplot(all_words, aes(x = words_per_line, fill = dataset)) +
geom_histogram(binwidth = 1, alpha = 0.5, position = "identity") +
labs(title = "Distribution of Words per Line Across Datasets",
x = "Words per Line",
y = "Frequency") +
theme_minimal()

1.3 Top Words per Dataset

# ===============================
# Function: Top Words per Dataset
# ===============================

top_words_list <- list()

for (name in names(files)) {
corpus <- read_and_clean_corpus(files[[name]])
df <- tibble(text = corpus)
unigrams <- df %>% unnest_tokens(word, text)

top_words_list[[name]] <- unigrams %>%
count(word, sort = TRUE) %>%
slice_head(n = 10) %>%
mutate(dataset = name)
}

top_words_all <- bind_rows(top_words_list)

ggplot(top_words_all, aes(x = reorder(word, n), y = n, fill = dataset)) +
geom_col(show.legend = FALSE) +
facet_wrap(~dataset, scales = "free_y") +
coord_flip() +
labs(title = "Top 10 Words per Dataset",
x = "Word",
y = "Frequency") +
theme_minimal()