This report presents an exploratory data analysis of three English language text corpora (blogs, news, and Twitter) to inform the development of a word prediction algorithm. The analysis examines file characteristics, vocabulary distributions, n-gram patterns, and text properties that are essential for building an effective predictive text application.
Most Significant Findings:
# Load required libraries
library(tidyverse)
library(stringi)
library(knitr)
library(gridExtra)
# Set seed for reproducibility
set.seed(123)
files <- c(
blogs = "en_US.blogs.txt",
news = "en_US.news.txt",
twitter = "en_US.twitter.txt"
)
# Function to get file statistics
get_file_stats <- function(filepath) {
# Check if file exists
if (!file.exists(filepath)) {
return(list(
size_mb = NA,
lines = NA,
words = NA,
chars = NA,
lines_sample = character(0)
))
}
# Get file size
size_mb <- file.info(filepath)$size / 1024^2
# Read file
con <- file(filepath, "r")
lines <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
# Calculate statistics
n_lines <- length(lines)
n_words <- sum(stri_count_words(lines))
n_chars <- sum(stri_length(lines))
list(
size_mb = size_mb,
lines = n_lines,
words = n_words,
chars = n_chars,
lines_sample = lines
)
}
# Collect statistics for all files
stats_list <- lapply(files, get_file_stats)
# Create summary table
summary_table <- data.frame(
Source = names(files),
Size_MB = sapply(stats_list, function(x) round(x$size_mb, 2)),
Total_Lines = sapply(stats_list, function(x) format(x$lines, big.mark = ",")),
Total_Words = sapply(stats_list, function(x) format(x$words, big.mark = ",")),
Total_Characters = sapply(stats_list, function(x) format(x$chars, big.mark = ",")),
Avg_Words_Per_Line = sapply(stats_list, function(x)
round(x$words / x$lines, 1))
)
kable(summary_table, caption = "Table 1: Corpus File Statistics",
align = c('l', 'r', 'r', 'r', 'r', 'r'))
| Source | Size_MB | Total_Lines | Total_Words | Total_Characters | Avg_Words_Per_Line | |
|---|---|---|---|---|---|---|
| blogs | blogs | 200.42 | 899,288 | 37,546,250 | 206,824,505 | 41.8 |
| news | news | 196.28 | 1,010,242 | 34,762,395 | 203,223,159 | 34.4 |
| 159.36 | 2,360,148 | 30,093,413 | 162,096,241 | 12.8 |
The table above presents the fundamental characteristics of each corpus. Twitter entries show shorter average length per line compared to blogs and news, reflecting the platform’s character limitations and conversational nature.
For efficient analysis and model development, we’ll work with representative samples from each corpus.
# Function to sample lines from corpus
sample_corpus <- function(lines, sample_rate = 0.01) {
if (length(lines) == 0) return(character(0))
n_sample <- max(1, floor(length(lines) * sample_rate))
sample(lines, n_sample)
}
# Create samples (1% of each corpus for demonstration)
samples <- lapply(stats_list, function(x) sample_corpus(x$lines_sample, 0.01))
# Combine all samples
all_samples <- unlist(samples)
cat(sprintf("Total sampled lines: %s\n", format(length(all_samples), big.mark = ",")))
## Total sampled lines: 42,695
cat(sprintf("Sample from blogs: %s lines\n", format(length(samples$blogs), big.mark = ",")))
## Sample from blogs: 8,992 lines
cat(sprintf("Sample from news: %s lines\n", format(length(samples$news), big.mark = ",")))
## Sample from news: 10,102 lines
cat(sprintf("Sample from twitter: %s lines\n", format(length(samples$twitter), big.mark = ",")))
## Sample from twitter: 23,601 lines
# Function to clean text using regular expression
clean_text <- function(text) {
text %>%
# Convert to lowercase
tolower() %>%
# Remove URLs
str_replace_all("http\\S+|www\\S+", "") %>%
# Remove email addresses
str_replace_all("\\S+@\\S+", "") %>%
# Keep only letters, apostrophes, and spaces
str_replace_all("[^a-z' ]", " ") %>%
# Remove extra spaces
str_replace_all("\\s+", " ") %>%
str_trim()
}
# Clean the samples
clean_samples <- lapply(samples, clean_text)
# Example: show before and after cleaning
cat("Original text sample:\n")
## Original text sample:
cat(samples$twitter[1], "\n\n")
## you guys have that also?? It runs our campus for a week
cat("Cleaned text sample:\n")
## Cleaned text sample:
cat(clean_samples$twitter[1], "\n")
## you guys have that also it runs our campus for a week
# Function to get word frequencies
get_word_freq <- function(text_vector) {
words <- unlist(strsplit(text_vector, "\\s+"))
words <- words[words != ""]
word_freq <- table(words)
data.frame(
word = names(word_freq),
frequency = as.numeric(word_freq),
stringsAsFactors = FALSE
) %>%
arrange(desc(frequency))
}
# Get word frequencies for each corpus
word_freqs <- lapply(clean_samples, get_word_freq)
# Top 20 words by corpus
top_words_comparison <- bind_rows(
word_freqs$blogs %>% head(20) %>% mutate(source = "Blogs"),
word_freqs$news %>% head(20) %>% mutate(source = "News"),
word_freqs$twitter %>% head(20) %>% mutate(source = "Twitter")
)
# Plot top words by source
ggplot(top_words_comparison, aes(x = reorder(word, frequency), y = frequency, fill = source)) +
geom_col() +
coord_flip() +
facet_wrap(~source, scales = "free_y", ncol = 3) +
labs(title = "Figure 1: Top 20 Most Frequent Words by Corpus",
x = "Word", y = "Frequency") +
theme_minimal() +
theme(legend.position = "none")
# Calculate vocabulary statistics
vocab_stats <- data.frame(
Source = c("Blogs", "News", "Twitter", "Combined"),
Unique_Words = c(
nrow(word_freqs$blogs),
nrow(word_freqs$news),
nrow(word_freqs$twitter),
length(unique(unlist(lapply(clean_samples, function(x)
unlist(strsplit(x, "\\s+"))))))
),
Total_Words = c(
sum(word_freqs$blogs$frequency),
sum(word_freqs$news$frequency),
sum(word_freqs$twitter$frequency),
sum(sapply(word_freqs, function(x) sum(x$frequency)))
)
)
vocab_stats$Type_Token_Ratio <- round(
vocab_stats$Unique_Words / vocab_stats$Total_Words, 4
)
kable(vocab_stats, caption = "Table 2: Vocabulary Statistics",
format.args = list(big.mark = ","))
| Source | Unique_Words | Total_Words | Type_Token_Ratio |
|---|---|---|---|
| Blogs | 27,713 | 374,449 | 0.0740 |
| News | 28,866 | 340,639 | 0.0847 |
| 24,383 | 297,318 | 0.0820 | |
| Combined | 51,534 | 1,012,406 | 0.0509 |
Understanding vocabulary coverage is crucial for prediction algorithm efficiency.
# Function to calculate coverage
calculate_coverage <- function(word_freq_df) {
word_freq_df <- word_freq_df %>% arrange(desc(frequency))
word_freq_df$cumulative_freq <- cumsum(word_freq_df$frequency)
word_freq_df$coverage <- word_freq_df$cumulative_freq / sum(word_freq_df$frequency)
word_freq_df$rank <- 1:nrow(word_freq_df)
word_freq_df
}
# Calculate coverage for combined corpus
combined_words <- bind_rows(word_freqs) %>%
group_by(word) %>%
summarise(frequency = sum(frequency), .groups = "drop")
coverage_data <- calculate_coverage(combined_words)
# Find words needed for different coverage levels
coverage_levels <- c(0.5, 0.75, 0.9, 0.95)
coverage_summary <- data.frame(
Coverage = paste0(coverage_levels * 100, "%"),
Words_Needed = sapply(coverage_levels, function(level) {
min(which(coverage_data$coverage >= level))
})
)
kable(coverage_summary, caption = "Table 3: Words Required for Coverage Levels")
| Coverage | Words_Needed |
|---|---|
| 50% | 141 |
| 75% | 1433 |
| 90% | 6928 |
| 95% | 15163 |
# Plot coverage curve
ggplot(coverage_data %>% filter(rank <= 5000),
aes(x = rank, y = coverage * 100)) +
geom_line(color = "#2C3E50", size = 1.2) +
geom_hline(yintercept = c(50, 75, 90, 95),
linetype = "dashed", color = "red", alpha = 0.5) +
labs(title = "Figure 2: Cumulative Word Coverage",
subtitle = "Percentage of total words covered by top N unique words",
x = "Number of Unique Words (Ranked by Frequency)",
y = "Coverage (%)") +
theme_minimal() +
scale_y_continuous(breaks = seq(0, 100, 10))
Approximately 141 unique words provide 50% coverage of the corpus, while 15163 words are needed for 95% coverage. This demonstrates the Zipfian distribution typical of natural language.
N-grams (sequences of n words) are fundamental to word prediction algorithms.
# Function to generate n-grams
generate_ngrams <- function(text_vector, n = 2) {
ngrams <- character()
for (line in text_vector) {
words <- unlist(strsplit(line, "\\s+"))
words <- words[words != ""]
if (length(words) >= n) {
for (i in 1:(length(words) - n + 1)) {
ngram <- paste(words[i:(i + n - 1)], collapse = " ")
ngrams <- c(ngrams, ngram)
}
}
}
ngram_freq <- table(ngrams)
data.frame(
ngram = names(ngram_freq),
frequency = as.numeric(ngram_freq),
stringsAsFactors = FALSE
) %>%
arrange(desc(frequency))
}
# Generate bigrams and trigrams (using smaller sample for speed)
sample_for_ngrams <- sample(all_samples, min(length(all_samples), 1000))
clean_for_ngrams <- clean_text(sample_for_ngrams)
bigrams <- generate_ngrams(clean_for_ngrams, 2)
trigrams <- generate_ngrams(clean_for_ngrams, 3)
# Display top n-grams
cat("Top 10 Bigrams:\n")
## Top 10 Bigrams:
kable(head(bigrams, 10), caption = "Table 4: Most Frequent Bigrams")
| ngram | frequency |
|---|---|
| in the | 91 |
| of the | 85 |
| to the | 60 |
| on the | 54 |
| for the | 37 |
| at the | 35 |
| to be | 34 |
| and the | 28 |
| but i | 27 |
| is a | 26 |
cat("\n\nTop 10 Trigrams:\n")
##
##
## Top 10 Trigrams:
kable(head(trigrams, 10), caption = "Table 5: Most Frequent Trigrams")
| ngram | frequency |
|---|---|
| one of the | 10 |
| w sunset blvd | 10 |
| the u s | 7 |
| is one of | 6 |
| a bit of | 5 |
| a lot of | 5 |
| i have to | 5 |
| thanks for the | 5 |
| there is a | 5 |
| a good thing | 4 |
# Visualize top bigrams
top_bigrams <- head(bigrams, 15)
ggplot(top_bigrams, aes(x = reorder(ngram, frequency), y = frequency)) +
geom_col(fill = "#3498DB") +
coord_flip() +
labs(title = "Figure 3: Top 15 Most Frequent Bigrams",
x = "Bigram", y = "Frequency") +
theme_minimal()
Based on this exploratory analysis, here are some recommendations for building a word prediction application:
1. N-gram Model with Backoff
2. Vocabulary Management
3. Corpus-Specific Models
Consider context-aware predictions:
4. Performance Optimization
Core Functionality:
This exploratory analysis reveals several key insights for word prediction:
Language follows predictable patterns: Common words and phrases appear with high frequency across all corpora
Efficient coverage: A relatively small vocabulary (~10,000 words) covers the vast majority of everyday usage
Context matters: Different text sources show distinct linguistic characteristics that could improve prediction accuracy