# Function to summarize tokenized data
summarize_token_data <- function(token_data, source_name) {
linecount <- length(token_data)
wordcounts <- sapply(token_data, length)
total_words <- sum(wordcounts)
avg_words <- mean(wordcounts)
max_words <- max(wordcounts)
data.frame(
Source = source_name,
LineCount = linecount,
TotalWordCount = total_words,
AverageWordCount = avg_words,
MaxWordCount = max_words
)
}
# Generate summaries for each dataset
summary_blog <- summarize_token_data(blogs_tokens_clean, "Blog")
summary_news <- summarize_token_data(news_tokens_clean, "News")
summary_twitter <- summarize_token_data(twitter_tokens_clean, "Twitter")
summary_combined <- summarize_token_data(all_tokens_clean, "Combined")
# Combine into one data frame
summary_df <- rbind(summary_blog, summary_news, summary_twitter, summary_combined)
rownames(summary_df) <- NULL
# Display result
print(summary_df)
## Source LineCount TotalWordCount AverageWordCount MaxWordCount
## 1 Blog 10000 412494 41.2494 681
## 2 News 10000 347931 34.7931 302
## 3 Twitter 10000 126048 12.6048 34
## 4 Combined 30000 886473 29.5491 681
# Function to get word frequency table
get_word_freq <- function(token_data) {
word_vector <- unlist(token_data)
freq_table <- table(tolower(word_vector))
sort(freq_table, decreasing = TRUE)
}
# Function to plot word frequency histogram
plot_freq_distribution <- function(freq_table, source_name) {
freq_counts <- as.numeric(freq_table)
df <- data.frame(Frequency = freq_counts)
ggplot(df, aes(x = Frequency)) +
geom_histogram(bins = 50, fill = "steelblue", color = "black") +
scale_x_log10() +
scale_y_log10() +
labs(
title = paste("Word Frequency Distribution -", source_name),
x = "Word Frequency (log10)",
y = "Number of Words (log10)"
)
}
# Function to plot top N words
plot_top_words <- function(freq_table, source_name, top_n = 15) {
top_words <- head(freq_table, top_n)
df <- data.frame(Word = names(top_words), Frequency = as.numeric(top_words))
ggplot(df, aes(x = reorder(Word, Frequency), y = Frequency)) +
geom_bar(stat = "identity", fill = "darkorange") +
coord_flip() +
labs(
title = paste("Top", top_n, "Words -", source_name),
x = "Word",
y = "Frequency"
)
}
# Analyze each dataset
datasets <- list(
Blog = blogs_tokens_clean,
News = news_tokens_clean,
Twitter = twitter_tokens_clean,
Combined = c(blogs_tokens_clean, news_tokens_clean, twitter_tokens_clean)
)
# Run the analysis
for (source_name in names(datasets)) {
freq_table <- get_word_freq(datasets[[source_name]])
print(plot_freq_distribution(freq_table, source_name)) # Histogram
print(plot_top_words(freq_table, source_name)) # Top 15 words
}
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Removed 6 rows containing missing values or values outside the scale range
## (`geom_bar()`).
# N-gram Generator
generate_ngrams <- function(tokens_list, n = 2) {
ngram_vec <- unlist(lapply(tokens_list, function(tokens) {
if (length(tokens) >= n) {
sapply(1:(length(tokens) - n + 1), function(i) {
paste(tokens[i:(i + n - 1)], collapse = " ")
})
} else {
character(0)
}
}))
return(ngram_vec)
}
# Plot Top N N-grams
plot_top_ngrams <- function(ngram_vec, source_name, ngram_type = "Bigram", top_n = 15) {
if (length(ngram_vec) == 0) {
message("No ", tolower(ngram_type), "s found in ", source_name, " dataset.")
return(NULL)
}
freq_table <- sort(table(ngram_vec), decreasing = TRUE)
top_ngrams <- head(freq_table, top_n)
df <- data.frame(
Ngram = factor(names(top_ngrams), levels = rev(names(top_ngrams))),
Frequency = as.numeric(top_ngrams),
stringsAsFactors = FALSE
)
ggplot(df, aes(x = Ngram, y = Frequency)) +
geom_bar(stat = "identity", fill = "darkblue") +
coord_flip() +
labs(
title = paste("Top", top_n, ngram_type, "s -", source_name),
x = ngram_type,
y = "Frequency"
) +
theme_minimal()
}
# Plot Histogram of N-gram Frequency Distribution
plot_ngram_histogram <- function(ngram_vec, source_name, ngram_type = "Bigram") {
if (length(ngram_vec) == 0) return(NULL)
freq_table <- table(ngram_vec)
freq_df <- data.frame(Frequency = as.numeric(freq_table))
ggplot(freq_df, aes(x = Frequency)) +
geom_histogram(bins = 50, fill = "steelblue", color = "black") +
scale_x_log10() +
scale_y_log10() +
labs(
title = paste(ngram_type, "Frequency Distribution -", source_name),
x = "Frequency (log scale)",
y = "Count (log scale)"
) +
theme_minimal()
}
# Dataset List
datasets <- list(
Blog = blogs_tokens_clean,
News = news_tokens_clean,
Twitter = twitter_tokens_clean,
Combined = c(blogs_tokens_clean, news_tokens_clean, twitter_tokens_clean)
)
# Run Analysis
for (source_name in names(datasets)) {
message("\nProcessing ", source_name, "...")
# Bigrams
bigrams <- generate_ngrams(datasets[[source_name]], n = 2)
message(" Found ", length(bigrams), " bigrams.")
plot_bi <- plot_top_ngrams(bigrams, source_name, "Bigram")
plot_bi_hist <- plot_ngram_histogram(bigrams, source_name, "Bigram")
if (!is.null(plot_bi)) print(plot_bi)
if (!is.null(plot_bi_hist)) print(plot_bi_hist)
# Trigrams
trigrams <- generate_ngrams(datasets[[source_name]], n = 3)
message(" Found ", length(trigrams), " trigrams.")
plot_tri <- plot_top_ngrams(trigrams, source_name, "Trigram")
plot_tri_hist <- plot_ngram_histogram(trigrams, source_name, "Trigram")
if (!is.null(plot_tri)) print(plot_tri)
if (!is.null(plot_tri_hist)) print(plot_tri_hist)
}
##
## Processing Blog...
## Found 402494 bigrams.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Found 392615 trigrams.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_bar()`).
##
## Processing News...
## Found 337931 bigrams.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Found 327959 trigrams.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_bar()`).
##
## Processing Twitter...
## Found 116048 bigrams.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Found 106054 trigrams.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 23 rows containing missing values or values outside the scale range
## (`geom_bar()`).
##
## Processing Combined...
## Found 856473 bigrams.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Found 826628 trigrams.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_bar()`).
# Function to calculate cumulative coverage
calculate_coverage <- function(token_data, source_name) {
word_vector <- unlist(token_data)
freq_table <- sort(table(tolower(word_vector)), decreasing = TRUE)
total_words <- sum(freq_table)
cumulative_freq <- cumsum(freq_table)
coverage_percent <- cumulative_freq / total_words
idx_50 <- which(coverage_percent >= 0.50)[1]
idx_90 <- which(coverage_percent >= 0.90)[1]
coverage_df <- data.frame(
Rank = seq_along(freq_table),
Coverage = coverage_percent,
Source = source_name
)
list(
summary = data.frame(
Source = source_name,
UniqueWords = length(freq_table),
WordsToCover50 = idx_50,
WordsToCover90 = idx_90
),
coverage_df = coverage_df
)
}
# Run for each dataset
results_blog <- calculate_coverage(blogs_tokens_clean, "Blog")
results_news <- calculate_coverage(news_tokens_clean, "News")
results_twitter <- calculate_coverage(twitter_tokens_clean, "Twitter")
results_combined <- calculate_coverage(
c(blogs_tokens_clean, news_tokens_clean, twitter_tokens_clean),
"Combined"
)
# Combine summary data
coverage_summary <- bind_rows(
results_blog$summary,
results_news$summary,
results_twitter$summary,
results_combined$summary
)
# Print summary table
print(kable(coverage_summary, caption = "Unique Words Needed to Reach 50% and 90% Coverage"))
##
##
## Table: Unique Words Needed to Reach 50% and 90% Coverage
##
## | |Source | UniqueWords| WordsToCover50| WordsToCover90|
## |:-----|:--------|-----------:|--------------:|--------------:|
## |too |Blog | 30934| 115| 6239|
## |set |News | 31076| 219| 7711|
## |yes |Twitter | 15478| 129| 4576|
## |three |Combined | 51227| 156| 7788|
# Combine data for plotting
coverage_curves <- bind_rows(
results_blog$coverage_df,
results_news$coverage_df,
results_twitter$coverage_df,
results_combined$coverage_df
)
# Plot cumulative coverage curves
ggplot(coverage_curves, aes(x = Rank, y = Coverage, color = Source)) +
geom_line(size = 1) +
scale_x_log10() +
labs(
title = "Cumulative Word Coverage by Frequency",
x = "Number of Unique Words (log scale)",
y = "Cumulative Coverage"
) +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "gray") +
geom_hline(yintercept = 0.9, linetype = "dashed", color = "gray") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Stemming What it does: Reduces words to their root or base form by chopping off suffixes. Example: runs, running, runner become run Why it’s useful: Groups different grammatical forms into a single representation, boosting coverage without needing full dictionaries.
Lemmatization What it does: Maps words to their dictionary base form using vocabulary and context. Example: better becomes good, running becomes run Why it’s better than stemming: More accurate and context-aware, avoids issues like stemming university to univers.
Spelling Correction What it does: Fixes common typos, OCR errors, or informal spelling variations. Example: recieve becomes receive, teh becomes the Why it helps: Noisy or user-generated data (like tweets) often contains misspellings that reduce effective coverage and inflate vocabulary size.
Subword Tokenization What it does: Breaks unknown or rare words into meaningful subword units (such as Byte Pair Encoding or WordPiece). Example: unhappiness becomes un, happi, ness; reposting becomes re, post, ing Why it works: Helps handle out-of-vocabulary words by building them from frequent subword components. This is crucial in modern NLP models.
Learn from Real Language (Training) We start by analyzing large amounts of real English text — like blogs, news articles, and tweets — to understand how people naturally write. We do this by looking at common combinations of 2 or 3 words (called bigrams and trigrams). Think of it as building a “dictionary of word patterns” that we can search later.
Build the Prediction Logic We create a system that: