Load Data

To speed up the process the first 10,000 lines of each file are used
For better readability the output is suppressed

Basic Summaries

# Function to summarize tokenized data
summarize_token_data <- function(token_data, source_name) {
  linecount <- length(token_data)
  wordcounts <- sapply(token_data, length)
  total_words <- sum(wordcounts)
  avg_words <- mean(wordcounts)
  max_words <- max(wordcounts)
  
  data.frame(
    Source = source_name,
    LineCount = linecount,
    TotalWordCount = total_words,
    AverageWordCount = avg_words,
    MaxWordCount = max_words
  )
}

# Generate summaries for each dataset
summary_blog    <- summarize_token_data(blogs_tokens_clean, "Blog")
summary_news    <- summarize_token_data(news_tokens_clean, "News")
summary_twitter <- summarize_token_data(twitter_tokens_clean, "Twitter")
summary_combined <- summarize_token_data(all_tokens_clean, "Combined")

# Combine into one data frame
summary_df <- rbind(summary_blog, summary_news, summary_twitter, summary_combined)
rownames(summary_df) <- NULL

# Display result
print(summary_df)

##     Source LineCount TotalWordCount AverageWordCount MaxWordCount
## 1     Blog     10000         412494          41.2494          681
## 2     News     10000         347931          34.7931          302
## 3  Twitter     10000         126048          12.6048           34
## 4 Combined     30000         886473          29.5491          681

Word Frequency Analysis

Analysis of word usage across Blog, News, Twitter, and Combined datasets.
Plot word frequency distributions (log-log scale) and top 15 most common words.
A small set of words (e.g. “the”, “and”, “to”) dominate across all sources.
Twitter showed shorter, more repetitive language due to character limits.
Results provide a baseline understanding for further text modeling and filtering.

# Function to get word frequency table
get_word_freq <- function(token_data) {
  word_vector <- unlist(token_data)
  freq_table <- table(tolower(word_vector))
  sort(freq_table, decreasing = TRUE)
}

# Function to plot word frequency histogram
plot_freq_distribution <- function(freq_table, source_name) {
  freq_counts <- as.numeric(freq_table)
  df <- data.frame(Frequency = freq_counts)

  ggplot(df, aes(x = Frequency)) +
    geom_histogram(bins = 50, fill = "steelblue", color = "black") +
    scale_x_log10() +
    scale_y_log10() +
    labs(
      title = paste("Word Frequency Distribution -", source_name),
      x = "Word Frequency (log10)",
      y = "Number of Words (log10)"
    )
}

# Function to plot top N words
plot_top_words <- function(freq_table, source_name, top_n = 15) {
  top_words <- head(freq_table, top_n)
  df <- data.frame(Word = names(top_words), Frequency = as.numeric(top_words))

  ggplot(df, aes(x = reorder(Word, Frequency), y = Frequency)) +
    geom_bar(stat = "identity", fill = "darkorange") +
    coord_flip() +
    labs(
      title = paste("Top", top_n, "Words -", source_name),
      x = "Word",
      y = "Frequency"
    )
}

# Analyze each dataset
datasets <- list(
  Blog = blogs_tokens_clean,
  News = news_tokens_clean,
  Twitter = twitter_tokens_clean,
  Combined = c(blogs_tokens_clean, news_tokens_clean, twitter_tokens_clean)
)

# Run the analysis
for (source_name in names(datasets)) {
  freq_table <- get_word_freq(datasets[[source_name]])
  
  print(plot_freq_distribution(freq_table, source_name))  # Histogram
  print(plot_top_words(freq_table, source_name))          # Top 15 words
}

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Removed 6 rows containing missing values or values outside the scale range
## (`geom_bar()`).

N-gram Frequency Analysis

Analysis of frequent word pairs (bigrams) and triples (trigrams) in Blog, News, Twitter, and Combined datasets.
Identified and visualized the top 15 most common bigrams and trigrams in each dataset.
Plotted n-gram frequency distributions using log-log histograms to reveal usage patterns.
Observed that:
- Most n-grams occur very rarely, with a few repeated often (typical of natural language).
- Common bigrams and trigrams reflect standard phrasing and topic-specific language in each source.
- Twitter shows shorter, more repetitive phrases compared to Blogs and News.
Results support downstream tasks like language modeling, predictive text, and content classification.

# N-gram Generator
generate_ngrams <- function(tokens_list, n = 2) {
  ngram_vec <- unlist(lapply(tokens_list, function(tokens) {
    if (length(tokens) >= n) {
      sapply(1:(length(tokens) - n + 1), function(i) {
        paste(tokens[i:(i + n - 1)], collapse = " ")
      })
    } else {
      character(0)
    }
  }))
  return(ngram_vec)
}

# Plot Top N N-grams
plot_top_ngrams <- function(ngram_vec, source_name, ngram_type = "Bigram", top_n = 15) {
  if (length(ngram_vec) == 0) {
    message("No ", tolower(ngram_type), "s found in ", source_name, " dataset.")
    return(NULL)
  }

  freq_table <- sort(table(ngram_vec), decreasing = TRUE)
  top_ngrams <- head(freq_table, top_n)

  df <- data.frame(
    Ngram = factor(names(top_ngrams), levels = rev(names(top_ngrams))),
    Frequency = as.numeric(top_ngrams),
    stringsAsFactors = FALSE
  )

  ggplot(df, aes(x = Ngram, y = Frequency)) +
    geom_bar(stat = "identity", fill = "darkblue") +
    coord_flip() +
    labs(
      title = paste("Top", top_n, ngram_type, "s -", source_name),
      x = ngram_type,
      y = "Frequency"
    ) +
    theme_minimal()
}

# Plot Histogram of N-gram Frequency Distribution
plot_ngram_histogram <- function(ngram_vec, source_name, ngram_type = "Bigram") {
  if (length(ngram_vec) == 0) return(NULL)

  freq_table <- table(ngram_vec)
  freq_df <- data.frame(Frequency = as.numeric(freq_table))

  ggplot(freq_df, aes(x = Frequency)) +
    geom_histogram(bins = 50, fill = "steelblue", color = "black") +
    scale_x_log10() +
    scale_y_log10() +
    labs(
      title = paste(ngram_type, "Frequency Distribution -", source_name),
      x = "Frequency (log scale)",
      y = "Count (log scale)"
    ) +
    theme_minimal()
}

# Dataset List
datasets <- list(
  Blog = blogs_tokens_clean,
  News = news_tokens_clean,
  Twitter = twitter_tokens_clean,
  Combined = c(blogs_tokens_clean, news_tokens_clean, twitter_tokens_clean)
)

# Run Analysis
for (source_name in names(datasets)) {
  message("\nProcessing ", source_name, "...")

  # Bigrams
  bigrams <- generate_ngrams(datasets[[source_name]], n = 2)
  message("  Found ", length(bigrams), " bigrams.")
  plot_bi <- plot_top_ngrams(bigrams, source_name, "Bigram")
  plot_bi_hist <- plot_ngram_histogram(bigrams, source_name, "Bigram")
  if (!is.null(plot_bi)) print(plot_bi)
  if (!is.null(plot_bi_hist)) print(plot_bi_hist)

  # Trigrams
  trigrams <- generate_ngrams(datasets[[source_name]], n = 3)
  message("  Found ", length(trigrams), " trigrams.")
  plot_tri <- plot_top_ngrams(trigrams, source_name, "Trigram")
  plot_tri_hist <- plot_ngram_histogram(trigrams, source_name, "Trigram")
  if (!is.null(plot_tri)) print(plot_tri)
  if (!is.null(plot_tri_hist)) print(plot_tri_hist)
}

## 
## Processing Blog...

##   Found 402494 bigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_bar()`).

##   Found 392615 trigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## 
## Processing News...

##   Found 337931 bigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_bar()`).

##   Found 327959 trigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## 
## Processing Twitter...

##   Found 116048 bigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_bar()`).

##   Found 106054 trigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 23 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## 
## Processing Combined...

##   Found 856473 bigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).

##   Found 826628 trigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Vocabulary Coverage Summary

Analysis of how many unique words are needed to cover 50% and 90% of all word occurrences in each dataset (Blog, News, Twitter, Combined).
Found that a small number of high-frequency words account for the majority of total word usage.
Plot of cumulative word coverage curves to visualize how vocabulary size relates to overall coverage.
Results highlight the efficiency of using a frequency-sorted dictionary for compression, modeling, or predictive text applications.

# Function to calculate cumulative coverage
calculate_coverage <- function(token_data, source_name) {
  word_vector <- unlist(token_data)
  freq_table <- sort(table(tolower(word_vector)), decreasing = TRUE)
  total_words <- sum(freq_table)
  cumulative_freq <- cumsum(freq_table)
  coverage_percent <- cumulative_freq / total_words

  idx_50 <- which(coverage_percent >= 0.50)[1]
  idx_90 <- which(coverage_percent >= 0.90)[1]

  coverage_df <- data.frame(
    Rank = seq_along(freq_table),
    Coverage = coverage_percent,
    Source = source_name
  )

  list(
    summary = data.frame(
      Source = source_name,
      UniqueWords = length(freq_table),
      WordsToCover50 = idx_50,
      WordsToCover90 = idx_90
    ),
    coverage_df = coverage_df
  )
}

# Run for each dataset
results_blog     <- calculate_coverage(blogs_tokens_clean, "Blog")
results_news     <- calculate_coverage(news_tokens_clean, "News")
results_twitter  <- calculate_coverage(twitter_tokens_clean, "Twitter")
results_combined <- calculate_coverage(
  c(blogs_tokens_clean, news_tokens_clean, twitter_tokens_clean),
  "Combined"
)

# Combine summary data
coverage_summary <- bind_rows(
  results_blog$summary,
  results_news$summary,
  results_twitter$summary,
  results_combined$summary
)

# Print summary table
print(kable(coverage_summary, caption = "Unique Words Needed to Reach 50% and 90% Coverage"))

## 
## 
## Table: Unique Words Needed to Reach 50% and 90% Coverage
## 
## |      |Source   | UniqueWords| WordsToCover50| WordsToCover90|
## |:-----|:--------|-----------:|--------------:|--------------:|
## |too   |Blog     |       30934|            115|           6239|
## |set   |News     |       31076|            219|           7711|
## |yes   |Twitter  |       15478|            129|           4576|
## |three |Combined |       51227|            156|           7788|

# Combine data for plotting
coverage_curves <- bind_rows(
  results_blog$coverage_df,
  results_news$coverage_df,
  results_twitter$coverage_df,
  results_combined$coverage_df
)

# Plot cumulative coverage curves
ggplot(coverage_curves, aes(x = Rank, y = Coverage, color = Source)) +
  geom_line(size = 1) +
  scale_x_log10() +
  labs(
    title = "Cumulative Word Coverage by Frequency",
    x = "Number of Unique Words (log scale)",
    y = "Cumulative Coverage"
  ) +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "gray") +
  geom_hline(yintercept = 0.9, linetype = "dashed", color = "gray") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Increasing Coverage Strategies

Stemming What it does: Reduces words to their root or base form by chopping off suffixes. Example: runs, running, runner become run Why it’s useful: Groups different grammatical forms into a single representation, boosting coverage without needing full dictionaries.
Lemmatization What it does: Maps words to their dictionary base form using vocabulary and context. Example: better becomes good, running becomes run Why it’s better than stemming: More accurate and context-aware, avoids issues like stemming university to univers.
Spelling Correction What it does: Fixes common typos, OCR errors, or informal spelling variations. Example: recieve becomes receive, teh becomes the Why it helps: Noisy or user-generated data (like tweets) often contains misspellings that reduce effective coverage and inflate vocabulary size.
Subword Tokenization What it does: Breaks unknown or rare words into meaningful subword units (such as Byte Pair Encoding or WordPiece). Example: unhappiness becomes un, happi, ness; reposting becomes re, post, ing Why it works: Helps handle out-of-vocabulary words by building them from frequent subword components. This is crucial in modern NLP models.

Next steps

Learn from Real Language (Training) We start by analyzing large amounts of real English text — like blogs, news articles, and tweets — to understand how people naturally write. We do this by looking at common combinations of 2 or 3 words (called bigrams and trigrams). Think of it as building a “dictionary of word patterns” that we can search later.
Build the Prediction Logic We create a system that:

Looks at what a user is typing
Checks our word pattern data for likely completions
If it can’t find a match, it “backs off” and looks for simpler patterns or common words
Even if a sentence is unique, the system still tries its best to guess using more general knowledge. This makes the tool more flexible and realistic.

Create a Shiny App (User Interface) We will wrap this prediction system in a simple web app using Shiny, a tool for interactive applications in R.

EDA NLP

Dominique Mühlbauer

2025-05-23

Load Data

Basic Summaries

Word Frequency Analysis

N-gram Frequency Analysis

Vocabulary Coverage Summary

Increasing Coverage Strategies

Next steps