Load Data

Basic Summaries

# Function to summarize tokenized data
summarize_token_data <- function(token_data, source_name) {
  linecount <- length(token_data)
  wordcounts <- sapply(token_data, length)
  total_words <- sum(wordcounts)
  avg_words <- mean(wordcounts)
  max_words <- max(wordcounts)
  
  data.frame(
    Source = source_name,
    LineCount = linecount,
    TotalWordCount = total_words,
    AverageWordCount = avg_words,
    MaxWordCount = max_words
  )
}

# Generate summaries for each dataset
summary_blog    <- summarize_token_data(blogs_tokens_clean, "Blog")
summary_news    <- summarize_token_data(news_tokens_clean, "News")
summary_twitter <- summarize_token_data(twitter_tokens_clean, "Twitter")
summary_combined <- summarize_token_data(all_tokens_clean, "Combined")

# Combine into one data frame
summary_df <- rbind(summary_blog, summary_news, summary_twitter, summary_combined)
rownames(summary_df) <- NULL

# Display result
print(summary_df)
##     Source LineCount TotalWordCount AverageWordCount MaxWordCount
## 1     Blog     10000         412494          41.2494          681
## 2     News     10000         347931          34.7931          302
## 3  Twitter     10000         126048          12.6048           34
## 4 Combined     30000         886473          29.5491          681

Word Frequency Analysis

# Function to get word frequency table
get_word_freq <- function(token_data) {
  word_vector <- unlist(token_data)
  freq_table <- table(tolower(word_vector))
  sort(freq_table, decreasing = TRUE)
}

# Function to plot word frequency histogram
plot_freq_distribution <- function(freq_table, source_name) {
  freq_counts <- as.numeric(freq_table)
  df <- data.frame(Frequency = freq_counts)

  ggplot(df, aes(x = Frequency)) +
    geom_histogram(bins = 50, fill = "steelblue", color = "black") +
    scale_x_log10() +
    scale_y_log10() +
    labs(
      title = paste("Word Frequency Distribution -", source_name),
      x = "Word Frequency (log10)",
      y = "Number of Words (log10)"
    )
}

# Function to plot top N words
plot_top_words <- function(freq_table, source_name, top_n = 15) {
  top_words <- head(freq_table, top_n)
  df <- data.frame(Word = names(top_words), Frequency = as.numeric(top_words))

  ggplot(df, aes(x = reorder(Word, Frequency), y = Frequency)) +
    geom_bar(stat = "identity", fill = "darkorange") +
    coord_flip() +
    labs(
      title = paste("Top", top_n, "Words -", source_name),
      x = "Word",
      y = "Frequency"
    )
}

# Analyze each dataset
datasets <- list(
  Blog = blogs_tokens_clean,
  News = news_tokens_clean,
  Twitter = twitter_tokens_clean,
  Combined = c(blogs_tokens_clean, news_tokens_clean, twitter_tokens_clean)
)

# Run the analysis
for (source_name in names(datasets)) {
  freq_table <- get_word_freq(datasets[[source_name]])
  
  print(plot_freq_distribution(freq_table, source_name))  # Histogram
  print(plot_top_words(freq_table, source_name))          # Top 15 words
}
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Removed 6 rows containing missing values or values outside the scale range
## (`geom_bar()`).

N-gram Frequency Analysis

# N-gram Generator
generate_ngrams <- function(tokens_list, n = 2) {
  ngram_vec <- unlist(lapply(tokens_list, function(tokens) {
    if (length(tokens) >= n) {
      sapply(1:(length(tokens) - n + 1), function(i) {
        paste(tokens[i:(i + n - 1)], collapse = " ")
      })
    } else {
      character(0)
    }
  }))
  return(ngram_vec)
}

# Plot Top N N-grams
plot_top_ngrams <- function(ngram_vec, source_name, ngram_type = "Bigram", top_n = 15) {
  if (length(ngram_vec) == 0) {
    message("No ", tolower(ngram_type), "s found in ", source_name, " dataset.")
    return(NULL)
  }

  freq_table <- sort(table(ngram_vec), decreasing = TRUE)
  top_ngrams <- head(freq_table, top_n)

  df <- data.frame(
    Ngram = factor(names(top_ngrams), levels = rev(names(top_ngrams))),
    Frequency = as.numeric(top_ngrams),
    stringsAsFactors = FALSE
  )

  ggplot(df, aes(x = Ngram, y = Frequency)) +
    geom_bar(stat = "identity", fill = "darkblue") +
    coord_flip() +
    labs(
      title = paste("Top", top_n, ngram_type, "s -", source_name),
      x = ngram_type,
      y = "Frequency"
    ) +
    theme_minimal()
}

# Plot Histogram of N-gram Frequency Distribution
plot_ngram_histogram <- function(ngram_vec, source_name, ngram_type = "Bigram") {
  if (length(ngram_vec) == 0) return(NULL)

  freq_table <- table(ngram_vec)
  freq_df <- data.frame(Frequency = as.numeric(freq_table))

  ggplot(freq_df, aes(x = Frequency)) +
    geom_histogram(bins = 50, fill = "steelblue", color = "black") +
    scale_x_log10() +
    scale_y_log10() +
    labs(
      title = paste(ngram_type, "Frequency Distribution -", source_name),
      x = "Frequency (log scale)",
      y = "Count (log scale)"
    ) +
    theme_minimal()
}

# Dataset List
datasets <- list(
  Blog = blogs_tokens_clean,
  News = news_tokens_clean,
  Twitter = twitter_tokens_clean,
  Combined = c(blogs_tokens_clean, news_tokens_clean, twitter_tokens_clean)
)

# Run Analysis
for (source_name in names(datasets)) {
  message("\nProcessing ", source_name, "...")

  # Bigrams
  bigrams <- generate_ngrams(datasets[[source_name]], n = 2)
  message("  Found ", length(bigrams), " bigrams.")
  plot_bi <- plot_top_ngrams(bigrams, source_name, "Bigram")
  plot_bi_hist <- plot_ngram_histogram(bigrams, source_name, "Bigram")
  if (!is.null(plot_bi)) print(plot_bi)
  if (!is.null(plot_bi_hist)) print(plot_bi_hist)

  # Trigrams
  trigrams <- generate_ngrams(datasets[[source_name]], n = 3)
  message("  Found ", length(trigrams), " trigrams.")
  plot_tri <- plot_top_ngrams(trigrams, source_name, "Trigram")
  plot_tri_hist <- plot_ngram_histogram(trigrams, source_name, "Trigram")
  if (!is.null(plot_tri)) print(plot_tri)
  if (!is.null(plot_tri_hist)) print(plot_tri_hist)
}
## 
## Processing Blog...
##   Found 402494 bigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_bar()`).

##   Found 392615 trigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## 
## Processing News...
##   Found 337931 bigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_bar()`).

##   Found 327959 trigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## 
## Processing Twitter...
##   Found 116048 bigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_bar()`).

##   Found 106054 trigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 23 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## 
## Processing Combined...
##   Found 856473 bigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).

##   Found 826628 trigrams.

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Vocabulary Coverage Summary

  • Analysis of how many unique words are needed to cover 50% and 90% of all word occurrences in each dataset (Blog, News, Twitter, Combined).
  • Found that a small number of high-frequency words account for the majority of total word usage.
  • Plot of cumulative word coverage curves to visualize how vocabulary size relates to overall coverage.
  • Results highlight the efficiency of using a frequency-sorted dictionary for compression, modeling, or predictive text applications.
# Function to calculate cumulative coverage
calculate_coverage <- function(token_data, source_name) {
  word_vector <- unlist(token_data)
  freq_table <- sort(table(tolower(word_vector)), decreasing = TRUE)
  total_words <- sum(freq_table)
  cumulative_freq <- cumsum(freq_table)
  coverage_percent <- cumulative_freq / total_words

  idx_50 <- which(coverage_percent >= 0.50)[1]
  idx_90 <- which(coverage_percent >= 0.90)[1]

  coverage_df <- data.frame(
    Rank = seq_along(freq_table),
    Coverage = coverage_percent,
    Source = source_name
  )

  list(
    summary = data.frame(
      Source = source_name,
      UniqueWords = length(freq_table),
      WordsToCover50 = idx_50,
      WordsToCover90 = idx_90
    ),
    coverage_df = coverage_df
  )
}

# Run for each dataset
results_blog     <- calculate_coverage(blogs_tokens_clean, "Blog")
results_news     <- calculate_coverage(news_tokens_clean, "News")
results_twitter  <- calculate_coverage(twitter_tokens_clean, "Twitter")
results_combined <- calculate_coverage(
  c(blogs_tokens_clean, news_tokens_clean, twitter_tokens_clean),
  "Combined"
)

# Combine summary data
coverage_summary <- bind_rows(
  results_blog$summary,
  results_news$summary,
  results_twitter$summary,
  results_combined$summary
)

# Print summary table
print(kable(coverage_summary, caption = "Unique Words Needed to Reach 50% and 90% Coverage"))
## 
## 
## Table: Unique Words Needed to Reach 50% and 90% Coverage
## 
## |      |Source   | UniqueWords| WordsToCover50| WordsToCover90|
## |:-----|:--------|-----------:|--------------:|--------------:|
## |too   |Blog     |       30934|            115|           6239|
## |set   |News     |       31076|            219|           7711|
## |yes   |Twitter  |       15478|            129|           4576|
## |three |Combined |       51227|            156|           7788|
# Combine data for plotting
coverage_curves <- bind_rows(
  results_blog$coverage_df,
  results_news$coverage_df,
  results_twitter$coverage_df,
  results_combined$coverage_df
)

# Plot cumulative coverage curves
ggplot(coverage_curves, aes(x = Rank, y = Coverage, color = Source)) +
  geom_line(size = 1) +
  scale_x_log10() +
  labs(
    title = "Cumulative Word Coverage by Frequency",
    x = "Number of Unique Words (log scale)",
    y = "Cumulative Coverage"
  ) +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "gray") +
  geom_hline(yintercept = 0.9, linetype = "dashed", color = "gray") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Increasing Coverage Strategies

Next steps

  1. Learn from Real Language (Training) We start by analyzing large amounts of real English text — like blogs, news articles, and tweets — to understand how people naturally write. We do this by looking at common combinations of 2 or 3 words (called bigrams and trigrams). Think of it as building a “dictionary of word patterns” that we can search later.

  2. Build the Prediction Logic We create a system that:

  1. Create a Shiny App (User Interface) We will wrap this prediction system in a simple web app using Shiny, a tool for interactive applications in R.