Exploratory Data Analysis and N-gram Modeling

Introduction

The objective of this analysis is to explore the text data from three different sources (blogs, news, and Twitter) and build a predictive model that can handle both seen and unseen n-grams. This model will help in predicting the next word in a given sequence of words, which can be useful for various applications such as text autocompletion.

Basic Summaries

Word Counts and Line Counts

# Load necessary libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(tidytext)
library(ggplot2)
library(wordcloud)

## Loading required package: RColorBrewer

library(stringr)
library(textstem)

## Loading required package: koRpus.lang.en

## Loading required package: koRpus

## Loading required package: sylly

## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()

library(SnowballC)

# Function to read file and summarize
summarize_file <- function(file) {
  lines <- readLines(file, warn = FALSE)
  num_lines <- length(lines)
  num_words <- sum(str_count(lines, '\\w+'))
  list(lines = num_lines, words = num_words)
}

# Summarize each file
file_names <- c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt")
summaries <- lapply(file_names, summarize_file)

# Print summaries
names(summaries) <- file_names
summaries

## $en_US.blogs.txt
## $en_US.blogs.txt$lines
## [1] 899288
## 
## $en_US.blogs.txt$words
## [1] 38309620
## 
## 
## $en_US.news.txt
## $en_US.news.txt$lines
## [1] 1010242
## 
## $en_US.news.txt$words
## [1] 35624454
## 
## 
## $en_US.twitter.txt
## $en_US.twitter.txt$lines
## [1] 2360148
## 
## $en_US.twitter.txt$words
## [1] 31003501

Exploratory Data Analysis (EDA)

Distribution of Word Counts per Document

Most Frequent Words, Bigrams, and Trigrams

# Tokenize the data for unigrams and apply stemming
tokenized_data <- data_sample %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word) %>%
  mutate(stemmed_word = wordStem(word))

# Unigram Analysis
unigrams <- tokenized_data %>%
  count(word, sort = TRUE)

top_unigrams <- unigrams %>%
  top_n(20, n)

ggplot(top_unigrams, aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top 20 Most Frequent Unigrams",
       x = "Unigram", y = "Frequency")

# Bigram Analysis
bigrams <- data_sample %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, into = c("word1", "word2"), sep = " ") %>%
  filter(!is.na(word1) & !is.na(word2)) %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
  count(word1, word2, sort = TRUE)

top_bigrams <- bigrams %>%
  top_n(20, n)

ggplot(top_bigrams, aes(x = reorder(paste(word1, word2, sep = " "), n), y = n)) +
  geom_bar(stat = "identity", fill = "coral") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top 20 Most Frequent Bigrams",
       x = "Bigram", y = "Frequency")

# Trigram Analysis
trigrams <- data_sample %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, into = c("word1", "word2", "word3"), sep = " ") %>%
  filter(!is.na(word1) & !is.na(word2) & !is.na(word3)) %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word, !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)

top_trigrams <- trigrams %>%
  top_n(20, n)

ggplot(top_trigrams, aes(x = reorder(paste(word1, word2, word3, sep = " "), n), y = n)) +
  geom_bar(stat = "identity", fill = "purple") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top 20 Most Frequent Trigrams",
       x = "Trigram", y = "Frequency")

Word Cloud Visualization

# Tokenize the data for unigrams, bigrams, and trigrams
tokenized_words <- data_sample %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word)

# Word Cloud for Most Common Words
word_freq <- tokenized_words %>%
  count(word, sort = TRUE)

set.seed(123)
wordcloud::wordcloud(words = word_freq$word, freq = word_freq$n, min.freq = 50,
          max.words = 100, random.order = FALSE, colors = brewer.pal(8, "Dark2"))

Modeling

N-gram Model Building

# Tokenize the data for unigrams, bigrams, and trigrams
tokenized_data <- data_sample %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word)

unigrams <- tokenized_data %>%
  count(word, sort = TRUE)

bigrams <- data_sample %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, into = c("word1", "word2"), sep = " ") %>%
  filter(!is.na(word1) & !is.na(word2)) %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
  count(word1, word2, sort = TRUE)

trigrams <- data_sample %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, into = c("word1", "word2", "word3"), sep = " ") %>%
  filter(!is.na(word1) & !is.na(word2) & !is.na(word3)) %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word, !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)

# Save n-gram models to files for later use
save(unigrams, file = "unigrams.RData")
save(bigrams, file = "bigrams.RData")
save(trigrams, file = "trigrams.RData")

Handling Unseen N-grams with Stupid Backoff

# Stupid Backoff Prediction Function
predict_next_word_stupid_backoff <- function(previous_words, unigrams, bigrams, trigrams, alpha = 0.4) {
  words <- unlist(strsplit(previous_words, " "))
  len <- length(words)
  
  if (len >= 2) {
    # Check trigram
    trigram <- filter(trigrams, word1 == words[len-1], word2 == words[len])
    if (nrow(trigram) > 0) {
      trigram <- trigram %>% arrange(desc(n))
      return(trigram$word3[1])
    }
  }
  
  if (len >= 1) {
    # Check bigram
    bigram <- filter(bigrams, word1 == words[len])
    if (nrow(bigram) > 0) {
      bigram <- bigram %>% arrange(desc(n))
      return(bigram$word2[1])
    }
  }
  
  # Fall back to unigram
  unigram <- unigrams %>% arrange(desc(n))
  return(unigram$word[1])
}

# Example prediction with Stupid Backoff model
previous_words <- "the quick"
next_word <- predict_next_word_stupid_backoff(previous_words, unigrams, bigrams, trigrams, alpha = 0.4)
cat("Predicted next word with Stupid Backoff:", next_word, "\n")

## Predicted next word with Stupid Backoff: check

Conclusion

Summary of Findings:

We conducted a comprehensive exploratory analysis of the text data from blogs, news, and Twitter. The distribution of word counts per document, along with the most frequent words, bigrams, and trigrams, were analyzed and visualized. We built unigram, bigram, and trigram models and implemented a Stupid Backoff model to handle unseen n-grams.

Next Steps:

Further refine the model by experimenting with more sophisticated smoothing techniques such as Kneser-Ney smoothing. Evaluate the model’s performance on a larger and more diverse dataset. Explore the use of advanced machine learning techniques for text prediction.