NLP Capstone: Exploratory Data Analysis

Overview

This milestone report presents an exploratory data analysis (EDA) of the HC Corpora dataset provided by SwiftKey. The ultimate goal is to build a next-word prediction algorithm and deploy it as a Shiny web application. This report summarizes the key features of the data, notable findings, and our planned approach for the prediction model.

1. Loading the Data

# Set path data folder
path <- "final/en_US/"

# Read files (using readLines for efficiency)
blogs   <- readLines(con <- file(paste0(path, "en_US.blogs.txt"),   "r", encoding = "UTF-8"), skipNul = TRUE); close(con)
news    <- readLines(con <- file(paste0(path, "en_US.news.txt"),    "r", encoding = "UTF-8"), skipNul = TRUE); close(con)
twitter <- readLines(con <- file(paste0(path, "en_US.twitter.txt"), "r", encoding = "UTF-8"), skipNul = TRUE); close(con)

cat("Data loaded successfully!\n")

## Data loaded successfully!

cat("Blogs lines:",   length(blogs),   "\n")

## Blogs lines: 899288

cat("News lines:",    length(news),    "\n")

## News lines: 1010206

cat("Twitter lines:", length(twitter), "\n")

## Twitter lines: 2360148

2. Basic Summary Statistics

library(stringr)
library(knitr)

# Word count function
word_count <- function(x) sum(str_count(x, "\\S+"))

# File sizes in MB
file_size <- function(filename) {
  round(file.info(paste0(path, filename))$size / 1024^2, 1)
}

summary_df <- data.frame(
  File     = c("Blogs", "News", "Twitter"),
  Size_MB  = c(file_size("en_US.blogs.txt"),
               file_size("en_US.news.txt"),
               file_size("en_US.twitter.txt")),
  Lines    = c(length(blogs), length(news), length(twitter)),
  Words    = c(word_count(blogs), word_count(news), word_count(twitter)),
  Avg_Words_Per_Line = c(
    round(word_count(blogs)   / length(blogs),   1),
    round(word_count(news)    / length(news),     1),
    round(word_count(twitter) / length(twitter),  1)
  ),
  Max_Chars = c(max(nchar(blogs)), max(nchar(news)), max(nchar(twitter)))
)

kable(summary_df, caption = "Table 1: Summary Statistics of the Three Corpora")

Table 1: Summary Statistics of the Three Corpora
File	Size_MB	Lines	Words	Avg_Words_Per_Line	Max_Chars
Blogs	200.4	899288	37334131	41.5	40833
News	196.3	1010206	34371031	34.0	11384
Twitter	159.4	2360148	30373583	12.9	144

3. Sampling the Data

Since the files are very large, we work with a random 1% sample for efficiency.

set.seed(42)
sample_pct <- 0.01

blogs_s   <- sample(blogs,   round(length(blogs)   * sample_pct))
news_s    <- sample(news,    round(length(news)    * sample_pct))
twitter_s <- sample(twitter, round(length(twitter) * sample_pct))

all_text <- c(blogs_s, news_s, twitter_s)
cat("Sample size:", length(all_text), "lines\n")

## Sample size: 42696 lines

4. Data Cleaning

library(tm)

# Build a corpus and clean
corpus <- VCorpus(VectorSource(all_text))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)

cat("Corpus cleaned.\n")

## Corpus cleaned.

5. Exploratory Visualizations

5a. Distribution of Line Lengths (Characters)

library(ggplot2)

len_df <- data.frame(
  chars  = c(nchar(blogs_s), nchar(news_s), nchar(twitter_s)),
  source = c(rep("Blogs", length(blogs_s)),
             rep("News",  length(news_s)),
             rep("Twitter", length(twitter_s)))
)

ggplot(len_df, aes(x = chars, fill = source)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  facet_wrap(~source, scales = "free") +
  labs(title = "Distribution of Line Lengths by Source",
       x = "Characters per Line", y = "Count") +
  theme_minimal() +
  theme(legend.position = "none")

5b. Top 20 Most Frequent Words (Unigrams)

library(tidytext)
library(dplyr)

text_df <- data.frame(text = sapply(corpus, as.character), stringsAsFactors = FALSE)

unigrams <- text_df %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE) %>%
  top_n(20, n)

ggplot(unigrams, aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 20 Most Frequent Words (after stopword removal)",
       x = "Word", y = "Frequency") +
  theme_minimal()

5c. Top 15 Bigrams

bigrams <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  filter(!is.na(bigram)) %>%
  count(bigram, sort = TRUE) %>%
  top_n(15, n)

ggplot(bigrams, aes(x = reorder(bigram, n), y = n)) +
  geom_bar(stat = "identity", fill = "tomato") +
  coord_flip() +
  labs(title = "Top 15 Bigrams",
       x = "Bigram", y = "Frequency") +
  theme_minimal()

5d. Top 15 Trigrams

trigrams <- text_df %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  filter(!is.na(trigram)) %>%
  count(trigram, sort = TRUE) %>%
  top_n(15, n)

ggplot(trigrams, aes(x = reorder(trigram, n), y = n)) +
  geom_bar(stat = "identity", fill = "seagreen") +
  coord_flip() +
  labs(title = "Top 15 Trigrams",
       x = "Trigram", y = "Frequency") +
  theme_minimal()

5e. Word Cloud

library(wordcloud)
library(RColorBrewer)

wordcloud(words = unigrams$word, freq = unigrams$n,
          min.freq = 2, max.words = 100,
          random.order = FALSE,
          colors = brewer.pal(8, "Dark2"))
title("Word Cloud - Most Frequent Terms")

6. Coverage Analysis

How many unique words are needed to cover 50% and 90% of all word instances?

all_words <- text_df %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE) %>%
  mutate(cumulative_pct = cumsum(n) / sum(n) * 100)

cover_50 <- min(which(all_words$cumulative_pct >= 50))
cover_90 <- min(which(all_words$cumulative_pct >= 90))

cat("Words needed to cover 50% of instances:", cover_50, "\n")

## Words needed to cover 50% of instances: 987

cat("Words needed to cover 90% of instances:", cover_90, "\n")

## Words needed to cover 90% of instances: 15004

# Plot coverage curve
ggplot(all_words[1:min(5000, nrow(all_words)), ],
       aes(x = seq_along(word), y = cumulative_pct)) +
  geom_line(color = "steelblue", size = 1) +
  geom_hline(yintercept = 50, linetype = "dashed", color = "orange") +
  geom_hline(yintercept = 90, linetype = "dashed", color = "red") +
  labs(title = "Word Coverage Curve",
       x = "Number of Unique Words (Ranked by Frequency)",
       y = "Cumulative % of Word Instances") +
  theme_minimal()

7. Key Findings

Twitter has the highest number of lines but the shortest average line length, capped at 140 characters.
Blogs have the longest lines on average, reflecting long-form writing.
A small vocabulary (~987 words) covers 50% of all text — Zipf’s Law in action.
N-gram patterns show strong common phrases; these will directly inform the prediction model.
After stop-word removal, content words reveal topic-specific vocabulary per source.

8. Plan for Prediction Algorithm & Shiny App

Algorithm

We will build an N-gram language model (unigram, bigram, trigram, and optionally 4-gram) with Stupid Backoff smoothing:

Tokenize and build frequency tables for N-grams (N = 1 to 4)
Given user input, look up the last 1–3 words in the N-gram tables
Return the top-3 most probable next words
Fall back to lower-order N-grams if no match is found

Shiny App

The app will feature: - A text input box where users type a sentence - Real-time prediction of the next word (top 3 suggestions) - A clean, mobile-friendly UI

Optimization

Store N-gram tables as compressed .rds files for fast loading
Filter low-frequency N-grams (frequency < 2) to reduce memory usage

9. Conclusion

This EDA confirms that the dataset is rich and well-suited for building an NLP prediction model. The next steps are to build optimized N-gram frequency tables and deploy the Shiny app with a responsive prediction engine.