Overview

This milestone report presents an exploratory data analysis (EDA) of the HC Corpora dataset provided by SwiftKey. The ultimate goal is to build a next-word prediction algorithm and deploy it as a Shiny web application. This report summarizes the key features of the data, notable findings, and our planned approach for the prediction model.


1. Loading the Data

# Set path data folder
path <- "final/en_US/"

# Read files (using readLines for efficiency)
blogs   <- readLines(con <- file(paste0(path, "en_US.blogs.txt"),   "r", encoding = "UTF-8"), skipNul = TRUE); close(con)
news    <- readLines(con <- file(paste0(path, "en_US.news.txt"),    "r", encoding = "UTF-8"), skipNul = TRUE); close(con)
twitter <- readLines(con <- file(paste0(path, "en_US.twitter.txt"), "r", encoding = "UTF-8"), skipNul = TRUE); close(con)

cat("Data loaded successfully!\n")
## Data loaded successfully!
cat("Blogs lines:",   length(blogs),   "\n")
## Blogs lines: 899288
cat("News lines:",    length(news),    "\n")
## News lines: 1010206
cat("Twitter lines:", length(twitter), "\n")
## Twitter lines: 2360148

2. Basic Summary Statistics

library(stringr)
library(knitr)

# Word count function
word_count <- function(x) sum(str_count(x, "\\S+"))

# File sizes in MB
file_size <- function(filename) {
  round(file.info(paste0(path, filename))$size / 1024^2, 1)
}

summary_df <- data.frame(
  File     = c("Blogs", "News", "Twitter"),
  Size_MB  = c(file_size("en_US.blogs.txt"),
               file_size("en_US.news.txt"),
               file_size("en_US.twitter.txt")),
  Lines    = c(length(blogs), length(news), length(twitter)),
  Words    = c(word_count(blogs), word_count(news), word_count(twitter)),
  Avg_Words_Per_Line = c(
    round(word_count(blogs)   / length(blogs),   1),
    round(word_count(news)    / length(news),     1),
    round(word_count(twitter) / length(twitter),  1)
  ),
  Max_Chars = c(max(nchar(blogs)), max(nchar(news)), max(nchar(twitter)))
)

kable(summary_df, caption = "Table 1: Summary Statistics of the Three Corpora")
Table 1: Summary Statistics of the Three Corpora
File Size_MB Lines Words Avg_Words_Per_Line Max_Chars
Blogs 200.4 899288 37334131 41.5 40833
News 196.3 1010206 34371031 34.0 11384
Twitter 159.4 2360148 30373583 12.9 144

3. Sampling the Data

Since the files are very large, we work with a random 1% sample for efficiency.

set.seed(42)
sample_pct <- 0.01

blogs_s   <- sample(blogs,   round(length(blogs)   * sample_pct))
news_s    <- sample(news,    round(length(news)    * sample_pct))
twitter_s <- sample(twitter, round(length(twitter) * sample_pct))

all_text <- c(blogs_s, news_s, twitter_s)
cat("Sample size:", length(all_text), "lines\n")
## Sample size: 42696 lines

4. Data Cleaning

library(tm)

# Build a corpus and clean
corpus <- VCorpus(VectorSource(all_text))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)

cat("Corpus cleaned.\n")
## Corpus cleaned.

5. Exploratory Visualizations

5a. Distribution of Line Lengths (Characters)

library(ggplot2)

len_df <- data.frame(
  chars  = c(nchar(blogs_s), nchar(news_s), nchar(twitter_s)),
  source = c(rep("Blogs", length(blogs_s)),
             rep("News",  length(news_s)),
             rep("Twitter", length(twitter_s)))
)

ggplot(len_df, aes(x = chars, fill = source)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  facet_wrap(~source, scales = "free") +
  labs(title = "Distribution of Line Lengths by Source",
       x = "Characters per Line", y = "Count") +
  theme_minimal() +
  theme(legend.position = "none")

5b. Top 20 Most Frequent Words (Unigrams)

library(tidytext)
library(dplyr)

text_df <- data.frame(text = sapply(corpus, as.character), stringsAsFactors = FALSE)

unigrams <- text_df %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE) %>%
  top_n(20, n)

ggplot(unigrams, aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 20 Most Frequent Words (after stopword removal)",
       x = "Word", y = "Frequency") +
  theme_minimal()

5c. Top 15 Bigrams

bigrams <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  filter(!is.na(bigram)) %>%
  count(bigram, sort = TRUE) %>%
  top_n(15, n)

ggplot(bigrams, aes(x = reorder(bigram, n), y = n)) +
  geom_bar(stat = "identity", fill = "tomato") +
  coord_flip() +
  labs(title = "Top 15 Bigrams",
       x = "Bigram", y = "Frequency") +
  theme_minimal()

5d. Top 15 Trigrams

trigrams <- text_df %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  filter(!is.na(trigram)) %>%
  count(trigram, sort = TRUE) %>%
  top_n(15, n)

ggplot(trigrams, aes(x = reorder(trigram, n), y = n)) +
  geom_bar(stat = "identity", fill = "seagreen") +
  coord_flip() +
  labs(title = "Top 15 Trigrams",
       x = "Trigram", y = "Frequency") +
  theme_minimal()

5e. Word Cloud

library(wordcloud)
library(RColorBrewer)

wordcloud(words = unigrams$word, freq = unigrams$n,
          min.freq = 2, max.words = 100,
          random.order = FALSE,
          colors = brewer.pal(8, "Dark2"))
title("Word Cloud - Most Frequent Terms")


6. Coverage Analysis

How many unique words are needed to cover 50% and 90% of all word instances?

all_words <- text_df %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE) %>%
  mutate(cumulative_pct = cumsum(n) / sum(n) * 100)

cover_50 <- min(which(all_words$cumulative_pct >= 50))
cover_90 <- min(which(all_words$cumulative_pct >= 90))

cat("Words needed to cover 50% of instances:", cover_50, "\n")
## Words needed to cover 50% of instances: 987
cat("Words needed to cover 90% of instances:", cover_90, "\n")
## Words needed to cover 90% of instances: 15004
# Plot coverage curve
ggplot(all_words[1:min(5000, nrow(all_words)), ],
       aes(x = seq_along(word), y = cumulative_pct)) +
  geom_line(color = "steelblue", size = 1) +
  geom_hline(yintercept = 50, linetype = "dashed", color = "orange") +
  geom_hline(yintercept = 90, linetype = "dashed", color = "red") +
  labs(title = "Word Coverage Curve",
       x = "Number of Unique Words (Ranked by Frequency)",
       y = "Cumulative % of Word Instances") +
  theme_minimal()


7. Key Findings


8. Plan for Prediction Algorithm & Shiny App

Algorithm

We will build an N-gram language model (unigram, bigram, trigram, and optionally 4-gram) with Stupid Backoff smoothing:

  1. Tokenize and build frequency tables for N-grams (N = 1 to 4)
  2. Given user input, look up the last 1–3 words in the N-gram tables
  3. Return the top-3 most probable next words
  4. Fall back to lower-order N-grams if no match is found

Shiny App

The app will feature: - A text input box where users type a sentence - Real-time prediction of the next word (top 3 suggestions) - A clean, mobile-friendly UI

Optimization

  • Store N-gram tables as compressed .rds files for fast loading
  • Filter low-frequency N-grams (frequency < 2) to reduce memory usage

9. Conclusion

This EDA confirms that the dataset is rich and well-suited for building an NLP prediction model. The next steps are to build optimized N-gram frequency tables and deploy the Shiny app with a responsive prediction engine.