This milestone report presents an exploratory data analysis (EDA) of the HC Corpora dataset provided by SwiftKey. The ultimate goal is to build a next-word prediction algorithm and deploy it as a Shiny web application. This report summarizes the key features of the data, notable findings, and our planned approach for the prediction model.
# Set path data folder
path <- "final/en_US/"
# Read files (using readLines for efficiency)
blogs <- readLines(con <- file(paste0(path, "en_US.blogs.txt"), "r", encoding = "UTF-8"), skipNul = TRUE); close(con)
news <- readLines(con <- file(paste0(path, "en_US.news.txt"), "r", encoding = "UTF-8"), skipNul = TRUE); close(con)
twitter <- readLines(con <- file(paste0(path, "en_US.twitter.txt"), "r", encoding = "UTF-8"), skipNul = TRUE); close(con)
cat("Data loaded successfully!\n")
## Data loaded successfully!
cat("Blogs lines:", length(blogs), "\n")
## Blogs lines: 899288
cat("News lines:", length(news), "\n")
## News lines: 1010206
cat("Twitter lines:", length(twitter), "\n")
## Twitter lines: 2360148
library(stringr)
library(knitr)
# Word count function
word_count <- function(x) sum(str_count(x, "\\S+"))
# File sizes in MB
file_size <- function(filename) {
round(file.info(paste0(path, filename))$size / 1024^2, 1)
}
summary_df <- data.frame(
File = c("Blogs", "News", "Twitter"),
Size_MB = c(file_size("en_US.blogs.txt"),
file_size("en_US.news.txt"),
file_size("en_US.twitter.txt")),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(word_count(blogs), word_count(news), word_count(twitter)),
Avg_Words_Per_Line = c(
round(word_count(blogs) / length(blogs), 1),
round(word_count(news) / length(news), 1),
round(word_count(twitter) / length(twitter), 1)
),
Max_Chars = c(max(nchar(blogs)), max(nchar(news)), max(nchar(twitter)))
)
kable(summary_df, caption = "Table 1: Summary Statistics of the Three Corpora")
| File | Size_MB | Lines | Words | Avg_Words_Per_Line | Max_Chars |
|---|---|---|---|---|---|
| Blogs | 200.4 | 899288 | 37334131 | 41.5 | 40833 |
| News | 196.3 | 1010206 | 34371031 | 34.0 | 11384 |
| 159.4 | 2360148 | 30373583 | 12.9 | 144 |
Since the files are very large, we work with a random 1% sample for efficiency.
set.seed(42)
sample_pct <- 0.01
blogs_s <- sample(blogs, round(length(blogs) * sample_pct))
news_s <- sample(news, round(length(news) * sample_pct))
twitter_s <- sample(twitter, round(length(twitter) * sample_pct))
all_text <- c(blogs_s, news_s, twitter_s)
cat("Sample size:", length(all_text), "lines\n")
## Sample size: 42696 lines
library(tm)
# Build a corpus and clean
corpus <- VCorpus(VectorSource(all_text))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)
cat("Corpus cleaned.\n")
## Corpus cleaned.
library(ggplot2)
len_df <- data.frame(
chars = c(nchar(blogs_s), nchar(news_s), nchar(twitter_s)),
source = c(rep("Blogs", length(blogs_s)),
rep("News", length(news_s)),
rep("Twitter", length(twitter_s)))
)
ggplot(len_df, aes(x = chars, fill = source)) +
geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
facet_wrap(~source, scales = "free") +
labs(title = "Distribution of Line Lengths by Source",
x = "Characters per Line", y = "Count") +
theme_minimal() +
theme(legend.position = "none")
library(tidytext)
library(dplyr)
text_df <- data.frame(text = sapply(corpus, as.character), stringsAsFactors = FALSE)
unigrams <- text_df %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE) %>%
top_n(20, n)
ggplot(unigrams, aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Most Frequent Words (after stopword removal)",
x = "Word", y = "Frequency") +
theme_minimal()
bigrams <- text_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
filter(!is.na(bigram)) %>%
count(bigram, sort = TRUE) %>%
top_n(15, n)
ggplot(bigrams, aes(x = reorder(bigram, n), y = n)) +
geom_bar(stat = "identity", fill = "tomato") +
coord_flip() +
labs(title = "Top 15 Bigrams",
x = "Bigram", y = "Frequency") +
theme_minimal()
trigrams <- text_df %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
filter(!is.na(trigram)) %>%
count(trigram, sort = TRUE) %>%
top_n(15, n)
ggplot(trigrams, aes(x = reorder(trigram, n), y = n)) +
geom_bar(stat = "identity", fill = "seagreen") +
coord_flip() +
labs(title = "Top 15 Trigrams",
x = "Trigram", y = "Frequency") +
theme_minimal()
library(wordcloud)
library(RColorBrewer)
wordcloud(words = unigrams$word, freq = unigrams$n,
min.freq = 2, max.words = 100,
random.order = FALSE,
colors = brewer.pal(8, "Dark2"))
title("Word Cloud - Most Frequent Terms")
How many unique words are needed to cover 50% and 90% of all word instances?
all_words <- text_df %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE) %>%
mutate(cumulative_pct = cumsum(n) / sum(n) * 100)
cover_50 <- min(which(all_words$cumulative_pct >= 50))
cover_90 <- min(which(all_words$cumulative_pct >= 90))
cat("Words needed to cover 50% of instances:", cover_50, "\n")
## Words needed to cover 50% of instances: 987
cat("Words needed to cover 90% of instances:", cover_90, "\n")
## Words needed to cover 90% of instances: 15004
# Plot coverage curve
ggplot(all_words[1:min(5000, nrow(all_words)), ],
aes(x = seq_along(word), y = cumulative_pct)) +
geom_line(color = "steelblue", size = 1) +
geom_hline(yintercept = 50, linetype = "dashed", color = "orange") +
geom_hline(yintercept = 90, linetype = "dashed", color = "red") +
labs(title = "Word Coverage Curve",
x = "Number of Unique Words (Ranked by Frequency)",
y = "Cumulative % of Word Instances") +
theme_minimal()
We will build an N-gram language model (unigram, bigram, trigram, and optionally 4-gram) with Stupid Backoff smoothing:
The app will feature: - A text input box where users type a sentence - Real-time prediction of the next word (top 3 suggestions) - A clean, mobile-friendly UI
.rds files for fast
loadingThis EDA confirms that the dataset is rich and well-suited for building an NLP prediction model. The next steps are to build optimized N-gram frequency tables and deploy the Shiny app with a responsive prediction engine.