This report summarises the exploratory analysis of the HC Corpora dataset provided for the Coursera Data Science Capstone. The goal is to build a predictive text model — similar to the suggestions on a smartphone keyboard — that predicts the next word a user is likely to type.
Key findings:
The dataset comprises text from three sources in four languages. This analysis focuses on the English (en_US) corpus.
data_dir <- "final"
sources <- c("blogs", "news", "twitter")
en_files <- file.path(data_dir, "en_US", paste0("en_US.", sources, ".txt"))
file_stats <- rbindlist(lapply(en_files, function(f) {
con <- file(f, open = "rb")
lines <- readLines(con, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
close(con)
words <- stri_count_words(lines)
data.table(
Source = gsub("en_US\\.", "", basename(f)) |> gsub(pattern = "\\.txt", replacement = ""),
`Size (MB)` = round(file.info(f)$size / 1024^2, 1),
Lines = format(length(lines), big.mark = ","),
Words = format(sum(words), big.mark = ","),
`Avg words/line` = round(mean(words), 1),
`Max words/line` = format(max(words), big.mark = ",")
)
}))
kbl(file_stats, caption = "English corpus file summary") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Source | Size (MB) | Lines | Words | Avg words/line | Max words/line |
|---|---|---|---|---|---|
| blogs | 200.4 | 899,288 | 37,581,907 | 41.8 | 6,780 |
| news | 196.3 | 1,010,242 | 34,858,293 | 34.5 | 1,803 |
| 159.4 | 2,360,148 | 30,162,832 | 12.8 | 47 |
Observations:
Working with the full 555 MB English corpus is unnecessary for
exploratory analysis. We drew a reproducible 5% random
sample (~213,000 lines) using a coin-flip approach
(rbinom). The cleaning pipeline includes:
After cleaning, the sample contains ~5 million tokens with a vocabulary of ~134,000 unique words.
uni_freq <- readRDS("data/unigrams.rds")
bi_freq <- readRDS("data/bigrams.rds")
tri_freq <- readRDS("data/trigrams.rds")
top30 <- head(uni_freq, 30)
top30[, ngram := factor(ngram, levels = rev(ngram))]
ggplot(top30, aes(x = ngram, y = count)) +
geom_col(fill = "#3575b5", width = 0.7) +
geom_text(aes(label = comma(count)), hjust = -0.08, size = 3.2) +
coord_flip() +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
labs(title = "Top 30 Most Frequent Words",
subtitle = "English corpus, 5% sample (~5M tokens)",
x = NULL, y = "Frequency") +
theme_cap
The top words are function words (the, to, and, a, of), which is typical of English text.
freq_of_freq <- uni_freq[, .(n_words = .N), by = count][order(count)]
p1 <- ggplot(uni_freq[count <= 50], aes(x = count)) +
geom_histogram(binwidth = 1, fill = "#3575b5", color = "white") +
scale_y_continuous(labels = comma) +
labs(title = "Word Frequency Distribution",
subtitle = "Number of words appearing exactly k times (k <= 50)",
x = "Frequency (k)", y = "Unique words") +
theme_cap
p2 <- ggplot(freq_of_freq, aes(x = count, y = n_words)) +
geom_point(alpha = 0.3, size = 0.8, color = "#3575b5") +
scale_x_log10(labels = comma) + scale_y_log10(labels = comma) +
labs(title = "Frequency-of-Frequency (log-log)",
subtitle = "Classic Zipfian long-tail pattern",
x = "Word frequency (log)", y = "# words (log)") +
theme_cap
gridExtra::grid.arrange(p1, p2, ncol = 2)
Over 54% of the vocabulary consists of words that appear only once (hapax legomena), yet they account for less than 1.5% of all tokens. This extreme skew is the foundation of our model design.
top30_bi <- head(bi_freq, 30)
top30_bi[, ngram := factor(ngram, levels = rev(ngram))]
ggplot(top30_bi, aes(x = ngram, y = count)) +
geom_col(fill = "#e07b39", width = 0.7) +
geom_text(aes(label = comma(count)), hjust = -0.08, size = 3.2) +
coord_flip() +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
labs(title = "Top 30 Bigrams (Word Pairs)",
x = NULL, y = "Frequency") +
theme_cap
top30_tri <- head(tri_freq, 30)
top30_tri[, ngram := factor(ngram, levels = rev(ngram))]
ggplot(top30_tri, aes(x = ngram, y = count)) +
geom_col(fill = "#4aad5b", width = 0.7) +
geom_text(aes(label = comma(count)), hjust = -0.08, size = 3.2) +
coord_flip() +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
labs(title = "Top 30 Trigrams (Word Triples)",
x = NULL, y = "Frequency") +
theme_cap
kbl(data.table(
`N-gram` = c("Unigrams", "Bigrams", "Trigrams"),
`Unique count` = format(c(nrow(uni_freq), nrow(bi_freq), nrow(tri_freq)), big.mark = ","),
`Top phrase` = c(uni_freq$ngram[1], bi_freq$ngram[1], tri_freq$ngram[1]),
`Top count` = format(c(uni_freq$count[1], bi_freq$count[1], tri_freq$count[1]), big.mark = ",")
), caption = "N-gram summary") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| N-gram | Unique count | Top phrase | Top count |
|---|---|---|---|
| Unigrams | 133,664 | the | 236,983 |
| Bigrams | 1,526,999 | of the | 21,283 |
| Trigrams | 3,295,460 | i do not | 2,989 |
A key question: how large does our dictionary need to be?
uni_freq[, rank := .I]
total_tokens <- sum(uni_freq$count)
uni_freq[, cum_pct := cumsum(as.numeric(count)) / total_tokens * 100]
milestones <- c(50, 75, 90, 95)
ms_dt <- data.table(
pct = milestones,
n_words = sapply(milestones, function(p) uni_freq[cum_pct >= p, rank[1]])
)
ms_dt[, label := sprintf("%d%% -> %s words", pct, comma(n_words))]
ggplot(uni_freq, aes(x = rank, y = cum_pct)) +
geom_line(color = "#3575b5", linewidth = 0.8) +
geom_hline(yintercept = milestones, linetype = "dashed", color = "grey60") +
geom_point(data = ms_dt, aes(x = n_words, y = pct), color = "red", size = 3) +
geom_label(data = ms_dt, aes(x = n_words, y = pct, label = label),
hjust = -0.05, size = 3.5, fill = "white") +
scale_x_continuous(labels = comma) +
labs(title = "Dictionary Coverage Curve",
subtitle = "Cumulative % of all text covered by the top-N most frequent words",
x = "Dictionary size (# unique words)", y = "% of text covered") +
theme_cap
ms_dt[, `% of vocabulary` := sprintf("%.1f%%", 100 * n_words / nrow(uni_freq))]
kbl(ms_dt[, .(`Coverage` = sprintf("%d%%", pct),
`Words needed` = comma(n_words),
`% of vocabulary`)],
caption = "Words needed for coverage milestones") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Coverage | Words needed | % of vocabulary |
|---|---|---|
| 50% | 123 | 0.1% |
| 75% | 1,388 | 1.0% |
| 90% | 7,346 | 5.5% |
| 95% | 17,947 | 13.4% |
Just 123 words cover half of all text. A dictionary of ~7,300 words covers 90%. This means our prediction model can be surprisingly compact.
dat <- readRDS("data/tokens_clean.rds")
src_words <- rbindlist(lapply(seq_len(nrow(dat)), function(i) {
data.table(word = dat$tokens[[i]], source = dat$source[i])
}))
top20 <- uni_freq$ngram[1:20]
src_top <- src_words[word %in% top20, .(count = .N), by = .(word, source)]
src_top[, word := factor(word, levels = rev(top20))]
ggplot(src_top, aes(x = word, y = count, fill = source)) +
geom_col(position = "dodge", width = 0.7) +
coord_flip() +
scale_y_continuous(labels = comma) +
scale_fill_manual(values = c(blogs = "#3575b5", news = "#e07b39", twitter = "#4aad5b")) +
labs(title = "Word Usage Differs by Source",
subtitle = "Top 20 words: frequency comparison across blogs, news, and Twitter",
x = NULL, y = "Frequency", fill = "Source") +
theme_cap
rm(dat, src_words); gc(verbose = FALSE)
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6430416 343.5 13258860 708.1 NA 13258860 708.1
## Vcells 28427922 216.9 485576950 3704.7 65536 524237080 3999.7
We have already built a working 4-gram Stupid Backoff model that:
| Feature | Detail |
|---|---|
| Method | Stupid Backoff (Brants et al. 2007) |
| N-gram depth | 4-gram → trigram → bigram → unigram |
| Vocabulary | ~40,000 words (pruned from 118K) |
| Model size | 5 MB on disk, 61 MB in RAM |
| Speed | ~6 ms per prediction |
| Accuracy | 16% top-1, 31% top-5 (held-out test) |
Planned improvements:
The app will have a simple, clean interface:
The app will be deployed on shinyapps.io and designed to run within the free-tier memory limit.
Report generated on 2026-04-25. Source code and data processing scripts are available in the project repository.