This report summarises exploratory analysis on the HC Corpora / SwiftKey dataset for the Coursera Data Science Capstone. The goal is to build a next-word prediction algorithm and deploy it as a Shiny web app — similar to smartphone keyboard prediction.
This document covers:
packages <- c("stringi", "tidytext", "tidyr", "dplyr",
"ggplot2", "knitr", "kableExtra", "stopwords")
installed_pkgs <- rownames(installed.packages())
to_install <- packages[!packages %in% installed_pkgs]
if (length(to_install)) install.packages(to_install, repos = "https://cran.rstudio.com/")## package 'ISOcodes' successfully unpacked and MD5 sums checked
## package 'stopwords' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\vkpar\AppData\Local\Temp\RtmpsrORDl\downloaded_packages
# ── Set your path here ────────────────────────────────────────────────────────
DATA_DIR <- "C:/Users/vkpar/Downloads/Coursera-SwiftKey/final/en_US/" # <- change to your local path
read_safe <- function(fname) {
readLines(file.path(DATA_DIR, fname), encoding = "UTF-8", skipNul = TRUE)
}
blogs <- read_safe("en_US.blogs.txt")
news <- read_safe("en_US.news.txt")
twitter <- read_safe("en_US.twitter.txt")
cat("Loaded -- blogs:", length(blogs),
"| news:", length(news),
"| twitter:", length(twitter), "lines\n")## Loaded -- blogs: 899288 | news: 1010206 | twitter: 2360148 lines
file_summary <- function(lines, label) {
word_counts <- stringi::stri_count_words(lines)
data.frame(
Source = label,
Lines = format(length(lines), big.mark = ","),
Words = format(sum(word_counts), big.mark = ","),
Chars = format(sum(nchar(lines)), big.mark = ","),
Max_Line_Length = format(max(nchar(lines)), big.mark = ","),
Avg_Words_Per_Line = round(mean(word_counts), 1)
)
}
summary_df <- rbind(
file_summary(blogs, "Blogs"),
file_summary(news, "News"),
file_summary(twitter, "Twitter")
)
kable(summary_df,
caption = "Table 1 - Corpus Summary Statistics",
align = "lrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Source | Lines | Words | Chars | Max_Line_Length | Avg_Words_Per_Line |
|---|---|---|---|---|---|
| Blogs | 899,288 | 37,546,806 | 206,824,505 | 40,833 | 41.8 |
| News | 1,010,206 | 34,761,151 | 203,214,543 | 11,384 | 34.4 |
| 2,360,148 | 30,096,690 | 162,096,241 | 140 | 12.8 |
Key observations:
len_df <- data.frame(
source = c(rep("Blogs", length(blogs)),
rep("News", length(news)),
rep("Twitter", length(twitter))),
len = c(nchar(blogs), nchar(news), nchar(twitter))
)
ggplot(len_df, aes(x = len, fill = source)) +
geom_histogram(bins = 60, alpha = 0.8) +
facet_wrap(~source, scales = "free") +
scale_fill_manual(values = c("#0984e3", "#00b894", "#e17055")) +
labs(title = "Distribution of Line Lengths by Source",
x = "Characters per Line", y = "Count") +
theme_minimal(base_size = 13) +
theme(legend.position = "none")## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 6736951 359.8 10317100 551.0 9294192 496.4
## Vcells 99629407 760.2 183458216 1399.7 182582246 1393.0
The full corpus is too large to process in memory all at once. We sample 0.5% of each source for n-gram analysis.
set.seed(2024)
SAMPLE_PCT <- 0.005 # 0.5% -- safe for 8GB RAM machines
sample_lines <- c(
sample(blogs, size = floor(length(blogs) * SAMPLE_PCT)),
sample(news, size = floor(length(news) * SAMPLE_PCT)),
sample(twitter, size = floor(length(twitter) * SAMPLE_PCT))
)
# Free originals immediately
rm(blogs, news, twitter); gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2534499 135.4 8253680 440.8 9294192 496.4
## Vcells 15343936 117.1 146766573 1119.8 182582246 1393.0
## Sample size: 21347 lines
clean_text <- function(x) {
x <- tolower(x)
x <- gsub("http[s]?://\\S+|www\\.\\S+", " ", x)
x <- gsub("[^a-z\\s']", " ", x)
x <- gsub("\\s+", " ", x)
trimws(x)
}
cleaned_lines <- clean_text(sample_lines)
rm(sample_lines); gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2534429 135.4 8253680 440.8 9294192 496.4
## Vcells 15327402 117.0 117413259 895.8 182582246 1393.0
tidy_df <- data.frame(
line = seq_along(cleaned_lines),
text = cleaned_lines,
stringsAsFactors = FALSE
)
rm(cleaned_lines); gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2534454 135.4 8253680 440.8 9294192 496.4
## Vcells 15327449 117.0 93930608 716.7 182582246 1393.0
## Tidy data frame ready: 21347 rows
stop_words_en <- stopwords::stopwords("en")
unigram_df <- tidy_df %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words_en, nchar(word) > 1) %>%
count(word, sort = TRUE)
cat("Unique words (after stopword removal):", nrow(unigram_df), "\n")## Unique words (after stopword removal): 35180
ggplot(head(unigram_df, 30),
aes(x = reorder(word, n), y = n, fill = n)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_gradient(low = "#74b9ff", high = "#0984e3") +
labs(title = "Top 30 Most Frequent Words (stopwords removed)",
x = NULL, y = "Count") +
theme_minimal(base_size = 13)bigram_df <- tidy_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, into = c("w1", "w2"), sep = " ") %>%
filter(!w1 %in% stop_words_en,
!w2 %in% stop_words_en,
!is.na(w1), !is.na(w2)) %>%
unite(bigram, w1, w2, sep = " ") %>%
count(bigram, sort = TRUE)
ggplot(head(bigram_df, 20),
aes(x = reorder(bigram, n), y = n, fill = n)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_gradient(low = "#55efc4", high = "#00b894") +
labs(title = "Top 20 Bigrams", x = NULL, y = "Count") +
theme_minimal(base_size = 13)trigram_df <- tidy_df %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
count(trigram, sort = TRUE)
ggplot(head(trigram_df, 20),
aes(x = reorder(trigram, n), y = n, fill = n)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_gradient(low = "#fd79a8", high = "#e84393") +
labs(title = "Top 20 Trigrams", x = NULL, y = "Count") +
theme_minimal(base_size = 13)How many unique words are needed to cover X% of all word instances?
unigram_all <- tidy_df %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE) %>%
mutate(cum_pct = cumsum(n) / sum(n) * 100,
rank = row_number())
cover_50 <- unigram_all$rank[which(unigram_all$cum_pct >= 50)[1]]
cover_90 <- unigram_all$rank[which(unigram_all$cum_pct >= 90)[1]]
cat("Words needed for 50% coverage:", cover_50, "\n")## Words needed for 50% coverage: 141
## Words needed for 90% coverage: 6631
ggplot(unigram_all[1:min(8000, nrow(unigram_all)), ],
aes(x = rank, y = cum_pct)) +
geom_line(color = "#6c5ce7", linewidth = 1) +
geom_hline(yintercept = c(50, 90),
linetype = "dashed",
color = c("#e17055", "#d63031")) +
annotate("text", x = cover_50 + 200, y = 47,
label = paste0(cover_50, " words -> 50%"),
color = "#e17055", size = 4) +
annotate("text", x = cover_90 + 200, y = 87,
label = paste0(cover_90, " words -> 90%"),
color = "#d63031", size = 4) +
labs(title = "Cumulative Word Coverage Curve",
x = "Unique Words (ranked by frequency)",
y = "Cumulative Coverage (%)") +
theme_minimal(base_size = 13)We can cover 90% of all word usage with just the top ~6631 unique words — this enables aggressive model compression without sacrificing much accuracy.
| Finding | Detail |
|---|---|
| Corpus scale | Hundreds of millions of words across blogs, news, and Twitter |
| Source diversity | Three distinct writing styles improve model generalisation |
| 50% coverage | ~141 unique words cover half of all word usage |
| 90% coverage | ~6631 unique words cover 90% — enables compact model |
| Bigram signal | Common 2-word phrases are strong predictors of the next word |
| Style variation | Twitter is short and informal; blogs are long-form — both are valuable training signals |
The next-word predictor will use a Stupid Back-off n-gram model:
data.table.The app will work like a smartphone keyboard:
.rds
n-gram tables ensure under 200ms responseReport generated with R Markdown · Coursera Data Science Capstone