This report presents an exploratory analysis of the SwiftKey corpus provided for the Johns Hopkins Data Science Capstone. The dataset contains text from three English-language sources: blogs, news articles, and Twitter. The goal is to understand the basic properties of the data and plan the development of a predictive text algorithm.
We load a 5% random sample of each file to keep memory usage manageable while still capturing meaningful patterns.
# Adjust path as needed
data_dir <- "C:/Users/irine/Documents/Coursera-SwiftKey/final/en_US/"
files <- c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt")
set.seed(42)
sample_pct <- 0.05
load_sample <- function(filepath, sample_pct = 0.05) {
con <- file(filepath, "r")
lines <- readLines(con, warn = FALSE)
close(con)
sample(lines, size = floor(length(lines) * sample_pct))
}
blogs <- load_sample(file.path(data_dir, files[1]), sample_pct)
news <- load_sample(file.path(data_dir, files[2]), sample_pct)
twitter <- load_sample(file.path(data_dir, files[3]), sample_pct)
# Line counts, word counts, character counts
count_words <- function(lines) sum(str_count(lines, "\\S+"))
count_chars <- function(lines) sum(nchar(lines))
summary_df <- data.frame(
Source = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(count_words(blogs), count_words(news), count_words(twitter)),
Characters = c(count_chars(blogs), count_chars(news), count_chars(twitter))
)
summary_df$Avg_Words_per_Line <- round(summary_df$Words / summary_df$Lines, 1)
knitr::kable(summary_df, format.args = list(big.mark = ","),
caption = "Table 1: Basic summary of the sampled corpus (5% sample)")
| Source | Lines | Words | Characters | Avg_Words_per_Line |
|---|---|---|---|---|
| Blogs | 44,964 | 1,861,521 | 10,317,646 | 41.4 |
| News | 50,510 | 1,719,709 | 10,167,077 | 34.0 |
| 118,007 | 1,517,016 | 8,101,736 | 12.9 |
Key observations:
# Combine all sources
corpus <- tolower(c(blogs, news, twitter))
corpus <- str_replace_all(corpus, "[^a-z\\s']", " ")
corpus <- str_squish(corpus)
# Tokenize and count
all_words <- unlist(tokenize_words(corpus))
word_freq <- as.data.table(table(word = all_words))
setnames(word_freq, "N", "freq")
setorder(word_freq, -freq)
# Top 20 words
top20 <- head(word_freq, 20)
ggplot(top20, aes(x = reorder(word, freq), y = freq)) +
geom_col(fill = "#2c7bb6") +
coord_flip() +
labs(title = "Figure 1: Top 20 Most Frequent Words",
x = "Word", y = "Frequency") +
theme_minimal()
The most frequent words are function words (the, and, to…), which is expected. These are known as stop words and carry little semantic meaning but are essential for grammatical structure in n-gram models.
word_freq[, rank := .I]
ggplot(word_freq[rank <= 5000], aes(x = log10(rank), y = log10(freq))) +
geom_line(color = "#d7191c", linewidth = 0.8) +
labs(title = "Figure 2: Zipf's Law — Word Rank vs Frequency (log-log scale)",
x = "log10(Rank)", y = "log10(Frequency)") +
theme_minimal()
The near-linear relationship on a log-log scale confirms Zipf’s Law: a small number of words account for the vast majority of occurrences. This has important implications for model efficiency — covering the top ~1,000 words captures most of the corpus.
total_words <- sum(word_freq$freq)
word_freq[, cum_pct := cumsum(freq) / total_words * 100]
# Words needed to cover 50% and 90% of the corpus
cover_50 <- word_freq[cum_pct >= 50, .I[1]]
cover_90 <- word_freq[cum_pct >= 90, .I[1]]
cat("Words needed to cover 50% of corpus:", cover_50, "\n")
## Words needed to cover 50% of corpus: 1
cat("Words needed to cover 90% of corpus:", cover_90, "\n")
## Words needed to cover 90% of corpus: 1
cat("Total unique words in sample: ", nrow(word_freq), "\n")
## Total unique words in sample: 117887
ggplot(word_freq[rank <= 20000], aes(x = rank, y = cum_pct)) +
geom_line(color = "#1a9641", linewidth = 0.8) +
geom_hline(yintercept = c(50, 90), linetype = "dashed", color = "gray40") +
annotate("text", x = 15000, y = 52, label = "50% coverage", size = 3) +
annotate("text", x = 15000, y = 92, label = "90% coverage", size = 3) +
labs(title = "Figure 3: Cumulative Word Coverage",
x = "Number of Unique Words (by rank)", y = "Cumulative % of Corpus") +
theme_minimal()
This analysis directly informs how we can reduce model size: by keeping only the top words needed to cover 90% of the corpus, we dramatically cut memory usage without sacrificing prediction quality.
build_ngrams_simple <- function(corpus_vec, n) {
tokens_list <- tokenize_words(corpus_vec, lowercase = FALSE)
ngrams_vec <- unlist(lapply(tokens_list, function(w) {
if (length(w) < n) return(character(0))
sapply(1:(length(w) - n + 1), function(i) paste(w[i:(i+n-1)], collapse = " "))
}))
dt <- data.table(ngram = ngrams_vec)[, .(freq = .N), by = ngram]
setorder(dt, -freq)
dt
}
# Use a smaller subsample for speed
set.seed(42)
mini_corpus <- sample(corpus, 5000)
bigrams <- build_ngrams_simple(mini_corpus, 2)
trigrams <- build_ngrams_simple(mini_corpus, 3)
# Plot top bigrams
ggplot(head(bigrams, 15), aes(x = reorder(ngram, freq), y = freq)) +
geom_col(fill = "#756bb1") +
coord_flip() +
labs(title = "Figure 4: Top 15 Bigrams", x = "Bigram", y = "Frequency") +
theme_minimal()
ggplot(head(trigrams, 15), aes(x = reorder(ngram, freq), y = freq)) +
geom_col(fill = "#e6550d") +
coord_flip() +
labs(title = "Figure 5: Top 15 Trigrams", x = "Trigram", y = "Frequency") +
theme_minimal()
The predictive model will be built as follows:
data.table objects for fast key-based lookup.The target is a model under 100 MB that responds in under 100 ms per query — suitable for deployment on shinyapps.io.