The goal of this project is to build a next-word prediction algorithm using large English text datasets provided by SwiftKey. This milestone report covers:
# Load all three datasets
twitter <- readLines("en_US/en_US.twitter.txt", encoding = "UTF-8", warn = FALSE)
blogs <- readLines("en_US/en_US.blogs.txt", encoding = "UTF-8", warn = FALSE)
news <- readLines("en_US/en_US.news.txt", encoding = "UTF-8", warn = FALSE)
cat("Data loaded successfully!\n")
## Data loaded successfully!
library(knitr)
# Word count function
count_words <- function(lines) {
sum(sapply(strsplit(lines, "\\s+"), length))
}
# Build summary table
summary_df <- data.frame(
Dataset = c("Twitter", "Blogs", "News"),
Lines = c(length(twitter), length(blogs), length(news)),
Words = c(count_words(twitter), count_words(blogs), count_words(news)),
Max_Chars = c(max(nchar(twitter)), max(nchar(blogs)), max(nchar(news))),
Mean_Chars = round(c(mean(nchar(twitter)), mean(nchar(blogs)), mean(nchar(news))), 1)
)
kable(summary_df,
col.names = c("Dataset", "Total Lines", "Total Words",
"Max Characters", "Avg Characters per Line"),
format.args = list(big.mark = ","))
| Dataset | Total Lines | Total Words | Max Characters | Avg Characters per Line |
|---|---|---|---|---|
| 2,360,148 | 30,373,543 | 140 | 68.7 | |
| Blogs | 899,288 | 37,334,131 | 40,833 | 230.0 |
| News | 1,010,206 | 34,371,031 | 11,384 | 201.2 |
Key observations:
Since the full datasets are very large, we work with a random 1% sample for exploratory analysis.
set.seed(42)
sample_data <- function(lines, prob = 0.001) {
lines[rbinom(length(lines), 1, prob) == 1]
}
twitter_sample <- sample_data(twitter)
blogs_sample <- sample_data(blogs)
news_sample <- sample_data(news)
# Combine all samples
all_text <- c(twitter_sample, blogs_sample, news_sample)
cat("Sample sizes:\n")
## Sample sizes:
cat("Twitter:", length(twitter_sample), "lines\n")
## Twitter: 2271 lines
cat("Blogs: ", length(blogs_sample), "lines\n")
## Blogs: 903 lines
cat("News: ", length(news_sample), "lines\n")
## News: 949 lines
library(tm)
library(ggplot2)
# Build a corpus from sample
corpus <- VCorpus(VectorSource(all_text))
# Clean the corpus
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)
# Term-document matrix
tdm <- TermDocumentMatrix(corpus)
mat <- as.matrix(tdm)
freq <- sort(rowSums(mat), decreasing = TRUE)
# Top 20 words
top20 <- data.frame(word = names(freq[1:20]), count = freq[1:20])
ggplot(top20, aes(x = reorder(word, count), y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Most Frequent Words",
x = "Word", y = "Frequency") +
theme_minimal()
N-grams (sequences of N words) are the foundation of our prediction algorithm.
library(RWeka)
# Bigram tokenizer
bigram_tok <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
trigram_tok <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
# Bigrams
bigram_tdm <- TermDocumentMatrix(corpus, control=list(tokenize=bigram_tok))
bigram_mat <- as.matrix(bigram_tdm)
bigram_freq <- sort(rowSums(bigram_mat), decreasing=TRUE)
top_bigrams <- data.frame(ngram=names(bigram_freq[1:15]), count=bigram_freq[1:15])
# Trigrams
trigram_tdm <- TermDocumentMatrix(corpus, control=list(tokenize=trigram_tok))
trigram_mat <- as.matrix(trigram_tdm)
trigram_freq <- sort(rowSums(trigram_mat), decreasing=TRUE)
top_trigrams <- data.frame(ngram=names(trigram_freq[1:15]), count=trigram_freq[1:15])
ggplot(top_bigrams, aes(x = reorder(ngram, count), y = count)) +
geom_bar(stat = "identity", fill = "darkorange") +
coord_flip() +
labs(title = "Top 15 Bigrams", x = "Bigram", y = "Frequency") +
theme_minimal()
ggplot(top_trigrams, aes(x = reorder(ngram, count), y = count)) +
geom_bar(stat = "identity", fill = "darkgreen") +
coord_flip() +
labs(title = "Top 15 Trigrams", x = "Trigram", y = "Frequency") +
theme_minimal()
# Word coverage analysis
total_words <- sum(freq)
cum_freq <- cumsum(freq) / total_words
words_50pct <- which(cum_freq >= 0.50)[1]
words_90pct <- which(cum_freq >= 0.90)[1]
cat("Unique words needed to cover 50% of text:", words_50pct, "\n")
## Unique words needed to cover 50% of text: 973
cat("Unique words needed to cover 90% of text:", words_90pct, "\n")
## Unique words needed to cover 90% of text: 9435
Key findings:
The next-word prediction model will use an N-gram backoff approach:
| Step | Description |
|---|---|
| 1 | Build unigram, bigram, trigram frequency tables from cleaned corpus |
| 2 | Given user input, look up matching trigrams first |
| 3 | If no trigram match, back off to bigrams |
| 4 | If no bigram match, return most common unigrams |
| 5 | Apply Katz backoff or Stupid Backoff smoothing |
The Shiny app will:
Report prepared for the Coursera Data Science Capstone — Johns Hopkins University