This milestone report explores the SwiftKey text corpus provided for the Coursera Data Science Capstone. The goal of the final project is to build a predictive text model that suggests the next word given a short text input.
The analysis focuses on the English United States files from blogs, news, and Twitter. This report demonstrates that the data can be downloaded and loaded, summarizes the main characteristics of the data, explores frequent words and n-grams, and outlines a plan for the prediction algorithm and final Shiny application.
The dataset is the Coursera SwiftKey corpus:
data_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
zip_file <- "Coursera-SwiftKey.zip"
data_dir <- "final/en_US"
files <- c(
blogs = file.path(data_dir, "en_US.blogs.txt"),
news = file.path(data_dir, "en_US.news.txt"),
twitter = file.path(data_dir, "en_US.twitter.txt")
)
files
## blogs news
## "final/en_US/en_US.blogs.txt" "final/en_US/en_US.news.txt"
## twitter
## "final/en_US/en_US.twitter.txt"
if (!file.exists(zip_file)) {
download.file(data_url, zip_file, mode = "wb")
}
if (!dir.exists(data_dir)) {
unzip(zip_file)
}
Run the previous chunk once if the dataset is not already present in the project folder.
stopifnot(all(file.exists(files)))
read_text <- function(path) {
readLines(path, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
}
blogs <- read_text(files["blogs"])
news <- read_text(files["news"])
twitter <- read_text(files["twitter"])
raw_data <- list(
blogs = blogs,
news = news,
twitter = twitter
)
count_words_per_line <- function(x) {
x <- trimws(x)
x <- x[nzchar(x)]
lengths(strsplit(x, "\\s+"))
}
sample_for_stats <- function(x, n = 10000) {
if (length(x) <= n) return(x)
sample(x, n)
}
stats_samples <- lapply(raw_data, sample_for_stats)
summary_stats <- data.frame(
source = names(files),
file_size_mb = round(file.info(files)$size / 1024^2, 2),
lines = sapply(raw_data, length),
avg_chars_per_line = round(sapply(stats_samples, function(x) mean(nchar(x, type = "chars"))), 1),
avg_words_per_line = round(sapply(stats_samples, function(x) mean(count_words_per_line(x))), 1),
estimated_words = round(sapply(seq_along(raw_data), function(i) {
length(raw_data[[i]]) * mean(count_words_per_line(stats_samples[[i]]))
})),
stringsAsFactors = FALSE
)
knitr::kable(summary_stats)
| source | file_size_mb | lines | avg_chars_per_line | avg_words_per_line | estimated_words | |
|---|---|---|---|---|---|---|
| blogs | blogs | 200.42 | 899288 | 230.7 | 41.7 | 37475579 |
| news | news | 196.28 | 1010206 | 200.6 | 34.0 | 34314172 |
| 159.36 | 2360148 | 68.7 | 12.9 | 30393042 |
The three files differ in style and length. Blog and news entries tend to be longer and more structured, while Twitter entries are usually shorter, noisier, and more informal.
line_lengths <- data.frame(
source = rep(names(stats_samples), lengths(stats_samples)),
characters = unlist(lapply(stats_samples, nchar), use.names = FALSE)
)
hist(
line_lengths$characters[line_lengths$source == "blogs"],
breaks = 50,
col = "steelblue",
main = "Distribution of Blog Line Lengths",
xlab = "Characters per Line",
xlim = c(0, quantile(line_lengths$characters, 0.95))
)
hist(
line_lengths$characters[line_lengths$source == "news"],
breaks = 50,
col = "darkseagreen",
main = "Distribution of News Line Lengths",
xlab = "Characters per Line",
xlim = c(0, quantile(line_lengths$characters, 0.95))
)
hist(
line_lengths$characters[line_lengths$source == "twitter"],
breaks = 50,
col = "salmon",
main = "Distribution of Twitter Line Lengths",
xlab = "Characters per Line",
xlim = c(0, quantile(line_lengths$characters, 0.95))
)
The full corpus is large, so this milestone report uses a reproducible sample for exploratory analysis. The final model can be trained on a larger sample or the full corpus depending on memory and runtime constraints.
sample_lines <- function(x, n = 10000) {
if (length(x) <= n) return(x)
sample(x, n)
}
sampled_data <- unlist(lapply(raw_data, sample_lines), use.names = FALSE)
length(sampled_data)
## [1] 30000
clean_text <- function(x) {
x <- iconv(x, from = "UTF-8", to = "ASCII", sub = " ")
x <- tolower(x)
x <- gsub("https?://\\S+|www\\.\\S+", " ", x)
x <- gsub("[^a-z'\\s]", " ", x)
x <- gsub("\\s+", " ", x)
trimws(x)
}
cleaned_sample <- clean_text(sampled_data)
cleaned_sample <- cleaned_sample[nzchar(cleaned_sample)]
tokens <- unlist(strsplit(cleaned_sample, "\\s+"))
tokens <- tokens[nzchar(tokens)]
word_freq <- sort(table(tokens), decreasing = TRUE)
top_words <- head(data.frame(
word = names(word_freq),
frequency = as.integer(word_freq),
row.names = NULL
), 25)
knitr::kable(top_words)
| word | frequency |
|---|---|
| the | 44226 |
| to | 23853 |
| and | 22915 |
| a | 21683 |
| of | 19126 |
| in | 14986 |
| i | 14002 |
| that | 9794 |
| is | 9173 |
| for | 9019 |
| it | 8439 |
| on | 6904 |
| you | 6898 |
| with | 6511 |
| was | 5817 |
| at | 4783 |
| this | 4766 |
| my | 4697 |
| as | 4655 |
| be | 4639 |
| have | 4504 |
| he | 4345 |
| but | 4247 |
| are | 4170 |
| we | 3759 |
barplot(
rev(top_words$frequency),
names.arg = rev(top_words$word),
horiz = TRUE,
las = 1,
col = "steelblue",
main = "Top 25 Words in the Sample",
xlab = "Frequency"
)
make_ngrams <- function(text, n) {
words <- unlist(strsplit(text, "\\s+"))
words <- words[nzchar(words)]
if (length(words) < n) return(character(0))
starts <- seq_len(length(words) - n + 1)
vapply(starts, function(i) paste(words[i:(i + n - 1)], collapse = " "), character(1))
}
sample_for_ngrams <- paste(cleaned_sample, collapse = " ")
bigrams <- make_ngrams(sample_for_ngrams, 2)
trigrams <- make_ngrams(sample_for_ngrams, 3)
top_bigrams <- head(data.frame(
bigram = names(sort(table(bigrams), decreasing = TRUE)),
frequency = as.integer(sort(table(bigrams), decreasing = TRUE)),
row.names = NULL
), 20)
top_trigrams <- head(data.frame(
trigram = names(sort(table(trigrams), decreasing = TRUE)),
frequency = as.integer(sort(table(trigrams), decreasing = TRUE)),
row.names = NULL
), 20)
knitr::kable(top_bigrams)
| bigram | frequency |
|---|---|
| of the | 4171 |
| in the | 3784 |
| to the | 2065 |
| on the | 1760 |
| for the | 1668 |
| to be | 1393 |
| and the | 1218 |
| at the | 1218 |
| in a | 1116 |
| with the | 993 |
| is a | 914 |
| it was | 886 |
| from the | 862 |
| with a | 821 |
| for a | 817 |
| of a | 814 |
| and i | 792 |
| it is | 779 |
| i have | 730 |
| will be | 712 |
knitr::kable(top_trigrams)
| trigram | frequency |
|---|---|
| one of the | 323 |
| a lot of | 286 |
| as well as | 157 |
| out of the | 155 |
| the end of | 146 |
| to be a | 139 |
| i don t | 135 |
| going to be | 133 |
| the u s | 132 |
| it was a | 131 |
| part of the | 128 |
| some of the | 128 |
| thanks for the | 116 |
| there is a | 111 |
| be able to | 107 |
| i want to | 107 |
| i have a | 105 |
| a couple of | 103 |
| this is a | 101 |
| end of the | 93 |
The most common words are high-frequency English function words such as articles, prepositions, pronouns, and conjunctions. This is expected in natural language data and is useful for understanding the baseline structure of the corpus.
The n-gram tables show repeated short phrases that can be used to predict likely next words. For example, a trigram model can estimate the next word from the previous two words. Twitter text is expected to add informal language, abbreviations, and noise, while blogs and news provide longer and more grammatical sentences.
The final prediction model will use an n-gram approach:
This backoff strategy balances accuracy and speed. The final Shiny application should load compact frequency tables rather than the full raw corpus, so the app remains responsive.
The next stage is to build and evaluate the prediction model, tune the sampling and cleaning decisions, reduce the model size for deployment, and create a Shiny application that accepts text input and displays the predicted next word.