knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE,
cache = TRUE,
comment = "##",
results = "markup"
)
Before we start, we need to make sure we have all the right tools in our toolbox. If a tool is missing, we will go get it automatically!
# List of packages we need for our analysis
required_packages <- c("stringi", "ggplot2", "quanteda", "quanteda.textstats", "quanteda.textplots", "gridExtra")
# Check if each package is installed. If not, install it.
for (pkg in required_packages) {
if (!require(pkg, character.only = TRUE)) {
message(paste("Installing missing package:", pkg))
install.packages(pkg, dependencies = TRUE)
library(pkg, character.only = TRUE)
} else {
library(pkg, character.only = TRUE)
}
}
First, we need to grab the giant box of words from the internet. It’s like downloading a whole library! We check if we already have it so we don’t have to wait again.
# Define the source
file_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
dest_file <- "Coursera-SwiftKey.zip"
# Download if missing
if (!file.exists(dest_file)) {
download.file(file_url, destfile = dest_file)
}
# Unzip if the folder doesn't exist
if (!dir.exists("final")) {
unzip(dest_file)
}
Wow! We have three big books: Blogs (diaries), News (papers), and Twitter (little bird messages). Let’s count how many lines and words are inside. It is going to be HUGE!
# Define paths
path_blogs <- "final/en_US/en_US.blogs.txt"
path_news <- "final/en_US/en_US.news.txt"
path_twitter <- "final/en_US/en_US.twitter.txt"
# Read files (Using binary mode for News to handle special characters)
blogs <- readLines(path_blogs, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(path_news, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(path_twitter, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
# Calculate stats using stringi (It's super fast!)
stats_df <- data.frame(
Source = c("Blogs", "News", "Twitter"),
Size_MB = c(file.info(path_blogs)$size / 1024^2,
file.info(path_news)$size / 1024^2,
file.info(path_twitter)$size / 1024^2),
Total_Lines = c(length(blogs), length(news), length(twitter)),
Total_Words = c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))),
Max_Chars = c(max(nchar(blogs)), max(nchar(news)), max(nchar(twitter)))
)
# Show the table
knitr::kable(stats_df, digits = 2, caption = "The Big Table of Word Stats")
| Source | Size_MB | Total_Lines | Total_Words | Max_Chars |
|---|---|---|---|---|
| Blogs | 200.42 | 899288 | 37546806 | 40833 |
| News | 196.28 | 1010206 | 34761151 | 11384 |
| 159.36 | 2360148 | 30096690 | 140 |
Manager Note: The files are massive (over 500MB total). Twitter has the most volume in terms of entries (lines), but Blogs contain the longest individual entries.
If we try to eat the whole cake at once, our tummy (computer memory) will hurt! So, we are going to take a small bite—just 1% of the data—to see what it tastes like.
set.seed(1234) # Make it reproducible
sample_percent <- 0.01
# Subsample
sample_blogs <- sample(blogs, length(blogs) * sample_percent)
sample_news <- sample(news, length(news) * sample_percent)
sample_twitter <- sample(twitter, length(twitter) * sample_percent)
# Combine into one giant mix
master_sample <- c(sample_blogs, sample_news, sample_twitter)
# Clean up memory
rm(blogs, news, twitter)
gc() # Garbage collection
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2482647 132.6 7871522 420.4 6713352 358.6
## Vcells 9329242 71.2 171446331 1308.1 213907921 1632.0
Now we use a super-tool called quanteda. We are going to wash the words! We will take out the punctuation, the numbers, and the boring symbols.
# Create a corpus
doc_corpus <- corpus(master_sample)
# Tokenize: removing numbers, punctuation, and symbols
# We keep stopwords for now as they are critical for next-word prediction!
tokens_all <- tokens(doc_corpus,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE)
# Clean up memory
Let’s see which words appear the most. Is it “Love”? Is it “Time”? Let’s look!
# Calculate document feature matrix
dfm_1 <- dfm(tokens_all)
# Get top features
top_words <- textstat_frequency(dfm_1, n = 20)
# Plot
g1 <- ggplot(top_words, aes(x = reorder(feature, frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Single Words (Unigrams)", x = "Word", y = "Frequency") +
theme_minimal()
print(g1)
Observation: The most common words are “stopwords” (the, to, and, a). This is expected in English! For a predictive keyboard, these are actually very important.
Now, let’s see which words like to hold hands and walk together. These are called Bigrams (two words).
# Create bigrams
tokens_2 <- tokens_ngrams(tokens_all, n = 2)
dfm_2 <- dfm(tokens_2)
# Get top features
top_bigrams <- textstat_frequency(dfm_2, n = 20)
# Plot
g2 <- ggplot(top_bigrams, aes(x = reorder(feature, frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "darkorange") +
coord_flip() +
labs(title = "Top 20 Word Pairs (Bigrams)", x = "Pair", y = "Frequency") +
theme_minimal()
print(g2)
Observation: “of the”, “in the”, and “to the” are the champions. This tells us that prepositions followed by articles are the glue of the language.
Finally, let’s look for groups of three words!
# Create trigrams
tokens_3 <- tokens_ngrams(tokens_all, n = 3)
dfm_3 <- dfm(tokens_3)
# Get top features
top_trigrams <- textstat_frequency(dfm_3, n = 20)
# Plot
g3 <- ggplot(top_trigrams, aes(x = reorder(feature, frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "forestgreen") +
coord_flip() +
labs(title = "Top 20 Word Trios (Trigrams)", x = "Trio", y = "Frequency") +
theme_minimal()
print(g3)
We have successfully downloaded, cleaned, and explored the SwiftKey dataset.
Volume: The data is sufficiently large to build a robust model, requiring sampling for efficient processing during development.
Structure: The language structure is dominated by stopwords (“the”, “and”, “to”).
Next Steps: * Build a predictive model using these N-Grams.
Handle “out of vocabulary” words (words we haven’t seen before).
Create the Shiny App!
That was fun! High five! ✋