In this report, I’m taking a look at the SwiftKey dataset for our data science capstone. The main goal here is to clean up a bunch of messy text from blogs, news, and tweets so we can see which phrases pop up the most. Before diving into the cleaning and modeling, let’s look at the basic summaries of the raw source files, including their sizes, line counts, and total word counts.
# ==========================================
# STEP 0: CALCULATE RAW FILE STATS EFFICIENTLY
# ==========================================
library(stringr)
# File Paths
blogs_path <- "en_US/en_US.blogs.txt"
news_path <- "en_US/en_US.news.txt"
twitter_path <- "en_US/en_US.twitter.txt"
# File Sizes (MB)
blogs_size <- file.info(blogs_path)$size / (1024^2)
news_size <- file.info(news_path)$size / (1024^2)
twitter_size <- file.info(twitter_path)$size / (1024^2)
# Read Lines (Efficiently read them once for statistics)
blogs_raw <- readLines(blogs_path, warn = FALSE, skipNul = TRUE)
news_raw <- readLines(news_path, warn = FALSE, skipNul = TRUE)
twitter_raw <- readLines(twitter_path, warn = FALSE, skipNul = TRUE)
# Line Counts
blogs_lines <- length(blogs_raw)
news_lines <- length(news_raw)
twitter_lines <- length(twitter_raw)
# Word Counts (Approximated by splitting on spaces to save memory)
blogs_words <- sum(str_count(blogs_raw, "\\w+"))
news_words <- sum(str_count(news_raw, "\\w+"))
twitter_words <- sum(str_count(twitter_raw, "\\w+"))
# Combine into a clean summary table
raw_summary_table <- data.frame(
File_Name = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
File_Size_MB = round(c(blogs_size, news_size, twitter_size), 2),
Line_Count = c(blogs_lines, news_lines, twitter_lines),
Word_Count = c(blogs_words, news_words, twitter_words)
)
# Print the table cleanly in the HTML report
knitr::kable(raw_summary_table, caption = "Summary Statistics of Raw SwiftKey Datasets")
| File_Name | File_Size_MB | Line_Count | Word_Count |
|---|---|---|---|
| en_US.blogs.txt | 200.42 | 899288 | 38309620 |
| en_US.news.txt | 196.28 | 1010206 | 35622913 |
| en_US.twitter.txt | 159.36 | 2360148 | 31003544 |
# ==========================================
# STEP 1: LOAD LIBRARIES & READ RAW DATA
# ==========================================
library(tidyverse)
library(tidytext)
library(stringr)
blogs_data <- readLines("en_US/en_US.blogs.txt", warn = FALSE, skipNul = TRUE)
news_data <- readLines("en_US/en_US.news.txt", warn = FALSE, skipNul = TRUE)
twitter_data <- readLines("en_US/en_US.twitter.txt", warn = FALSE, skipNul = TRUE)
# ==========================================
# STEP 2: DATA SAMPLING (1% FOR PERFORMANCE)
# ==========================================
set.seed(1234)
sample_blogs <- sample(blogs_data, length(blogs_data) * 0.01)
sample_news <- sample(news_data, length(news_data) * 0.01)
sample_twitter <- sample(twitter_data, length(twitter_data) * 0.01)
combined_sample <- c(sample_blogs, sample_news, sample_twitter)
# Clear heavy raw files from memory
rm(blogs_data, news_data, twitter_data, sample_blogs, sample_news, sample_twitter)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 6551480 349.9 12462888 665.6 8760728 467.9
## Vcells 92887990 708.7 161637656 1233.2 114230294 871.6
# ==========================================
# STEP 3: TEXT CLEANING
# ==========================================
clean_sample <- str_to_lower(combined_sample)
clean_sample <- str_replace_all(clean_sample, "http\\S+\\s*", "")
clean_sample <- str_replace_all(clean_sample, "[^a-zA-Z0-9' ]", "")
clean_sample <- str_squish(clean_sample)
text_df <- tibble(line = 1:length(clean_sample), text = clean_sample)
# ==========================================
# STEP 4: N-GRAM EXTRACTION & COUNTING
# ==========================================
unigram_freq <- text_df %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE)
bigram_freq <- text_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE) %>%
drop_na()
Once I got rid of the URLs, punctuation, and extra spaces, I split the text into two-word phrases (bigrams). The bar chart below shows the top 15 most common word pairings found in my 1% sample.
# ==========================================
# STEP 5: VISUALIZE THE RESULTS
# ==========================================
ggplot(head(bigram_freq, 15), aes(x = reorder(bigram, n), y = n)) +
geom_col(fill = "#008080") +
coord_flip() +
labs(title = "Top 15 Most Common Bigrams",
x = "Phrases",
y = "Frequency") +
theme_minimal()