Introduction

In this report, I’m taking a look at the SwiftKey dataset for our data science capstone. The main goal here is to clean up a bunch of messy text from blogs, news, and tweets so we can see which phrases pop up the most. Before diving into the cleaning and modeling, let’s look at the basic summaries of the raw source files, including their sizes, line counts, and total word counts.

# ==========================================
# STEP 0: CALCULATE RAW FILE STATS EFFICIENTLY
# ==========================================
library(stringr)

# File Paths
blogs_path   <- "en_US/en_US.blogs.txt"
news_path    <- "en_US/en_US.news.txt"
twitter_path <- "en_US/en_US.twitter.txt"

# File Sizes (MB)
blogs_size   <- file.info(blogs_path)$size / (1024^2)
news_size    <- file.info(news_path)$size / (1024^2)
twitter_size <- file.info(twitter_path)$size / (1024^2)

# Read Lines (Efficiently read them once for statistics)
blogs_raw   <- readLines(blogs_path, warn = FALSE, skipNul = TRUE)
news_raw    <- readLines(news_path, warn = FALSE, skipNul = TRUE)
twitter_raw <- readLines(twitter_path, warn = FALSE, skipNul = TRUE)

# Line Counts
blogs_lines   <- length(blogs_raw)
news_lines    <- length(news_raw)
twitter_lines <- length(twitter_raw)

# Word Counts (Approximated by splitting on spaces to save memory)
blogs_words   <- sum(str_count(blogs_raw, "\\w+"))
news_words    <- sum(str_count(news_raw, "\\w+"))
twitter_words <- sum(str_count(twitter_raw, "\\w+"))

# Combine into a clean summary table
raw_summary_table <- data.frame(
  File_Name = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
  File_Size_MB = round(c(blogs_size, news_size, twitter_size), 2),
  Line_Count = c(blogs_lines, news_lines, twitter_lines),
  Word_Count = c(blogs_words, news_words, twitter_words)
)

# Print the table cleanly in the HTML report
knitr::kable(raw_summary_table, caption = "Summary Statistics of Raw SwiftKey Datasets")
Summary Statistics of Raw SwiftKey Datasets
File_Name File_Size_MB Line_Count Word_Count
en_US.blogs.txt 200.42 899288 38309620
en_US.news.txt 196.28 1010206 35622913
en_US.twitter.txt 159.36 2360148 31003544
# ==========================================
# STEP 1: LOAD LIBRARIES & READ RAW DATA
# ==========================================
library(tidyverse)
library(tidytext)
library(stringr)

blogs_data   <- readLines("en_US/en_US.blogs.txt", warn = FALSE, skipNul = TRUE)
news_data    <- readLines("en_US/en_US.news.txt", warn = FALSE, skipNul = TRUE)
twitter_data <- readLines("en_US/en_US.twitter.txt", warn = FALSE, skipNul = TRUE)

# ==========================================
# STEP 2: DATA SAMPLING (1% FOR PERFORMANCE)
# ==========================================
set.seed(1234) 

sample_blogs   <- sample(blogs_data, length(blogs_data) * 0.01)
sample_news    <- sample(news_data, length(news_data) * 0.01)
sample_twitter <- sample(twitter_data, length(twitter_data) * 0.01)

combined_sample <- c(sample_blogs, sample_news, sample_twitter)

# Clear heavy raw files from memory
rm(blogs_data, news_data, twitter_data, sample_blogs, sample_news, sample_twitter)
gc() 
##            used  (Mb) gc trigger   (Mb)  max used  (Mb)
## Ncells  6551480 349.9   12462888  665.6   8760728 467.9
## Vcells 92887990 708.7  161637656 1233.2 114230294 871.6
# ==========================================
# STEP 3: TEXT CLEANING
# ==========================================
clean_sample <- str_to_lower(combined_sample) 
clean_sample <- str_replace_all(clean_sample, "http\\S+\\s*", "") 
clean_sample <- str_replace_all(clean_sample, "[^a-zA-Z0-9' ]", "") 
clean_sample <- str_squish(clean_sample) 

text_df <- tibble(line = 1:length(clean_sample), text = clean_sample)

# ==========================================
# STEP 4: N-GRAM EXTRACTION & COUNTING
# ==========================================
unigram_freq <- text_df %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)

bigram_freq <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(bigram, sort = TRUE) %>%
  drop_na()

Exploratory Findings

Once I got rid of the URLs, punctuation, and extra spaces, I split the text into two-word phrases (bigrams). The bar chart below shows the top 15 most common word pairings found in my 1% sample.

# ==========================================
# STEP 5: VISUALIZE THE RESULTS
# ==========================================
ggplot(head(bigram_freq, 15), aes(x = reorder(bigram, n), y = n)) +
  geom_col(fill = "#008080") + 
  coord_flip() + 
  labs(title = "Top 15 Most Common Bigrams",
       x = "Phrases",
       y = "Frequency") +
  theme_minimal()