Data Science Capstone: Milestone Report

Introduction

In this report, I’m taking a look at the SwiftKey dataset for our data science capstone. The main goal here is to clean up a bunch of messy text from blogs, news, and tweets so we can see which phrases pop up the most. Before diving into the cleaning and modeling, let’s look at the basic summaries of the raw source files, including their sizes, line counts, and total word counts.

# ==========================================
# STEP 0: CALCULATE RAW FILE STATS EFFICIENTLY
# ==========================================
library(stringr)

# File Paths
blogs_path   <- "en_US/en_US.blogs.txt"
news_path    <- "en_US/en_US.news.txt"
twitter_path <- "en_US/en_US.twitter.txt"

# File Sizes (MB)
blogs_size   <- file.info(blogs_path)$size / (1024^2)
news_size    <- file.info(news_path)$size / (1024^2)
twitter_size <- file.info(twitter_path)$size / (1024^2)

# Read Lines (Efficiently read them once for statistics)
blogs_raw   <- readLines(blogs_path, warn = FALSE, skipNul = TRUE)
news_raw    <- readLines(news_path, warn = FALSE, skipNul = TRUE)
twitter_raw <- readLines(twitter_path, warn = FALSE, skipNul = TRUE)

# Line Counts
blogs_lines   <- length(blogs_raw)
news_lines    <- length(news_raw)
twitter_lines <- length(twitter_raw)

# Word Counts (Approximated by splitting on spaces to save memory)
blogs_words   <- sum(str_count(blogs_raw, "\\w+"))
news_words    <- sum(str_count(news_raw, "\\w+"))
twitter_words <- sum(str_count(twitter_raw, "\\w+"))

# Combine into a clean summary table
raw_summary_table <- data.frame(
  File_Name = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
  File_Size_MB = round(c(blogs_size, news_size, twitter_size), 2),
  Line_Count = c(blogs_lines, news_lines, twitter_lines),
  Word_Count = c(blogs_words, news_words, twitter_words)
)

# Print the table cleanly in the HTML report
knitr::kable(raw_summary_table, caption = "Summary Statistics of Raw SwiftKey Datasets")

Summary Statistics of Raw SwiftKey Datasets
File_Name	File_Size_MB	Line_Count	Word_Count
en_US.blogs.txt	200.42	899288	38309620
en_US.news.txt	196.28	1010206	35622913
en_US.twitter.txt	159.36	2360148	31003544

# ==========================================
# STEP 1: LOAD LIBRARIES & READ RAW DATA
# ==========================================
library(tidyverse)
library(tidytext)
library(stringr)

blogs_data   <- readLines("en_US/en_US.blogs.txt", warn = FALSE, skipNul = TRUE)
news_data    <- readLines("en_US/en_US.news.txt", warn = FALSE, skipNul = TRUE)
twitter_data <- readLines("en_US/en_US.twitter.txt", warn = FALSE, skipNul = TRUE)

# ==========================================
# STEP 2: DATA SAMPLING (1% FOR PERFORMANCE)
# ==========================================
set.seed(1234) 

sample_blogs   <- sample(blogs_data, length(blogs_data) * 0.01)
sample_news    <- sample(news_data, length(news_data) * 0.01)
sample_twitter <- sample(twitter_data, length(twitter_data) * 0.01)

combined_sample <- c(sample_blogs, sample_news, sample_twitter)

# Clear heavy raw files from memory
rm(blogs_data, news_data, twitter_data, sample_blogs, sample_news, sample_twitter)
gc()

##            used  (Mb) gc trigger   (Mb)  max used  (Mb)
## Ncells  6551480 349.9   12462888  665.6   8760728 467.9
## Vcells 92887990 708.7  161637656 1233.2 114230294 871.6

# ==========================================
# STEP 3: TEXT CLEANING
# ==========================================
clean_sample <- str_to_lower(combined_sample) 
clean_sample <- str_replace_all(clean_sample, "http\\S+\\s*", "") 
clean_sample <- str_replace_all(clean_sample, "[^a-zA-Z0-9' ]", "") 
clean_sample <- str_squish(clean_sample) 

text_df <- tibble(line = 1:length(clean_sample), text = clean_sample)

# ==========================================
# STEP 4: N-GRAM EXTRACTION & COUNTING
# ==========================================
unigram_freq <- text_df %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)

bigram_freq <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(bigram, sort = TRUE) %>%
  drop_na()

Data Science Capstone: Milestone Report

Data Science Student

2026-06-01

Introduction

Exploratory Findings