title: “Milestone Report” author: “Phuoc Nguyen” date: “2025-10-31”

output: html_document: toc: true toc_depth: 3 toc_float: true number_sections: false theme: readable highlight: tango

1. Motivation & Goals (for a non‑data‑science manager)

Purpose: Build a simple text‑prediction engine (next‑word suggestion) based on large, real‑world text sources (blogs, news, Twitter).
This milestone: Prove we can download & load the data, create basic summaries (sizes, line/word counts), show simple plots, and outline a plan for the final algorithm + Shiny app.

2. Data Access & Loading

We use the Coursera/SwiftKey data set (blogs, news, Twitter) for English (en_US). The following chunk downloads and unpacks the files into a data/ folder if not present.

library(fs)
library(utils)
library(glue)

set.seed(2025)

dir_create("data")
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
zip_path <- path("data", "Coursera-SwiftKey.zip")

if (!file_exists(zip_path)) {
  download.file(url, destfile = zip_path, mode = "wb")
}

# Unzip only once
if (!dir_exists(path("data", "final"))) {
  unzip(zip_path, exdir = "data")
}

# Paths for en_US files
base_dir <- path("data", "final", "en_US")
blogs_path   <- path(base_dir, "en_US.blogs.txt")
news_path    <- path(base_dir, "en_US.news.txt")
twitter_path <- path(base_dir, "en_US.twitter.txt")

3. Basic Summaries (files, lines, words)

We count file sizes, number of lines, and word counts.

library(stringi)
library(readr)

# Helper to get counts quickly without loading all columns into a data frame
count_lines <- function(file) length(readLines(file, warn = FALSE, skipNul = TRUE))
count_words <- function(file) {
  txt <- read_file(file)
  sum(stri_count_words(txt))
}

sizes <- tibble::tibble(
  File = c("Blogs", "News", "Twitter"),
  Path = c(blogs_path, news_path, twitter_path),
  Size_MB = round(file_info(c(blogs_path, news_path, twitter_path))$size / (1024^2), 2),
  Lines = purrr::map_int(c(blogs_path, news_path, twitter_path), count_lines),
  Words = purrr::map_int(c(blogs_path, news_path, twitter_path), count_words)
)

knitr::kable(sizes[, c("File", "Size_MB", "Lines", "Words")], caption = "File sizes (MB), line counts, and word counts")

File sizes (MB), line counts, and word counts
File	Size_MB	Lines	Words
Blogs	200	899288	37546806
News	196	1010206	34762658
Twitter	159	2360148	2127835

Note: Counting words via stringi on the full files may take a bit on a low‑resource laptop. It is cached by knitr to avoid re‑running.

4. Create a Small Sample for EDA

To keep the EDA responsive, we randomly sample a subset of lines from each file (change sample_frac as needed).

library(dplyr)
library(readr)

sample_frac <- 0.01  # 1% of each file; increase if your machine is strong

sample_file <- function(path, frac) {
  con <- file(path, open = "r")
  on.exit(close(con), add = TRUE)
  lines <- readLines(con, warn = FALSE, skipNul = TRUE)
  n <- length(lines)
  keep <- sample.int(n, size = max(1, floor(n * frac)))
  tibble(text = lines[keep])
}

blogs_df   <- sample_file(blogs_path, sample_frac) %>% mutate(source = "blogs")
news_df    <- sample_file(news_path, sample_frac) %>% mutate(source = "news")
twitter_df <- sample_file(twitter_path, sample_frac) %>% mutate(source = "twitter")

sampled <- bind_rows(blogs_df, news_df, twitter_df)

# Quick sanity checks
knitr::kable(sampled %>% count(source), caption = "Sampled line counts by source")

Sampled line counts by source
source	n
blogs	8992
news	10102
twitter	23601

5. Cleaning & Tokenization (Unigrams, Bigrams, Trigrams)

We normalize text by lower‑casing and removing punctuation/numbers; then we tokenize into unigrams (single words), bigrams (2‑word phrases) and trigrams (3‑word phrases).

library(tidytext)
library(tidyr)
library(ggplot2)
library(stopwords)

# Unigrams
unigrams <- sampled %>%
  mutate(text = stringi::stri_trans_tolower(text)) %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stopwords::stopwords("en"), # remove stopwords to see topical words
         !stri_detect_regex(word, "^[0-9]+$"))

top_uni <- unigrams %>% count(word, sort = TRUE) %>% slice_head(n = 20)

# Bigrams
bigrams <- sampled %>%
  mutate(text = stringi::stri_trans_tolower(text)) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, into = c("w1", "w2"), sep = " ", fill = "right", remove = FALSE) %>%
  filter(!w1 %in% stopwords::stopwords("en"), !w2 %in% stopwords::stopwords("en"))

top_bi <- bigrams %>% count(bigram, sort = TRUE) %>% slice_head(n = 20)

# Trigrams
trigrams <- sampled %>%
  mutate(text = stringi::stri_trans_tolower(text)) %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3)

top_tri <- trigrams %>% count(trigram, sort = TRUE) %>% slice_head(n = 20)

knitr::kable(top_uni, caption = "Top 20 unigrams (after stopword removal)")

Top 20 unigrams (after stopword removal)
word	n
just	3152
said	3035
one	2978
like	2699
can	2521
get	2252
time	2208
new	2000
good	1878
now	1837
day	1782
know	1628
love	1611
people	1580
year	1449
see	1438
back	1428
first	1400
go	1392
also	1327

knitr::kable(top_bi,  caption = "Top 20 bigrams (after stopword removal)")

Top 20 bigrams (after stopword removal)
bigram	n
right now	278
new york	205
year old	203
last year	176
last night	153
high school	145
NA	140
first time	132
last week	130
years ago	128
feel like	123
st louis	113
looks like	104
next week	102
looking forward	98
can get	97
even though	95
just got	95
make sure	93
united states	91

knitr::kable(top_tri, caption = "Top 20 trigrams")

Top 20 trigrams
trigram	n
NA	1157
one of the	349
a lot of	297
thanks for the	265
to be a	194
it was a	182
going to be	179
as well as	142
i want to	140
part of the	136
the end of	130
out of the	125
i have to	122
some of the	121
be able to	119
this is a	115
i have a	113
a couple of	109
the first time	109
the fact that	105

6. Basic Plots

We add simple plots (histograms / bar charts) that a non‑technical reader can grasp quickly.

# Histogram of words per line (sample)
sampled %>%
  mutate(tokens = stri_count_words(text)) %>%
  ggplot(aes(tokens)) +
  geom_histogram(binwidth = 2) +
  labs(title = "Distribution of words per line (sample)", x = "Words per line", y = "Count")

# Bar charts for top unigrams and bigrams
top_uni %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Top 20 unigrams (sample)", x = NULL, y = "Frequency")

top_bi %>%
  mutate(bigram = reorder(bigram, n)) %>%
  ggplot(aes(bigram, n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Top 20 bigrams (sample)", x = NULL, y = "Frequency")

7. Interesting Early Findings (Examples)

Heavy‑tailed distributions: Word counts per line are very skewed—most lines are short; a few are long.
Vocabulary overlap: Common functional words dominate; after removing stopwords, distinctive topical terms appear.
Source differences: Twitter text is shorter and noisier (mentions, hashtags) compared with blogs/news.

8. Plan for the Prediction Algorithm

Model: Start with n‑gram language models (unigram/bigram/trigram) with back‑off (e.g., Stupid Backoff) and Kneser–Ney smoothing as a stretch goal.
Preprocessing: Lower‑case, remove URLs/handles/extra whitespace; optionally keep apostrophes for contractions.
Data split: Use a larger sample for training (e.g., 70–80%), hold out a test set for perplexity evaluation.
Features: Store frequency tables for n‑grams in compressed form (e.g., using data.table / arrow).
Latency: Pre‑compute hash maps from prefix → candidate next words for fast lookup.

9. Plan for the Shiny App

UI: One text input box + a list of top 3–5 next‑word suggestions.
Server: On each keystroke, parse the last 1–3 tokens, lookup predictions from n‑gram tables with backoff.
Extras: A toggle for “remove profanity”, and simple telemetry (optional) to log latency.

10. Checklist (Rubric)

HTML on RPubs describes EDA of the training data.
Basic summaries of the three files (sizes, line & word counts).
Basic plots (histograms / bar charts).
Brief, non‑technical narrative oriented to a manager.
Clear plan for algorithm & Shiny app.

11. Reproducibility & Session Info

sessionInfo()

## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Saigon
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] stopwords_2.3  ggplot2_4.0.0  tidyr_1.3.1    tidytext_0.4.3 dplyr_1.1.4   
## [6] readr_2.1.5    stringi_1.8.7  glue_1.8.0     fs_1.6.6      
## 
## loaded via a namespace (and not attached):
##  [1] Matrix_1.7-3       gtable_0.3.6       jsonlite_2.0.0     janeaustenr_1.0.0 
##  [5] compiler_4.5.1     tidyselect_1.2.1   Rcpp_1.1.0         stringr_1.5.2     
##  [9] jquerylib_0.1.4    scales_1.4.0       fastmap_1.2.0      lattice_0.22-7    
## [13] R6_2.6.1           labeling_0.4.3     SnowballC_0.7.1    generics_0.1.4    
## [17] knitr_1.50         tibble_3.3.0       RColorBrewer_1.1-3 bslib_0.9.0       
## [21] pillar_1.11.1      tzdb_0.5.0         tokenizers_0.3.0   rlang_1.1.6       
## [25] cachem_1.1.0       xfun_0.53          S7_0.2.0           sass_0.4.10       
## [29] cli_3.6.5          withr_3.0.2        magrittr_2.0.4     digest_0.6.37     
## [33] grid_4.5.1         rstudioapi_0.17.1  hms_1.1.3          lifecycle_1.0.4   
## [37] vctrs_0.6.5        evaluate_1.0.5     farver_2.1.2       codetools_0.2-20  
## [41] purrr_1.1.0        rmarkdown_2.29     tools_4.5.1        pkgconfig_2.0.3   
## [45] htmltools_0.5.8.1

12. Notes for Reviewers

All code chunks are cached; increase sample_frac to reproduce fuller results.
For faster iteration, consider processing each source in parallel using future.apply (not required for milestone).