title: “Milestone Report” author: “Phuoc Nguyen” date: “2025-10-31”
output: html_document: toc: true toc_depth: 3 toc_float: true number_sections: false theme: readable highlight: tango

1. Motivation & Goals (for a non‑data‑science manager)

2. Data Access & Loading

We use the Coursera/SwiftKey data set (blogs, news, Twitter) for English (en_US). The following chunk downloads and unpacks the files into a data/ folder if not present.

library(fs)
library(utils)
library(glue)

set.seed(2025)

dir_create("data")
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
zip_path <- path("data", "Coursera-SwiftKey.zip")

if (!file_exists(zip_path)) {
  download.file(url, destfile = zip_path, mode = "wb")
}

# Unzip only once
if (!dir_exists(path("data", "final"))) {
  unzip(zip_path, exdir = "data")
}

# Paths for en_US files
base_dir <- path("data", "final", "en_US")
blogs_path   <- path(base_dir, "en_US.blogs.txt")
news_path    <- path(base_dir, "en_US.news.txt")
twitter_path <- path(base_dir, "en_US.twitter.txt")

3. Basic Summaries (files, lines, words)

We count file sizes, number of lines, and word counts.

library(stringi)
library(readr)

# Helper to get counts quickly without loading all columns into a data frame
count_lines <- function(file) length(readLines(file, warn = FALSE, skipNul = TRUE))
count_words <- function(file) {
  txt <- read_file(file)
  sum(stri_count_words(txt))
}

sizes <- tibble::tibble(
  File = c("Blogs", "News", "Twitter"),
  Path = c(blogs_path, news_path, twitter_path),
  Size_MB = round(file_info(c(blogs_path, news_path, twitter_path))$size / (1024^2), 2),
  Lines = purrr::map_int(c(blogs_path, news_path, twitter_path), count_lines),
  Words = purrr::map_int(c(blogs_path, news_path, twitter_path), count_words)
)

knitr::kable(sizes[, c("File", "Size_MB", "Lines", "Words")], caption = "File sizes (MB), line counts, and word counts")
File sizes (MB), line counts, and word counts
File Size_MB Lines Words
Blogs 200 899288 37546806
News 196 1010206 34762658
Twitter 159 2360148 2127835

Note: Counting words via stringi on the full files may take a bit on a low‑resource laptop. It is cached by knitr to avoid re‑running.

4. Create a Small Sample for EDA

To keep the EDA responsive, we randomly sample a subset of lines from each file (change sample_frac as needed).

library(dplyr)
library(readr)

sample_frac <- 0.01  # 1% of each file; increase if your machine is strong

sample_file <- function(path, frac) {
  con <- file(path, open = "r")
  on.exit(close(con), add = TRUE)
  lines <- readLines(con, warn = FALSE, skipNul = TRUE)
  n <- length(lines)
  keep <- sample.int(n, size = max(1, floor(n * frac)))
  tibble(text = lines[keep])
}

blogs_df   <- sample_file(blogs_path, sample_frac) %>% mutate(source = "blogs")
news_df    <- sample_file(news_path, sample_frac) %>% mutate(source = "news")
twitter_df <- sample_file(twitter_path, sample_frac) %>% mutate(source = "twitter")

sampled <- bind_rows(blogs_df, news_df, twitter_df)

# Quick sanity checks
knitr::kable(sampled %>% count(source), caption = "Sampled line counts by source")
Sampled line counts by source
source n
blogs 8992
news 10102
twitter 23601

5. Cleaning & Tokenization (Unigrams, Bigrams, Trigrams)

We normalize text by lower‑casing and removing punctuation/numbers; then we tokenize into unigrams (single words), bigrams (2‑word phrases) and trigrams (3‑word phrases).

library(tidytext)
library(tidyr)
library(ggplot2)
library(stopwords)

# Unigrams
unigrams <- sampled %>%
  mutate(text = stringi::stri_trans_tolower(text)) %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stopwords::stopwords("en"), # remove stopwords to see topical words
         !stri_detect_regex(word, "^[0-9]+$"))

top_uni <- unigrams %>% count(word, sort = TRUE) %>% slice_head(n = 20)

# Bigrams
bigrams <- sampled %>%
  mutate(text = stringi::stri_trans_tolower(text)) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, into = c("w1", "w2"), sep = " ", fill = "right", remove = FALSE) %>%
  filter(!w1 %in% stopwords::stopwords("en"), !w2 %in% stopwords::stopwords("en"))

top_bi <- bigrams %>% count(bigram, sort = TRUE) %>% slice_head(n = 20)

# Trigrams
trigrams <- sampled %>%
  mutate(text = stringi::stri_trans_tolower(text)) %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3)

top_tri <- trigrams %>% count(trigram, sort = TRUE) %>% slice_head(n = 20)

knitr::kable(top_uni, caption = "Top 20 unigrams (after stopword removal)")
Top 20 unigrams (after stopword removal)
word n
just 3152
said 3035
one 2978
like 2699
can 2521
get 2252
time 2208
new 2000
good 1878
now 1837
day 1782
know 1628
love 1611
people 1580
year 1449
see 1438
back 1428
first 1400
go 1392
also 1327
knitr::kable(top_bi,  caption = "Top 20 bigrams (after stopword removal)")
Top 20 bigrams (after stopword removal)
bigram n
right now 278
new york 205
year old 203
last year 176
last night 153
high school 145
NA 140
first time 132
last week 130
years ago 128
feel like 123
st louis 113
looks like 104
next week 102
looking forward 98
can get 97
even though 95
just got 95
make sure 93
united states 91
knitr::kable(top_tri, caption = "Top 20 trigrams")
Top 20 trigrams
trigram n
NA 1157
one of the 349
a lot of 297
thanks for the 265
to be a 194
it was a 182
going to be 179
as well as 142
i want to 140
part of the 136
the end of 130
out of the 125
i have to 122
some of the 121
be able to 119
this is a 115
i have a 113
a couple of 109
the first time 109
the fact that 105

6. Basic Plots

We add simple plots (histograms / bar charts) that a non‑technical reader can grasp quickly.

# Histogram of words per line (sample)
sampled %>%
  mutate(tokens = stri_count_words(text)) %>%
  ggplot(aes(tokens)) +
  geom_histogram(binwidth = 2) +
  labs(title = "Distribution of words per line (sample)", x = "Words per line", y = "Count")

# Bar charts for top unigrams and bigrams
top_uni %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Top 20 unigrams (sample)", x = NULL, y = "Frequency")

top_bi %>%
  mutate(bigram = reorder(bigram, n)) %>%
  ggplot(aes(bigram, n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Top 20 bigrams (sample)", x = NULL, y = "Frequency")

7. Interesting Early Findings (Examples)

8. Plan for the Prediction Algorithm

9. Plan for the Shiny App

10. Checklist (Rubric)

11. Reproducibility & Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Saigon
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] stopwords_2.3  ggplot2_4.0.0  tidyr_1.3.1    tidytext_0.4.3 dplyr_1.1.4   
## [6] readr_2.1.5    stringi_1.8.7  glue_1.8.0     fs_1.6.6      
## 
## loaded via a namespace (and not attached):
##  [1] Matrix_1.7-3       gtable_0.3.6       jsonlite_2.0.0     janeaustenr_1.0.0 
##  [5] compiler_4.5.1     tidyselect_1.2.1   Rcpp_1.1.0         stringr_1.5.2     
##  [9] jquerylib_0.1.4    scales_1.4.0       fastmap_1.2.0      lattice_0.22-7    
## [13] R6_2.6.1           labeling_0.4.3     SnowballC_0.7.1    generics_0.1.4    
## [17] knitr_1.50         tibble_3.3.0       RColorBrewer_1.1-3 bslib_0.9.0       
## [21] pillar_1.11.1      tzdb_0.5.0         tokenizers_0.3.0   rlang_1.1.6       
## [25] cachem_1.1.0       xfun_0.53          S7_0.2.0           sass_0.4.10       
## [29] cli_3.6.5          withr_3.0.2        magrittr_2.0.4     digest_0.6.37     
## [33] grid_4.5.1         rstudioapi_0.17.1  hms_1.1.3          lifecycle_1.0.4   
## [37] vctrs_0.6.5        evaluate_1.0.5     farver_2.1.2       codetools_0.2-20  
## [41] purrr_1.1.0        rmarkdown_2.29     tools_4.5.1        pkgconfig_2.0.3   
## [45] htmltools_0.5.8.1

12. Notes for Reviewers