| title: “Milestone Report” author: “Phuoc Nguyen” date: “2025-10-31” |
| output: html_document: toc: true toc_depth: 3 toc_float: true number_sections: false theme: readable highlight: tango |
We use the Coursera/SwiftKey data set (blogs, news, Twitter) for
English (en_US). The following chunk
downloads and unpacks the files into a
data/ folder if not present.
library(fs)
library(utils)
library(glue)
set.seed(2025)
dir_create("data")
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
zip_path <- path("data", "Coursera-SwiftKey.zip")
if (!file_exists(zip_path)) {
download.file(url, destfile = zip_path, mode = "wb")
}
# Unzip only once
if (!dir_exists(path("data", "final"))) {
unzip(zip_path, exdir = "data")
}
# Paths for en_US files
base_dir <- path("data", "final", "en_US")
blogs_path <- path(base_dir, "en_US.blogs.txt")
news_path <- path(base_dir, "en_US.news.txt")
twitter_path <- path(base_dir, "en_US.twitter.txt")
We count file sizes, number of lines, and word counts.
library(stringi)
library(readr)
# Helper to get counts quickly without loading all columns into a data frame
count_lines <- function(file) length(readLines(file, warn = FALSE, skipNul = TRUE))
count_words <- function(file) {
txt <- read_file(file)
sum(stri_count_words(txt))
}
sizes <- tibble::tibble(
File = c("Blogs", "News", "Twitter"),
Path = c(blogs_path, news_path, twitter_path),
Size_MB = round(file_info(c(blogs_path, news_path, twitter_path))$size / (1024^2), 2),
Lines = purrr::map_int(c(blogs_path, news_path, twitter_path), count_lines),
Words = purrr::map_int(c(blogs_path, news_path, twitter_path), count_words)
)
knitr::kable(sizes[, c("File", "Size_MB", "Lines", "Words")], caption = "File sizes (MB), line counts, and word counts")
| File | Size_MB | Lines | Words |
|---|---|---|---|
| Blogs | 200 | 899288 | 37546806 |
| News | 196 | 1010206 | 34762658 |
| 159 | 2360148 | 2127835 |
Note: Counting words via
stringion the full files may take a bit on a low‑resource laptop. It is cached by knitr to avoid re‑running.
To keep the EDA responsive, we randomly sample a subset of lines from
each file (change sample_frac as needed).
library(dplyr)
library(readr)
sample_frac <- 0.01 # 1% of each file; increase if your machine is strong
sample_file <- function(path, frac) {
con <- file(path, open = "r")
on.exit(close(con), add = TRUE)
lines <- readLines(con, warn = FALSE, skipNul = TRUE)
n <- length(lines)
keep <- sample.int(n, size = max(1, floor(n * frac)))
tibble(text = lines[keep])
}
blogs_df <- sample_file(blogs_path, sample_frac) %>% mutate(source = "blogs")
news_df <- sample_file(news_path, sample_frac) %>% mutate(source = "news")
twitter_df <- sample_file(twitter_path, sample_frac) %>% mutate(source = "twitter")
sampled <- bind_rows(blogs_df, news_df, twitter_df)
# Quick sanity checks
knitr::kable(sampled %>% count(source), caption = "Sampled line counts by source")
| source | n |
|---|---|
| blogs | 8992 |
| news | 10102 |
| 23601 |
We normalize text by lower‑casing and removing punctuation/numbers; then we tokenize into unigrams (single words), bigrams (2‑word phrases) and trigrams (3‑word phrases).
library(tidytext)
library(tidyr)
library(ggplot2)
library(stopwords)
# Unigrams
unigrams <- sampled %>%
mutate(text = stringi::stri_trans_tolower(text)) %>%
unnest_tokens(word, text) %>%
filter(!word %in% stopwords::stopwords("en"), # remove stopwords to see topical words
!stri_detect_regex(word, "^[0-9]+$"))
top_uni <- unigrams %>% count(word, sort = TRUE) %>% slice_head(n = 20)
# Bigrams
bigrams <- sampled %>%
mutate(text = stringi::stri_trans_tolower(text)) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, into = c("w1", "w2"), sep = " ", fill = "right", remove = FALSE) %>%
filter(!w1 %in% stopwords::stopwords("en"), !w2 %in% stopwords::stopwords("en"))
top_bi <- bigrams %>% count(bigram, sort = TRUE) %>% slice_head(n = 20)
# Trigrams
trigrams <- sampled %>%
mutate(text = stringi::stri_trans_tolower(text)) %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3)
top_tri <- trigrams %>% count(trigram, sort = TRUE) %>% slice_head(n = 20)
knitr::kable(top_uni, caption = "Top 20 unigrams (after stopword removal)")
| word | n |
|---|---|
| just | 3152 |
| said | 3035 |
| one | 2978 |
| like | 2699 |
| can | 2521 |
| get | 2252 |
| time | 2208 |
| new | 2000 |
| good | 1878 |
| now | 1837 |
| day | 1782 |
| know | 1628 |
| love | 1611 |
| people | 1580 |
| year | 1449 |
| see | 1438 |
| back | 1428 |
| first | 1400 |
| go | 1392 |
| also | 1327 |
knitr::kable(top_bi, caption = "Top 20 bigrams (after stopword removal)")
| bigram | n |
|---|---|
| right now | 278 |
| new york | 205 |
| year old | 203 |
| last year | 176 |
| last night | 153 |
| high school | 145 |
| NA | 140 |
| first time | 132 |
| last week | 130 |
| years ago | 128 |
| feel like | 123 |
| st louis | 113 |
| looks like | 104 |
| next week | 102 |
| looking forward | 98 |
| can get | 97 |
| even though | 95 |
| just got | 95 |
| make sure | 93 |
| united states | 91 |
knitr::kable(top_tri, caption = "Top 20 trigrams")
| trigram | n |
|---|---|
| NA | 1157 |
| one of the | 349 |
| a lot of | 297 |
| thanks for the | 265 |
| to be a | 194 |
| it was a | 182 |
| going to be | 179 |
| as well as | 142 |
| i want to | 140 |
| part of the | 136 |
| the end of | 130 |
| out of the | 125 |
| i have to | 122 |
| some of the | 121 |
| be able to | 119 |
| this is a | 115 |
| i have a | 113 |
| a couple of | 109 |
| the first time | 109 |
| the fact that | 105 |
We add simple plots (histograms / bar charts) that a non‑technical reader can grasp quickly.
# Histogram of words per line (sample)
sampled %>%
mutate(tokens = stri_count_words(text)) %>%
ggplot(aes(tokens)) +
geom_histogram(binwidth = 2) +
labs(title = "Distribution of words per line (sample)", x = "Words per line", y = "Count")
# Bar charts for top unigrams and bigrams
top_uni %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
coord_flip() +
labs(title = "Top 20 unigrams (sample)", x = NULL, y = "Frequency")
top_bi %>%
mutate(bigram = reorder(bigram, n)) %>%
ggplot(aes(bigram, n)) +
geom_col() +
coord_flip() +
labs(title = "Top 20 bigrams (sample)", x = NULL, y = "Frequency")
data.table /
arrow).sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Saigon
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] stopwords_2.3 ggplot2_4.0.0 tidyr_1.3.1 tidytext_0.4.3 dplyr_1.1.4
## [6] readr_2.1.5 stringi_1.8.7 glue_1.8.0 fs_1.6.6
##
## loaded via a namespace (and not attached):
## [1] Matrix_1.7-3 gtable_0.3.6 jsonlite_2.0.0 janeaustenr_1.0.0
## [5] compiler_4.5.1 tidyselect_1.2.1 Rcpp_1.1.0 stringr_1.5.2
## [9] jquerylib_0.1.4 scales_1.4.0 fastmap_1.2.0 lattice_0.22-7
## [13] R6_2.6.1 labeling_0.4.3 SnowballC_0.7.1 generics_0.1.4
## [17] knitr_1.50 tibble_3.3.0 RColorBrewer_1.1-3 bslib_0.9.0
## [21] pillar_1.11.1 tzdb_0.5.0 tokenizers_0.3.0 rlang_1.1.6
## [25] cachem_1.1.0 xfun_0.53 S7_0.2.0 sass_0.4.10
## [29] cli_3.6.5 withr_3.0.2 magrittr_2.0.4 digest_0.6.37
## [33] grid_4.5.1 rstudioapi_0.17.1 hms_1.1.3 lifecycle_1.0.4
## [37] vctrs_0.6.5 evaluate_1.0.5 farver_2.1.2 codetools_0.2-20
## [41] purrr_1.1.0 rmarkdown_2.29 tools_4.5.1 pkgconfig_2.0.3
## [45] htmltools_0.5.8.1
sample_frac to
reproduce fuller results.future.apply (not required for milestone).