I use the English dataset (blogs, news, Twitter) and prepare it with cleaning, tokenization, and profanity filtering.
use_pkgs <- c("readr", "data.table", "stringi", "dplyr", "tidyr", "tibble", "quanteda", "quanteda.textstats", "ggplot2", "scales", "kableExtra", "here")
inst <- use_pkgs[!(use_pkgs %in% installed.packages()[, "Package"])]
if (length(inst)) install.packages(inst, dependencies = TRUE)
invisible(lapply(use_pkgs, library, character.only = TRUE))
DATA_DIR <- here::here("Coursera-SwiftKey/final/en_US")
blogs_path <- file.path(DATA_DIR, "en_US.blogs.txt")
news_path <- file.path(DATA_DIR, "en_US.news.txt")
twitter_path <- file.path(DATA_DIR, "en_US.twitter.txt")
read_lines_fast <- function(path) {
data.table::fread(path, sep = "\n", header = FALSE, encoding = "UTF-8", quote = "", data.table = FALSE, showProgress = FALSE)[[1]]
}
blogs <- read_lines_fast(blogs_path)
news <- read_lines_fast(news_path)
twitter <- read_lines_fast(twitter_path)
size_mb <- function(path) round(file.info(path)$size / 1024^2, 2)
stats_tbl <- tibble::tibble(
file = c("blogs", "news", "twitter"),
size_MB = c(size_mb(blogs_path), size_mb(news_path), size_mb(twitter_path)),
lines = c(length(blogs), length(news), length(twitter)),
n_chars = c(sum(nchar(blogs)), sum(nchar(news)), sum(nchar(twitter))),
n_words = c(sum(stringi::stri_count_words(blogs)),
sum(stringi::stri_count_words(news)),
sum(stringi::stri_count_words(twitter)))
)
knitr::kable(stats_tbl, caption = "Raw file summary") %>%
kableExtra::kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
| file | size_MB | lines | n_chars | n_words |
|---|---|---|---|---|
| blogs | 200.42 | 615491 | 141455384 | 25679351 |
| news | 196.28 | 987096 | 198569320 | 33966501 |
| 159.36 | 2360148 | 162096241 | 30096690 |
blogs1 <- iconv(blogs, from = "UTF-8", to = "ASCII//TRANSLIT", sub = "")
news1 <- iconv(news, from = "UTF-8", to = "ASCII//TRANSLIT", sub = "")
twitter1 <- iconv(twitter, from = "UTF-8", to = "ASCII//TRANSLIT", sub = "")
sample_pct <- 0.01
sample_vec <- function(x, p) if (length(x) > 0) sample(x, max(1, floor(length(x) * p))) else character()
sample_data <- c(sample_vec(blogs1, sample_pct), sample_vec(news1, sample_pct), sample_vec(twitter1, sample_pct))
clean_text <- function(x) {
x |>
stringi::stri_replace_all_regex("https?://[^\\s]+", " ") |>
stringi::stri_replace_all_regex("[0-9]+", " ") |>
stringi::stri_trans_tolower() |>
stringi::stri_trim_both()
}
sample_clean <- clean_text(sample_data)
corp <- quanteda::corpus(sample_clean)
profane_words <- c("badword1", "badword2", "offensiveword") # extend as needed
tok <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) |> tokens_remove(profane_words)
tok_uni_nostop <- tokens_remove(tok, stopwords("en"))
tok_bi <- tokens_ngrams(tok, n = 2)
tok_tri <- tokens_ngrams(tok, n = 3)
I explore word frequencies, n-gram distributions, and coverage.
make_freq <- function(toks) {
dfm <- quanteda::dfm(toks)
frq <- quanteda.textstats::textstat_frequency(dfm)
tibble::as_tibble(frq) |> dplyr::select(feature, frequency)
}
freq_uni <- make_freq(tok_uni_nostop)
freq_bi <- make_freq(tok_bi)
freq_tri <- make_freq(tok_tri)
plot_top_n <- function(freq_tbl, n = 10, title = "Top features", fill_color = "#2C3E50") {
freq_tbl |>
slice_max(frequency, n = n) |>
mutate(feature = reorder(feature, frequency)) |>
ggplot(aes(x = feature, y = frequency, fill = frequency)) +
geom_col(show.legend = FALSE, fill = fill_color) +
coord_flip() +
labs(title = title, x = NULL, y = "Frequency") +
scale_y_continuous(labels = scales::comma) +
theme_minimal(base_size = 12)
}
plot_top_n(freq_uni, 10, "Top 10 Unigrams (no stopwords)", "#18BC9C")
plot_top_n(freq_bi, 10, "Top 10 Bigrams", "#3498DB")
plot_top_n(freq_tri, 10, "Top 10 Trigrams", "#E74C3C")
cum_cov <- freq_uni |> mutate(cum_freq = cumsum(frequency)/sum(frequency))
cover50 <- min(which(cum_cov$cum_freq >= 0.5))
cover90 <- min(which(cum_cov$cum_freq >= 0.9))
list(words_needed_50pct = cover50, words_needed_90pct = cover90)
## $words_needed_50pct
## [1] 1011
##
## $words_needed_90pct
## [1] 15215
I build a basic predictive model using n-grams and backoff.
lookup_bi <- freq_bi |>
tidyr::separate(feature, into = c("w1","w2"), sep = "_", remove = FALSE) |>
group_by(w1) |> arrange(desc(frequency), .by_group = TRUE)
lookup_tri <- freq_tri |>
tidyr::separate(feature, into = c("w1","w2","w3"), sep = "_", remove = FALSE) |>
group_by(w1, w2) |> arrange(desc(frequency), .by_group = TRUE)
predict_next <- function(phrase, top_k = 3) {
# Tokenize once; avoid piping to `[[` which breaks with base pipe
toks <- quanteda::tokens(phrase, remove_punct = TRUE)[[1]]
if (length(toks) == 0) {
return(freq_uni$feature[seq_len(min(top_k, nrow(freq_uni)))])
}
toks <- tolower(toks)
n <- length(toks)
# Try trigram continuation
if (n >= 2) {
w1 <- toks[n - 1]; w2 <- toks[n]
cand <- lookup_tri |>
dplyr::filter(w1 == .env$w1, w2 == .env$w2) |>
dplyr::slice_head(n = top_k)
if (nrow(cand) > 0) return(cand$w3)
}
# Back off to bigram
w1 <- toks[n]
cand <- lookup_bi |>
dplyr::filter(w1 == .env$w1) |>
dplyr::slice_head(n = top_k)
if (nrow(cand) > 0) return(cand$w2)
# Fallback to most frequent unigrams
freq_uni$feature[seq_len(min(top_k, nrow(freq_uni)))]
}
Next task will be: refine smoothing, compress tables, and build Shiny app.
sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Africa/Nairobi
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] here_1.0.1 kableExtra_1.4.0
## [3] scales_1.4.0 ggplot2_3.5.2
## [5] quanteda.textstats_0.97.2 quanteda_4.3.1
## [7] tibble_3.3.0 tidyr_1.3.1
## [9] dplyr_1.1.4 stringi_1.8.7
## [11] data.table_1.17.8 readr_2.1.5
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 generics_0.1.4 xml2_1.3.8 lattice_0.22-6
## [5] hms_1.1.3 digest_0.6.37 magrittr_2.0.3 evaluate_1.0.4
## [9] grid_4.4.1 RColorBrewer_1.1-3 fastmap_1.2.0 rprojroot_2.1.0
## [13] jsonlite_2.0.0 Matrix_1.7-0 stopwords_2.3 purrr_1.1.0
## [17] viridisLite_0.4.2 codetools_0.2-20 textshaping_1.0.1 jquerylib_0.1.4
## [21] cli_3.6.3 rlang_1.1.4 withr_3.0.2 cachem_1.1.0
## [25] yaml_2.3.10 tools_4.4.1 tzdb_0.5.0 fastmatch_1.1-6
## [29] vctrs_0.6.5 R6_2.6.1 lifecycle_1.0.4 stringr_1.5.1
## [33] pkgconfig_2.0.3 pillar_1.11.0 bslib_0.9.0 gtable_0.3.6
## [37] glue_1.8.0 Rcpp_1.1.0 systemfonts_1.2.3 xfun_0.52
## [41] tidyselect_1.2.1 rstudioapi_0.17.1 knitr_1.50 dichromat_2.0-0.1
## [45] farver_2.1.2 htmltools_0.5.8.1 labeling_0.4.3 rmarkdown_2.29
## [49] svglite_2.2.1 compiler_4.4.1 nsyllable_1.0.1