1 Task 1 – Getting and Cleaning the Data

I use the English dataset (blogs, news, Twitter) and prepare it with cleaning, tokenization, and profanity filtering.

1.1 Load & Inspect Raw Files

use_pkgs <- c("readr", "data.table", "stringi", "dplyr", "tidyr", "tibble", "quanteda", "quanteda.textstats", "ggplot2", "scales", "kableExtra", "here")
inst <- use_pkgs[!(use_pkgs %in% installed.packages()[, "Package"])]
if (length(inst)) install.packages(inst, dependencies = TRUE)
invisible(lapply(use_pkgs, library, character.only = TRUE))
DATA_DIR <- here::here("Coursera-SwiftKey/final/en_US")
blogs_path   <- file.path(DATA_DIR, "en_US.blogs.txt")
news_path    <- file.path(DATA_DIR, "en_US.news.txt")
twitter_path <- file.path(DATA_DIR, "en_US.twitter.txt")
read_lines_fast <- function(path) {
  data.table::fread(path, sep = "\n", header = FALSE, encoding = "UTF-8", quote = "", data.table = FALSE, showProgress = FALSE)[[1]]
}

blogs   <- read_lines_fast(blogs_path)
news    <- read_lines_fast(news_path)
twitter <- read_lines_fast(twitter_path)

1.2 File Statistics

size_mb <- function(path) round(file.info(path)$size / 1024^2, 2)

stats_tbl <- tibble::tibble(
  file = c("blogs", "news", "twitter"),
  size_MB = c(size_mb(blogs_path), size_mb(news_path), size_mb(twitter_path)),
  lines   = c(length(blogs), length(news), length(twitter)),
  n_chars = c(sum(nchar(blogs)), sum(nchar(news)), sum(nchar(twitter))),
  n_words = c(sum(stringi::stri_count_words(blogs)),
              sum(stringi::stri_count_words(news)),
              sum(stringi::stri_count_words(twitter)))
)

knitr::kable(stats_tbl, caption = "Raw file summary") %>%
  kableExtra::kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
Raw file summary
file size_MB lines n_chars n_words
blogs 200.42 615491 141455384 25679351
news 196.28 987096 198569320 33966501
twitter 159.36 2360148 162096241 30096690

1.3 Sampling & Cleaning

blogs1   <- iconv(blogs,   from = "UTF-8", to = "ASCII//TRANSLIT", sub = "")
news1    <- iconv(news,    from = "UTF-8", to = "ASCII//TRANSLIT", sub = "")
twitter1 <- iconv(twitter, from = "UTF-8", to = "ASCII//TRANSLIT", sub = "")

sample_pct <- 0.01
sample_vec <- function(x, p) if (length(x) > 0) sample(x, max(1, floor(length(x) * p))) else character()

sample_data <- c(sample_vec(blogs1, sample_pct), sample_vec(news1, sample_pct), sample_vec(twitter1, sample_pct))

clean_text <- function(x) {
  x |>
    stringi::stri_replace_all_regex("https?://[^\\s]+", " ") |>
    stringi::stri_replace_all_regex("[0-9]+", " ") |>
    stringi::stri_trans_tolower() |>
    stringi::stri_trim_both()
}

sample_clean <- clean_text(sample_data)

1.4 Tokenization & Profanity Filtering

corp <- quanteda::corpus(sample_clean)
profane_words <- c("badword1", "badword2", "offensiveword") # extend as needed
tok <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) |> tokens_remove(profane_words)

tok_uni_nostop <- tokens_remove(tok, stopwords("en"))
tok_bi  <- tokens_ngrams(tok, n = 2)
tok_tri <- tokens_ngrams(tok, n = 3)

2 Task 2 – Exploratory Data Analysis

I explore word frequencies, n-gram distributions, and coverage.

2.1 Frequency Tables

make_freq <- function(toks) {
  dfm <- quanteda::dfm(toks)
  frq <- quanteda.textstats::textstat_frequency(dfm)
  tibble::as_tibble(frq) |> dplyr::select(feature, frequency)
}

freq_uni <- make_freq(tok_uni_nostop)
freq_bi  <- make_freq(tok_bi)
freq_tri <- make_freq(tok_tri)

2.2 Visualization of Top Features

plot_top_n <- function(freq_tbl, n = 10, title = "Top features", fill_color = "#2C3E50") {
  freq_tbl |>
    slice_max(frequency, n = n) |>
    mutate(feature = reorder(feature, frequency)) |>
    ggplot(aes(x = feature, y = frequency, fill = frequency)) +
    geom_col(show.legend = FALSE, fill = fill_color) +
    coord_flip() +
    labs(title = title, x = NULL, y = "Frequency") +
    scale_y_continuous(labels = scales::comma) +
    theme_minimal(base_size = 12)
}

plot_top_n(freq_uni, 10, "Top 10 Unigrams (no stopwords)", "#18BC9C")

plot_top_n(freq_bi, 10, "Top 10 Bigrams", "#3498DB")

plot_top_n(freq_tri, 10, "Top 10 Trigrams", "#E74C3C")

2.3 Word Coverage

cum_cov <- freq_uni |> mutate(cum_freq = cumsum(frequency)/sum(frequency))
cover50 <- min(which(cum_cov$cum_freq >= 0.5))
cover90 <- min(which(cum_cov$cum_freq >= 0.9))

list(words_needed_50pct = cover50, words_needed_90pct = cover90)
## $words_needed_50pct
## [1] 1011
## 
## $words_needed_90pct
## [1] 15215

3 Task 3 – Modeling

I build a basic predictive model using n-grams and backoff.

3.1 Basic N-Gram Model & Backoff

lookup_bi <- freq_bi |>
  tidyr::separate(feature, into = c("w1","w2"), sep = "_", remove = FALSE) |>
  group_by(w1) |> arrange(desc(frequency), .by_group = TRUE)

lookup_tri <- freq_tri |>
  tidyr::separate(feature, into = c("w1","w2","w3"), sep = "_", remove = FALSE) |>
  group_by(w1, w2) |> arrange(desc(frequency), .by_group = TRUE)

predict_next <- function(phrase, top_k = 3) {
  # Tokenize once; avoid piping to `[[` which breaks with base pipe
  toks <- quanteda::tokens(phrase, remove_punct = TRUE)[[1]]
  if (length(toks) == 0) {
    return(freq_uni$feature[seq_len(min(top_k, nrow(freq_uni)))])
  }
  toks <- tolower(toks)
  n <- length(toks)

  # Try trigram continuation
  if (n >= 2) {
    w1 <- toks[n - 1]; w2 <- toks[n]
    cand <- lookup_tri |>
      dplyr::filter(w1 == .env$w1, w2 == .env$w2) |>
      dplyr::slice_head(n = top_k)
    if (nrow(cand) > 0) return(cand$w3)
  }

  # Back off to bigram
  w1 <- toks[n]
  cand <- lookup_bi |>
    dplyr::filter(w1 == .env$w1) |>
    dplyr::slice_head(n = top_k)
  if (nrow(cand) > 0) return(cand$w2)

  # Fallback to most frequent unigrams
  freq_uni$feature[seq_len(min(top_k, nrow(freq_uni)))]
}

4 Conclusion & Next Steps

  • ✅ Task 1: Data loading, cleaning, tokenization, profanity filtering.
  • ✅ Task 2: EDA with frequency tables, plots, and coverage.
  • ✅ Task 3: Basic n-gram model with backoff.

Next task will be: refine smoothing, compress tables, and build Shiny app.

5 Appendix

sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Africa/Nairobi
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] here_1.0.1                kableExtra_1.4.0         
##  [3] scales_1.4.0              ggplot2_3.5.2            
##  [5] quanteda.textstats_0.97.2 quanteda_4.3.1           
##  [7] tibble_3.3.0              tidyr_1.3.1              
##  [9] dplyr_1.1.4               stringi_1.8.7            
## [11] data.table_1.17.8         readr_2.1.5              
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     xml2_1.3.8         lattice_0.22-6    
##  [5] hms_1.1.3          digest_0.6.37      magrittr_2.0.3     evaluate_1.0.4    
##  [9] grid_4.4.1         RColorBrewer_1.1-3 fastmap_1.2.0      rprojroot_2.1.0   
## [13] jsonlite_2.0.0     Matrix_1.7-0       stopwords_2.3      purrr_1.1.0       
## [17] viridisLite_0.4.2  codetools_0.2-20   textshaping_1.0.1  jquerylib_0.1.4   
## [21] cli_3.6.3          rlang_1.1.4        withr_3.0.2        cachem_1.1.0      
## [25] yaml_2.3.10        tools_4.4.1        tzdb_0.5.0         fastmatch_1.1-6   
## [29] vctrs_0.6.5        R6_2.6.1           lifecycle_1.0.4    stringr_1.5.1     
## [33] pkgconfig_2.0.3    pillar_1.11.0      bslib_0.9.0        gtable_0.3.6      
## [37] glue_1.8.0         Rcpp_1.1.0         systemfonts_1.2.3  xfun_0.52         
## [41] tidyselect_1.2.1   rstudioapi_0.17.1  knitr_1.50         dichromat_2.0-0.1 
## [45] farver_2.1.2       htmltools_0.5.8.1  labeling_0.4.3     rmarkdown_2.29    
## [49] svglite_2.2.1      compiler_4.4.1     nsyllable_1.0.1