milestone-report

Introduction

This milestone report presents exploratory data analysis (EDA) for a predictive text model using the Coursera SwiftKey dataset. The report is explicitly designed to be memory-efficient, reproducible, and safe to knit on limited-RAM environments.

1. Libraries

library(downloader)
library(stringi)
library(knitr)
library(ggplot2)

library(quanteda)

2. Downloading the Dataset

if (!file.exists("./projectData")) dir.create("./projectData")

url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
zip_file <- "./projectData/Coursera-SwiftKey.zip"

if (!file.exists(zip_file)) {
  download.file(url, destfile = zip_file, mode = "wb")
}

if (!file.exists("./projectData/final")) {
  unzip(zip_file, exdir = "./projectData")
}

3. Truly Memory-Safe Sampling (Reservoir Sampling)

sampleLines <- function(file, n = 3000, seed = 5000) {
  set.seed(seed)
  
  con <- file(file, open = "r")
  sample <- character(0)
  i <- 0
  
  repeat {
    line <- readLines(con, n = 1, warn = FALSE)
    if (length(line) == 0) break
    
    i <- i + 1
    
    if (i <= n) {
      sample[i] <- line
    } else {
      j <- sample.int(i, 1)
      if (j <= n) {
        sample[j] <- line
      }
    }
  }
  
  close(con)
  sample
}

4. Sampling the English Corpora

base_path <- "./projectData/final/en_US"

blogs_samp   <- sampleLines(file.path(base_path, "en_US.blogs.txt"))
news_samp    <- sampleLines(file.path(base_path, "en_US.news.txt"))
twitter_samp <- sampleLines(file.path(base_path, "en_US.twitter.txt"))

all_samp <- c(blogs_samp, news_samp, twitter_samp)

dir.create("./projectData/sample", showWarnings = FALSE)
writeLines(all_samp, "./projectData/sample/all_samp.txt")

5. Basic Data Summary

data_summary <- data.frame(
  source = c("blogs", "news", "twitter"),
  sampled.lines = c(length(blogs_samp), length(news_samp), length(twitter_samp)),
  total.words = c(
    sum(stri_count_words(blogs_samp)),
    sum(stri_count_words(news_samp)),
    sum(stri_count_words(twitter_samp))
  ),
  mean.words = c(
    mean(stri_count_words(blogs_samp)),
    mean(stri_count_words(news_samp)),
    mean(stri_count_words(twitter_samp))
  )
)

kable(data_summary, caption = "Summary of Sampled Data")

Summary of Sampled Data
source	sampled.lines	total.words	mean.words
blogs	3000	127801	42.60033
news	3000	103107	34.36900
twitter	3000	39381	13.12700

6. Tokenization and Cleaning (quanteda)

tokens_all <- tokens(
  all_samp,
  remove_punct = TRUE,
  remove_numbers = TRUE,
  remove_symbols = TRUE,
  remove_url = TRUE
)

tokens_all <- tokens_tolower(tokens_all)
tokens_all <- tokens_remove(tokens_all, stopwords("en"))

7. Exploratory Analysis: N-grams

Unigrams

dfm1 <- dfm(tokens_all)
top_uni <- topfeatures(dfm1, 30)
top_uni

##   said    one   just   like    can   time    get    new people    now    day 
##    922    826    678    616    605    541    520    492    458    436    411 
##   also   good    two    see   know   make  first   much   last   back   year 
##    402    383    356    352    347    342    333    332    327    323    318 
##     us  think     go  going   even    way   well really 
##    317    307    305    305    297    287    286    283

Bigrams

dfm2 <- dfm(tokens_ngrams(tokens_all, n = 2))
top_bi <- topfeatures(dfm2, 30)
top_bi

##       right_now       last_year       years_ago     high_school       last_week 
##              48              47              41              41              40 
##        new_york      first_time      new_jersey         can_get         one_day 
##              39              32              32              31              26 
##        st_louis    good_morning      don’t_know       feel_like     many_people 
##              25              24              23              23              22 
##      last_night    social_media     even_though      looks_like        get_back 
##              22              22              21              21              20 
##       two_years       long_time     los_angeles   united_states       make_sure 
##              20              19              19              19              18 
##       last_time         can_see     three_years looking_forward        can_help 
##              18              18              18              17              17

Trigrams

dfm3 <- dfm(tokens_ngrams(tokens_all, n = 3))
top_tri <- topfeatures(dfm3, 30)
top_tri

##              new_york_times                 dah_dah_dah 
##                           6                           5 
##            couple_years_ago            uk_singles_chart 
##                           5                           5 
##      president_barack_obama     airline_ticket_discount 
##                           5                           5 
##               two_years_ago          gov_chris_christie 
##                           5                           5 
##       miley_beautiful_miley   beautiful_miley_beautiful 
##                           5                           5 
##             don’t_get_wrong     monitoring_social_media 
##                           4                           4 
##               long_time_ago              world_book_day 
##                           4                           4 
##             new_york_giants               cinco_de_mayo 
##                           4                           4 
##            saturday_may_5th              mr_omand_wants 
##                           3                           3 
##           omand_wants_green           wants_green_paper 
##                           3                           3 
##       green_paper_published  paper_published_monitoring 
##                           3                           3 
## published_monitoring_social          social_media_sites 
##                           3                           3 
##         media_sites_private      sites_private_industry 
##                           3                           3 
##       private_industry_link      percent_income_percent 
##                           3                           3 
##       income_percent_income                new_york_n.y 
##                           3                           3

8. Visualization

plot_top <- function(x, title) {
  df <- data.frame(
    term = names(x),
    freq = as.numeric(x)
  )
  
  ggplot(df, aes(reorder(term, freq), freq)) +
    geom_col(fill = "steelblue") +
    coord_flip() +
    labs(title = title, x = NULL, y = "Frequency") +
    theme_minimal()
}

plot_top(top_uni, "Top 30 Unigrams")

plot_top(top_bi, "Top 30 Bigrams")

plot_top(top_tri, "Top 30 Trigrams")

9. Conclusion and Next Steps

Reservoir sampling ensured constant memory usage.
quanteda avoided dense matrices.
Frequent n-grams were identified safely.
Next step: back-off n-gram prediction model.
Final deployment via Shiny application.

6. Data Cleaning

corpus <- tm::VCorpus(tm::VectorSource(all_samp))

toSpace <- tm::content_transformer(function(x, pattern) gsub(pattern, " ", x))

corpus <- tm::tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm::tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm::tm_map(corpus, toSpace, "#")
corpus <- tm::tm_map(corpus, tm::content_transformer(tolower))
corpus <- tm::tm_map(corpus, tm::removeWords, tm::stopwords("english"))
corpus <- tm::tm_map(corpus, tm::removePunctuation)
corpus <- tm::tm_map(corpus, tm::removeNumbers)
corpus <- tm::tm_map(corpus, tm::stripWhitespace)
corpus <- tm::tm_map(corpus, tm::PlainTextDocument)

7. Exploratory Analysis: N-grams

getFreq <- function(tdm) {
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  data.frame(word = names(freq), freq = freq)
}

bigram  <- function(x) RWeka::NGramTokenizer(x, RWeka::Weka_control(min = 2, max = 2))
trigram <- function(x) RWeka::NGramTokenizer(x, RWeka::Weka_control(min = 3, max = 3))

makePlot <- function(data, title) {
  ggplot2::ggplot(data[1:30, ], ggplot2::aes(reorder(word, -freq), freq)) +
    ggplot2::geom_bar(stat = "identity", fill = "steelblue") +
    ggplot2::labs(x = title, y = "Frequency") +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60, hjust = 1))
}

tokens_all <- tokens(
  all_samp,
  remove_punct = TRUE,
  remove_numbers = TRUE,
  remove_symbols = TRUE,
  remove_url = TRUE
)

tokens_all <- tokens_tolower(tokens_all)
tokens_all <- tokens_remove(tokens_all, stopwords("en"))

Top 30 Unigrams

dfm_uni <- quanteda::dfm(tokens_all)

top_uni <- quanteda::topfeatures(dfm_uni, 30)
top_uni

##   said    one   just   like    can   time    get    new people    now    day 
##    922    826    678    616    605    541    520    492    458    436    411 
##   also   good    two    see   know   make  first   much   last   back   year 
##    402    383    356    352    347    342    333    332    327    323    318 
##     us  think     go  going   even    way   well really 
##    317    307    305    305    297    287    286    283

Top 30 Bigrams

tokens_bi <- quanteda::tokens_ngrams(tokens_all, n = 2)

dfm_bi <- quanteda::dfm(tokens_bi)

top_bi <- quanteda::topfeatures(dfm_bi, 30)
top_bi

##       right_now       last_year       years_ago     high_school       last_week 
##              48              47              41              41              40 
##        new_york      first_time      new_jersey         can_get         one_day 
##              39              32              32              31              26 
##        st_louis    good_morning      don’t_know       feel_like     many_people 
##              25              24              23              23              22 
##      last_night    social_media     even_though      looks_like        get_back 
##              22              22              21              21              20 
##       two_years       long_time     los_angeles   united_states       make_sure 
##              20              19              19              19              18 
##       last_time         can_see     three_years looking_forward        can_help 
##              18              18              18              17              17

Top 30 Trigrams

tokens_tri <- quanteda::tokens_ngrams(tokens_all, n = 3)

dfm_tri <- quanteda::dfm(tokens_tri)

top_tri <- quanteda::topfeatures(dfm_tri, 30)
top_tri

##              new_york_times                 dah_dah_dah 
##                           6                           5 
##            couple_years_ago            uk_singles_chart 
##                           5                           5 
##      president_barack_obama     airline_ticket_discount 
##                           5                           5 
##               two_years_ago          gov_chris_christie 
##                           5                           5 
##       miley_beautiful_miley   beautiful_miley_beautiful 
##                           5                           5 
##             don’t_get_wrong     monitoring_social_media 
##                           4                           4 
##               long_time_ago              world_book_day 
##                           4                           4 
##             new_york_giants               cinco_de_mayo 
##                           4                           4 
##            saturday_may_5th              mr_omand_wants 
##                           3                           3 
##           omand_wants_green           wants_green_paper 
##                           3                           3 
##       green_paper_published  paper_published_monitoring 
##                           3                           3 
## published_monitoring_social          social_media_sites 
##                           3                           3 
##         media_sites_private      sites_private_industry 
##                           3                           3 
##       private_industry_link      percent_income_percent 
##                           3                           3 
##       income_percent_income                new_york_n.y 
##                           3                           3

8. Conclusion and Next Steps

Memory-efficient sampling of large text files
Cleaned and normalized corpus
Identified frequent unigrams, bigrams, and trigrams
Next step: build a back-off n-gram prediction model
Final deployment via Shiny application

milestone-report

2026-01-08

Introduction

1. Libraries

2. Downloading the Dataset

3. Truly Memory-Safe Sampling (Reservoir Sampling)

4. Sampling the English Corpora

5. Basic Data Summary

6. Tokenization and Cleaning (quanteda)

7. Exploratory Analysis: N-grams

Unigrams

Bigrams

Trigrams

8. Visualization

9. Conclusion and Next Steps

6. Data Cleaning

7. Exploratory Analysis: N-grams

Top 30 Unigrams

Top 30 Bigrams

Top 30 Trigrams

8. Conclusion and Next Steps