Overview
Data Loading (Robust: finds files or downloads them)
Text Cleaning
Building n-grams (1–4)
Pruning for size + runtime
Backoff prediction (handles unseen n-grams)
Evaluation (simple top-3 accuracy)
Model size
Conclusion

Overview

This report builds an n-gram predictive text model (1- to 4-grams) and uses a backoff strategy to handle unseen n-grams. The model is pruned to reduce memory usage and improve runtime for potential Shiny deployment.

Data Loading (Robust: finds files or downloads them)

find_swiftkey_file <- function(filename) {
  # Common places your files might be (including where your screenshot shows you unzipped)
  candidates <- c(
    getwd(),
    file.path(getwd(), "final", "en_US"),
    file.path(path.expand("~"), "Downloads"),
    file.path(path.expand("~"), "Downloads", "capstone_data"),
    file.path(path.expand("~"), "Downloads", "capstone_data", "final", "en_US")
  )
  
  # Direct hits
  for (d in candidates) {
    p <- file.path(d, filename)
    if (file.exists(p)) return(normalizePath(p))
  }
  
  # Recursive search inside candidates
  for (d in candidates) {
    if (dir.exists(d)) {
      hits <- list.files(d, pattern = paste0("^", gsub("\\.", "\\\\.", filename), "$"),
                         recursive = TRUE, full.names = TRUE)
      if (length(hits) > 0) return(normalizePath(hits[1]))
    }
  }
  
  NA_character_
}

blogs_path   <- find_swiftkey_file("en_US.blogs.txt")
news_path    <- find_swiftkey_file("en_US.news.txt")
twitter_path <- find_swiftkey_file("en_US.twitter.txt")

# If not found, download + unzip the Coursera SwiftKey dataset (same approach you were using)
if (any(is.na(c(blogs_path, news_path, twitter_path)))) {
  zip_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
  zip_file <- file.path(tempdir(), "Coursera-SwiftKey.zip")
  unzip_dir <- file.path(tempdir(), "swiftkey")
  
  download.file(zip_url, destfile = zip_file, mode = "wb")
  unzip(zip_file, exdir = unzip_dir)
  
  blogs_path   <- list.files(unzip_dir, pattern = "^en_US.blogs\\.txt$", recursive = TRUE, full.names = TRUE)[1]
  news_path    <- list.files(unzip_dir, pattern = "^en_US.news\\.txt$", recursive = TRUE, full.names = TRUE)[1]
  twitter_path <- list.files(unzip_dir, pattern = "^en_US.twitter\\.txt$", recursive = TRUE, full.names = TRUE)[1]
}

stopifnot(file.exists(blogs_path), file.exists(news_path), file.exists(twitter_path))

blogs   <- readLines(blogs_path, encoding = "UTF-8", warn = FALSE)
news    <- readLines(news_path, encoding = "UTF-8", warn = FALSE)
twitter <- readLines(twitter_path, encoding = "UTF-8", warn = FALSE)

text <- c(blogs, news, twitter)

# Sample for speed (adjust up/down if you want)
set.seed(123)
text <- sample(text, size = min(50000, length(text)))

stopifnot(is.character(text))
length(text)

## [1] 50000

Text Cleaning

clean_text <- function(x) {
  x <- tolower(x)
  x <- stri_replace_all_regex(x, "http\\S+|www\\S+", " ")
  x <- stri_replace_all_regex(x, "[^a-z\\s']", " ")
  x <- stri_replace_all_regex(x, "\\s+", " ")
  x <- stri_trim_both(x)
  x[nchar(x) > 0]
}

text_clean <- clean_text(text)
length(text_clean)

## [1] 49976

Building n-grams (1–4)

make_ngrams <- function(lines, n) {
  grams <- vector("list", length(lines))
  k <- 1L
  
  for (i in seq_along(lines)) {
    w <- unlist(strsplit(lines[i], " ", fixed = TRUE))
    if (length(w) < n) next
    
    grams[[k]] <- vapply(
      seq_len(length(w) - n + 1),
      function(j) paste(w[j:(j+n-1)], collapse = " "),
      character(1)
    )
    k <- k + 1L
  }
  
  grams <- unlist(grams, use.names = FALSE)
  dt <- data.table(ngram = grams)[, .N, by = ngram]
  setorder(dt, -N)
  dt
}

uni  <- make_ngrams(text_clean, 1)
bi   <- make_ngrams(text_clean, 2)
tri  <- make_ngrams(text_clean, 3)
quad <- make_ngrams(text_clean, 4)

list(unigrams=nrow(uni), bigrams=nrow(bi), trigrams=nrow(tri), quadgrams=nrow(quad))

## $unigrams
## [1] 55862
## 
## $bigrams
## [1] 499501
## 
## $trigrams
## [1] 894765
## 
## $quadgrams
## [1] 995480

Pruning for size + runtime

prune_ngram_table <- function(dt, n, min_count = 2, top_k = 3) {
  parts <- tstrsplit(dt$ngram, " ", fixed = TRUE)
  
  if (n == 1) {
    out <- dt[N >= min_count]
    out[, prob := N / sum(N)]
    return(out[, .(next_word = ngram, prob)])
  }
  
  prefix <- do.call(paste, c(parts[1:(n-1)], list(sep = " ")))
  nextw  <- parts[[n]]
  
  out <- data.table(prefix = prefix, next_word = nextw, N = dt$N)
  out <- out[N >= min_count]
  out[, prob := N / sum(N), by = prefix]
  setorder(out, prefix, -prob)
  out[, head(.SD, top_k), by = prefix]
}

uni_p  <- prune_ngram_table(uni, 1, min_count = 5)
bi_p   <- prune_ngram_table(bi, 2, min_count = 3)
tri_p  <- prune_ngram_table(tri, 3, min_count = 2)
quad_p <- prune_ngram_table(quad,4, min_count = 2)

Backoff prediction (handles unseen n-grams)

alpha <- 0.4

predict_next <- function(input, top_n = 3) {
  w <- unlist(strsplit(clean_text(input), " "))
  
  get <- function(dt, p, weight) {
    hit <- dt[prefix == p]
    if (nrow(hit) == 0) return(NULL)
    hit[, .(next_word, score = prob * weight)]
  }
  
  candidates <- rbindlist(list(
    if (length(w) >= 3) get(quad_p, paste(tail(w,3), collapse=" "), 1),
    if (length(w) >= 2) get(tri_p,  paste(tail(w,2), collapse=" "), alpha),
    if (length(w) >= 1) get(bi_p,   tail(w,1), alpha^2)
  ), fill = TRUE)
  
  if (!is.null(candidates) && nrow(candidates) > 0) {
    candidates <- candidates[, .(score = max(score)), by = next_word]
    setorder(candidates, -score)
    return(head(candidates, top_n))
  }
  
  head(uni_p[, .(next_word, score = prob)], top_n)
}

predict_next("i love")

##    next_word      score
##       <char>      <num>
## 1:       you 0.11358025
## 2:       the 0.04641975
## 3:        it 0.02962963

predict_next("this is")

##    next_word      score
##       <char>      <num>
## 1:         a 0.07927273
## 2:       the 0.06981818
## 3:       not 0.02981818

predict_next("in the middle of")

##    next_word     score
##       <char>     <num>
## 1:       the 0.6904762
## 2:         a 0.1666667
## 3:   nowhere 0.0952381

Evaluation (simple top-3 accuracy)

set.seed(42)
test_lines <- sample(text_clean, size = floor(0.05 * length(text_clean)))

cases <- lapply(test_lines, function(x) {
  w <- unlist(strsplit(x, " "))
  if (length(w) < 4) return(NULL)
  i <- sample(2:(length(w)-1), 1)
  list(input = paste(w[1:i], collapse=" "), actual = w[i+1])
})
cases <- rbindlist(cases)

accuracy_top3 <- mean(sapply(seq_len(nrow(cases)), function(i) {
  cases$actual[i] %in% predict_next(cases$input[i], top_n = 3)$next_word
}))

accuracy_top3

## [1] 0.2537699

Model size

data.table(
  model = c("unigram","bigram","trigram","quadgram"),
  size_mb = c(
    object.size(uni_p),
    object.size(bi_p),
    object.size(tri_p),
    object.size(quad_p)
  ) / (1024^2)
)

##       model   size_mb
##      <char>     <num>
## 1:  unigram 0.9995651
## 2:   bigram 0.7919540
## 3:  trigram 3.0848312
## 4: quadgram 1.5997772

Conclusion

An n-gram model (1–4 grams) was built for next-word prediction. Unseen n-grams were handled using a backoff approach, ensuring robust predictions while maintaining efficient runtime. Pruning reduced the model size and improved lookup speed, supporting potential deployment within a Shiny app.

Data Science Capstone — Task 3: Modeling

Mayank Pandit

December 17, 2025