This report builds an n-gram predictive text model (1- to 4-grams) and uses a backoff strategy to handle unseen n-grams. The model is pruned to reduce memory usage and improve runtime for potential Shiny deployment.
find_swiftkey_file <- function(filename) {
# Common places your files might be (including where your screenshot shows you unzipped)
candidates <- c(
getwd(),
file.path(getwd(), "final", "en_US"),
file.path(path.expand("~"), "Downloads"),
file.path(path.expand("~"), "Downloads", "capstone_data"),
file.path(path.expand("~"), "Downloads", "capstone_data", "final", "en_US")
)
# Direct hits
for (d in candidates) {
p <- file.path(d, filename)
if (file.exists(p)) return(normalizePath(p))
}
# Recursive search inside candidates
for (d in candidates) {
if (dir.exists(d)) {
hits <- list.files(d, pattern = paste0("^", gsub("\\.", "\\\\.", filename), "$"),
recursive = TRUE, full.names = TRUE)
if (length(hits) > 0) return(normalizePath(hits[1]))
}
}
NA_character_
}
blogs_path <- find_swiftkey_file("en_US.blogs.txt")
news_path <- find_swiftkey_file("en_US.news.txt")
twitter_path <- find_swiftkey_file("en_US.twitter.txt")
# If not found, download + unzip the Coursera SwiftKey dataset (same approach you were using)
if (any(is.na(c(blogs_path, news_path, twitter_path)))) {
zip_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
zip_file <- file.path(tempdir(), "Coursera-SwiftKey.zip")
unzip_dir <- file.path(tempdir(), "swiftkey")
download.file(zip_url, destfile = zip_file, mode = "wb")
unzip(zip_file, exdir = unzip_dir)
blogs_path <- list.files(unzip_dir, pattern = "^en_US.blogs\\.txt$", recursive = TRUE, full.names = TRUE)[1]
news_path <- list.files(unzip_dir, pattern = "^en_US.news\\.txt$", recursive = TRUE, full.names = TRUE)[1]
twitter_path <- list.files(unzip_dir, pattern = "^en_US.twitter\\.txt$", recursive = TRUE, full.names = TRUE)[1]
}
stopifnot(file.exists(blogs_path), file.exists(news_path), file.exists(twitter_path))
blogs <- readLines(blogs_path, encoding = "UTF-8", warn = FALSE)
news <- readLines(news_path, encoding = "UTF-8", warn = FALSE)
twitter <- readLines(twitter_path, encoding = "UTF-8", warn = FALSE)
text <- c(blogs, news, twitter)
# Sample for speed (adjust up/down if you want)
set.seed(123)
text <- sample(text, size = min(50000, length(text)))
stopifnot(is.character(text))
length(text)
## [1] 50000
clean_text <- function(x) {
x <- tolower(x)
x <- stri_replace_all_regex(x, "http\\S+|www\\S+", " ")
x <- stri_replace_all_regex(x, "[^a-z\\s']", " ")
x <- stri_replace_all_regex(x, "\\s+", " ")
x <- stri_trim_both(x)
x[nchar(x) > 0]
}
text_clean <- clean_text(text)
length(text_clean)
## [1] 49976
make_ngrams <- function(lines, n) {
grams <- vector("list", length(lines))
k <- 1L
for (i in seq_along(lines)) {
w <- unlist(strsplit(lines[i], " ", fixed = TRUE))
if (length(w) < n) next
grams[[k]] <- vapply(
seq_len(length(w) - n + 1),
function(j) paste(w[j:(j+n-1)], collapse = " "),
character(1)
)
k <- k + 1L
}
grams <- unlist(grams, use.names = FALSE)
dt <- data.table(ngram = grams)[, .N, by = ngram]
setorder(dt, -N)
dt
}
uni <- make_ngrams(text_clean, 1)
bi <- make_ngrams(text_clean, 2)
tri <- make_ngrams(text_clean, 3)
quad <- make_ngrams(text_clean, 4)
list(unigrams=nrow(uni), bigrams=nrow(bi), trigrams=nrow(tri), quadgrams=nrow(quad))
## $unigrams
## [1] 55862
##
## $bigrams
## [1] 499501
##
## $trigrams
## [1] 894765
##
## $quadgrams
## [1] 995480
prune_ngram_table <- function(dt, n, min_count = 2, top_k = 3) {
parts <- tstrsplit(dt$ngram, " ", fixed = TRUE)
if (n == 1) {
out <- dt[N >= min_count]
out[, prob := N / sum(N)]
return(out[, .(next_word = ngram, prob)])
}
prefix <- do.call(paste, c(parts[1:(n-1)], list(sep = " ")))
nextw <- parts[[n]]
out <- data.table(prefix = prefix, next_word = nextw, N = dt$N)
out <- out[N >= min_count]
out[, prob := N / sum(N), by = prefix]
setorder(out, prefix, -prob)
out[, head(.SD, top_k), by = prefix]
}
uni_p <- prune_ngram_table(uni, 1, min_count = 5)
bi_p <- prune_ngram_table(bi, 2, min_count = 3)
tri_p <- prune_ngram_table(tri, 3, min_count = 2)
quad_p <- prune_ngram_table(quad,4, min_count = 2)
alpha <- 0.4
predict_next <- function(input, top_n = 3) {
w <- unlist(strsplit(clean_text(input), " "))
get <- function(dt, p, weight) {
hit <- dt[prefix == p]
if (nrow(hit) == 0) return(NULL)
hit[, .(next_word, score = prob * weight)]
}
candidates <- rbindlist(list(
if (length(w) >= 3) get(quad_p, paste(tail(w,3), collapse=" "), 1),
if (length(w) >= 2) get(tri_p, paste(tail(w,2), collapse=" "), alpha),
if (length(w) >= 1) get(bi_p, tail(w,1), alpha^2)
), fill = TRUE)
if (!is.null(candidates) && nrow(candidates) > 0) {
candidates <- candidates[, .(score = max(score)), by = next_word]
setorder(candidates, -score)
return(head(candidates, top_n))
}
head(uni_p[, .(next_word, score = prob)], top_n)
}
predict_next("i love")
## next_word score
## <char> <num>
## 1: you 0.11358025
## 2: the 0.04641975
## 3: it 0.02962963
predict_next("this is")
## next_word score
## <char> <num>
## 1: a 0.07927273
## 2: the 0.06981818
## 3: not 0.02981818
predict_next("in the middle of")
## next_word score
## <char> <num>
## 1: the 0.6904762
## 2: a 0.1666667
## 3: nowhere 0.0952381
set.seed(42)
test_lines <- sample(text_clean, size = floor(0.05 * length(text_clean)))
cases <- lapply(test_lines, function(x) {
w <- unlist(strsplit(x, " "))
if (length(w) < 4) return(NULL)
i <- sample(2:(length(w)-1), 1)
list(input = paste(w[1:i], collapse=" "), actual = w[i+1])
})
cases <- rbindlist(cases)
accuracy_top3 <- mean(sapply(seq_len(nrow(cases)), function(i) {
cases$actual[i] %in% predict_next(cases$input[i], top_n = 3)$next_word
}))
accuracy_top3
## [1] 0.2537699
data.table(
model = c("unigram","bigram","trigram","quadgram"),
size_mb = c(
object.size(uni_p),
object.size(bi_p),
object.size(tri_p),
object.size(quad_p)
) / (1024^2)
)
## model size_mb
## <char> <num>
## 1: unigram 0.9995651
## 2: bigram 0.7919540
## 3: trigram 3.0848312
## 4: quadgram 1.5997772
An n-gram model (1–4 grams) was built for next-word prediction. Unseen n-grams were handled using a backoff approach, ensuring robust predictions while maintaining efficient runtime. Pruning reduced the model size and improved lookup speed, supporting potential deployment within a Shiny app.