create_model_pruned.R

Build n-grams from SwiftKey files, prune top-K, and save a small model for deployment.

options(timeout = 1200) library(data.table) library(stringi)

—————- user settings —————-

sample_rate <- 1.0 # 1 = use all data, <1 to speed development e.g. 0.2 top_uni <- 50000 # keep top 50k unigrams top_bi <- 150000 # keep top 150k bigrams top_tri <- 150000 # keep top 150k trigrams top_quad <- 100000 # keep top 100k quadgrams # ————————————————

prepare directories

if(!dir.exists(“data”)) dir.create(“data”) zipfile <- “data/Coursera-SwiftKey.zip” if(!file.exists(zipfile)){ url <- “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip” message(“Downloading dataset (may take several minutes)…”) download.file(url, zipfile, mode = “wb”) } if(!dir.exists(“data/final”)){ unzip(zipfile, exdir = “data”) }

files

blogs_file <- “data/final/en_US/en_US.blogs.txt” news_file <- “data/final/en_US/en_US.news.txt” twitter_file <- “data/final/en_US/en_US.twitter.txt” stopifnot(file.exists(blogs_file), file.exists(news_file), file.exists(twitter_file))

read with optional sampling

read_sample <- function(path){ lines <- readLines(path, encoding = “UTF-8”, skipNul = TRUE, warn = FALSE) if(sample_rate < 1) lines <- sample(lines, size = max(1, floor(length(lines)*sample_rate))) lines } blogs <- read_sample(blogs_file) news <- read_sample(news_file) twitter <- read_sample(twitter_file) text_all_ if(!dir.exists(“data/final/en_US”)) dir.create(“data/final/en_US”, recursive = TRUE)

final/en_US/

Saved model_ngrams_small.Rds model_ngrams_small.Rds size (bytes): 65231400