This milestone demonstrates that the SwiftKey text data were downloaded, loaded, sampled, cleaned, and explored. Summary tables and plots illustrate the three sources (Blogs, News, Twitter). I also describe the plan for the prediction algorithm (n-grams with backoff) and the Shiny app interface that will present predictions to a user.
```r # install if necessary # install.packages(c(“tm”,“quanteda”,“dplyr”,“ggplot2”,“stringi”,“data.table”,“wordcloud”,“shiny”))
library(quanteda) library(dplyr) library(ggplot2) library(stringi) library(data.table) library(wordcloud)
blogs_file <- “en_US.blogs.txt” news_file <- “en_US.news.txt” twitter_file<- “en_US.twitter.txt”
blogs <- readLines(blogs_file, encoding=“UTF-8”, warn=FALSE) news <- readLines(news_file, encoding=“UTF-8”, warn=FALSE) twitter <- readLines(twitter_file, encoding=“UTF-8”, warn=FALSE)
data_summary <- data.frame( Source = c(“Blogs”, “News”, “Twitter”), Lines = c(length(blogs), length(news), length(twitter)), Words = c(sum(stri_count_words(blogs)), sum(stri_count_words(news)), sum(stri_count_words(twitter))) ) data_summary
set.seed(123) sample_size <- 10000 # adjust if you have more RAM sampled <- c(sample(blogs, sample_size), sample(news, sample_size), sample(twitter, sample_size))
clean_text <- function(x){ x <- iconv(x, “UTF-8”, “ASCII”, sub=““) # remove non-ASCII x <- tolower(x) x <- gsub(”http[[:alnum:]]*“,”“, x) # remove urls x <- gsub(”[^a-z\\s']“,” “, x) # keep letters, spaces, apostrophes x <- gsub(”\s+“,” “, x) trimws(x) } sampled_clean <- vapply(sampled, clean_text, FUN.VALUE = character(1)) # use quanteda for fast tokenization corp <- corpus(sampled_clean) toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) toks <- tokens_remove(toks, stopwords(”en”)) # remove stopwords for cleaner patterns (keep original in model later)
dfm_uni <- dfm(tokens_ngrams(toks, n = 1)) dfm_bi <- dfm(tokens_ngrams(toks, n = 2)) dfm_tri <- dfm(tokens_ngrams(toks, n = 3))
top_n <- 20 freq_uni <- textstat_frequency(dfm_uni) %>% head(top_n) freq_bi <- textstat_frequency(dfm_bi) %>% head(top_n) freq_tri <- textstat_frequency(dfm_tri) %>% head(top_n)
freq_uni freq_bi freq_tri # top unigrams ggplot(freq_uni, aes(x=reorder(feature, frequency), y=frequency)) + geom_bar(stat=“identity”) + coord_flip() + ggtitle(“Top unigrams (sample)”)
ggplot(head(freq_bi, 15), aes(x=reorder(feature, frequency), y=frequency)) + geom_bar(stat=“identity”) + coord_flip() + ggtitle(“Top bigrams (sample)”)