Intro

This is the final part of Capstone project for Coursera’s Data Science and Data Science: Statistics and Machine Learning Specializations.

The goal of the project is to develop application predicting the next word after being given some text input. The preliminary Exploratory Data Analysis served as a preparation for building a predictive model. Provided in this Report Model evolves that findings to construct a data product based on the predictive algorithm.

Training data set containing a sample corpus of text in different languages can be downloaded here. This project focuses on English texts in the data set come from blogs, news and twitter.


  • Code chunks can be displayed by clicking Code button

Download and unzip

(loader function is performed in Appendix: loader)

library(stringi); library(tidytext); library(quanteda)
library(quanteda.textstats); library(kableExtra); library(tidyverse)
filename <- "Coursera-SwiftKey.zip"
loader(url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", filename = filename)
dir <- "./data/1227_DS-CS-w2_WordsFIN"
dest <- paste0(dir, "/", filename)
if(!file.exists(paste0(dir, "/final"))) {unzip(dest, exdir = dir)}

Data Processing

Sampling

Within each file (en_US.blogs.txt, en_US.news.txt, en_US.twitter.txt), every line is an extract from a single post/ article/ tweet. Key summaries can be found in the Exploratory Data Analysis.

The original files are quite large: \(0.899\) mln blog records, \(1.01\) mln news, \(2.36\) mln tweets. To speed up, sample \(15\%\) of the lines from each source (processing original files of that huge size pushed up against R’s memory limits and ran slowly).

path <- paste0(dir, "/cache/sample.RData")
if(file.exists(path)) load(path) else{
  path <- paste0(dir, "/final/en_US")
  svar <- function(name = "twitter") {
    file<- paste0(path,"/en_US.",name,".txt")
    svar<- read_lines(file, skip_empty_rows = TRUE)
    svar <- as_tibble_col(svar, column_name = "text")
    }
  blogs<-svar("blogs")
  twitter<-svar("twitter")
  news<-svar("news")

  set.seed(211214)
  rate <- 0.15
  blogs.sample <- blogs %>% slice_sample(., prop=rate)
  twitter.sample <- twitter %>% slice_sample(., prop=rate)
  news.sample <- news %>% slice_sample(., prop=rate)
  sample <- bind_rows(mutate(blogs.sample, source="blogs"),
                      mutate(twitter.sample, source= "twitter"),
                      mutate(news.sample, source="news"))
  sample$source <- as.factor(sample$source)
  cache(var = sample)
  rm(blogs, twitter, news, blogs.sample, twitter.sample, news.sample, rate, svar)
}

Cleanup

Text are splitted into sentences and converted to lower case, then are removed numbers, URLs, hashtags, condensed multiple white spaces, removed leading/ trailing white spaces; also unified apostrophes’ options, removed all non-alphanumerics except apostrophes, stripped surrounding apostrophes/ quotation marks.

# load("./data/1227_DS-CS-w2_WordsFIN/cache/sample.RData")
path <- paste0(dir, "/cache/sample.tidy.RData")
if(file.exists(path)) load(path) else{
  sample.tidy <- as_tibble(clean(sample$text)) %>% rename(text = value)
  blank <- which(sample.tidy$text == "")
  sample.tidy <- sample.tidy[-blank,]
  cache(var = sample.tidy)
  rm(sample, blank)
}

Tokenization

Tokenize the data, that is to separate it into smaller units like words or phrases - n-grams (contiguous sequence of n items). Also, there were pros and cons of removing profanities, but in the end it was decided to remove them

# load("./data/1227_DS-CS-w2_WordsFIN/cache/sample.tidy.RData")
path <- paste0(dir, "/cache/sample.corpus.RData")
if(file.exists(path)) load(path) else{
  sample.corpus<- corpus(sample.tidy$text)
  cache(var = sample.corpus)
}
# words.tidy<- sample.tidy %>% unnest_tokens(word,text) - just example: all words (w\o numbers & apostrophes), count: nrow(words.tidy)

path <- paste0(dir, "/cache/toks1.RData")
if(file.exists(path)) load(path) else{
  toks1<- tokens(sample.corpus) # to look at a whole sentence:
  # sample.corpus[["text640151"]]/ toks1[["text640151"]],
  # count: sum(ntoken(sample.corpus))/ sum(ntoken(toks1))
  cache(var = toks1) # orogonal, w\o removing profanities
  rm(sample.tidy, sample.corpus)
}
toks1 <- toksrp(toks1) # remove profanities
# toks1 <- toksrs(toks1) # remove stop-words

Create ngrams (for n = 1:5) and predicting tables showing frequency+.

Then, filter ngrams to speed up operations, and finally, create united nexxt-table (leave only ngrams appearing more than once & no more than four predictions)

path <- paste0(dir, "/cache/nexxt.RData")
if(file.exists(path)) load(path) else{
  nexxt0 <- list() # create united nexxt-table - original, full
  nexxt <- list() # reduced united nexxt-table (memory reasons)
  # load("./data/1227_DS-CS-w2_WordsFIN/cache/toks1.RData")
  n1gram<- dfm(toks1) # document feature matrix
  n1gram<-textstat_frequency(n1gram)
  n1gram <- tibble(n1gram) %>%
    transmute(ngram = feature, given = "", nexxt = feature,
              share = frequency/sum(frequency),
              frequency, coverage = cumsum(share), n = 1L)
  cache(var = n1gram) # store original ngram
  
  # load("./data/1227_DS-CS-w2_WordsFIN/cache/n1gram.RData")
  nexxt0[[1]] <- n1gram %>% filter(frequency >= 2) # filter ngram for the nexxt-table
  nexxt1 <- nexxt0[[1]]
  cache(var = nexxt1) # store the 1st list of the nexxt-table
  rm(n1gram, nexxt1)
  
  for (i in 2:5) {
    ngram <- tokens_ngrams(toks1, i)
    ngram<- dfm(ngram)
    ngram<-textstat_frequency(ngram)
    ngram <- tibble(ngram) %>%
      transmute(ngram = gsub("_", " ", feature),
                given = substr(ngram, 1, regexpr(" [^ ]*$", ngram)-1),
                nexxt = substr(ngram, regexpr(" [^ ]*$", ngram)+1, nchar(ngram)),
                share = frequency/sum(frequency),
                frequency, coverage = cumsum(share), n = as.integer(i))
    name <- paste0("n", i, "gram")
    cache(var = ngram, name = name)
    
    # path <- paste0("./data/1227_DS-CS-w2_WordsFIN/cache/n", i, "gram.RData")
    # load(path)
    ngram <- ngram %>% filter(frequency >= 2) # leave only `ngrams` appearing more than once
    # duo <- select(ngram, given) %>% duplicated() # remove duplicates (for the final model)
    # ngram <- ngram[!duo,] # (for the final model)
    ##### also possible leave no more than n predictions (here is n=5)
    # duo <- select(ngram, given) %>% duplicated()
    # ngram2 <- ngram[duo,]
    # ngram1 <- ngram[!duo,]
    # duo <- select(ngram2, given) %>% duplicated()
    # ngram3 <- ngram2[duo,]
    # ngram2 <- ngram2[!duo,]
    # duo <- select(ngram3, given) %>% duplicated()
    # ngram4 <- ngram3[duo,]
    # ngram3 <- ngram3[!duo,]
    # duo <- select(ngram4, given) %>% duplicated()
    # ngram5 <- ngram4[duo,]
    # ngram4 <- ngram4[!duo,]
    # duo <- select(ngram5, given) %>% duplicated()
    # ngram5 <- ngram5[!duo,]
    # ngram <- rbind(ngram1, ngram2, ngram3, ngram4, ngram5)
    #####
    nexxt0[[i]] <- ngram
    name <- paste0("nexxt", i)
    cache(var = ngram, name = name)
    }
  for (i in 1:length(nexxt0)) nexxt[[i]] <- nexxt0[[i]] %>%
    select(given, nexxt, share, frequency, n)
  cache(var = nexxt0)
  cache(var = nexxt)
  rm(toks1, i, name, ngram)
}

Model Testing

Smoothing

The model, as many others, depends on the training corpus (sample). So, we always face problems like items not seen in the training data (unknown words/ unobserved n-grams), as well as balance weight between frequent and infrequent ngrams.

There are a number of ways to fix it.

First, if the item is not seen, there may be several reasons:

  • (some) word(s) contain typos,
  • the last entered word is incomplete,
  • it’s really unseen in a training set.

In case of typos, can be used different string metrics, i.e. Q-gram, Jaccard, Cosine.

The incomplete word problem can be resolved by considering whether the last character is a space.

As for the problem with unseen items, it is also associated with infrequent ones. A good analysis of this issue is given in “Speech and Language Processing” by Jurafsky & Martin (chapters 3.3, 3.4).

For example, for unknown words (OOV - out of vocabulary), a pseudo-word called ‘UNK’ is created. Then, all words that occur fewer than n times in the training set can be replaced by ‘UNK’, or the top, say, 50 000 words by frequency are chosen, and the rest are replaced by ‘UNK’.

In general, while dealing with zero-frequency (unobserved) words/ngrams, it is necessary to smooth probability distributions by assigning to them non-zero probabilities. There are different smoothing approaches, e.g.:

Stupid Backoff algorithm implementation

Here is implemented Stupid backoff model, since it’s also the simplest solution for web-scale ngrams, computing ngrams-score (rather than a probability) on very large datasets very quickly. If the model meets ngram with a zero count, it simply backoff to a lower order ngram, weighed by a fixed (context-independent) weight:

\(S(w_i|w^{i-1}_{i-k+1}) = \left\{ \begin{array}{ll} \frac{f(w^i_{i-k+1})}{f(w^{i-1}_{i-k+1})} & \mbox{if \(f(w^i_{i-k+1})>0\)};\\\alpha \cdot S(w_i|w^{i-1}_{i-k+2}) & \mbox{otherwise}\end{array} \right.\)

where

  • \(w^L_1 = (w_1,...,w_L)\) is a string of \(L\) tokens
  • \(w^j_i\) is a substring of \(w^L_1\)
  • \(f(w^j_i)\) is the frequency of occurrence of that substring in the training data.

The recursion ends at unigrams: \(S(w_i) = \frac{f(w_i)}{N}\) with \(N\) being the size of the training corpus.

The backoff factor \(\alpha\) authors of the model empirically set to \(\alpha = 0.4\).

This algorithm is implemented in the wnexxt function (Appendix: wnexxt).

Perhaps, to speed up the program, it would also make sense to consider the option of preliminary calculations of the Stupid backoff scores.

Test Examples

Testing exercises came from Quiz 2: Natural language processing I, Week 3, Data Science Capstone course: predict the next word for each of the sentence fragments below (scroll down the table).

test <- rbind(
        c("The guy in front of me just bought a pound of bacon, a bouquet, and a case of", "beer", "cheese", "pretzels", "soda"), 
        c( "You're the reason why I smile everyday. Can you follow me please? It would mean the", "world", "most", "universe", "best"), 
        c( "Hey sunshine, can you follow me and make me the", "smelliest", "saddest", "bluest", "happiest"), 
        c( "Very early observations on the Bills game: Offense still struggling but the", "referees", "players", "defense", "crowd"), 
        c( "Go on a romantic date at the", "movies", "mall", "grocery", "beach"),
        c( "Well I'm pretty sure my granny has some old bagpipes in her garage I'll dust them off and be on my", "motorcycle", "way", "phone", "horse"), 
        c( "Ohhhhh #PointBreak is on tomorrow. Love that film and haven't seen it in quite some", "time", "weeks", "thing", "years"), 
        c( "After the ice bucket challenge Louis will push his long wet hair out of his eyes with his little", "eyes", "fingers", "toes", "ears"), 
        c( "Be grateful for the good times and keep the faith during the", "sad", "bad", "hard", "worse"), 
        c( "If this isn't the cutest thing you've ever seen, then you must be", "callous", "insane", "insensitive", "asleep"),
        c("When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd","die","give","sleep","eat"),
        c("Guy at my table's wife got up to go to the bathroom and I asked about dessert and he started telling me about his","horticultural","spiritual","financial","marital"),
        c("I'd give anything to see arctic monkeys this","month","morning","weekend","decade"),
        c("Talking to your mom has the same effect as a hug and helps reduce your","sleepiness","happiness","stress","hunger"),
        c("When you were in Holland you were like 1 inch away from me but you hadn't time to take a","look","minute","picture","walk"),
        c("I'd just like all of these questions answered, a presentation of evidence, and a jury to settle the","incident","case","account","matter"),
        c("I can't deal with unsymetrical things. I can't even hold an uneven number of bags of groceries in each","finger","hand","arm","toe"),
        c("Every inch of you is perfect from the bottom to the","center","middle","top","side"),
        c("I’m thankful my childhood was filled with imagination and bruises from playing","outside","daily","inside","weekly"),
        c("I like how the same people are in almost all of Adam Sandler's","pictures","novels","movies","stories")
)
colnames(test) <- c("text", "option1", "option2", "option3", "option4")
test <- as_tibble(test) %>% add_column(textn = 1:nrow(test), .before = "text")
kable(test, caption = "Table 1: Exercise") %>% kable_styling() %>%
        scroll_box(width = "100%", height = "500px")
Table 1: Exercise
textn text option1 option2 option3 option4
1 The guy in front of me just bought a pound of bacon, a bouquet, and a case of beer cheese pretzels soda
2 You’re the reason why I smile everyday. Can you follow me please? It would mean the world most universe best
3 Hey sunshine, can you follow me and make me the smelliest saddest bluest happiest
4 Very early observations on the Bills game: Offense still struggling but the referees players defense crowd
5 Go on a romantic date at the movies mall grocery beach
6 Well I’m pretty sure my granny has some old bagpipes in her garage I’ll dust them off and be on my motorcycle way phone horse
7 Ohhhhh #PointBreak is on tomorrow. Love that film and haven’t seen it in quite some time weeks thing years
8 After the ice bucket challenge Louis will push his long wet hair out of his eyes with his little eyes fingers toes ears
9 Be grateful for the good times and keep the faith during the sad bad hard worse
10 If this isn’t the cutest thing you’ve ever seen, then you must be callous insane insensitive asleep
11 When you breathe, I want to be the air for you. I’ll be there for you, I’d live and I’d die give sleep eat
12 Guy at my table’s wife got up to go to the bathroom and I asked about dessert and he started telling me about his horticultural spiritual financial marital
13 I’d give anything to see arctic monkeys this month morning weekend decade
14 Talking to your mom has the same effect as a hug and helps reduce your sleepiness happiness stress hunger
15 When you were in Holland you were like 1 inch away from me but you hadn’t time to take a look minute picture walk
16 I’d just like all of these questions answered, a presentation of evidence, and a jury to settle the incident case account matter
17 I can’t deal with unsymetrical things. I can’t even hold an uneven number of bags of groceries in each finger hand arm toe
18 Every inch of you is perfect from the bottom to the center middle top side
19 I’m thankful my childhood was filled with imagination and bruises from playing outside daily inside weekly
20 I like how the same people are in almost all of Adam Sandler’s pictures novels movies stories
  • Hire then function wnexxt to see which options come first, then compare them to the correct answers (scroll down the table):
tops <- list()
astr<- list()
correct <- c("beer", "world", "happiest", "defense", "beach", "way", "time", "fingers", "bad", "insane", "die", "marital", "weekend", "stress", "picture", "matter", "hand", "top", "outside", "movies")
ans <- tibble(textn = 1:20, text = test$text, ans = as.character(NA),
              correct = correct, place = as.character(NA))
for (i in 1:nrow(test)) {
  text <- paste(clean(test$text[i]), collapse = ' ')
  text <- unlist(toksrp(tokens(text)))
  initn <- min(length(nexxt), length(text)+1) # initial ngram model
  lim <- 10000
  for (j in 1:length(nexxt)) {
    astr[[j]] <- nexxt[[j]] %>% filter(nexxt %in% test[i, 3:6])
    # reduce the `nexxt` united table to the options specified in the test
    }
  tops[[i]]<- wnexxt(nexxt = astr, initn, initn, text, lim = lim)
  tops[[i]] <- tops[[i]] %>% arrange(desc(gramn), desc(score)) %>%
    add_column(textn = as.integer(i), .before = "given")
  if(nrow(tops[[i]])>0) ans[i,3] <- tops[[i]]$nexxt[1]
  if (length(grep(ans[i,4], tops[[i]]$nexxt)) >0) ans[i,5] <-
    paste(grep(ans[i,4], tops[[i]]$nexxt), collapse = " ")
}

kable(ans, caption = "Table 2: First answers vs Correct answers",
      col.names = c("text number", "text","first option",
                    "correct answer","place(s) of correct answer")) %>%
        kable_styling() %>%
        scroll_box(width = "100%", height = "400px")
Table 2: First answers vs Correct answers
text number text first option correct answer place(s) of correct answer
1 The guy in front of me just bought a pound of bacon, a bouquet, and a case of beer beer 1 2 3 6
2 You’re the reason why I smile everyday. Can you follow me please? It would mean the world world 1 2 3 7 11
3 Hey sunshine, can you follow me and make me the happiest happiest 1 2 3 6
4 Very early observations on the Bills game: Offense still struggling but the defense defense 1 6 9
5 Go on a romantic date at the beach beach 1 5 9
6 Well I’m pretty sure my granny has some old bagpipes in her garage I’ll dust them off and be on my way way 1 3 6 10
7 Ohhhhh #PointBreak is on tomorrow. Love that film and haven’t seen it in quite some time time 1 2 3 7
8 After the ice bucket challenge Louis will push his long wet hair out of his eyes with his little eyes fingers 2 4
9 Be grateful for the good times and keep the faith during the bad bad 1 2 7
10 If this isn’t the cutest thing you’ve ever seen, then you must be asleep insane 2 5
11 When you breathe, I want to be the air for you. I’ll be there for you, I’d live and I’d give die 2 8
12 Guy at my table’s wife got up to go to the bathroom and I asked about dessert and he started telling me about his financial marital 5
13 I’d give anything to see arctic monkeys this morning weekend 2 6
14 Talking to your mom has the same effect as a hug and helps reduce your happiness stress 2 4
15 When you were in Holland you were like 1 inch away from me but you hadn’t time to take a look picture 2 6 10 15
16 I’d just like all of these questions answered, a presentation of evidence, and a jury to settle the matter matter 1 3 6 10
17 I can’t deal with unsymetrical things. I can’t even hold an uneven number of bags of groceries in each hand hand 1 3 5
18 Every inch of you is perfect from the bottom to the top top 1 5 9
19 I’m thankful my childhood was filled with imagination and bruises from playing outside outside 1 3
20 I like how the same people are in almost all of Adam Sandler’s stories movies 3
  • Now look at the order of the returned options for each sentence (scroll down the table):
astr<- tops[[1]]
for (i in 2:20) astr<- rbind(astr, tops[[i]])
astr<- astr %>% mutate(given = paste0('"', given, '"'))
kable(astr, caption = "Table 3: Ranking of the options",
      col.names = c("text number", "begin of ngram","rank of options",
                    "score","words in ngram")) %>%
        kable_styling() %>%
        scroll_box(width = "100%", height = "300px")
Table 3: Ranking of the options
text number begin of ngram rank of options score words in ngram
1 “a case of” beer 1.00e-07 4
1 “case of” beer 1.00e-07 3
1 “of” beer 8.00e-07 2
1 “of” cheese 2.00e-07 2
1 “of” soda 1.00e-07 2
1 "" beer 5.00e-06 1
1 "" cheese 2.60e-06 1
1 "" soda 4.00e-07 1
1 "" pretzels 1.00e-07 1
2 “it would mean the” world 2.80e-06 5
2 “would mean the” world 1.10e-06 4
2 “mean the” world 6.00e-07 3
2 “mean the” most 1.00e-07 3
2 “the” best 2.80e-05 2
2 “the” most 2.31e-05 2
2 “the” world 2.30e-05 2
2 “the” universe 1.30e-06 2
2 "" most 2.38e-05 1
2 "" best 1.99e-05 1
2 "" world 1.65e-05 1
2 "" universe 9.00e-07 1
3 “make me the” happiest 1.00e-07 4
3 “me the” happiest 1.00e-07 3
3 “the” happiest 3.00e-07 2
3 “the” saddest 1.00e-07 2
3 “the” bluest 0.00e+00 2
3 "" happiest 2.00e-07 1
3 "" saddest 1.00e-07 1
3 "" bluest 0.00e+00 1
4 “but the” defense 1.00e-07 3
4 “but the” crowd 1.00e-07 3
4 “but the” players 0.00e+00 3
4 “the” crowd 2.40e-06 2
4 “the” players 1.50e-06 2
4 “the” defense 1.20e-06 2
4 “the” referees 0.00e+00 2
4 "" players 4.40e-06 1
4 "" defense 2.70e-06 1
4 "" crowd 1.90e-06 1
4 "" referees 1.00e-07 1
5 “at the” beach 1.00e-06 3
5 “at the” mall 7.00e-07 3
5 “at the” grocery 4.00e-07 3
5 “at the” movies 3.00e-07 3
5 “the” beach 2.40e-06 2
5 “the” mall 1.00e-06 2
5 “the” movies 8.00e-07 2
5 “the” grocery 6.00e-07 2
5 "" beach 2.90e-06 1
5 "" movies 2.20e-06 1
5 "" mall 1.00e-06 1
5 "" grocery 8.00e-07 1
6 “be on my” way 1.00e-07 4
6 “be on my” phone 1.00e-07 4
6 “on my” way 4.80e-06 3
6 “on my” phone 1.70e-06 3
6 “on my” horse 1.00e-07 3
6 “my” way 3.20e-06 2
6 “my” phone 3.10e-06 2
6 “my” horse 1.00e-07 2
6 “my” motorcycle 0.00e+00 2
6 "" way 2.88e-05 1
6 "" phone 5.90e-06 1
6 "" horse 1.30e-06 1
6 "" motorcycle 3.00e-07 1
7 “in quite some” time 2.00e-07 4
7 “quite some” time 6.00e-07 3
7 “some” time 2.50e-06 2
7 “some” years 3.00e-07 2
7 “some” weeks 1.00e-07 2
7 “some” thing 1.00e-07 2
7 "" time 5.67e-05 1
7 "" years 2.36e-05 1
7 "" thing 1.43e-05 1
7 "" weeks 6.50e-06 1
8 “little” eyes 0.00e+00 2
8 “little” fingers 0.00e+00 2
8 "" eyes 4.20e-06 1
8 "" fingers 1.10e-06 1
8 "" ears 7.00e-07 1
8 "" toes 3.00e-07 1
9 “during the” bad 0.00e+00 3
9 “the” bad 1.70e-06 2
9 “the” hard 1.40e-06 2
9 “the” sad 4.00e-07 2
9 “the” worse 3.00e-07 2
9 "" hard 1.07e-05 1
9 "" bad 1.04e-05 1
9 "" sad 3.10e-06 1
9 "" worse 2.20e-06 1
10 “be” asleep 1.00e-07 2
10 “be” insane 1.00e-07 2
10 “be” insensitive 0.00e+00 2
10 "" asleep 1.00e-06 1
10 "" insane 6.00e-07 1
10 "" insensitive 1.00e-07 1
10 "" callous 0.00e+00 1
11 “i’d” give 2.00e-07 2
11 “i’d” die 0.00e+00 2
11 “i’d” eat 0.00e+00 2
11 “i’d” sleep 0.00e+00 2
11 "" give 1.22e-05 1
11 "" sleep 5.10e-06 1
11 "" eat 5.00e-06 1
11 "" die 2.60e-06 1
12 “his” financial 0.00e+00 2
12 “his” spiritual 0.00e+00 2
12 "" financial 2.80e-06 1
12 "" spiritual 8.00e-07 1
12 "" marital 1.00e-07 1
12 "" horticultural 0.00e+00 1
13 “this” morning 8.10e-06 2
13 “this” weekend 7.70e-06 2
13 “this” month 3.60e-06 2
13 “this” decade 1.00e-07 2
13 "" morning 1.14e-05 1
13 "" weekend 9.30e-06 1
13 "" month 7.90e-06 1
13 "" decade 1.30e-06 1
14 “your” happiness 1.00e-07 2
14 “your” stress 1.00e-07 2
14 “your” hunger 0.00e+00 2
14 "" stress 1.20e-06 1
14 "" happiness 1.00e-06 1
14 "" hunger 9.00e-07 1
14 "" sleepiness 0.00e+00 1
15 “to take a” look 1.40e-06 4
15 “to take a” picture 1.00e-06 4
15 “to take a” walk 2.00e-07 4
15 “to take a” minute 1.00e-07 4
15 “take a” look 2.80e-06 3
15 “take a” picture 9.00e-07 3
15 “take a” minute 4.00e-07 3
15 “take a” walk 2.00e-07 3
15 “a” minute 2.60e-06 2
15 “a” picture 2.50e-06 2
15 “a” look 2.10e-06 2
15 “a” walk 1.20e-06 2
15 "" look 1.65e-05 1
15 "" walk 4.60e-06 1
15 "" picture 3.80e-06 1
15 "" minute 2.90e-06 1
16 “to settle the” matter 1.00e-07 4
16 “to settle the” case 1.00e-07 4
16 “settle the” matter 0.00e+00 3
16 “settle the” case 0.00e+00 3
16 “the” case 6.30e-06 2
16 “the” matter 1.40e-06 2
16 “the” incident 1.30e-06 2
16 “the” account 3.00e-07 2
16 "" case 7.40e-06 1
16 "" matter 4.90e-06 1
16 "" account 2.60e-06 1
16 "" incident 1.10e-06 1
17 “in each” hand 0.00e+00 3
17 “in each” arm 0.00e+00 3
17 “each” hand 0.00e+00 2
17 “each” arm 0.00e+00 2
17 "" hand 6.00e-06 1
17 "" arm 1.20e-06 1
17 "" finger 8.00e-07 1
17 "" toe 3.00e-07 1
18 “to the” top 2.70e-06 3
18 “to the” side 8.00e-07 3
18 “to the” center 5.00e-07 3
18 “to the” middle 5.00e-07 3
18 “the” top 9.90e-06 2
18 “the” middle 6.20e-06 2
18 “the” center 2.90e-06 2
18 “the” side 2.20e-06 2
18 "" top 9.60e-06 1
18 "" center 7.30e-06 1
18 "" side 7.10e-06 1
18 "" middle 4.00e-06 1
19 “playing” outside 1.00e-07 2
19 “playing” inside 0.00e+00 2
19 "" outside 5.50e-06 1
19 "" inside 4.20e-06 1
19 "" daily 2.90e-06 1
19 "" weekly 8.00e-07 1
20 "" stories 3.00e-06 1
20 "" pictures 2.70e-06 1
20 "" movies 2.20e-06 1
20 "" novels 5.00e-07 1
rm(ans, test, tops, astr, correct, i, initn, lim, text)

server.R

Also, log, chain and guess functions were written for server.R script. log - for debugging, while chain and guess - so that server.R could work with incomplete words (Appendix: log, chain, guess).

Appendix

loader

loader <- function(url, dir = "./data/1227_DS-CS-w2_WordsFIN", filename) {
  if(!dir.exists(dir)) dir.create(dir, recursive = TRUE)
        dest <- paste0(dir, "/", filename)
        if(!file.exists(dest)) download.file(
                url = url, destfile = dest, method = "curl")
}

cache

cache <- function (var, name = as.character(var),
                   dir = "./data/1227_DS-CS-w2_WordsFIN/cache") {
        var = deparse(substitute(var))
        dest <- paste0(dir, "/", name, ".RData")
if(!dir.exists(dir)) dir.create(dir, recursive = TRUE)
if(!file.exists(dest)) save(list = c(var), file = dest)
}

clean

clean <- function(text) {
        text <- unlist(strsplit(text, "[\\.\\!\\?:;]+"))
        text <- gsub("[’‘`´\"]", "'", text) # unif apostrophes
        text <- gsub("[[:blank:]]#[^[:blank:]]*", " ", text, perl = T) #hashs
        text <- gsub("[[:blank:]]@[^[:blank:]]*", " ", text, perl = T) #at signs (@)
        text <- gsub("(https?)?://[^[:blank:]]*", " ", text, perl = T) #urls
        text <- gsub("[^[:alnum:]']", " ", text, perl = T) # non-alpha/numerical
        text <- gsub("[[:blank:]]+'([[:alnum:][:blank:]]+)'[[:blank:]]+", " \\1 ",
                          text, perl = T) # surr apostrophes
        text <- stri_trim_both(text) # surr blanks
        text <-gsub("[[:blank:][:digit:]+[:blank:]]", " ", text, perl = T) #numbers w\o text
        text<- gsub("[[:blank:]]{2,}", " ", text, perl = T) # condense blanks
        text <- trimws(text) # leading/ trailing blanks
        text <- tolower(text)
        text
}

rmoptions

Ways for filtering profanities/ stop-words, stemming

# load profanity file (a publicly kept profanity list from https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words)
        loader(url =
                 "https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/5faf2ba42d7b1c0977169ec3611df25a3c08eb13/en",
               dir = "./data/1227_DS-CS-w2_WordsFIN/ignore",
               filename = "ignore")

# remove profanities
toksrp <- function(toks){
        ignore <- read_tsv("./data/1227_DS-CS-w2_WordsFIN/ignore/ignore",
                           col_names = FALSE)
        ignore <- ignore$X1
        toksrp <- tokens_remove(toks, ignore)
        toksrp
}

# remove stop-words
toksrs <- function(toks) {
  library(quanteda)
  toksrs <- tokens_remove(toks, stopwords("english"))
  toksrs
}

# stemming
stemm <- function(dfm) {
  library(quanteda)
  stemm <- dfm(dfm, stem=TRUE)
  stemm
}

wnexxt

wnexxt <- function(nexxt, initn, nexxtn, text, entire = TRUE,
                  wnexxt = NULL, lim = 1000, alpha = 0.4) {
        if(nexxtn<1) return(wnexxt)
        ### recursion
        astr <- tail(text, n = nexxtn-1)
        astr <- paste(astr, collapse = " ")
        nexxt0 <- nexxt[[nexxtn]]
        if(entire) nexxt0 <- nexxt0 %>% filter(given == astr) else
                nexxt0 <- nexxt0[startsWith(nexxt0$given, astr), ]
        if(nrow(nexxt0)>0) nexxt0 <- nexxt0 %>%
                slice_head(n = min(nrow(nexxt0), lim)) %>%
                transmute(given, nexxt,
                          score = share*alpha^(initn-nexxtn), gramn = n)
        # Stupid backoff factor alpha
        wnexxt <- rbind(wnexxt, nexxt0)
        return(wnexxt(nexxt = nexxt, initn = initn, nexxtn = nexxtn-1,
                      text = text, entire = entire, wnexxt = wnexxt, lim = lim))
}

log (for server.R)

log <- function(..., obj1=NULL, obj2=NULL) {
        if(exists("verbose") && verbose){
                cat(file=stderr(),"-------------------------------")
                cat(file=stderr(),"\n> ", ..., "\n", sep=" ")
                if(!is.null(obj1)){
                        print(obj1)
                }
                if(!is.null(obj2)){
                        print(obj2)
                }
        }
}

chain (for server.R)

chain<- function(text, given){
        ttext <- paste(clean(text), collapse = ' ')
        tgiven <- paste(clean(given), collapse = ' ')
        initl <- min(nchar(ttext),nchar(tgiven))
        repeat{
                goleft <- str_sub(tgiven, 1, initl)
                goright <- str_sub(ttext, -initl)
                if(goleft == goright) loop <- FALSE else{
                        initl <- initl - 1 
                        loop <- initl > 0
                }
                if(!loop) break
        }
        paste0(text, str_sub(given, initl+1))
}

guess (for server.R)

guess <- function(text, options, entire){
        options$full = sapply(options$given, chain, text = text)
        if(entire){
                options$guess <- options$nexxt
                options$full <- paste0(options$full, options$nexxt)
        }else{
                options$guess <- paste(options$given, options$nexxt)
                options$full <- paste(options$full, options$nexxt)
        }
        options %>% select(guess, full)
}

Session info

sessionInfo()
R version 4.0.3 (2020-10-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur 10.16

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_0.5.1             stringr_1.4.0            
 [3] dplyr_1.0.7               purrr_0.3.4              
 [5] readr_2.0.1               tidyr_1.1.3              
 [7] tibble_3.1.3              ggplot2_3.3.5            
 [9] tidyverse_1.3.1           kableExtra_1.3.4         
[11] quanteda.textstats_0.94.1 quanteda_3.1.0           
[13] tidytext_0.3.1            stringi_1.7.3            

loaded via a namespace (and not attached):
 [1] httr_1.4.2         sass_0.4.0         bit64_4.0.5        vroom_1.5.4       
 [5] jsonlite_1.7.2     viridisLite_0.4.0  modelr_0.1.8       bslib_0.2.5.1     
 [9] RcppParallel_5.1.4 assertthat_0.2.1   highr_0.9          cellranger_1.1.0  
[13] yaml_2.2.1         pillar_1.6.2       backports_1.2.1    lattice_0.20-44   
[17] glue_1.4.2         digest_0.6.27      rvest_1.0.1        colorspace_2.0-2  
[21] htmltools_0.5.1.1  Matrix_1.3-4       pkgconfig_2.0.3    broom_0.7.9       
[25] haven_2.4.3        scales_1.1.1       webshot_0.5.2      svglite_2.0.0     
[29] nsyllable_1.0      tzdb_0.1.2         generics_0.1.0     ellipsis_0.3.2    
[33] withr_2.4.2        cli_3.0.1          magrittr_2.0.1     crayon_1.4.1      
[37] readxl_1.3.1       evaluate_0.14      stopwords_2.2      tokenizers_0.2.1  
[41] janeaustenr_0.1.5  fs_1.5.0           fansi_0.5.0        SnowballC_0.7.0   
[45] xml2_1.3.2         tools_4.0.3        hms_1.1.0          lifecycle_1.0.0   
[49] munsell_0.5.0      reprex_2.0.1       compiler_4.0.3     jquerylib_0.1.4   
[53] proxyC_0.2.0       systemfonts_1.0.2  rlang_0.4.11       grid_4.0.3        
[57] rstudioapi_0.13    rmarkdown_2.10     gtable_0.3.0       DBI_1.1.1         
[61] R6_2.5.1           lubridate_1.7.10   knitr_1.33         bit_4.0.4         
[65] utf8_1.2.2         fastmatch_1.1-3    parallel_4.0.3     Rcpp_1.0.7        
[69] vctrs_0.3.8        dbplyr_2.1.1       tidyselect_1.1.1   xfun_0.25