This is the final part of Capstone project for Coursera’s Data Science and Data Science: Statistics and Machine Learning Specializations.
The goal of the project is to develop application predicting the next word after being given some text input. The preliminary Exploratory Data Analysis served as a preparation for building a predictive model. Provided in this Report Model evolves that findings to construct a data product based on the predictive algorithm.
Training data set containing a sample corpus of text in different languages can be downloaded here. This project focuses on English texts in the data set come from blogs, news and twitter.
Code button(loader function is performed in Appendix: loader)
library(stringi); library(tidytext); library(quanteda)
library(quanteda.textstats); library(kableExtra); library(tidyverse)filename <- "Coursera-SwiftKey.zip"
loader(url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", filename = filename)
dir <- "./data/1227_DS-CS-w2_WordsFIN"
dest <- paste0(dir, "/", filename)
if(!file.exists(paste0(dir, "/final"))) {unzip(dest, exdir = dir)}Within each file (en_US.blogs.txt, en_US.news.txt, en_US.twitter.txt), every line is an extract from a single post/ article/ tweet. Key summaries can be found in the Exploratory Data Analysis.
The original files are quite large: \(0.899\) mln blog records, \(1.01\) mln news, \(2.36\) mln tweets. To speed up, sample \(15\%\) of the lines from each source (processing original files of that huge size pushed up against R’s memory limits and ran slowly).
path <- paste0(dir, "/cache/sample.RData")
if(file.exists(path)) load(path) else{
path <- paste0(dir, "/final/en_US")
svar <- function(name = "twitter") {
file<- paste0(path,"/en_US.",name,".txt")
svar<- read_lines(file, skip_empty_rows = TRUE)
svar <- as_tibble_col(svar, column_name = "text")
}
blogs<-svar("blogs")
twitter<-svar("twitter")
news<-svar("news")
set.seed(211214)
rate <- 0.15
blogs.sample <- blogs %>% slice_sample(., prop=rate)
twitter.sample <- twitter %>% slice_sample(., prop=rate)
news.sample <- news %>% slice_sample(., prop=rate)
sample <- bind_rows(mutate(blogs.sample, source="blogs"),
mutate(twitter.sample, source= "twitter"),
mutate(news.sample, source="news"))
sample$source <- as.factor(sample$source)
cache(var = sample)
rm(blogs, twitter, news, blogs.sample, twitter.sample, news.sample, rate, svar)
}Text are splitted into sentences and converted to lower case, then are removed numbers, URLs, hashtags, condensed multiple white spaces, removed leading/ trailing white spaces; also unified apostrophes’ options, removed all non-alphanumerics except apostrophes, stripped surrounding apostrophes/ quotation marks.
# load("./data/1227_DS-CS-w2_WordsFIN/cache/sample.RData")
path <- paste0(dir, "/cache/sample.tidy.RData")
if(file.exists(path)) load(path) else{
sample.tidy <- as_tibble(clean(sample$text)) %>% rename(text = value)
blank <- which(sample.tidy$text == "")
sample.tidy <- sample.tidy[-blank,]
cache(var = sample.tidy)
rm(sample, blank)
}Tokenize the data, that is to separate it into smaller units like words or phrases - n-grams (contiguous sequence of n items). Also, there were pros and cons of removing profanities, but in the end it was decided to remove them
# load("./data/1227_DS-CS-w2_WordsFIN/cache/sample.tidy.RData")
path <- paste0(dir, "/cache/sample.corpus.RData")
if(file.exists(path)) load(path) else{
sample.corpus<- corpus(sample.tidy$text)
cache(var = sample.corpus)
}
# words.tidy<- sample.tidy %>% unnest_tokens(word,text) - just example: all words (w\o numbers & apostrophes), count: nrow(words.tidy)
path <- paste0(dir, "/cache/toks1.RData")
if(file.exists(path)) load(path) else{
toks1<- tokens(sample.corpus) # to look at a whole sentence:
# sample.corpus[["text640151"]]/ toks1[["text640151"]],
# count: sum(ntoken(sample.corpus))/ sum(ntoken(toks1))
cache(var = toks1) # orogonal, w\o removing profanities
rm(sample.tidy, sample.corpus)
}
toks1 <- toksrp(toks1) # remove profanities
# toks1 <- toksrs(toks1) # remove stop-wordsCreate ngrams (for n = 1:5) and predicting tables showing frequency+.
Then, filter ngrams to speed up operations, and finally, create united nexxt-table (leave only ngrams appearing more than once & no more than four predictions)
path <- paste0(dir, "/cache/nexxt.RData")
if(file.exists(path)) load(path) else{
nexxt0 <- list() # create united nexxt-table - original, full
nexxt <- list() # reduced united nexxt-table (memory reasons)
# load("./data/1227_DS-CS-w2_WordsFIN/cache/toks1.RData")
n1gram<- dfm(toks1) # document feature matrix
n1gram<-textstat_frequency(n1gram)
n1gram <- tibble(n1gram) %>%
transmute(ngram = feature, given = "", nexxt = feature,
share = frequency/sum(frequency),
frequency, coverage = cumsum(share), n = 1L)
cache(var = n1gram) # store original ngram
# load("./data/1227_DS-CS-w2_WordsFIN/cache/n1gram.RData")
nexxt0[[1]] <- n1gram %>% filter(frequency >= 2) # filter ngram for the nexxt-table
nexxt1 <- nexxt0[[1]]
cache(var = nexxt1) # store the 1st list of the nexxt-table
rm(n1gram, nexxt1)
for (i in 2:5) {
ngram <- tokens_ngrams(toks1, i)
ngram<- dfm(ngram)
ngram<-textstat_frequency(ngram)
ngram <- tibble(ngram) %>%
transmute(ngram = gsub("_", " ", feature),
given = substr(ngram, 1, regexpr(" [^ ]*$", ngram)-1),
nexxt = substr(ngram, regexpr(" [^ ]*$", ngram)+1, nchar(ngram)),
share = frequency/sum(frequency),
frequency, coverage = cumsum(share), n = as.integer(i))
name <- paste0("n", i, "gram")
cache(var = ngram, name = name)
# path <- paste0("./data/1227_DS-CS-w2_WordsFIN/cache/n", i, "gram.RData")
# load(path)
ngram <- ngram %>% filter(frequency >= 2) # leave only `ngrams` appearing more than once
# duo <- select(ngram, given) %>% duplicated() # remove duplicates (for the final model)
# ngram <- ngram[!duo,] # (for the final model)
##### also possible leave no more than n predictions (here is n=5)
# duo <- select(ngram, given) %>% duplicated()
# ngram2 <- ngram[duo,]
# ngram1 <- ngram[!duo,]
# duo <- select(ngram2, given) %>% duplicated()
# ngram3 <- ngram2[duo,]
# ngram2 <- ngram2[!duo,]
# duo <- select(ngram3, given) %>% duplicated()
# ngram4 <- ngram3[duo,]
# ngram3 <- ngram3[!duo,]
# duo <- select(ngram4, given) %>% duplicated()
# ngram5 <- ngram4[duo,]
# ngram4 <- ngram4[!duo,]
# duo <- select(ngram5, given) %>% duplicated()
# ngram5 <- ngram5[!duo,]
# ngram <- rbind(ngram1, ngram2, ngram3, ngram4, ngram5)
#####
nexxt0[[i]] <- ngram
name <- paste0("nexxt", i)
cache(var = ngram, name = name)
}
for (i in 1:length(nexxt0)) nexxt[[i]] <- nexxt0[[i]] %>%
select(given, nexxt, share, frequency, n)
cache(var = nexxt0)
cache(var = nexxt)
rm(toks1, i, name, ngram)
}The model, as many others, depends on the training corpus (sample). So, we always face problems like items not seen in the training data (unknown words/ unobserved n-grams), as well as balance weight between frequent and infrequent ngrams.
There are a number of ways to fix it.
First, if the item is not seen, there may be several reasons:
In case of typos, can be used different string metrics, i.e. Q-gram, Jaccard, Cosine.
The incomplete word problem can be resolved by considering whether the last character is a space.
As for the problem with unseen items, it is also associated with infrequent ones. A good analysis of this issue is given in “Speech and Language Processing” by Jurafsky & Martin (chapters 3.3, 3.4).
For example, for unknown words (OOV - out of vocabulary), a pseudo-word called ‘UNK’ is created. Then, all words that occur fewer than n times in the training set can be replaced by ‘UNK’, or the top, say, 50 000 words by frequency are chosen, and the rest are replaced by ‘UNK’.
In general, while dealing with zero-frequency (unobserved) words/ngrams, it is necessary to smooth probability distributions by assigning to them non-zero probabilities. There are different smoothing approaches, e.g.:
Here is implemented Stupid backoff model, since it’s also the simplest solution for web-scale ngrams, computing ngrams-score (rather than a probability) on very large datasets very quickly. If the model meets ngram with a zero count, it simply backoff to a lower order ngram, weighed by a fixed (context-independent) weight:
\(S(w_i|w^{i-1}_{i-k+1}) = \left\{ \begin{array}{ll} \frac{f(w^i_{i-k+1})}{f(w^{i-1}_{i-k+1})} & \mbox{if \(f(w^i_{i-k+1})>0\)};\\\alpha \cdot S(w_i|w^{i-1}_{i-k+2}) & \mbox{otherwise}\end{array} \right.\)
where
The recursion ends at unigrams: \(S(w_i) = \frac{f(w_i)}{N}\) with \(N\) being the size of the training corpus.
The backoff factor \(\alpha\) authors of the model empirically set to \(\alpha = 0.4\).
This algorithm is implemented in the wnexxt function (Appendix: wnexxt).
Perhaps, to speed up the program, it would also make sense to consider the option of preliminary calculations of the Stupid backoff scores.
Testing exercises came from Quiz 2: Natural language processing I, Week 3, Data Science Capstone course: predict the next word for each of the sentence fragments below (scroll down the table).
test <- rbind(
c("The guy in front of me just bought a pound of bacon, a bouquet, and a case of", "beer", "cheese", "pretzels", "soda"),
c( "You're the reason why I smile everyday. Can you follow me please? It would mean the", "world", "most", "universe", "best"),
c( "Hey sunshine, can you follow me and make me the", "smelliest", "saddest", "bluest", "happiest"),
c( "Very early observations on the Bills game: Offense still struggling but the", "referees", "players", "defense", "crowd"),
c( "Go on a romantic date at the", "movies", "mall", "grocery", "beach"),
c( "Well I'm pretty sure my granny has some old bagpipes in her garage I'll dust them off and be on my", "motorcycle", "way", "phone", "horse"),
c( "Ohhhhh #PointBreak is on tomorrow. Love that film and haven't seen it in quite some", "time", "weeks", "thing", "years"),
c( "After the ice bucket challenge Louis will push his long wet hair out of his eyes with his little", "eyes", "fingers", "toes", "ears"),
c( "Be grateful for the good times and keep the faith during the", "sad", "bad", "hard", "worse"),
c( "If this isn't the cutest thing you've ever seen, then you must be", "callous", "insane", "insensitive", "asleep"),
c("When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd","die","give","sleep","eat"),
c("Guy at my table's wife got up to go to the bathroom and I asked about dessert and he started telling me about his","horticultural","spiritual","financial","marital"),
c("I'd give anything to see arctic monkeys this","month","morning","weekend","decade"),
c("Talking to your mom has the same effect as a hug and helps reduce your","sleepiness","happiness","stress","hunger"),
c("When you were in Holland you were like 1 inch away from me but you hadn't time to take a","look","minute","picture","walk"),
c("I'd just like all of these questions answered, a presentation of evidence, and a jury to settle the","incident","case","account","matter"),
c("I can't deal with unsymetrical things. I can't even hold an uneven number of bags of groceries in each","finger","hand","arm","toe"),
c("Every inch of you is perfect from the bottom to the","center","middle","top","side"),
c("I’m thankful my childhood was filled with imagination and bruises from playing","outside","daily","inside","weekly"),
c("I like how the same people are in almost all of Adam Sandler's","pictures","novels","movies","stories")
)
colnames(test) <- c("text", "option1", "option2", "option3", "option4")
test <- as_tibble(test) %>% add_column(textn = 1:nrow(test), .before = "text")
kable(test, caption = "Table 1: Exercise") %>% kable_styling() %>%
scroll_box(width = "100%", height = "500px")| textn | text | option1 | option2 | option3 | option4 |
|---|---|---|---|---|---|
| 1 | The guy in front of me just bought a pound of bacon, a bouquet, and a case of | beer | cheese | pretzels | soda |
| 2 | You’re the reason why I smile everyday. Can you follow me please? It would mean the | world | most | universe | best |
| 3 | Hey sunshine, can you follow me and make me the | smelliest | saddest | bluest | happiest |
| 4 | Very early observations on the Bills game: Offense still struggling but the | referees | players | defense | crowd |
| 5 | Go on a romantic date at the | movies | mall | grocery | beach |
| 6 | Well I’m pretty sure my granny has some old bagpipes in her garage I’ll dust them off and be on my | motorcycle | way | phone | horse |
| 7 | Ohhhhh #PointBreak is on tomorrow. Love that film and haven’t seen it in quite some | time | weeks | thing | years |
| 8 | After the ice bucket challenge Louis will push his long wet hair out of his eyes with his little | eyes | fingers | toes | ears |
| 9 | Be grateful for the good times and keep the faith during the | sad | bad | hard | worse |
| 10 | If this isn’t the cutest thing you’ve ever seen, then you must be | callous | insane | insensitive | asleep |
| 11 | When you breathe, I want to be the air for you. I’ll be there for you, I’d live and I’d | die | give | sleep | eat |
| 12 | Guy at my table’s wife got up to go to the bathroom and I asked about dessert and he started telling me about his | horticultural | spiritual | financial | marital |
| 13 | I’d give anything to see arctic monkeys this | month | morning | weekend | decade |
| 14 | Talking to your mom has the same effect as a hug and helps reduce your | sleepiness | happiness | stress | hunger |
| 15 | When you were in Holland you were like 1 inch away from me but you hadn’t time to take a | look | minute | picture | walk |
| 16 | I’d just like all of these questions answered, a presentation of evidence, and a jury to settle the | incident | case | account | matter |
| 17 | I can’t deal with unsymetrical things. I can’t even hold an uneven number of bags of groceries in each | finger | hand | arm | toe |
| 18 | Every inch of you is perfect from the bottom to the | center | middle | top | side |
| 19 | I’m thankful my childhood was filled with imagination and bruises from playing | outside | daily | inside | weekly |
| 20 | I like how the same people are in almost all of Adam Sandler’s | pictures | novels | movies | stories |
wnexxt to see which options come first, then compare them to the correct answers (scroll down the table):tops <- list()
astr<- list()
correct <- c("beer", "world", "happiest", "defense", "beach", "way", "time", "fingers", "bad", "insane", "die", "marital", "weekend", "stress", "picture", "matter", "hand", "top", "outside", "movies")
ans <- tibble(textn = 1:20, text = test$text, ans = as.character(NA),
correct = correct, place = as.character(NA))
for (i in 1:nrow(test)) {
text <- paste(clean(test$text[i]), collapse = ' ')
text <- unlist(toksrp(tokens(text)))
initn <- min(length(nexxt), length(text)+1) # initial ngram model
lim <- 10000
for (j in 1:length(nexxt)) {
astr[[j]] <- nexxt[[j]] %>% filter(nexxt %in% test[i, 3:6])
# reduce the `nexxt` united table to the options specified in the test
}
tops[[i]]<- wnexxt(nexxt = astr, initn, initn, text, lim = lim)
tops[[i]] <- tops[[i]] %>% arrange(desc(gramn), desc(score)) %>%
add_column(textn = as.integer(i), .before = "given")
if(nrow(tops[[i]])>0) ans[i,3] <- tops[[i]]$nexxt[1]
if (length(grep(ans[i,4], tops[[i]]$nexxt)) >0) ans[i,5] <-
paste(grep(ans[i,4], tops[[i]]$nexxt), collapse = " ")
}
kable(ans, caption = "Table 2: First answers vs Correct answers",
col.names = c("text number", "text","first option",
"correct answer","place(s) of correct answer")) %>%
kable_styling() %>%
scroll_box(width = "100%", height = "400px")| text number | text | first option | correct answer | place(s) of correct answer |
|---|---|---|---|---|
| 1 | The guy in front of me just bought a pound of bacon, a bouquet, and a case of | beer | beer | 1 2 3 6 |
| 2 | You’re the reason why I smile everyday. Can you follow me please? It would mean the | world | world | 1 2 3 7 11 |
| 3 | Hey sunshine, can you follow me and make me the | happiest | happiest | 1 2 3 6 |
| 4 | Very early observations on the Bills game: Offense still struggling but the | defense | defense | 1 6 9 |
| 5 | Go on a romantic date at the | beach | beach | 1 5 9 |
| 6 | Well I’m pretty sure my granny has some old bagpipes in her garage I’ll dust them off and be on my | way | way | 1 3 6 10 |
| 7 | Ohhhhh #PointBreak is on tomorrow. Love that film and haven’t seen it in quite some | time | time | 1 2 3 7 |
| 8 | After the ice bucket challenge Louis will push his long wet hair out of his eyes with his little | eyes | fingers | 2 4 |
| 9 | Be grateful for the good times and keep the faith during the | bad | bad | 1 2 7 |
| 10 | If this isn’t the cutest thing you’ve ever seen, then you must be | asleep | insane | 2 5 |
| 11 | When you breathe, I want to be the air for you. I’ll be there for you, I’d live and I’d | give | die | 2 8 |
| 12 | Guy at my table’s wife got up to go to the bathroom and I asked about dessert and he started telling me about his | financial | marital | 5 |
| 13 | I’d give anything to see arctic monkeys this | morning | weekend | 2 6 |
| 14 | Talking to your mom has the same effect as a hug and helps reduce your | happiness | stress | 2 4 |
| 15 | When you were in Holland you were like 1 inch away from me but you hadn’t time to take a | look | picture | 2 6 10 15 |
| 16 | I’d just like all of these questions answered, a presentation of evidence, and a jury to settle the | matter | matter | 1 3 6 10 |
| 17 | I can’t deal with unsymetrical things. I can’t even hold an uneven number of bags of groceries in each | hand | hand | 1 3 5 |
| 18 | Every inch of you is perfect from the bottom to the | top | top | 1 5 9 |
| 19 | I’m thankful my childhood was filled with imagination and bruises from playing | outside | outside | 1 3 |
| 20 | I like how the same people are in almost all of Adam Sandler’s | stories | movies | 3 |
astr<- tops[[1]]
for (i in 2:20) astr<- rbind(astr, tops[[i]])
astr<- astr %>% mutate(given = paste0('"', given, '"'))
kable(astr, caption = "Table 3: Ranking of the options",
col.names = c("text number", "begin of ngram","rank of options",
"score","words in ngram")) %>%
kable_styling() %>%
scroll_box(width = "100%", height = "300px")| text number | begin of ngram | rank of options | score | words in ngram |
|---|---|---|---|---|
| 1 | “a case of” | beer | 1.00e-07 | 4 |
| 1 | “case of” | beer | 1.00e-07 | 3 |
| 1 | “of” | beer | 8.00e-07 | 2 |
| 1 | “of” | cheese | 2.00e-07 | 2 |
| 1 | “of” | soda | 1.00e-07 | 2 |
| 1 | "" | beer | 5.00e-06 | 1 |
| 1 | "" | cheese | 2.60e-06 | 1 |
| 1 | "" | soda | 4.00e-07 | 1 |
| 1 | "" | pretzels | 1.00e-07 | 1 |
| 2 | “it would mean the” | world | 2.80e-06 | 5 |
| 2 | “would mean the” | world | 1.10e-06 | 4 |
| 2 | “mean the” | world | 6.00e-07 | 3 |
| 2 | “mean the” | most | 1.00e-07 | 3 |
| 2 | “the” | best | 2.80e-05 | 2 |
| 2 | “the” | most | 2.31e-05 | 2 |
| 2 | “the” | world | 2.30e-05 | 2 |
| 2 | “the” | universe | 1.30e-06 | 2 |
| 2 | "" | most | 2.38e-05 | 1 |
| 2 | "" | best | 1.99e-05 | 1 |
| 2 | "" | world | 1.65e-05 | 1 |
| 2 | "" | universe | 9.00e-07 | 1 |
| 3 | “make me the” | happiest | 1.00e-07 | 4 |
| 3 | “me the” | happiest | 1.00e-07 | 3 |
| 3 | “the” | happiest | 3.00e-07 | 2 |
| 3 | “the” | saddest | 1.00e-07 | 2 |
| 3 | “the” | bluest | 0.00e+00 | 2 |
| 3 | "" | happiest | 2.00e-07 | 1 |
| 3 | "" | saddest | 1.00e-07 | 1 |
| 3 | "" | bluest | 0.00e+00 | 1 |
| 4 | “but the” | defense | 1.00e-07 | 3 |
| 4 | “but the” | crowd | 1.00e-07 | 3 |
| 4 | “but the” | players | 0.00e+00 | 3 |
| 4 | “the” | crowd | 2.40e-06 | 2 |
| 4 | “the” | players | 1.50e-06 | 2 |
| 4 | “the” | defense | 1.20e-06 | 2 |
| 4 | “the” | referees | 0.00e+00 | 2 |
| 4 | "" | players | 4.40e-06 | 1 |
| 4 | "" | defense | 2.70e-06 | 1 |
| 4 | "" | crowd | 1.90e-06 | 1 |
| 4 | "" | referees | 1.00e-07 | 1 |
| 5 | “at the” | beach | 1.00e-06 | 3 |
| 5 | “at the” | mall | 7.00e-07 | 3 |
| 5 | “at the” | grocery | 4.00e-07 | 3 |
| 5 | “at the” | movies | 3.00e-07 | 3 |
| 5 | “the” | beach | 2.40e-06 | 2 |
| 5 | “the” | mall | 1.00e-06 | 2 |
| 5 | “the” | movies | 8.00e-07 | 2 |
| 5 | “the” | grocery | 6.00e-07 | 2 |
| 5 | "" | beach | 2.90e-06 | 1 |
| 5 | "" | movies | 2.20e-06 | 1 |
| 5 | "" | mall | 1.00e-06 | 1 |
| 5 | "" | grocery | 8.00e-07 | 1 |
| 6 | “be on my” | way | 1.00e-07 | 4 |
| 6 | “be on my” | phone | 1.00e-07 | 4 |
| 6 | “on my” | way | 4.80e-06 | 3 |
| 6 | “on my” | phone | 1.70e-06 | 3 |
| 6 | “on my” | horse | 1.00e-07 | 3 |
| 6 | “my” | way | 3.20e-06 | 2 |
| 6 | “my” | phone | 3.10e-06 | 2 |
| 6 | “my” | horse | 1.00e-07 | 2 |
| 6 | “my” | motorcycle | 0.00e+00 | 2 |
| 6 | "" | way | 2.88e-05 | 1 |
| 6 | "" | phone | 5.90e-06 | 1 |
| 6 | "" | horse | 1.30e-06 | 1 |
| 6 | "" | motorcycle | 3.00e-07 | 1 |
| 7 | “in quite some” | time | 2.00e-07 | 4 |
| 7 | “quite some” | time | 6.00e-07 | 3 |
| 7 | “some” | time | 2.50e-06 | 2 |
| 7 | “some” | years | 3.00e-07 | 2 |
| 7 | “some” | weeks | 1.00e-07 | 2 |
| 7 | “some” | thing | 1.00e-07 | 2 |
| 7 | "" | time | 5.67e-05 | 1 |
| 7 | "" | years | 2.36e-05 | 1 |
| 7 | "" | thing | 1.43e-05 | 1 |
| 7 | "" | weeks | 6.50e-06 | 1 |
| 8 | “little” | eyes | 0.00e+00 | 2 |
| 8 | “little” | fingers | 0.00e+00 | 2 |
| 8 | "" | eyes | 4.20e-06 | 1 |
| 8 | "" | fingers | 1.10e-06 | 1 |
| 8 | "" | ears | 7.00e-07 | 1 |
| 8 | "" | toes | 3.00e-07 | 1 |
| 9 | “during the” | bad | 0.00e+00 | 3 |
| 9 | “the” | bad | 1.70e-06 | 2 |
| 9 | “the” | hard | 1.40e-06 | 2 |
| 9 | “the” | sad | 4.00e-07 | 2 |
| 9 | “the” | worse | 3.00e-07 | 2 |
| 9 | "" | hard | 1.07e-05 | 1 |
| 9 | "" | bad | 1.04e-05 | 1 |
| 9 | "" | sad | 3.10e-06 | 1 |
| 9 | "" | worse | 2.20e-06 | 1 |
| 10 | “be” | asleep | 1.00e-07 | 2 |
| 10 | “be” | insane | 1.00e-07 | 2 |
| 10 | “be” | insensitive | 0.00e+00 | 2 |
| 10 | "" | asleep | 1.00e-06 | 1 |
| 10 | "" | insane | 6.00e-07 | 1 |
| 10 | "" | insensitive | 1.00e-07 | 1 |
| 10 | "" | callous | 0.00e+00 | 1 |
| 11 | “i’d” | give | 2.00e-07 | 2 |
| 11 | “i’d” | die | 0.00e+00 | 2 |
| 11 | “i’d” | eat | 0.00e+00 | 2 |
| 11 | “i’d” | sleep | 0.00e+00 | 2 |
| 11 | "" | give | 1.22e-05 | 1 |
| 11 | "" | sleep | 5.10e-06 | 1 |
| 11 | "" | eat | 5.00e-06 | 1 |
| 11 | "" | die | 2.60e-06 | 1 |
| 12 | “his” | financial | 0.00e+00 | 2 |
| 12 | “his” | spiritual | 0.00e+00 | 2 |
| 12 | "" | financial | 2.80e-06 | 1 |
| 12 | "" | spiritual | 8.00e-07 | 1 |
| 12 | "" | marital | 1.00e-07 | 1 |
| 12 | "" | horticultural | 0.00e+00 | 1 |
| 13 | “this” | morning | 8.10e-06 | 2 |
| 13 | “this” | weekend | 7.70e-06 | 2 |
| 13 | “this” | month | 3.60e-06 | 2 |
| 13 | “this” | decade | 1.00e-07 | 2 |
| 13 | "" | morning | 1.14e-05 | 1 |
| 13 | "" | weekend | 9.30e-06 | 1 |
| 13 | "" | month | 7.90e-06 | 1 |
| 13 | "" | decade | 1.30e-06 | 1 |
| 14 | “your” | happiness | 1.00e-07 | 2 |
| 14 | “your” | stress | 1.00e-07 | 2 |
| 14 | “your” | hunger | 0.00e+00 | 2 |
| 14 | "" | stress | 1.20e-06 | 1 |
| 14 | "" | happiness | 1.00e-06 | 1 |
| 14 | "" | hunger | 9.00e-07 | 1 |
| 14 | "" | sleepiness | 0.00e+00 | 1 |
| 15 | “to take a” | look | 1.40e-06 | 4 |
| 15 | “to take a” | picture | 1.00e-06 | 4 |
| 15 | “to take a” | walk | 2.00e-07 | 4 |
| 15 | “to take a” | minute | 1.00e-07 | 4 |
| 15 | “take a” | look | 2.80e-06 | 3 |
| 15 | “take a” | picture | 9.00e-07 | 3 |
| 15 | “take a” | minute | 4.00e-07 | 3 |
| 15 | “take a” | walk | 2.00e-07 | 3 |
| 15 | “a” | minute | 2.60e-06 | 2 |
| 15 | “a” | picture | 2.50e-06 | 2 |
| 15 | “a” | look | 2.10e-06 | 2 |
| 15 | “a” | walk | 1.20e-06 | 2 |
| 15 | "" | look | 1.65e-05 | 1 |
| 15 | "" | walk | 4.60e-06 | 1 |
| 15 | "" | picture | 3.80e-06 | 1 |
| 15 | "" | minute | 2.90e-06 | 1 |
| 16 | “to settle the” | matter | 1.00e-07 | 4 |
| 16 | “to settle the” | case | 1.00e-07 | 4 |
| 16 | “settle the” | matter | 0.00e+00 | 3 |
| 16 | “settle the” | case | 0.00e+00 | 3 |
| 16 | “the” | case | 6.30e-06 | 2 |
| 16 | “the” | matter | 1.40e-06 | 2 |
| 16 | “the” | incident | 1.30e-06 | 2 |
| 16 | “the” | account | 3.00e-07 | 2 |
| 16 | "" | case | 7.40e-06 | 1 |
| 16 | "" | matter | 4.90e-06 | 1 |
| 16 | "" | account | 2.60e-06 | 1 |
| 16 | "" | incident | 1.10e-06 | 1 |
| 17 | “in each” | hand | 0.00e+00 | 3 |
| 17 | “in each” | arm | 0.00e+00 | 3 |
| 17 | “each” | hand | 0.00e+00 | 2 |
| 17 | “each” | arm | 0.00e+00 | 2 |
| 17 | "" | hand | 6.00e-06 | 1 |
| 17 | "" | arm | 1.20e-06 | 1 |
| 17 | "" | finger | 8.00e-07 | 1 |
| 17 | "" | toe | 3.00e-07 | 1 |
| 18 | “to the” | top | 2.70e-06 | 3 |
| 18 | “to the” | side | 8.00e-07 | 3 |
| 18 | “to the” | center | 5.00e-07 | 3 |
| 18 | “to the” | middle | 5.00e-07 | 3 |
| 18 | “the” | top | 9.90e-06 | 2 |
| 18 | “the” | middle | 6.20e-06 | 2 |
| 18 | “the” | center | 2.90e-06 | 2 |
| 18 | “the” | side | 2.20e-06 | 2 |
| 18 | "" | top | 9.60e-06 | 1 |
| 18 | "" | center | 7.30e-06 | 1 |
| 18 | "" | side | 7.10e-06 | 1 |
| 18 | "" | middle | 4.00e-06 | 1 |
| 19 | “playing” | outside | 1.00e-07 | 2 |
| 19 | “playing” | inside | 0.00e+00 | 2 |
| 19 | "" | outside | 5.50e-06 | 1 |
| 19 | "" | inside | 4.20e-06 | 1 |
| 19 | "" | daily | 2.90e-06 | 1 |
| 19 | "" | weekly | 8.00e-07 | 1 |
| 20 | "" | stories | 3.00e-06 | 1 |
| 20 | "" | pictures | 2.70e-06 | 1 |
| 20 | "" | movies | 2.20e-06 | 1 |
| 20 | "" | novels | 5.00e-07 | 1 |
rm(ans, test, tops, astr, correct, i, initn, lim, text)server.RAlso, log, chain and guess functions were written for server.R script. log - for debugging, while chain and guess - so that server.R could work with incomplete words (Appendix: log, chain, guess).
loaderloader <- function(url, dir = "./data/1227_DS-CS-w2_WordsFIN", filename) {
if(!dir.exists(dir)) dir.create(dir, recursive = TRUE)
dest <- paste0(dir, "/", filename)
if(!file.exists(dest)) download.file(
url = url, destfile = dest, method = "curl")
}cachecache <- function (var, name = as.character(var),
dir = "./data/1227_DS-CS-w2_WordsFIN/cache") {
var = deparse(substitute(var))
dest <- paste0(dir, "/", name, ".RData")
if(!dir.exists(dir)) dir.create(dir, recursive = TRUE)
if(!file.exists(dest)) save(list = c(var), file = dest)
}cleanclean <- function(text) {
text <- unlist(strsplit(text, "[\\.\\!\\?:;]+"))
text <- gsub("[’‘`´\"]", "'", text) # unif apostrophes
text <- gsub("[[:blank:]]#[^[:blank:]]*", " ", text, perl = T) #hashs
text <- gsub("[[:blank:]]@[^[:blank:]]*", " ", text, perl = T) #at signs (@)
text <- gsub("(https?)?://[^[:blank:]]*", " ", text, perl = T) #urls
text <- gsub("[^[:alnum:]']", " ", text, perl = T) # non-alpha/numerical
text <- gsub("[[:blank:]]+'([[:alnum:][:blank:]]+)'[[:blank:]]+", " \\1 ",
text, perl = T) # surr apostrophes
text <- stri_trim_both(text) # surr blanks
text <-gsub("[[:blank:][:digit:]+[:blank:]]", " ", text, perl = T) #numbers w\o text
text<- gsub("[[:blank:]]{2,}", " ", text, perl = T) # condense blanks
text <- trimws(text) # leading/ trailing blanks
text <- tolower(text)
text
}rmoptionsWays for filtering profanities/ stop-words, stemming
# load profanity file (a publicly kept profanity list from https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words)
loader(url =
"https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/5faf2ba42d7b1c0977169ec3611df25a3c08eb13/en",
dir = "./data/1227_DS-CS-w2_WordsFIN/ignore",
filename = "ignore")
# remove profanities
toksrp <- function(toks){
ignore <- read_tsv("./data/1227_DS-CS-w2_WordsFIN/ignore/ignore",
col_names = FALSE)
ignore <- ignore$X1
toksrp <- tokens_remove(toks, ignore)
toksrp
}
# remove stop-words
toksrs <- function(toks) {
library(quanteda)
toksrs <- tokens_remove(toks, stopwords("english"))
toksrs
}
# stemming
stemm <- function(dfm) {
library(quanteda)
stemm <- dfm(dfm, stem=TRUE)
stemm
}wnexxtwnexxt <- function(nexxt, initn, nexxtn, text, entire = TRUE,
wnexxt = NULL, lim = 1000, alpha = 0.4) {
if(nexxtn<1) return(wnexxt)
### recursion
astr <- tail(text, n = nexxtn-1)
astr <- paste(astr, collapse = " ")
nexxt0 <- nexxt[[nexxtn]]
if(entire) nexxt0 <- nexxt0 %>% filter(given == astr) else
nexxt0 <- nexxt0[startsWith(nexxt0$given, astr), ]
if(nrow(nexxt0)>0) nexxt0 <- nexxt0 %>%
slice_head(n = min(nrow(nexxt0), lim)) %>%
transmute(given, nexxt,
score = share*alpha^(initn-nexxtn), gramn = n)
# Stupid backoff factor alpha
wnexxt <- rbind(wnexxt, nexxt0)
return(wnexxt(nexxt = nexxt, initn = initn, nexxtn = nexxtn-1,
text = text, entire = entire, wnexxt = wnexxt, lim = lim))
}log (for server.R)log <- function(..., obj1=NULL, obj2=NULL) {
if(exists("verbose") && verbose){
cat(file=stderr(),"-------------------------------")
cat(file=stderr(),"\n> ", ..., "\n", sep=" ")
if(!is.null(obj1)){
print(obj1)
}
if(!is.null(obj2)){
print(obj2)
}
}
}chain (for server.R)chain<- function(text, given){
ttext <- paste(clean(text), collapse = ' ')
tgiven <- paste(clean(given), collapse = ' ')
initl <- min(nchar(ttext),nchar(tgiven))
repeat{
goleft <- str_sub(tgiven, 1, initl)
goright <- str_sub(ttext, -initl)
if(goleft == goright) loop <- FALSE else{
initl <- initl - 1
loop <- initl > 0
}
if(!loop) break
}
paste0(text, str_sub(given, initl+1))
}guess (for server.R)guess <- function(text, options, entire){
options$full = sapply(options$given, chain, text = text)
if(entire){
options$guess <- options$nexxt
options$full <- paste0(options$full, options$nexxt)
}else{
options$guess <- paste(options$given, options$nexxt)
options$full <- paste(options$full, options$nexxt)
}
options %>% select(guess, full)
}sessionInfo()R version 4.0.3 (2020-10-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur 10.16
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_0.5.1 stringr_1.4.0
[3] dplyr_1.0.7 purrr_0.3.4
[5] readr_2.0.1 tidyr_1.1.3
[7] tibble_3.1.3 ggplot2_3.3.5
[9] tidyverse_1.3.1 kableExtra_1.3.4
[11] quanteda.textstats_0.94.1 quanteda_3.1.0
[13] tidytext_0.3.1 stringi_1.7.3
loaded via a namespace (and not attached):
[1] httr_1.4.2 sass_0.4.0 bit64_4.0.5 vroom_1.5.4
[5] jsonlite_1.7.2 viridisLite_0.4.0 modelr_0.1.8 bslib_0.2.5.1
[9] RcppParallel_5.1.4 assertthat_0.2.1 highr_0.9 cellranger_1.1.0
[13] yaml_2.2.1 pillar_1.6.2 backports_1.2.1 lattice_0.20-44
[17] glue_1.4.2 digest_0.6.27 rvest_1.0.1 colorspace_2.0-2
[21] htmltools_0.5.1.1 Matrix_1.3-4 pkgconfig_2.0.3 broom_0.7.9
[25] haven_2.4.3 scales_1.1.1 webshot_0.5.2 svglite_2.0.0
[29] nsyllable_1.0 tzdb_0.1.2 generics_0.1.0 ellipsis_0.3.2
[33] withr_2.4.2 cli_3.0.1 magrittr_2.0.1 crayon_1.4.1
[37] readxl_1.3.1 evaluate_0.14 stopwords_2.2 tokenizers_0.2.1
[41] janeaustenr_0.1.5 fs_1.5.0 fansi_0.5.0 SnowballC_0.7.0
[45] xml2_1.3.2 tools_4.0.3 hms_1.1.0 lifecycle_1.0.0
[49] munsell_0.5.0 reprex_2.0.1 compiler_4.0.3 jquerylib_0.1.4
[53] proxyC_0.2.0 systemfonts_1.0.2 rlang_0.4.11 grid_4.0.3
[57] rstudioapi_0.13 rmarkdown_2.10 gtable_0.3.0 DBI_1.1.1
[61] R6_2.5.1 lubridate_1.7.10 knitr_1.33 bit_4.0.4
[65] utf8_1.2.2 fastmatch_1.1-3 parallel_4.0.3 Rcpp_1.0.7
[69] vctrs_0.3.8 dbplyr_2.1.1 tidyselect_1.1.1 xfun_0.25