The task is to explore the data and come up with a model that predicts the next word of a given string, based on the last 3,2 or 1 word.
Main questions to consider:
To reduce the time for compilation of this document, the analysis relies on the first 1000 entries of each document (blogs, twitter and news).
con <- paste(".\\final\\en_US\\en_US.",c("blogs","twitter","news"),".txt",sep="")
text_data <- lapply(con,readr::read_lines,n_max=1000) #(con)#,skip=i*1000,n_max = 10000)
Firstly, the data is looked at, to see the structure. It turns out each object contains a vector of characters. Each element of the vector can contain multiple words or sentences.
To keep the time needed for the analysis down the analysis is always done on a subset of the data,
These questions are answered using the quanteda package in R. To do this, the data is transformed into a corpus and from this a document-features matrix is produced. The punctuation, symbols, urls and such are removed because they are not useful for this task.
corp_blog <- corpus(text_data[[1]])
corp_twit <- corpus(text_data[[2]])
corp_news <- corpus(text_data[[3]])
# transform to tokens and at the same time removing numbers, url, symbols, etc.
toks_blog <- tokens(corp_blog,remove_punct=TRUE,
remove_symbols = TRUE,remove_numbers = TRUE,
remove_separators = TRUE,remove_url = TRUE)
toks_blog <- tokens_tolower(toks_blog) # lower-casing
n_blog <- length(toks_blog)
toks_blog <- tokens_remove(toks_blog,pattern=lexicon::profanity_alvarez) # remove profanities
toks_blog <- tokens_remove(toks_blog,pattern=lexicon::profanity_racist) # remove racist terms
n_blog_filt <- length(toks_blog)
# same for twitter
toks_twit <- tokens(corp_twit,remove_punct=TRUE,
remove_symbols = TRUE,remove_numbers = TRUE,
remove_separators = TRUE,remove_url = TRUE)
toks_twit <- tokens_tolower(toks_twit) # lower-casing
n_twit <- length(toks_twit)
toks_twit <- tokens_remove(toks_twit,pattern=lexicon::profanity_alvarez) # remove profanities
toks_twit <- tokens_remove(toks_twit,pattern=lexicon::profanity_racist) # remove racist terms
n_twit_filt <- length(toks_twit)
# same for news
toks_news <- tokens(corp_news,remove_punct=TRUE,
remove_symbols = TRUE,remove_numbers = TRUE,
remove_separators = TRUE,remove_url = TRUE)
toks_news <- tokens_tolower(toks_news) # lower-casing
n_news <- length(toks_news)
toks_news <- tokens_remove(toks_news,pattern=lexicon::profanity_alvarez) # remove profanities
toks_news <- tokens_remove(toks_news,pattern=lexicon::profanity_racist) # remove racist terms
n_news_filt <- length(toks_news)
# turn it into a document-frame feature matrix (dfm)
# this is just an intermediate step
dfm_blog <- dfm(toks_blog)
dfm_twit <- dfm(toks_twit)
dfm_news <- dfm(toks_news)
After removing punctuation and such blogs, twitter and news consist of {r c(n_blog,n_twit,n_news)} words respecitvely. After removing pronfanities blogs,twitter and news consist of
{r c(n_blog_filt,n_twit_filt,n_news_filt)}. So profanities make up `{r 100-c(n_blog_filt,n_twit_filt,n_news_filt)c(n_blog,n_twit,n_news)*100} percent of the text.
The word frequency is calculated using the topfeatures() function:
top20_blog <- topfeatures(dfm_blog,20)
top20_twit <- topfeatures(dfm_twit,20)
top20_news <- topfeatures(dfm_news,20)
top20_blog <- data.frame(word=names(top20_blog),n=top20_blog)
top20_twit <- data.frame(word=names(top20_twit),n=top20_twit)
top20_news <- data.frame(word=names(top20_news),n=top20_news)
The following plots show the most common words only for blog (to keep this short). It can be seen that most of the words are just stopwords and don’t give much insight of the actual content of the text.
The 2-grams are looked at using the tokens_ngrams() function:
For sake of keeping this short only the 2-grams of the blogs are plotted here:
The same is done for the 3-grams:
blog_3gram <- tokens_ngrams(toks_blog, n = 3)
top_blog_3gram <- topfeatures(dfm(blog_3gram),20)
top_blog_3gram <- data.frame(word=names(top_blog_3gram),n=top_blog_3gram)
twit_3gram <- tokens_ngrams(toks_twit, n = 3)
top_twit_3gram <- topfeatures(dfm(twit_3gram),20)
top_twit_3gram <- data.frame(word=names(top_twit_3gram),n=top_twit_3gram)
news_3gram <- tokens_ngrams(toks_news, n = 3)
top_news_3gram <- topfeatures(dfm(news_3gram),20)
top_news_3gram <- data.frame(word=names(top_news_3gram),n=top_news_3gram)
To see how many words are needed to cover 50% or 90% of all the tokens, the vector of the most frequently used words (ordered by frequency from highest to lowest) is used. In a while loop a dictionary is defined that gets bigger and bigger with each loop. The loop breaks when the desired coverage is reached. The number of words needed to reach the desired coverage grows not linearly, but rather exponentially.
top_blog <- topfeatures(dfm_blog,100000)
freq <- 0
i <- 1
while(freq <= 0.5) {
dict <- dictionary(list(freq_sort=names(top_blog[1:i])))
freq <- length(unlist(tokens_lookup(toks_blog,dictionary=dict)))/length(unlist(toks_blog))
i <- i+1
}
i
## [1] 104
freq <- 0
i <- 1
while(freq <= 0.9) {
dict <- dictionary(list(freq_sort=names(top_blog[1:i])))
freq <- length(unlist(tokens_lookup(toks_blog,dictionary=dict)))/length(unlist(toks_blog))
i <- i+1
}
i
## [1] 3870
A first attempt at finding foreign language was to use a dictionary and use the tokens_lookup() function to check if the tokens are in the dictionary. The following code does that and afterwards displays the word that weren’t matched. It can be seen that most words that weren’t matched are names, abbreviations, slang and composite words and not just foreign words. The coverage would be improved by using a better dictionary that also has some names and more common language and composite words in it. Another method would be to substitute hyphens and apostrophes in the raw text to break composite words apart.
Another possibilty would be to include other dictionaries from other languages. This could be done by just adding to the list inside the dictionary function.
dict <- dictionary(list(EN=lexicon::grady_augmented))
# How much coverage
length(unlist(tokens_lookup(toks_blog,dictionary=dict)))/length(unlist(toks_blog))
## [1] 0.9601246
# Words that were not machted
non_match <- unlist(toks_blog)[unlist(tokens_select(toks_blog,dict,pad=TRUE))==""]
names(non_match) <- NULL
non_match[1:20]
## [1] "skylander" "xbox" "kyan" "puter"
## [5] "one's" "nestabilities" "cardstock" "te's"
## [9] "laurelwood" "half-dozen" "cantab" "ordinariness"
## [13] "schiff" "imbalanced" "en-route" "cornwall"
## [17] "leeds" "millies" "assam" "np"
The overall strategy here was to get the n-grams of as much data as possible to cover most n-grams. This will be done for unigrams, bigrams, trigrams and 4-grams. The data would be cleaned beforehand, removing punctuation and such, as seen above. Also words not occuring in an English dictionary were also removed. To get a lot of data without crashing the computer, the data was sampled and analysed. This was done in a loop and the results of each step combined. This was done 40 times, each time with about 15% of the data. This was just a strategy used because of limited computer capacity. This has no effect on the model this relies on the finished n-grams that get loaded into R.
The first model that was built relied on looking for matches of the last three words of input text in the 4-gram table. If a match was found the match with the highest frequency was returned. If not the model would jump to the 3-gram table and only the last two words of input text were used for the search. And after that the 2-gram table and so on.
This model worked well in terms of frequency but the accuracy was not satisfactory.
The next strategy was an interpolation strategy taken from this source (https://web.stanford.edu/~jurafsky/slp3/4.pdf, p.15). It relies on adding the probabilities of n-grams and weighting them.
The n-gram probabilty was calculated by dividing the frequency of the n-gram (including the predicted word) by the frequency of the preceding n-gram. So, in case of the interpolation strategy the probability was 4-gram probabilty + 3-gram probabilty + 2-gram probabilty + unigram probabilty. The weights given to each of those probabiites were changed by trial-and-error. Again, if no n-gram was found in the highest n-gram-table this method would back up to the next lower n-gram. This model performed a lot better, however still not satisfactory.
The next method was the Kneser-Ney smoothing strategy explained here: https://medium.com/@dennyc/a-simple-numerical-example-for-kneser-ney-smoothing-nlp-4600addf38b8
This method takes into account the number of possible n-grams preceding a predicted word as well as the frequency of those possible n-grams. This model performed a lot better in terms of accuracy, however, the speed is a a bit of a bottleneck.
top_1gram <- read.table("1gram_sum.txt",sep=",",dec=".",header = TRUE,fill=NA)
top_2gram <- read.table("2gram_sum.txt",sep=",",dec=".",header = TRUE,fill=NA)
top_3gram <- read.table("3gram_sum.txt",sep=",",dec=".",header = TRUE,fill=NA)
top_4gram <- read.table("4gram_sum.txt",sep=",",dec=".",header = TRUE,fill=NA)
is.nullint <- function(x) {sapply(x,length)==0}
predict_word4 <- function(intext) {
int <- rev(tail(strsplit(intext," ")[[1]],3))
# position of matching 2gram
opts_n_i <- which(top_4gram$x1==paste(int[3],int[2],int[1],sep=" "))
if(length(opts_n_i)>=3) {
d <- 0
# first term
num <- sapply(top_4gram$n[opts_n_i]-d,max,2)
den <- sum(top_4gram$n[opts_n_i])
# lambda
lambda <- d/sum(top_4gram$n[opts_n_i])*length(opts_n_i)
# P_cont
p_cont <- sapply(opts_n_i,function(x) length(top_4gram$x2==top_4gram$x2[x]))/nrow(top_4gram)
p <- num/den+lambda*p_cont
return(top_4gram$x2[opts_n_i][order(p,decreasing = TRUE)][1:3])
} else {
opts_n_i <- which(top_3gram$x1==paste(int[2],int[1],sep=" "))
if(length(opts_n_i)>=3) {
d <- 0
# first term
num <- sapply(top_3gram$n[opts_n_i]-d,max,2)
den <- sum(top_3gram$n[opts_n_i])
# lambda
lambda <- d/sum(top_3gram$n[opts_n_i])*length(opts_n_i)
# P_cont
p_cont <- sapply(opts_n_i,function(x) length(top_3gram$x2==top_3gram$x2[x]))/nrow(top_3gram)
p <- num/den+lambda*p_cont
return(top_3gram$x2[opts_n_i][order(p,decreasing = TRUE)][1:3])
} else {
opts_n_i <- which(top_2gram$x1==int[1])
if(length(opts_n_i)>=3) {
d <- 0
# first term
num <- sapply(top_2gram$n[opts_n_i]-d,max,2)
den <- sum(top_2gram$n[opts_n_i])
# lambda
lambda <- d/sum(top_2gram$n[opts_n_i])*length(opts_n_i)
# P_cont
p_cont <- sapply(opts_n_i,function(x) length(top_2gram$x2==top_4gram$x2[x]))/nrow(top_4gram)
p <- num/den+lambda*p_cont
return(top_2gram$x2[opts_n_i][order(p,decreasing = TRUE)][1:3])
} else {return(NA)}
}
}
}
predict_word4("This homework is")
## [1] "done" "to" "the"
The shiny application will be quite simplistic. It will have a text input and output. The output will give the best three options that the model finds.