Word Prediction Algorithm

Intro and packages

The Data Exploration has been made in the Capstone week 2 file.
Here we are going to explain all the steps and assumptions made for the word prediction model.
For the cleaning of the data we used the Quanteda package because is the most optimized package for the task.

library(tm);library(R.utils)
library(readr);library(data.table);library(readtext)
library(quanteda);library(ggplot2);library(tidyr)
library(dplyr);library(purrr);library(spacyr)
library(doParallel);library(quanteda);library(hunspell)
library(stringr);library(stringi);library(wordnet)
library(pluralize);library(textcat);library(tidyverse)
library(arsenal)

Its important to make parallel processing so I used it to make everything quicker

cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl)

Sample the data

First of all I created a folder to input a sample of the 3 files. I preferred to do it outside RStudio because that way I spare more RAM.

if (!dir.exists("samples")) {
   dir.create("samples")}

createSample <- function(path, fileName, p) {
   # Calculate number of lines efficiently
   readFilePath <- paste(path, fileName, sep="/")
   nLines <- countLines(readFilePath)
   # Prepare write connection to file
   sampleFilePath <- paste("samples", fileName, sep="/")
   conWrite <- file(sampleFilePath, "w")
   
   testfileName<- paste("test", fileName, sep="_")
   sampleFilePath <- paste("samples", testfileName, sep="/")
   testWrite <- file(sampleFilePath, "w")
   # Prepare read connection
   conRead <- file(readFilePath, "r")
   # The size of the samples with value = 1 is nLines*p, meaning its not a fixed length
   samples <- rbinom(nLines, size=1, prob=p)
   for (i in 1:nLines) {
      t <- readLines(conRead, 1, skipNul = TRUE)
      if (samples[i] == 1) {
         writeLines(t, conWrite)
      } else { 
         writeLines(t, testWrite)
         }
   }
   close(conRead)
   close(conWrite)
   close(testWrite)
}

NOTE: I delete some special characters by hand (a square character on the NEWS file and double spacing in Twitter) that made the file read only partially.

Then created 2 files per each file one that has the probability stated (Train set) and the other one is his complement (Test set).

pathToFiles <- "en_US"
sampleProbability <- .3
set.seed(123)

   createSample(pathToFiles, "en_US.blogs.txt", sampleProbability)
   createSample(pathToFiles, "en_US.news.txt", sampleProbability)
   createSample(pathToFiles, "en_US.twitter.txt", sampleProbability)

Then it only takes to read the sampled data

readdata<-function(filename){
   pathfile<- paste(".","samples",filename,sep = "/")
   datafile<-readtext(pathfile)
}


corpus<-map_dfr(list.files("en_US1"),readdata)%>%corpus()

Data Preparation

Combine documents from the corpus and transform the whole text in sentences Also only use those sentences that have English characters, this reduces the probabilities to have other languages in the corpus

corpus<-corpus(texts(corpus, groups = rep(1, ndoc(corpus))))%>%
   corpus_reshape(to="sentences")
corpus<-corpus[stri_enc_isascii(corpus)]

Here we removed all the symbols, punctuation, numbers, url’s and tokenize it with the quanteda package

tkn <- tokens(corpus,
                            remove_punct = TRUE,
                            remove_numbers = TRUE,
                            remove_symbols = TRUE,
                            split_hyphens = TRUE,
                            remove_url = TRUE,
                            remove_twitter = TRUE,
                            what = "word1")%>%
   tokens_tolower()%>% tokens_remove("^[0-9]",valuetype = "regex")
rm(corpus)

For posterior use I took the first word of each sentence (this will be the first words that will appear in the app)

first_words<-character()
for(i in 1:10000) {
   first_words<-rbind(first_words,tkn[[i]][1])
   }

first_words<-table(first_words)
first_words<-tibble(name=names(first_words),freq=first_words)%>%arrange(-freq)
print(first_words[1:5,])

## # A tibble: 5 x 2
##   name  freq   
##   <chr> <table>
## 1 i     1229   
## 2 the    691   
## 3 it     316   
## 4 and    251   
## 5 we     240

Here we select those words that are not correct and because correct them would require allot of RAM, I change them to a stopword and subsequently cut the sentence where that word appear.

spell_tokens <- function(tkn, language = "en_US") {
   
   # extract types to only work on them
   types <- types1 <- types(tkn)
   
   # spelling
   correct <- hunspell_check(
      words = as.character(types), 
      dict = hunspell::dictionary(language)
   )
   
   types[!correct]<- "STOP"
   
   # replace original tokens
   token_new <- tokens_replace(tkn, types1, types,valuetype = "fixed",case_insensitive= FALSE)
   
   token_new <- quanteda::tokens_segment(token_new, pattern = "STOP", valuetype = "fixed",extract_pattern=TRUE)
   
   token_new
}

tkn<-spell_tokens(tkn)
dfm<- dfm(tkn)

Frequency and distribution of top words

Now we want to see the accumulative graphic of the words.

freq<- textstat_frequency(dfm)[,1:2]
table_freq<- data.table(table(freq$frequency))%>%print()

##            V1    N
##    1:       1 9143
##    2:       2 5177
##    3:       3 3583
##    4:       4 2715
##    5:       5 2077
##   ---             
## 2775:  484842    1
## 2776:  591841    1
## 2777:  591871    1
## 2778:  682812    1
## 2779: 1170762    1

words_coverage <- data.frame(
   coverage = round(cumsum(freq$frequency) / sum(freq$frequency) * 100, 2),
   words = 1:nrow(freq)
)
rm(freq,table_freq)

qplot(data=words_coverage[1:3000,],x=words,y=coverage)

I only plotted to 3.000 words because it represents the interesting part of the curve. Were the Zipf’s law shows. “Zipf’s law was originally formulated in terms of quantitative linguistics, stating that given some corpus of natural language utterances, the frequency of any word is inversely proportional to its rank in the frequency table. Thus the most frequent word will occur approximately twice as often as the second most frequent word, three times as often as the third most frequent word, etc.”
The minimum number of top words added to achieve 50% and 90% coverage are:

#50% coverage
min(words_coverage[words_coverage$coverage > 50, ]$words)

## [1] 109

#90% coverage
min(words_coverage[words_coverage$coverage > 90, ]$words)%>%print()

## [1] 4256

#Words that appeared only 1 time
tail(words_coverage, n = 1)[, 2]

## [1] 60882

Data for the algorithm

Here we transform the data to Ngrams and transform it to a tipe of data more usable.

unigs <- dfm
bigrs<- tokens_ngrams(tkn, 2,concatenator = " ")%>%dfm()
trigs<- tokens_ngrams(tkn, 3,concatenator = " ")%>%dfm()
rm(dfm)

sums_U <- colSums(unigs)
sums_B <- colSums(bigrs)
sums_T <- colSums(trigs)

unigs <- tibble(word_1 = names(sums_U), freq_u = sums_U)

bigrs <- tibble(
   word_1 = sapply(strsplit(names(sums_B), " ", fixed = TRUE), '[[', 1),
   word_2 = sapply(strsplit(names(sums_B), " ", fixed = TRUE), '[[', 2),
   freq_b = sums_B)

trigs <- tibble(
   word_1 = sapply(strsplit(names(sums_T), " ", fixed = TRUE), '[[', 1),
   word_2 = sapply(strsplit(names(sums_T), " ", fixed = TRUE), '[[', 2),
   word_3 = sapply(strsplit(names(sums_T), " ", fixed = TRUE), '[[', 3),
   freq_t = sums_T)

rm(sums_B,sums_T,sums_U)

Obtain the most frequent words used in all the dataset. This will be useful is if the word used to predict is not in the dataset. This could be changed if we collect the user data so that we can predict their own most frequent words.

n_pred<-5
most_freq_word <-arrange(unigs,-freq_u)$word_1[1:n_pred]

Algorithm

I needed a function to detect wrong spelled words and suggest new ones

wordspell<-function(words_pred){
      correct <- hunspell_check(
         words = words_pred, 
         dict = hunspell::dictionary("en_US")
      )
      
      pattern <- words_pred[!correct]
      replacement <- sapply(hunspell_suggest(pattern, dict = "en_US"), FUN = "[",1:n_pred)
      return(replacement)
   }

Then the function that will be inserted in the app

predict_this<- function(pred,n_pred =5,gamma2 =0.5,gamma3 = 0.5){
   most_first_word <- c("i",   "the", "it",  "and", "we")
   if(is.na(pred)| pred== "") { 
      return(most_first_word)}
   
   pred<- tokens(pred,
                 remove_punct = TRUE,
                 remove_numbers = TRUE,
                 remove_symbols = TRUE,
                 split_hyphens = TRUE) 
   pred<-rev(pred[[1]])
   correct<-wordspell(pred[1])
   
   
   if(length(correct)>1) {
      return(as.character(correct))
   } 
   
   correct<-wordspell(pred[2])
   
   
   if(length(correct)>1) {
      pred<- pred[1]
   } 
   
   pred<- tolower(pred)
   
   
   obs_unigr<- unigs[unigs$word_1 == pred[1],]
   obs_bigrams<- filter(bigrs, word_1 == pred[1])
   obs_unigr2<-nrow(unigs[unigs$word_1 == pred[2],])
   obsCount <- filter(bigrs, word_1 == pred[2],word_2 ==pred[1])
   obs_trig <- filter(trigs, word_1 == pred[2],word_2 ==pred[1])
   
  #function to predict
   
   trigram_predcit<- function(pred){
      qbo_obs_trigrams <- mutate(obs_trig, prob=((freq_t - gamma3) / sum(obsCount$freq_b)))%>%
         select(-c(word_1,word_2,freq_t))
      
      unobs_trig_tails <- unigs[!(unigs$word_1 %in% qbo_obs_trigrams$word_3), ]$word_1
      
      alphaBi <- 1 - (sum(obs_bigrams$freq_b - gamma2) / obs_unigr$freq_u)
      
      obs_boBigrams<-filter(bigrs, is.element(word_2,unobs_trig_tails) & word_1== pred[1])
      unobs_boBigrams<-unobs_trig_tails[!is.element(unobs_trig_tails,obs_boBigrams$word_2)]
      
      qboObsBigs <- mutate(obs_boBigrams,prob=(freq_b - gamma2) / obs_unigr$freq_u)%>%
         select(-c(freq_b,word_1))
      
      qboUnobsBigs <-filter(unigs,is.element(word_1,unobs_boBigrams)) %>% 
         rename(word_2= word_1) %>% 
         mutate(prob=(alphaBi * freq_u) / sum(freq_u))%>%
         select(-freq_u)
      
      
      qbo_bigrams <- bind_rows(qboObsBigs, qboUnobsBigs)%>%arrange(desc(prob))
      
      #Checkpoint
      #cheking if both numbers are equal it shows that you made everything good
      #sum(qbo_bigrams[!is.element(qbo_bigrams$word_2,filter(bigrs,word_1==pred[1])$word_2),]$prob)
      #alphaBi
      
      alphaTrig <- 1 - sum((obs_trig$freq_t - gamma3) / obsCount$freq_b)
      
      
      
      qbo_unobs_trigrams <- mutate(qbo_bigrams,
                                   prob=alphaTrig * qbo_bigrams$prob / sum(qbo_bigrams$prob))%>% 
         rename(word_3=word_2)
      
      
      qbo_trigrams <- rbind(qbo_obs_trigrams, qbo_unobs_trigrams)%>%
         arrange(desc(prob))
      
      
      return(qbo_trigrams[1:n_pred,]$word_3)
   }
   #Checkpoint
   #cheking if the sum is equal to 1 shows that you made everything good
   #sum(qbo_trigrams$prob)
   
   
   # What has the code to check and return for each case
   
   if(nrow(obs_unigr) == 0) { 
      return(most_first_word)
   } else if( nrow(obs_bigrams) == 0) {
      return( arrange(unigs,-freq_u)$word_1[1:n_pred] )
   } else if( obs_unigr2 == 0) {
      return(arrange(obs_bigrams,-freq_b)$word_2 [1:n_pred])
   } else if (nrow(obsCount)==0){
      return(trigram_predcit(pred))
   } else if(nrow(obs_trig)==0) { 
      return(arrange(obs_bigrams,-freq_b)$word_2 [1:n_pred])
   } else { 
      return(trigram_predcit(pred))
   }
   
   
}

Examples

To show some examples using the model

predict_this("Shall we go to the")

## [1] "public" "next"   "point"  "new"    "top"

predict_this("about his")

## [1] "life" "work" "new"  "own"  "past"

predict_this("monkeys this")

## [1] "is"      "year"    "week"    "morning" "weekend"

Performance of the prediction

Use the part of the data saved for the test set. As this is to much, we are going to use only 500 an we have to correct the spelling of the previous and the next word. Because we now that the data set has many errors

test_data<-readtext("./samples/test/test_en_US.twitter.txt")%>%corpus()

n_sample<-500

set.seed(2021)
test_data<-corpus(texts(test_data, groups = rep(1, ndoc(test_data))))%>%
   corpus_reshape(test_data,to="sentences")%>%corpus_sample(n_sample)

#tokenize as it were the entry text

test_data2<-test_data[stri_enc_isascii(test_data)]%>%
   tokens( remove_punct = TRUE,
       remove_numbers = TRUE,
       remove_symbols = TRUE,
       split_hyphens = TRUE,
       remove_url = TRUE,
       remove_twitter = TRUE,
       what = "word1")%>%
   tokens_tolower()%>% tokens_remove("^[0-9]",valuetype = "regex")

## Warning: 'remove_twitter' is defunct; see 'quanteda Tokenizers' in ?tokens

test_data3<-tibble(w_1=as.character(rep(NA,times=ndoc(test_data2))),
                   pred=as.character(rep(NA,times=ndoc(test_data2))))
n_pred<-1
for(i in 1:ndoc(test_data2)) {
   doc<-c(rev(test_data2[[i]]))
   n_pred<-1
   
   word<- character()
   if(length(doc)==1){
      word<- NA
   }else{
      word1<-wordspell(doc[3])
      if(length(word1)==0) {word11<-doc[3]
      }else {word11<-word1}
      
      word2<-wordspell(doc[2])
      if(length(word2)==0) {word21<-doc[2]
      }else {word21<-word2}
   }
      word<-paste(word11,word21,sep = " ")
   
      word3<-wordspell(doc[1])
      if(length(word3)==0) {word31<-doc[1]
      }else {word31<-word3}
      
      
      
   test_data3[i,"w_1"]<-word
   test_data3[i,"pred"]<-word31
}


result_test<-map(test_data3$w_1,predict_this)

result_test<-tibble(word=result_test)%>%
   unnest_wider(word,names_sep = "_")

result_test2<-apply(result_test,2,map2,test_data3$pred,identical)

result_test2<-tibble(word=result_test2)%>% 
   unnest_wider(word,names_sep = "_")%>%
   t()

rownames(result_test2)<-NULL

sum(result_test2)

## [1] 113

so there are

## [1] 22.6

% correct predictions.

Code and model

Rebeca