Part 2 capstone project word prediction: dfm

Create a quanteda corpus

master_vector <- c(Twitter, Blogs, News)
corp <- corpus(master_vector)

saveRDS(corp, "corp.RDS")

corp <- readRDS("corp.RDS")

profanityFileName <- "profanity.txt"
profanity <- read.csv(profanityFileName, header = FALSE, stringsAsFactors = FALSE)
profanity[,1][1:5]

## [1] "2g1c"               "2 girls 1 cup"      "acrotomophilia"    
## [4] "alabama hot pocket" "alaskan pipeline"

Create words tokens

master_Tokens <- tokens(
    x = tolower(corp),
    remove_punct = TRUE,
    remove_twitter = TRUE,
    remove_numbers = TRUE,
    remove_hyphens = TRUE,
    remove_symbols = TRUE,
    remove_url = TRUE
)

Create bi-gram and tri-gram

bi_gram <- tokens_ngrams(master_Tokens, n = 2)
tri_gram <- tokens_ngrams(master_Tokens, n = 3)

saveRDS(master_Tokens, "master_Tokens")
saveRDS(bi_gram, "bi_gram.RDS")
saveRDS(tri_gram, "tri_gram.RDS")

Construct the dfm

uni_DFM <- dfm(master_Tokens)

bi_DFM <- dfm(bi_gram)
tri_DFM <- dfm(tri_gram)

saveRDS(uni_DFM, "uni_DFM.RDS")
saveRDS(bi_DFM, "bi_DFM.RDS")
saveRDS(tri_DFM, "tri_DFM.RDS")

uni_DFM <- dfm_trim(uni_DFM, 3)
bi_DFM <- dfm_trim(bi_DFM, 3)
tri_DFM <- dfm_trim(tri_DFM, 3)

saveRDS(uni_DFM, "uni_DFMtrim.RDS")
saveRDS(bi_DFM, "bi_DFMtrim.RDS")
saveRDS(tri_DFM, "tri_DFMtrim.RDS")

uni_DFM <- readRDS("uni_DFMtrim.RDS")
bi_DFM <- readRDS("bi_DFMtrim.RDS")
tri_DFM <- readRDS("tri_DFMtrim.RDS")

# Create named vectors with counts of words 
sums_U <- colSums(uni_DFM)
sums_B <- colSums(bi_DFM)
sums_T <- colSums(tri_DFM)

remove rows with eeoss, nnumm, aabrr as these are not useful

Note that any bigrams, tri/fourgrams that contain any of these three words are incorrect because they are not sequences of words that were actually used in the text

index1 <- grepl("eeoss|nnum|aabrr", x = names(sums_U))
sums_U <- sums_U[!index]

index2 <- grepl("eeoss|nnum|aabrr", x = names(sums_B))
sums_B <- sums_B[!index2]
# take a loop
sums_B[sample(1:length(sums_B), 10)]

index3 <- grepl("eeoss|nnum|aabrr", x = names(sums_T))
sums_T <- sums_T[!index3]
# take a loop
sums_T[sample(1:length(sums_T), 10)]

# Requires packages

suppressPackageStartupMessages(library(data.table))

# Create data tables with individual words as columns
uni_words <- data.table(word_1 = names(sums_U), count = sums_U)

bi_words <- data.table(
        word_1 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 1),
        word_2 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 2),
        count = sums_B)

tri_words <- data.table(
        word_1 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 1),
        word_2 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 2),
        word_3 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 3),
        count = sums_T)

Save R object to avoid re running

saveRDS(uni_words, "uni_words.RDS")
saveRDS(bi_words, "bi_words.RDS")
saveRDS(tri_words, "tri_words.RDS")

Loop of the dataframe created

uni_words <- readRDS("uni_words.RDS")
bi_words <- readRDS("bi_words.RDS")
tri_words <- readRDS("tri_words.RDS")

head(uni_words, 20)

##       word_1  count
## 1       what  24414
## 2         if  21868
## 3          i 178186
## 4       want   8985
## 5         to 192639
## 6         be  40767
## 7          a 157723
## 8   spinster      4
## 9        cat    583
## 10      lady    817
## 11       one  21794
## 12 direction    512
## 13        is 107988
## 14     being   6489
## 15      sued     35
## 16      that  77098
## 17        so  33587
## 18    stupid    772
## 19     suing     18
## 20      them  12171

head(bi_words, 20)

##       word_1    word_2 count
## 1       what        if   247
## 2         if         i  2892
## 3          i      want  2492
## 4       want        to  5203
## 5         to        be 11615
## 6         be         a  3524
## 7        cat      lady     9
## 8        one direction   141
## 9  direction        is     9
## 10        is     being   317
## 11      some    nights    10
## 12    nights         i    16
## 13         i      just  3477
## 14      just       lay     9
## 15       lay        in    31
## 16        in       bed   251
## 17       bed       and   123
## 18      that        is  7684
## 19        is        so  1657
## 20        so    stupid    33

head(tri_words, 20)

##    word_1    word_2 word_3 count
## 1    what        if      i    41
## 2      if         i   want    39
## 3       i      want     to  1375
## 4    want        to     be   537
## 5    some    nights      i     4
## 6      to        be      a  1314
## 7     one direction     is     7
## 8       i      just    lay     5
## 9     lay        in    bed    14
## 10     in       bed    and    22
## 11   them        is    not     5
## 12     is       not  going   112
## 13      a     bunch     of   307
## 14    not     going     to   448
## 15  going        to   make   112
## 16  bunch        of    new    13
## 17     to      make    you    60
## 18   make       you   more     4
## 19 before         i     go    37
## 20      i        go     to   126

From the N-Grams created we can get an idea of how to predict the texts. The 1-Gram can be used to predict most frequent words while 2-Grams and 3-Grams can be used to predict subsequent words.

To speedup calculations even more, let’s index the N-Grams.

setkey(uni_words, word_1)
setkey(bi_words, word_1, word_2)
setkey(tri_words, word_1, word_2, word_3)

Kneser-Kney Smoothing

Let’s add Kneser-Kney smoothing to the dataset. First we will find bi-gram probabilities and then add smoothing.

discount_value <- 0.75

######## Finding Bi-Gram Probability #################

# Finding number of bi-gram words
numOfBiGrams <- nrow(bi_words[by = .(word_1, word_2)])

# Dividing number of times word 2 occurs as second part of bigram, by total number of bigrams.  
# ( Finding probability for a word given the number of times it was second word of a bigram)
ckn <- bi_words[, .(Prob = ((.N) / numOfBiGrams)), by = word_2]
setkey(ckn, word_2)

# Assigning the probabilities as second word of bigram, to unigrams
uni_words[, Prob := ckn[word_1, Prob]]
uni_words <- uni_words[!is.na(uni_words$Prob)]

# Finding number of times word 1 occurred as word 1 of bi-grams
n1wi <- bi_words[, .(N = .N), by = word_1]
setkey(n1wi, word_1)

# Assigning total times word 1 occured to bigram cn1
bi_words[, Cn1 := uni_words[word_1, count]]

# Kneser Kney Algorithm
bi_words[, Prob := ((count - discount_value) / Cn1 + discount_value / Cn1 * n1wi[word_1, N] * uni_words[word_2, Prob])]

######## End of Finding Bi-Gram Probability #################

Then let’s find tri-gram probabilities and add smoothing.

######## Finding Tri-Gram Probability #################

# Finding count of word1-word2 combination in bigram 
tri_words[, Cn2 := bi_words[.(word_1, word_2), count]]

# Finding count of word1-word2 combination in trigram
n1w12 <- tri_words[, .N, by = .(word_1, word_2)]
setkey(n1w12, word_1, word_2)

# Kneser Kney Algorithm
tri_words[, Prob := (count - discount_value) / Cn2 + discount_value / Cn2 * n1w12[.(word_1, word_2), N] *
              bi_words[.(word_1, word_2), Prob]]

######## End of Finding Tri-Gram Probability #################

Finally let’s tweak the unigram to be used more effectively. Here we single out 50 most occurring unigrams as it is more likely to occur. This will be used as the last resort in backing-off.

# Finding the most frequently used 50 unigrmas
uni_words <- uni_words[order(-Prob)][1:50]

saveRDS(uni_words, "uni_wordsfinal.RDS")
saveRDS(bi_words, "bi_wordsfinal.RDS")
saveRDS(tri_words, "tri_wordsfinal.RDS")

With that, we have found, the word probabilities for the words in the dataset. Now let’s create few functions to build the prediction app.

The Prediction App

uni_words <- readRDS("uni_wordsfinal.RDS")
bi_words <- readRDS("bi_wordsfinal.RDS")
tri_words <- readRDS("tri_wordsfinal.RDS")

Clean the text input

library(textclean)
cleanInput <-function(input) {
        # 1. Separate words connected with - or /
        input <- gsub("-", " ", input)
        input <- gsub("/", " ", input)
        
        # 2. Establish end of sentence, abbr, number, email, html
        input <- gsub("\\? |\\?$|\\! |\\!$", " EEOSS ", input)
        input <- gsub("[A-Za-z]\\.[A-Za-z]\\.[A-Za-z]\\.[A-Za-z]\\. |[A-Za-z]\\.[A-Za-z]\\.[A-Za-z]\\. |[A-Za-z]\\.[A-Za-z]\\. ", " AABRR ", input)
        input <- gsub("\\. |\\.$", " EEOSS ", input)
        input <- gsub("[0-9]+"," NNUMM ",input)
        input <- gsub("\\S+@\\S+","EEMAILL",input) 
        input <- gsub("[Hh}ttp([^ ]+)","HHTMLL",input) 
        input <- gsub("RT | via"," RTVIA ",input) # retweets
        input <- gsub("@([^ ]+)","ATPPLE",input) # @people
        input <- gsub("[@][a - zA - Z0 - 9_]{1,15}","UUSRNMSS",input) # usernames
        
        # 3. to lower
        input <- tolower(input)
        
        # 4. Remove/replace &, @, 'm, 's, 'are, 'll, etc...
        input <- gsub(" & ", " and ", input)
        input <- gsub(" @ ", " at ", input)
        input <- replace_contraction(input)
        input <- gsub("'s", "", input) 
        input <- gsub("haven't", "have not", input)
        input <- gsub("hadn't", "had not", input)
        
        # 5. Remove emoji's, emoticons
        input <- gsub("[^\x01-\x7F]", "", input)
        
        # 6. Remove g, mg, lbs etc; removes all single letters except "a" and "i"
        
        input <- gsub(" [1-9]+g ", " ", input) # grams
        input <- gsub(" [1-9]+mg ", " ", input) # miligrams, etc
        input <- gsub(" [1-9]+kg ", " ", input)
        input <- gsub(" [1-9]+lbs ", " ", input)
        input <- gsub(" [1-9]+s ", " ", input) # seconds, etc
        input <- gsub(" [1-9]+m ", " ", input)
        input <- gsub(" [1-9]+h ", " ", input)
        input <- gsub(" +g ", " ", input) # grams
        input <- gsub(" +mg ", " ", input) # miligrams, etc
        input <- gsub(" +kg ", " ", input)
        input <- gsub(" +lbs ", " ", input)
        input <- gsub(" +s ", " ", input) # seconds, etc
        input <- gsub(" +m ", " ", input)
        input <- gsub(" +h ", " ", input)
        input <- gsub(" +lbs ", " ", input)
        input <- gsub(" +kg ", " ", input)
        
        # 7. remove punctuation
        #input <- gsub("[^[:alnum:][:space:]\']", "",input)
        #input <- gsub(""", "", input)
        #input <- gsub(""", "", input)
        #input <- gsub("'", "", input)
        #input <- gsub("'", "", input)
        
        # 8. remove all single letters eccept i and a
        input <- gsub(" u ", " you ", input)
        input <- gsub(" [b-hj-z] ", " ", input)
        
        # 9. remove profanity
        input <- removeWords(input, profanity[,1])
        
        # 10. remove extra spaces
        # input <- gsub("^[ ]{1,10}","",input)
        # input <- gsub("[ ]{2,10}"," ",input)
        input <- stripWhitespace(input)
        # remove space at end of phrase
        input <- gsub(" $", "", input)
        return(input)
}

First the function to predict the third word, given two previous words.

# function to return highly probable previous word given two successive words
triWords <- function(w1, w2, n = 5) {
    pwords <- tri_words[.(w1, w2)][order(-Prob)]
    if (any(is.na(pwords)))
        return(biWords(w2, n))
    if (nrow(pwords) > n)
        return(pwords[1:n, word_3])
    count <- nrow(pwords)
    bwords <- biWords(w2, n)[1:(n - count)]
    return(c(pwords[, word_3], bwords))
}

If we don’t find a tri-gram with the two given words, we backoff to the bi-gram. We find the next word given one previous word.

# function to return highly probable previous word given a word
biWords <- function(w1, n = 5) {
    pwords <- bi_words[w1][order(-Prob)]
    if (any(is.na(pwords)))
        return(uniWords(n))
    if (nrow(pwords) > n)
        return(pwords[1:n, word_2])
    count <- nrow(pwords)
    unWords <- uniWords(n)[1:(n - count)]
    return(c(pwords[, word_2], unWords))
}

If we couldn’t even find the corresponding bigram, we randomly get a word from unigrams with high probability. This is the last resort for n-grams that are not found in the dataset.

# function to return random words from unigrams
uniWords <- function(n = 5) {  
    return(sample(uni_words[, word_1], size = n))
}

Finally, a function to bind all these,

# The prediction app
getWords <- function(str, n=5){
    require(textclean)
    require(quanteda)
    require(tm)
    str <- cleanInput(str)
    tokens <- tokens(x = char_tolower(str))
    tokens <- rev(rev(tokens[[1]])[1:2])
    
    words <- triWords(tokens[1], tokens[2], n)
    chain_1 <- paste(tokens[1], tokens[2], words[1], sep = " ")

    print(words)
}

getWords("Shall we go to")

## Loading required package: quanteda

## Package version: 1.3.14

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

## Loading required package: tm

## Loading required package: NLP

## 
## Attaching package: 'tm'

## The following objects are masked from 'package:quanteda':
## 
##     as.DocumentTermMatrix, stopwords

## [1] "the"    "bed"    "a"      "school" "sleep"