master_vector <- c(Twitter, Blogs, News)
corp <- corpus(master_vector)
saveRDS(corp, "corp.RDS")
corp <- readRDS("corp.RDS")
profanityFileName <- "profanity.txt"
profanity <- read.csv(profanityFileName, header = FALSE, stringsAsFactors = FALSE)
profanity[,1][1:5]
## [1] "2g1c" "2 girls 1 cup" "acrotomophilia"
## [4] "alabama hot pocket" "alaskan pipeline"
master_Tokens <- tokens(
x = tolower(corp),
remove_punct = TRUE,
remove_twitter = TRUE,
remove_numbers = TRUE,
remove_hyphens = TRUE,
remove_symbols = TRUE,
remove_url = TRUE
)
bi_gram <- tokens_ngrams(master_Tokens, n = 2)
tri_gram <- tokens_ngrams(master_Tokens, n = 3)
saveRDS(master_Tokens, "master_Tokens")
saveRDS(bi_gram, "bi_gram.RDS")
saveRDS(tri_gram, "tri_gram.RDS")
uni_DFM <- dfm(master_Tokens)
bi_DFM <- dfm(bi_gram)
tri_DFM <- dfm(tri_gram)
saveRDS(uni_DFM, "uni_DFM.RDS")
saveRDS(bi_DFM, "bi_DFM.RDS")
saveRDS(tri_DFM, "tri_DFM.RDS")
uni_DFM <- dfm_trim(uni_DFM, 3)
bi_DFM <- dfm_trim(bi_DFM, 3)
tri_DFM <- dfm_trim(tri_DFM, 3)
saveRDS(uni_DFM, "uni_DFMtrim.RDS")
saveRDS(bi_DFM, "bi_DFMtrim.RDS")
saveRDS(tri_DFM, "tri_DFMtrim.RDS")
uni_DFM <- readRDS("uni_DFMtrim.RDS")
bi_DFM <- readRDS("bi_DFMtrim.RDS")
tri_DFM <- readRDS("tri_DFMtrim.RDS")
# Create named vectors with counts of words
sums_U <- colSums(uni_DFM)
sums_B <- colSums(bi_DFM)
sums_T <- colSums(tri_DFM)
Note that any bigrams, tri/fourgrams that contain any of these three words are incorrect because they are not sequences of words that were actually used in the text
index1 <- grepl("eeoss|nnum|aabrr", x = names(sums_U))
sums_U <- sums_U[!index]
index2 <- grepl("eeoss|nnum|aabrr", x = names(sums_B))
sums_B <- sums_B[!index2]
# take a loop
sums_B[sample(1:length(sums_B), 10)]
index3 <- grepl("eeoss|nnum|aabrr", x = names(sums_T))
sums_T <- sums_T[!index3]
# take a loop
sums_T[sample(1:length(sums_T), 10)]
# Requires packages
suppressPackageStartupMessages(library(data.table))
# Create data tables with individual words as columns
uni_words <- data.table(word_1 = names(sums_U), count = sums_U)
bi_words <- data.table(
word_1 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 2),
count = sums_B)
tri_words <- data.table(
word_1 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 2),
word_3 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 3),
count = sums_T)
saveRDS(uni_words, "uni_words.RDS")
saveRDS(bi_words, "bi_words.RDS")
saveRDS(tri_words, "tri_words.RDS")
uni_words <- readRDS("uni_words.RDS")
bi_words <- readRDS("bi_words.RDS")
tri_words <- readRDS("tri_words.RDS")
head(uni_words, 20)
## word_1 count
## 1 what 24414
## 2 if 21868
## 3 i 178186
## 4 want 8985
## 5 to 192639
## 6 be 40767
## 7 a 157723
## 8 spinster 4
## 9 cat 583
## 10 lady 817
## 11 one 21794
## 12 direction 512
## 13 is 107988
## 14 being 6489
## 15 sued 35
## 16 that 77098
## 17 so 33587
## 18 stupid 772
## 19 suing 18
## 20 them 12171
head(bi_words, 20)
## word_1 word_2 count
## 1 what if 247
## 2 if i 2892
## 3 i want 2492
## 4 want to 5203
## 5 to be 11615
## 6 be a 3524
## 7 cat lady 9
## 8 one direction 141
## 9 direction is 9
## 10 is being 317
## 11 some nights 10
## 12 nights i 16
## 13 i just 3477
## 14 just lay 9
## 15 lay in 31
## 16 in bed 251
## 17 bed and 123
## 18 that is 7684
## 19 is so 1657
## 20 so stupid 33
head(tri_words, 20)
## word_1 word_2 word_3 count
## 1 what if i 41
## 2 if i want 39
## 3 i want to 1375
## 4 want to be 537
## 5 some nights i 4
## 6 to be a 1314
## 7 one direction is 7
## 8 i just lay 5
## 9 lay in bed 14
## 10 in bed and 22
## 11 them is not 5
## 12 is not going 112
## 13 a bunch of 307
## 14 not going to 448
## 15 going to make 112
## 16 bunch of new 13
## 17 to make you 60
## 18 make you more 4
## 19 before i go 37
## 20 i go to 126
From the N-Grams created we can get an idea of how to predict the texts. The 1-Gram can be used to predict most frequent words while 2-Grams and 3-Grams can be used to predict subsequent words.
To speedup calculations even more, let’s index the N-Grams.
setkey(uni_words, word_1)
setkey(bi_words, word_1, word_2)
setkey(tri_words, word_1, word_2, word_3)
Let’s add Kneser-Kney smoothing to the dataset. First we will find bi-gram probabilities and then add smoothing.
discount_value <- 0.75
######## Finding Bi-Gram Probability #################
# Finding number of bi-gram words
numOfBiGrams <- nrow(bi_words[by = .(word_1, word_2)])
# Dividing number of times word 2 occurs as second part of bigram, by total number of bigrams.
# ( Finding probability for a word given the number of times it was second word of a bigram)
ckn <- bi_words[, .(Prob = ((.N) / numOfBiGrams)), by = word_2]
setkey(ckn, word_2)
# Assigning the probabilities as second word of bigram, to unigrams
uni_words[, Prob := ckn[word_1, Prob]]
uni_words <- uni_words[!is.na(uni_words$Prob)]
# Finding number of times word 1 occurred as word 1 of bi-grams
n1wi <- bi_words[, .(N = .N), by = word_1]
setkey(n1wi, word_1)
# Assigning total times word 1 occured to bigram cn1
bi_words[, Cn1 := uni_words[word_1, count]]
# Kneser Kney Algorithm
bi_words[, Prob := ((count - discount_value) / Cn1 + discount_value / Cn1 * n1wi[word_1, N] * uni_words[word_2, Prob])]
######## End of Finding Bi-Gram Probability #################
Then let’s find tri-gram probabilities and add smoothing.
######## Finding Tri-Gram Probability #################
# Finding count of word1-word2 combination in bigram
tri_words[, Cn2 := bi_words[.(word_1, word_2), count]]
# Finding count of word1-word2 combination in trigram
n1w12 <- tri_words[, .N, by = .(word_1, word_2)]
setkey(n1w12, word_1, word_2)
# Kneser Kney Algorithm
tri_words[, Prob := (count - discount_value) / Cn2 + discount_value / Cn2 * n1w12[.(word_1, word_2), N] *
bi_words[.(word_1, word_2), Prob]]
######## End of Finding Tri-Gram Probability #################
Finally let’s tweak the unigram to be used more effectively. Here we single out 50 most occurring unigrams as it is more likely to occur. This will be used as the last resort in backing-off.
# Finding the most frequently used 50 unigrmas
uni_words <- uni_words[order(-Prob)][1:50]
saveRDS(uni_words, "uni_wordsfinal.RDS")
saveRDS(bi_words, "bi_wordsfinal.RDS")
saveRDS(tri_words, "tri_wordsfinal.RDS")
With that, we have found, the word probabilities for the words in the dataset. Now let’s create few functions to build the prediction app.
uni_words <- readRDS("uni_wordsfinal.RDS")
bi_words <- readRDS("bi_wordsfinal.RDS")
tri_words <- readRDS("tri_wordsfinal.RDS")
library(textclean)
cleanInput <-function(input) {
# 1. Separate words connected with - or /
input <- gsub("-", " ", input)
input <- gsub("/", " ", input)
# 2. Establish end of sentence, abbr, number, email, html
input <- gsub("\\? |\\?$|\\! |\\!$", " EEOSS ", input)
input <- gsub("[A-Za-z]\\.[A-Za-z]\\.[A-Za-z]\\.[A-Za-z]\\. |[A-Za-z]\\.[A-Za-z]\\.[A-Za-z]\\. |[A-Za-z]\\.[A-Za-z]\\. ", " AABRR ", input)
input <- gsub("\\. |\\.$", " EEOSS ", input)
input <- gsub("[0-9]+"," NNUMM ",input)
input <- gsub("\\S+@\\S+","EEMAILL",input)
input <- gsub("[Hh}ttp([^ ]+)","HHTMLL",input)
input <- gsub("RT | via"," RTVIA ",input) # retweets
input <- gsub("@([^ ]+)","ATPPLE",input) # @people
input <- gsub("[@][a - zA - Z0 - 9_]{1,15}","UUSRNMSS",input) # usernames
# 3. to lower
input <- tolower(input)
# 4. Remove/replace &, @, 'm, 's, 'are, 'll, etc...
input <- gsub(" & ", " and ", input)
input <- gsub(" @ ", " at ", input)
input <- replace_contraction(input)
input <- gsub("'s", "", input)
input <- gsub("haven't", "have not", input)
input <- gsub("hadn't", "had not", input)
# 5. Remove emoji's, emoticons
input <- gsub("[^\x01-\x7F]", "", input)
# 6. Remove g, mg, lbs etc; removes all single letters except "a" and "i"
input <- gsub(" [1-9]+g ", " ", input) # grams
input <- gsub(" [1-9]+mg ", " ", input) # miligrams, etc
input <- gsub(" [1-9]+kg ", " ", input)
input <- gsub(" [1-9]+lbs ", " ", input)
input <- gsub(" [1-9]+s ", " ", input) # seconds, etc
input <- gsub(" [1-9]+m ", " ", input)
input <- gsub(" [1-9]+h ", " ", input)
input <- gsub(" +g ", " ", input) # grams
input <- gsub(" +mg ", " ", input) # miligrams, etc
input <- gsub(" +kg ", " ", input)
input <- gsub(" +lbs ", " ", input)
input <- gsub(" +s ", " ", input) # seconds, etc
input <- gsub(" +m ", " ", input)
input <- gsub(" +h ", " ", input)
input <- gsub(" +lbs ", " ", input)
input <- gsub(" +kg ", " ", input)
# 7. remove punctuation
#input <- gsub("[^[:alnum:][:space:]\']", "",input)
#input <- gsub(""", "", input)
#input <- gsub(""", "", input)
#input <- gsub("'", "", input)
#input <- gsub("'", "", input)
# 8. remove all single letters eccept i and a
input <- gsub(" u ", " you ", input)
input <- gsub(" [b-hj-z] ", " ", input)
# 9. remove profanity
input <- removeWords(input, profanity[,1])
# 10. remove extra spaces
# input <- gsub("^[ ]{1,10}","",input)
# input <- gsub("[ ]{2,10}"," ",input)
input <- stripWhitespace(input)
# remove space at end of phrase
input <- gsub(" $", "", input)
return(input)
}
First the function to predict the third word, given two previous words.
# function to return highly probable previous word given two successive words
triWords <- function(w1, w2, n = 5) {
pwords <- tri_words[.(w1, w2)][order(-Prob)]
if (any(is.na(pwords)))
return(biWords(w2, n))
if (nrow(pwords) > n)
return(pwords[1:n, word_3])
count <- nrow(pwords)
bwords <- biWords(w2, n)[1:(n - count)]
return(c(pwords[, word_3], bwords))
}
If we don’t find a tri-gram with the two given words, we backoff to the bi-gram. We find the next word given one previous word.
# function to return highly probable previous word given a word
biWords <- function(w1, n = 5) {
pwords <- bi_words[w1][order(-Prob)]
if (any(is.na(pwords)))
return(uniWords(n))
if (nrow(pwords) > n)
return(pwords[1:n, word_2])
count <- nrow(pwords)
unWords <- uniWords(n)[1:(n - count)]
return(c(pwords[, word_2], unWords))
}
If we couldn’t even find the corresponding bigram, we randomly get a word from unigrams with high probability. This is the last resort for n-grams that are not found in the dataset.
# function to return random words from unigrams
uniWords <- function(n = 5) {
return(sample(uni_words[, word_1], size = n))
}
Finally, a function to bind all these,
# The prediction app
getWords <- function(str, n=5){
require(textclean)
require(quanteda)
require(tm)
str <- cleanInput(str)
tokens <- tokens(x = char_tolower(str))
tokens <- rev(rev(tokens[[1]])[1:2])
words <- triWords(tokens[1], tokens[2], n)
chain_1 <- paste(tokens[1], tokens[2], words[1], sep = " ")
print(words)
}
getWords("Shall we go to")
## Loading required package: quanteda
## Package version: 1.3.14
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
## Loading required package: tm
## Loading required package: NLP
##
## Attaching package: 'tm'
## The following objects are masked from 'package:quanteda':
##
## as.DocumentTermMatrix, stopwords
## [1] "the" "bed" "a" "school" "sleep"