rm(list = ls())
library(quanteda)
## Package version: 2.1.2
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
unigrs <- fread("unigrs.csv")
bigrs <- fread("bigrs.csv")
trigrs <- fread("trigrs.csv")
## Returns a two column data.frame of observed trigrams that start with the
## bigram prefix (bigPre) in the first column named ngram and
## frequencies/counts in the second column named freq. If no observed trigrams
## that start with bigPre exist, an empty data.frame is returned.
##
## bigPre - single-element char array of the form w2_w1 which are the first
## two words of the trigram we are predicting the tail word of
## trigrams - 2 column data.frame or data.table. The first column: ngram,
## contains all the trigrams in the corpus. The second column:
## freq, contains the frequency/count of each trigram.
getObsTrigs <- function(bigPre, trigrams) {
trigs.winA <- data.frame(ngrams=vector(mode = 'character', length = 0),
freq=vector(mode = 'integer', length = 0))
regex <- sprintf("%s%s%s", "^", bigPre, "_")
trigram_indices <- grep(regex, trigrams$ngram)
if(length(trigram_indices) > 0) {
trigs.winA <- trigrams[trigram_indices, ]
}
return(trigs.winA)
}
## Returns a two column data.frame of observed trigrams that start with bigram
## prefix bigPre in the first column named ngram and the probabilities
## q_bo(w_i | w_i-2, w_i-1) in the second column named prob calculated from
## eqn 12. If no observed trigrams starting with bigPre exist, NULL is returned.
##
## obsTrigs - 2 column data.frame or data.table. The first column: ngram,
## contains all the observed trigrams that start with the bigram
## prefix bigPre which we are attempting to the predict the next
## word of in a give phrase. The second column: freq, contains the
## frequency/count of each trigram.
## bigrs - 2 column data.frame or data.table. The first column: ngram,
## contains all the bigrams in the corpus. The second column:
## freq, contains the frequency/count of each bigram.
## bigPre - single-element char array of the form w2_w1 which are first two
## words of the trigram we are predicting the tail word of
## triDisc - amount to discount observed trigrams
getObsTriProbs <- function(obsTrigs, bigrs, bigPre, triDisc=0.5) {
if(nrow(obsTrigs) < 1) return(NULL)
obsCount <- subset(bigrs, ngram == bigPre)$freq[1]
obsTrigProbs <- mutate(obsTrigs, freq=((freq - triDisc) / obsCount))
colnames(obsTrigProbs) <- c("ngram", "prob")
return(obsTrigProbs)
}
## Returns a character vector which are the tail words of unobserved trigrams
## that start with the first two words of obsTrigs (aka the bigram prefix).
## These are the words w in the set B(w_i-2, w_i-1) as defined in the section
## describing the details of equation 17.
##
## obsTrigs - character vector of observed trigrams delimited by _ of the form:
## w3_w2_w1 where w3_w2 is the bigram prefix
## unigs - 2 column data.frame of all the unigrams in the corpus:
## ngram = unigram
## freq = frequency/count of each unigram
getUnobsTrigTails <- function(obsTrigs, unigs) {
obs_trig_tails <- str_split_fixed(obsTrigs, "_", 3)[, 3]
unobs_trig_tails <- unigs[!(unigs$ngram %in% obs_trig_tails), ]$ngram
return(unobs_trig_tails)
}
## Returns the total probability mass discounted from all observed bigrams
## calculated from equation 14. This is the amount of probability mass which
## is redistributed to UNOBSERVED bigrams. If no bigrams starting with
## unigram$ngram[1] exist, 0 is returned.
##
## unigram - single row, 2 column frequency table. The first column: ngram,
## contains the w_i-1 unigram (2nd word of the bigram prefix). The
## second column: freq, contains the frequency/count of this unigram.
## bigrams - 2 column data.frame or data.table. The first column: ngram,
## contains all the bigrams in the corpus. The second column:
## freq, contains the frequency or count of each bigram.
## bigDisc - amount to discount observed bigrams
getAlphaBigram <- function(unigram, bigrams, bigDisc=0.5) {
# get all bigrams that start with unigram
regex <- sprintf("%s%s%s", "^", unigram$ngram[1], "_")
bigsThatStartWithUnig <- bigrams[grep(regex, bigrams$ngram),]
if(nrow(bigsThatStartWithUnig) < 1) return(0)
alphaBi <- 1 - (sum(bigsThatStartWithUnig$freq - bigDisc) / unigram$freq)
return(alphaBi)
}
## Returns a character vector of backed off bigrams of the form w2_w1. These
## are all the (w_i-1, w) bigrams where w_i-1 is the tail word of the bigram
## prefix bigPre and w are the tail words of unobserved bigrams that start with
## w_i-1.
##
## bigPre - single-element char array of the form w2_w1 which are first two
## words of the trigram we are predicting the tail word of
## unobsTrigTails - character vector that are tail words of unobserved trigrams
getBOBigrams <- function(bigPre, unobsTrigTails) {
w_i_minus1 <- str_split(bigPre, "_")[[1]][2]
boBigrams <- paste(w_i_minus1, unobsTrigTails, sep = "_")
return(boBigrams)
}
## Returns a two column data.frame of backed-off bigrams in the first column
## named ngram and their frequency/counts in the second column named freq.
##
## bigPre - single-element char array of the form w2_w1 which are first two
## words of the trigram we are predicting the tail word of
## unobsTrigTails - character vector that are tail words of unobserved trigrams
## bigrs - 2 column data.frame or data.table. The first column: ngram,
## contains all the bigrams in the corpus. The second column:
## freq, contains the frequency/count of each bigram.
getObsBOBigrams <- function(bigPre, unobsTrigTails, bigrs) {
boBigrams <- getBOBigrams(bigPre, unobsTrigTails)
obs_bo_bigrams <- bigrs[bigrs$ngram %in% boBigrams, ]
return(obs_bo_bigrams)
}
## Returns a character vector of backed-off bigrams which are unobserved.
##
## bigPre - single-element char array of the form w2_w1 which are first two
## words of the trigram we are predicting the tail word of
## unobsTrigTails - character vector that are tail words of unobserved trigrams
## obsBOBigram - data.frame which contains the observed bigrams in a column
## named ngram
getUnobsBOBigrams <- function(bigPre, unobsTrigTails, obsBOBigram) {
boBigrams <- getBOBigrams(bigPre, unobsTrigTails)
unobs_bigs <- boBigrams[!(boBigrams %in% obsBOBigram$ngram)]
return(unobs_bigs)
}
## Returns a dataframe of 2 columns: ngram and probs. Values in the ngram
## column are bigrams of the form: w2_w1 which are observed as the last
## two words in unobserved trigrams. The values in the prob column are
## q_bo(w1 | w2) calculated from from equation 10.
##
## obsBOBigrams - a dataframe with 2 columns: ngram and freq. The ngram column
## contains bigrams of the form w1_w2 which are observed bigrams
## that are the last 2 words of unobserved trigrams (i.e. "backed
## off" bigrams). The freq column contains integers that are
## the counts of these observed bigrams in the corpus.
## unigs - 2 column data.frame of all the unigrams in the corpus:
## ngram = unigram
## freq = frequency/count of each unigram
## bigDisc - amount to discount observed bigrams
getObsBigProbs <- function(obsBOBigrams, unigs, bigDisc=0.5) {
first_words <- str_split_fixed(obsBOBigrams$ngram, "_", 2)[, 1]
first_word_freqs <- unigs[unigs$ngram %in% first_words, ]
obsBigProbs <- (obsBOBigrams$freq - bigDisc) / first_word_freqs$freq
obsBigProbs <- data.frame(ngram=obsBOBigrams$ngram, prob=obsBigProbs)
return(obsBigProbs)
}
## Returns a dataframe of 2 columns: ngram and prob. Values in the ngram
## column are unobserved bigrams of the form: w2_w1. The values in the prob
## column are the backed off probability estimates q_bo(w1 | w2) calculated
## from from equation 16.
##
## unobsBOBigrams - character vector of unobserved backed off bigrams
## unigs - 2 column data.frame of all the unigrams in the corpus:
## ngram = unigram
## freq = frequency/count of each unigram
## alphaBig - total discounted probability mass at the bigram level
getQboUnobsBigrams <- function(unobsBOBigrams, unigs, alphaBig) {
# get the unobserved bigram tails
qboUnobsBigs <- str_split_fixed(unobsBOBigrams, "_", 2)[, 2]
w_in_Aw_iminus1 <- unigs[!(unigs$ngram %in% qboUnobsBigs), ]
# convert to data.frame with counts
qboUnobsBigs <- unigs[unigs$ngram %in% qboUnobsBigs, ]
denom <- sum(qboUnobsBigs$freq)
# converts counts to probabilities
qboUnobsBigs <- data.frame(ngram=unobsBOBigrams,
prob=(alphaBig * qboUnobsBigs$freq / denom))
return(qboUnobsBigs)
}
## Returns the total probability mass discounted from all observed trigrams.
## This is the amount of probability mass which is
## redistributed to UNOBSERVED trigrams. If no trigrams starting with
## bigram$ngram[1] exist, 1 is returned.
##
## obsTrigs - 2 column data.frame or data.table. The first column: ngram,
## contains all the observed trigrams that start with the bigram
## prefix we are attempting to the predict the next word of. The
## second column: freq, contains the frequency/count of each trigram.
## bigram - single row frequency table where the first col: ngram, is the bigram
## which are the first two words of unobserved trigrams we want to
## estimate probabilities of (same as bigPre in other functions listed
## prior) delimited with '_'. The second column: freq, is the
## frequency/count of the bigram listed in the ngram column.
## triDisc - amount to discount observed trigrams
getAlphaTrigram <- function(obsTrigs, bigram, triDisc=0.5) {
if(nrow(obsTrigs) < 1) return(1)
alphaTri <- 1 - sum((obsTrigs$freq - triDisc) / bigram$freq[1])
return(alphaTri)
}
## Returns a dataframe of 2 columns: ngram and prob. Values in the ngram
## column are unobserved trigrams of the form: w3_w2_w1. The values in the prob
## column are q_bo(w1 | w3, w2) calculated from equation 17.
##
## bigPre - single-element char array of the form w2_w1 which are first two
## words of the trigram we are predicting the tail word of
## qboObsBigrams - 2 column data.frame with the following columns -
## ngram: observed bigrams of the form w2_w1
## probs: the probability estimate for observed bigrams:
## qbo(w1 | w2) calc'd from equation 10.
## qboUnobsBigrams - 2 column data.frame with the following columns -
## ngram: unobserved bigrams of the form w2_w1
## probs: the probability estimate for unobserved bigrams
## qbo(w1 | w2) calc'd from equation 16.
## alphaTrig - total discounted probability mass at the trigram level
getUnobsTriProbs <- function(bigPre, qboObsBigrams,
qboUnobsBigrams, alphaTrig) {
qboBigrams <- rbind(qboObsBigrams, qboUnobsBigrams)
qboBigrams <- qboBigrams[order(-qboBigrams$prob), ]
sumQboBigs <- sum(qboBigrams$prob)
first_bigPre_word <- str_split(bigPre, "_")[[1]][1]
unobsTrigNgrams <- paste(first_bigPre_word, qboBigrams$ngram, sep="_")
unobsTrigProbs <- alphaTrig * qboBigrams$prob / sumQboBigs
unobsTrigDf <- data.frame(ngram=unobsTrigNgrams, prob=unobsTrigProbs)
return(unobsTrigDf)
}
getPredictionMsg <- function(qbo_trigs) {
# pull off tail word of highest prob trigram
prediction <- str_split(qbo_trigs$ngram[1], "_")[[1]][3]
result <- sprintf("%s%s%s%.4f", "highest prob prediction is >>> ", prediction," <<< which has probability = ", qbo_trigs$prob[1])
return(result)
}
predict_word <- function(bigPre,unigrs,bigrs,trigrs){
gamma2=0.7; gamma3=0.7 # initialize new discount rates
obs_trigs <- getObsTrigs(bigPre, trigrs)
unobs_trig_tails <- getUnobsTrigTails(obs_trigs$ngram, unigrs)
bo_bigrams <- getBOBigrams(bigPre, unobs_trig_tails)
# Separate bigrams into observed and unobserved using the appropriate equations
obs_bo_bigrams <- getObsBOBigrams(bigPre, unobs_trig_tails, bigrs)
unobs_bo_bigrams <- getUnobsBOBigrams(bigPre, unobs_trig_tails, obs_bo_bigrams)
# Calculate observed bigram probabilites
qbo_obs_bigrams <- getObsBigProbs(obs_bo_bigrams, unigrs, gamma2)
# Calculate alpha_big and unobserved bigram probabilities
unig <- str_split(bigPre, "_")[[1]][2]
unig <- unigrs[unigrs$ngram == unig,]
alpha_big <- getAlphaBigram(unig, bigrs, gamma2)
# Distribute discounted bigram probability mass to unobserved bigrams in proportion to unigram ML
qbo_unobs_bigrams <- getQboUnobsBigrams(unobs_bo_bigrams, unigrs, alpha_big)
# Calculate observed trigram probabilities...
qbo_obs_trigrams <- getObsTriProbs(obs_trigs, bigrs, bigPre, gamma3)
# Finally, calculate unobserved trigram probabilities...
bigram <- bigrs[bigrs$ngram %in% bigPre, ]
alpha_trig <- getAlphaTrigram(obs_trigs, bigram, gamma3)
qbo_unobs_trigrams <- getUnobsTriProbs(bigPre, qbo_obs_bigrams,
qbo_unobs_bigrams, alpha_trig)
qbo_trigrams <- rbind(qbo_obs_trigrams, qbo_unobs_trigrams)
qbo_trigrams <- qbo_trigrams[order(-qbo_trigrams$prob), ]
# getPredictionMsg(qbo_trigrams)
return(qbo_trigrams)
}
1. When you breathe, I want to be the air for you. I’ll be there for you, I’d live and I’d…
Options: give, sleep, eat, die
bigPre = "and_id"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
# word_list[grepl("case_of_beer", ngram, fixed=TRUE)]
word_list[((word_list$ngram=="and_id_give") | (word_list$ngram=="and_id_sleep") | (word_list$ngram=="and_id_eat")| (word_list$ngram=="and_id_die")),]
## ngram prob
## 1: and_id_die 5.816474e-03
## 2: and_id_give 3.287572e-03
## 3: and_id_sleep 9.168352e-05
## 4: and_id_eat 8.618158e-05
The correct answer is “die”.
Guy at my table’s wife got up to go to the bathroom and I asked about dessert and he started telling me about his…
Options: horticultural, spiritual (wrong), marital (right), financial
bigPre = "about_his"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="about_his_horticultural") | (word_list$ngram=="about_his_spiritual") | (word_list$ngram=="about_his_marital")| (word_list$ngram=="about_his_financial")),]
## ngram prob
## 1: about_his_financial 5.026958e-05
## 2: about_his_spiritual 2.841324e-05
## 3: about_his_marital 8.999002e-07
## 4: about_his_horticultural 1.451452e-07
The correct answer is “marital”. Unfortunately, the predictor chose “financial”.
I’d give anything to see arctic monkeys this…
Options: weekend, month, decade, morning
bigPre = "monkeys_this"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="monkeys_this_weekend") | (word_list$ngram=="monkeys_this_month") | (word_list$ngram=="monkeys_this_decade")| (word_list$ngram=="monkeys_this_morning")),]
## ngram prob
## 4 monkeys_this_morning 0.0216346950
## 6 monkeys_this_weekend 0.0201449302
## 10 monkeys_this_month 0.0092935572
## 601 monkeys_this_decade 0.0001710471
The correct answer is “weekend”. Unfortunately, the predictor chose “morning”.
Talking to your mom has the same effect as a hug and helps reduce your…
Options: hunger, stress, happiness, sleepiness
bigPre = "reduce_your"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="reduce_your_hunger") | (word_list$ngram=="reduce_your_stress") | (word_list$ngram=="reduce_your_happiness")| (word_list$ngram=="reduce_your_sleepiness")),]
## ngram prob
## 1: reduce_your_happiness 3.034938e-04
## 2: reduce_your_stress 1.173013e-04
## 3: reduce_your_hunger 5.585775e-06
## 4: reduce_your_sleepiness 1.248539e-07
The correct answer is “stress”.
When you were in Holland you were like 1 inch away from me but you hadn’t time to take a…
Options: look (wrong), picture (right), walk, minute
bigPre = "take_a"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="take_a_look") | (word_list$ngram=="take_a_picture") | (word_list$ngram=="take_a_walk")| (word_list$ngram=="take_a_minute")),]
## ngram prob
## 1: take_a_look 0.100365497
## 2: take_a_picture 0.040423977
## 3: take_a_minute 0.010453216
## 4: take_a_walk 0.009722222
The correct answer is “picture”. Unfortunately, the predictor chose “look”.
6. I’d just like all of these questions answered, a presentation of evidence, and a jury to settle the…
Options: incident, account, matter (right), case (wrong)
bigPre = "settle_the"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="settle_the_incident") | (word_list$ngram=="settle_the_account") | (word_list$ngram=="settle_the_matter")| (word_list$ngram=="settle_the_case")),]
## ngram prob
## 1: settle_the_case 0.1210526316
## 2: settle_the_matter 0.0684210526
## 3: settle_the_account 0.0157894737
## 4: settle_the_incident 0.0001910143
The correct answer is “matter”. Unfortunately, the predictor chose “case”.
7. I can’t deal with unsymmetrical things. I can’t even hold an uneven number of bags of groceries in each…
Options: finger, arm, toe, hand
bigPre = "in_each"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="in_each_finger") | (word_list$ngram=="in_each_arm") | (word_list$ngram=="in_each_toe")| (word_list$ngram=="in_each_hand")),]
## ngram prob
## 1: in_each_hand 6.735751e-03
## 2: in_each_arm 1.213297e-05
## 3: in_each_finger 6.763935e-06
## 4: in_each_toe 3.316171e-06
The correct answer is “hand”.
Every inch of you is perfect from the bottom to the…
Options: side, center, top, middle
bigPre = "to_the"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="to_the_side") | (word_list$ngram=="to_the_center") | (word_list$ngram=="to_the_top")| (word_list$ngram=="to_the_middle")),]
## ngram prob
## 1: to_the_top 0.006398844
## 2: to_the_side 0.001878175
## 3: to_the_middle 0.001179102
## 4: to_the_center 0.001085893
The correct answer is “top”.
I’m thankful my childhood was filled with imagination and bruises from playing…
Options: inside, daily, weekly, outside
bigPre = "from_playing"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="from_playing_inside") | (word_list$ngram=="from_playing_daily") | (word_list$ngram=="from_playing_weekly")| (word_list$ngram=="from_playing_outside")),]
## ngram prob
## 1: from_playing_outside 1.409220e-03
## 2: from_playing_inside 7.976720e-05
## 3: from_playing_daily 2.992629e-05
## 4: from_playing_weekly 9.740715e-06
The correct answer is “outside”.
I like how the same people are in almost all of Adam Sandler’s…
Options: stories, movies, pictures, novels
bigPre = "adam_sandlers"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="adam_sandlers_stories") | (word_list$ngram=="adam_sandlers_movies") | (word_list$ngram=="adam_sandlers_pictures")| (word_list$ngram=="adam_sandlers_novels")),]
## ngram prob
## 919 adam_sandlers_stories NaN
## 922 adam_sandlers_pictures NaN
## 1286 adam_sandlers_movies NaN
## 3876 adam_sandlers_novels NaN
The correct answer is “movies”.