Data Science in R Specialization

rm(list = ls())
library(quanteda)

## Package version: 2.1.2

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

library(data.table)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

Read in the unigram, bigram, and trigam entries and frequencies…

unigrs <- fread("unigrs.csv")
bigrs <- fread("bigrs.csv")
trigrs <- fread("trigrs.csv")

Process a number of functions for our analysis…

## Returns a two column data.frame of observed trigrams that start with the
## bigram prefix (bigPre) in the first column named ngram and
## frequencies/counts in the second column named freq. If no observed trigrams
## that start with bigPre exist, an empty data.frame is returned.
##
## bigPre -  single-element char array of the form w2_w1 which are the first 
##           two words of the trigram we are predicting the tail word of
## trigrams - 2 column data.frame or data.table. The first column: ngram,
##            contains all the trigrams in the corpus. The second column:
##            freq, contains the frequency/count of each trigram.
getObsTrigs <- function(bigPre, trigrams) {
    trigs.winA <- data.frame(ngrams=vector(mode = 'character', length = 0),
                             freq=vector(mode = 'integer', length = 0))
    regex <- sprintf("%s%s%s", "^", bigPre, "_")
    trigram_indices <- grep(regex, trigrams$ngram)
    if(length(trigram_indices) > 0) {
        trigs.winA <- trigrams[trigram_indices, ]
    }
    
    return(trigs.winA)
}

## Returns a two column data.frame of observed trigrams that start with bigram
## prefix bigPre in the first column named ngram and the probabilities
## q_bo(w_i | w_i-2, w_i-1) in the second column named prob calculated from
## eqn 12. If no observed trigrams starting with bigPre exist, NULL is returned.
##
## obsTrigs - 2 column data.frame or data.table. The first column: ngram,
##            contains all the observed trigrams that start with the bigram
##            prefix bigPre which we are attempting to the predict the next
##            word of in a give phrase. The second column: freq, contains the
##            frequency/count of each trigram.
## bigrs - 2 column data.frame or data.table. The first column: ngram,
##         contains all the bigrams in the corpus. The second column:
##         freq, contains the frequency/count of each bigram.
## bigPre -  single-element char array of the form w2_w1 which are first two
##           words of the trigram we are predicting the tail word of
## triDisc - amount to discount observed trigrams
getObsTriProbs <- function(obsTrigs, bigrs, bigPre, triDisc=0.5) {
    if(nrow(obsTrigs) < 1) return(NULL)
    obsCount <- subset(bigrs, ngram == bigPre)$freq[1]
    obsTrigProbs <- mutate(obsTrigs, freq=((freq - triDisc) / obsCount))
    colnames(obsTrigProbs) <- c("ngram", "prob")
    
    return(obsTrigProbs)
}

## Returns a character vector which are the tail words of unobserved trigrams
## that start with the first two words of obsTrigs (aka the bigram prefix).
## These are the words w in the set B(w_i-2, w_i-1) as defined in the section
## describing the details of equation 17.
##
## obsTrigs - character vector of observed trigrams delimited by _ of the form:
##            w3_w2_w1 where w3_w2 is the bigram prefix
## unigs - 2 column data.frame of all the unigrams in the corpus:
##         ngram = unigram
##         freq = frequency/count of each unigram

getUnobsTrigTails <- function(obsTrigs, unigs) {
    obs_trig_tails <- str_split_fixed(obsTrigs, "_", 3)[, 3]
    unobs_trig_tails <- unigs[!(unigs$ngram %in% obs_trig_tails), ]$ngram
    return(unobs_trig_tails)
}

## Returns the total probability mass discounted from all observed bigrams
## calculated from equation 14.  This is the amount of probability mass which
## is redistributed to UNOBSERVED bigrams. If no bigrams starting with
## unigram$ngram[1] exist, 0 is returned.
##
## unigram - single row, 2 column frequency table. The first column: ngram,
##           contains the w_i-1 unigram (2nd word of the bigram prefix). The
##           second column: freq, contains the frequency/count of this unigram.
## bigrams - 2 column data.frame or data.table. The first column: ngram,
##           contains all the bigrams in the corpus. The second column:
##           freq, contains the frequency or count of each bigram.
## bigDisc - amount to discount observed bigrams

getAlphaBigram <- function(unigram, bigrams, bigDisc=0.5) {
    # get all bigrams that start with unigram
    regex <- sprintf("%s%s%s", "^", unigram$ngram[1], "_")
    bigsThatStartWithUnig <- bigrams[grep(regex, bigrams$ngram),]
    if(nrow(bigsThatStartWithUnig) < 1) return(0)
    alphaBi <- 1 - (sum(bigsThatStartWithUnig$freq - bigDisc) / unigram$freq)
    
    return(alphaBi)
}

## Returns a character vector of backed off bigrams of the form w2_w1. These 
## are all the (w_i-1, w) bigrams where w_i-1 is the tail word of the bigram
## prefix bigPre and w are the tail words of unobserved bigrams that start with
## w_i-1.
##
## bigPre - single-element char array of the form w2_w1 which are first two
##          words of the trigram we are predicting the tail word of
## unobsTrigTails - character vector that are tail words of unobserved trigrams
getBOBigrams <- function(bigPre, unobsTrigTails) {
    w_i_minus1 <- str_split(bigPre, "_")[[1]][2]
    boBigrams <- paste(w_i_minus1, unobsTrigTails, sep = "_")
    return(boBigrams)
}

## Returns a two column data.frame of backed-off bigrams in the first column
## named ngram and their frequency/counts in the second column named freq.
## 
## bigPre -  single-element char array of the form w2_w1 which are first two
##           words of the trigram we are predicting the tail word of
## unobsTrigTails - character vector that are tail words of unobserved trigrams
## bigrs - 2 column data.frame or data.table. The first column: ngram,
##         contains all the bigrams in the corpus. The second column:
##         freq, contains the frequency/count of each bigram.
getObsBOBigrams <- function(bigPre, unobsTrigTails, bigrs) {
    boBigrams <- getBOBigrams(bigPre, unobsTrigTails)
    obs_bo_bigrams <- bigrs[bigrs$ngram %in% boBigrams, ]
    return(obs_bo_bigrams)
}

## Returns a character vector of backed-off bigrams which are unobserved.
##
## bigPre -  single-element char array of the form w2_w1 which are first two
##           words of the trigram we are predicting the tail word of
## unobsTrigTails - character vector that are tail words of unobserved trigrams
## obsBOBigram - data.frame which contains the observed bigrams in a column
##               named ngram
getUnobsBOBigrams <- function(bigPre, unobsTrigTails, obsBOBigram) {
    boBigrams <- getBOBigrams(bigPre, unobsTrigTails)
    unobs_bigs <- boBigrams[!(boBigrams %in% obsBOBigram$ngram)]
    return(unobs_bigs)
}

## Returns a dataframe of 2 columns: ngram and probs.  Values in the ngram
## column are bigrams of the form: w2_w1 which are observed as the last
## two words in unobserved trigrams.  The values in the prob column are
## q_bo(w1 | w2) calculated from from equation 10.
##
## obsBOBigrams - a dataframe with 2 columns: ngram and freq. The ngram column
##                contains bigrams of the form w1_w2 which are observed bigrams
##                that are the last 2 words of unobserved trigrams (i.e. "backed
##                off" bigrams). The freq column contains integers that are
##                the counts of these observed bigrams in the corpus.
## unigs - 2 column data.frame of all the unigrams in the corpus:
##         ngram = unigram
##         freq = frequency/count of each unigram
## bigDisc - amount to discount observed bigrams
getObsBigProbs <- function(obsBOBigrams, unigs, bigDisc=0.5) {
    first_words <- str_split_fixed(obsBOBigrams$ngram, "_", 2)[, 1]
    first_word_freqs <- unigs[unigs$ngram %in% first_words, ]
    obsBigProbs <- (obsBOBigrams$freq - bigDisc) / first_word_freqs$freq
    obsBigProbs <- data.frame(ngram=obsBOBigrams$ngram, prob=obsBigProbs)
    
    return(obsBigProbs)
}

## Returns a dataframe of 2 columns: ngram and prob.  Values in the ngram
## column are unobserved bigrams of the form: w2_w1.  The values in the prob
## column are the backed off probability estimates q_bo(w1 | w2) calculated
## from from equation 16.
##
## unobsBOBigrams - character vector of unobserved backed off bigrams
## unigs - 2 column data.frame of all the unigrams in the corpus:
##         ngram = unigram
##         freq = frequency/count of each unigram
## alphaBig - total discounted probability mass at the bigram level
getQboUnobsBigrams <- function(unobsBOBigrams, unigs, alphaBig) {
    # get the unobserved bigram tails
    qboUnobsBigs <- str_split_fixed(unobsBOBigrams, "_", 2)[, 2]
    w_in_Aw_iminus1 <- unigs[!(unigs$ngram %in% qboUnobsBigs), ]
    # convert to data.frame with counts
    qboUnobsBigs <- unigs[unigs$ngram %in% qboUnobsBigs, ]
    denom <- sum(qboUnobsBigs$freq)
    # converts counts to probabilities
    qboUnobsBigs <- data.frame(ngram=unobsBOBigrams,
                               prob=(alphaBig * qboUnobsBigs$freq / denom))
    
    return(qboUnobsBigs)
}

## Returns the total probability mass discounted from all observed trigrams.
## This is the amount of probability mass which is
## redistributed to UNOBSERVED trigrams. If no trigrams starting with
## bigram$ngram[1] exist, 1 is returned.
##
## obsTrigs - 2 column data.frame or data.table. The first column: ngram,
##            contains all the observed trigrams that start with the bigram
##            prefix we are attempting to the predict the next word of. The 
##            second column: freq, contains the frequency/count of each trigram.
## bigram - single row frequency table where the first col: ngram, is the bigram
##          which are the first two words of unobserved trigrams we want to
##          estimate probabilities of (same as bigPre in other functions listed
##          prior) delimited with '_'. The second column: freq, is the
##          frequency/count of the bigram listed in the ngram column.
## triDisc - amount to discount observed trigrams
getAlphaTrigram <- function(obsTrigs, bigram, triDisc=0.5) {
    if(nrow(obsTrigs) < 1) return(1)
    alphaTri <- 1 - sum((obsTrigs$freq - triDisc) / bigram$freq[1])
    
    return(alphaTri)
}

## Returns a dataframe of 2 columns: ngram and prob.  Values in the ngram
## column are unobserved trigrams of the form: w3_w2_w1.  The values in the prob
## column are q_bo(w1 | w3, w2) calculated from equation 17.
##
## bigPre -  single-element char array of the form w2_w1 which are first two
##           words of the trigram we are predicting the tail word of
## qboObsBigrams - 2 column data.frame with the following columns -
##                 ngram: observed bigrams of the form w2_w1
##                 probs: the probability estimate for observed bigrams:
##                        qbo(w1 | w2) calc'd from equation 10.
## qboUnobsBigrams - 2 column data.frame with the following columns -
##                   ngram: unobserved bigrams of the form w2_w1
##                   probs: the probability estimate for unobserved bigrams
##                          qbo(w1 | w2) calc'd from equation 16.
## alphaTrig - total discounted probability mass at the trigram level
getUnobsTriProbs <- function(bigPre, qboObsBigrams,
                             qboUnobsBigrams, alphaTrig) {
    qboBigrams <- rbind(qboObsBigrams, qboUnobsBigrams)
    qboBigrams <- qboBigrams[order(-qboBigrams$prob), ]
    sumQboBigs <- sum(qboBigrams$prob)
    first_bigPre_word <- str_split(bigPre, "_")[[1]][1]
    unobsTrigNgrams <- paste(first_bigPre_word, qboBigrams$ngram, sep="_")
    unobsTrigProbs <- alphaTrig * qboBigrams$prob / sumQboBigs
    unobsTrigDf <- data.frame(ngram=unobsTrigNgrams, prob=unobsTrigProbs)
    
    return(unobsTrigDf)
}

getPredictionMsg <- function(qbo_trigs) {
    # pull off tail word of highest prob trigram
    prediction <- str_split(qbo_trigs$ngram[1], "_")[[1]][3]
    result <- sprintf("%s%s%s%.4f", "highest prob prediction is >>> ", prediction," <<< which has probability = ", qbo_trigs$prob[1])
    return(result)
}

predict_word <- function(bigPre,unigrs,bigrs,trigrs){
  gamma2=0.7; gamma3=0.7  # initialize new discount rates
  obs_trigs <- getObsTrigs(bigPre, trigrs)
  unobs_trig_tails <- getUnobsTrigTails(obs_trigs$ngram, unigrs)
  bo_bigrams <- getBOBigrams(bigPre, unobs_trig_tails)

  # Separate bigrams into observed and unobserved using the appropriate equations
  obs_bo_bigrams <- getObsBOBigrams(bigPre, unobs_trig_tails, bigrs)
  unobs_bo_bigrams <- getUnobsBOBigrams(bigPre, unobs_trig_tails, obs_bo_bigrams)

  # Calculate observed bigram probabilites
  qbo_obs_bigrams <- getObsBigProbs(obs_bo_bigrams, unigrs, gamma2)

  # Calculate alpha_big and unobserved bigram probabilities
  unig <- str_split(bigPre, "_")[[1]][2]
  unig <- unigrs[unigrs$ngram == unig,]
  alpha_big <- getAlphaBigram(unig, bigrs, gamma2)

  # Distribute discounted bigram probability mass to unobserved bigrams in   proportion to unigram ML
  qbo_unobs_bigrams <- getQboUnobsBigrams(unobs_bo_bigrams, unigrs, alpha_big)

  # Calculate observed trigram probabilities...
  qbo_obs_trigrams <- getObsTriProbs(obs_trigs, bigrs, bigPre, gamma3)

  # Finally, calculate unobserved trigram probabilities...
  bigram <- bigrs[bigrs$ngram %in% bigPre, ]
  alpha_trig <- getAlphaTrigram(obs_trigs, bigram, gamma3)
  qbo_unobs_trigrams <- getUnobsTriProbs(bigPre, qbo_obs_bigrams,
                                       qbo_unobs_bigrams, alpha_trig)
  qbo_trigrams <- rbind(qbo_obs_trigrams, qbo_unobs_trigrams)
  qbo_trigrams <- qbo_trigrams[order(-qbo_trigrams$prob), ]
  # getPredictionMsg(qbo_trigrams)
  return(qbo_trigrams)
}

For each of the sentence fragments below, we will use our natural language processing algorithm to predict the next word in the sentence.

Question 1

1. When you breathe, I want to be the air for you. I’ll be there for you, I’d live and I’d…

Options: give, sleep, eat, die

bigPre = "and_id"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
# word_list[grepl("case_of_beer", ngram, fixed=TRUE)]
word_list[((word_list$ngram=="and_id_give") | (word_list$ngram=="and_id_sleep") | (word_list$ngram=="and_id_eat")| (word_list$ngram=="and_id_die")),]

##           ngram         prob
## 1:   and_id_die 5.816474e-03
## 2:  and_id_give 3.287572e-03
## 3: and_id_sleep 9.168352e-05
## 4:   and_id_eat 8.618158e-05

The correct answer is “die”.

Question 2

Guy at my table’s wife got up to go to the bathroom and I asked about dessert and he started telling me about his…

Options: horticultural, spiritual (wrong), marital (right), financial

bigPre = "about_his"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="about_his_horticultural") | (word_list$ngram=="about_his_spiritual") | (word_list$ngram=="about_his_marital")| (word_list$ngram=="about_his_financial")),]

##                      ngram         prob
## 1:     about_his_financial 5.026958e-05
## 2:     about_his_spiritual 2.841324e-05
## 3:       about_his_marital 8.999002e-07
## 4: about_his_horticultural 1.451452e-07

The correct answer is “marital”. Unfortunately, the predictor chose “financial”.

Question 3

I’d give anything to see arctic monkeys this…

Options: weekend, month, decade, morning

bigPre = "monkeys_this"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="monkeys_this_weekend") | (word_list$ngram=="monkeys_this_month") | (word_list$ngram=="monkeys_this_decade")| (word_list$ngram=="monkeys_this_morning")),]

##                    ngram         prob
## 4   monkeys_this_morning 0.0216346950
## 6   monkeys_this_weekend 0.0201449302
## 10    monkeys_this_month 0.0092935572
## 601  monkeys_this_decade 0.0001710471

The correct answer is “weekend”. Unfortunately, the predictor chose “morning”.

Question 4

Talking to your mom has the same effect as a hug and helps reduce your…

Options: hunger, stress, happiness, sleepiness

bigPre = "reduce_your"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="reduce_your_hunger") | (word_list$ngram=="reduce_your_stress") | (word_list$ngram=="reduce_your_happiness")| (word_list$ngram=="reduce_your_sleepiness")),]

##                     ngram         prob
## 1:  reduce_your_happiness 3.034938e-04
## 2:     reduce_your_stress 1.173013e-04
## 3:     reduce_your_hunger 5.585775e-06
## 4: reduce_your_sleepiness 1.248539e-07

The correct answer is “stress”.

Question 5

When you were in Holland you were like 1 inch away from me but you hadn’t time to take a…

Options: look (wrong), picture (right), walk, minute

bigPre = "take_a"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="take_a_look") | (word_list$ngram=="take_a_picture") | (word_list$ngram=="take_a_walk")| (word_list$ngram=="take_a_minute")),]

##             ngram        prob
## 1:    take_a_look 0.100365497
## 2: take_a_picture 0.040423977
## 3:  take_a_minute 0.010453216
## 4:    take_a_walk 0.009722222

The correct answer is “picture”. Unfortunately, the predictor chose “look”.

Question 6

6. I’d just like all of these questions answered, a presentation of evidence, and a jury to settle the…

Options: incident, account, matter (right), case (wrong)

bigPre = "settle_the"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="settle_the_incident") | (word_list$ngram=="settle_the_account") | (word_list$ngram=="settle_the_matter")| (word_list$ngram=="settle_the_case")),]

##                  ngram         prob
## 1:     settle_the_case 0.1210526316
## 2:   settle_the_matter 0.0684210526
## 3:  settle_the_account 0.0157894737
## 4: settle_the_incident 0.0001910143

The correct answer is “matter”. Unfortunately, the predictor chose “case”.

Question 7

7. I can’t deal with unsymmetrical things. I can’t even hold an uneven number of bags of groceries in each…

Options: finger, arm, toe, hand

bigPre = "in_each"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="in_each_finger") | (word_list$ngram=="in_each_arm") | (word_list$ngram=="in_each_toe")| (word_list$ngram=="in_each_hand")),]

##             ngram         prob
## 1:   in_each_hand 6.735751e-03
## 2:    in_each_arm 1.213297e-05
## 3: in_each_finger 6.763935e-06
## 4:    in_each_toe 3.316171e-06

The correct answer is “hand”.

Question 8

Every inch of you is perfect from the bottom to the…

Options: side, center, top, middle

bigPre = "to_the"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="to_the_side") | (word_list$ngram=="to_the_center") | (word_list$ngram=="to_the_top")| (word_list$ngram=="to_the_middle")),]

##            ngram        prob
## 1:    to_the_top 0.006398844
## 2:   to_the_side 0.001878175
## 3: to_the_middle 0.001179102
## 4: to_the_center 0.001085893

The correct answer is “top”.

Question 9

I’m thankful my childhood was filled with imagination and bruises from playing…

Options: inside, daily, weekly, outside

bigPre = "from_playing"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="from_playing_inside") | (word_list$ngram=="from_playing_daily") | (word_list$ngram=="from_playing_weekly")| (word_list$ngram=="from_playing_outside")),]

##                   ngram         prob
## 1: from_playing_outside 1.409220e-03
## 2:  from_playing_inside 7.976720e-05
## 3:   from_playing_daily 2.992629e-05
## 4:  from_playing_weekly 9.740715e-06

The correct answer is “outside”.

Question 10

I like how the same people are in almost all of Adam Sandler’s…

Options: stories, movies, pictures, novels

bigPre = "adam_sandlers"
word_list <- predict_word(bigPre,unigrs,bigrs,trigrs)
word_list[((word_list$ngram=="adam_sandlers_stories") | (word_list$ngram=="adam_sandlers_movies") | (word_list$ngram=="adam_sandlers_pictures")| (word_list$ngram=="adam_sandlers_novels")),]

##                       ngram prob
## 919   adam_sandlers_stories  NaN
## 922  adam_sandlers_pictures  NaN
## 1286   adam_sandlers_movies  NaN
## 3876   adam_sandlers_novels  NaN

The correct answer is “movies”.

Data Science in R Specialization - Capstone Project

Natural Language Processing II - Week 4 Quiz (10% N-grams)

Ken Wood

10/18/2020