Introduction

The report follow the example from page 64 to 68 of Artificial Intelligence II Prof. Olga Veksler - Lecture 9 - NLP: Language Models available here.

library(gtools)

Vocabulary is {a,b,c}

vocabulary <- c("a","b","c")
vocabulary
## [1] "a" "b" "c"

Possible bigrams: {aa,ab,ac,ba,bb,bc,ca,cb,cc}

perm <- permutations(n=3,r=2,v=vocabulary,repeats.allowed=T)
possbigrams <- paste(perm[,1],perm[,2])
possbigrams
## [1] "a a" "a b" "a c" "b a" "b b" "b c" "c a" "c b" "c c"

Corpus

corpus <- c("b","a","b","a","a","c","b","c","a","c","a","c")

Observed bigrams are {ba, ab, ba, aa, ac, cb, bc, ca, ac, ca, ac}

# Remove the first element and add a period at the end
corpus2 <- c(corpus[-1],".")
# Note the length stays the same
length(corpus2)
## [1] 12
# Create a sorted table of bigram type frequencies
bigrams <- paste(corpus, corpus2)
n <- grep("[[:punct:]]", bigrams)
bigrams <- bigrams[-n]
bigrams
##  [1] "b a" "a b" "b a" "a a" "a c" "c b" "b c" "c a" "a c" "c a" "a c"

Unobserved bigrams: bb,cc

possbigrams[which(!(possbigrams %in% bigrams))]
## [1] "b b" "c c"

Observed bigram frequencies:

ab: 1, aa: 1,cb: 1, bc: 1, ba: 2, ca: 2, ac: 3

freq <- sort(table(bigrams),decreasing = FALSE)
freq 
## bigrams
## a a a b b c c b b a c a a c 
##   1   1   1   1   2   2   3

N0=2, N1=4, N2=2, N3=1, N = 11

r <- tabulate(freq+1L)
Nr <- tabulate(freq+1L)
Nr[1] <- nrow(permutations(n=3,r=2,v=corpus,repeats.allowed=T)) - length(unique(bigrams))
N  <- sum(freq)
paste(c("N0 =","N1 =","N2 =","N3 =","N ="),c(Nr,N))
## [1] "N0 = 2" "N1 = 4" "N2 = 2" "N3 = 1" "N = 11"
N
## [1] 11

Will use GT probabilities up to and including r = 2

Probability estimations:

Use Good-Turing: P(bb)=P(cc)= (0+1)(N1/(NxN0))=4/(112)=2/11= 0.181818

Use Good-Turing: P(ab)=P(aa)=P(cb)=P(bc)= (1+1)*(N2/(NxN1))=1/11= 0.090909

Use Good-Turing: P(ba)=P(ca)= (2+1)*(N3/(NxN2))=3/22= 0.1363636

Use MLE: P(ac) = 3/11 = 0.2727273

prob <- NULL
r <- 2
for (i in 0:r){
prob[i+1] <- (i+1)*(Nr[i+2]/(N*Nr[i+1]))
}
r <- r + 1
prob[r+1] <- r/N
prob_ <- prob

Normalization Case I:

Finally renormalize

Before renormalization:

P’(bb)=P(cc)= 2/11 = 0.181818

P’(ab)=P’(aa)=P’(cb)=P’(bc)= 1/11 = 0.090909

P’(ba)=P’(ca)= 3/22= 0.1363636

P’(ac) = 3/11 = 0.2727273

I put P’(.) (prob_) to indicate that the things above are not true

probabilities, since they don’t add up to 1

prob_
## [1] 0.18181818 0.09090909 0.13636364 0.27272727

Renormalize only the weight of seen bigrams ab,aa,cb,bc,ba,ca,ac and

their total weight should be 1-[P’(bb)+P’(cc)] = 7/11 = 0.6363636

bigrams_weight <- 1 - prob[1]*Nr[1]
bigrams_weight
## [1] 0.6363636

P’(ab)+P’(aa)+P’(cb)+P’(bc)+P’(ba)+P’(ca)+P’(ac) = 10/11 = 0.9090909

newprob <-  prob_[2:4]*Nr[2:4]
totalnewprob <- sum(newprob)
totalnewprob
## [1] 0.9090909

Multiply through by constant (11/10)*(7/11)=7/10 = 0.7

constant <- bigrams_weight / totalnewprob
constant
## [1] 0.7

New probabilities are:

P(bb)=P(cc)= 2/11 = 0.181818

prob_[1] # did not want to change these
## [1] 0.1818182

P(ab)=P(aa)=P(cb)=P(bc)= (1/11)*(7/10)=7/110 = 0.06363636

P(ba)=P(ca)=(3/22)*(7/10)=21/220= 0.09545455

P(ac)=(3/11)*(7/10)=21/110 = 0.1909091

normprob1 <- prob[2:4]*constant
normprob1 <- c(prob[1], normprob1)
normprob1
## [1] 0.18181818 0.06363636 0.09545455 0.19090909

Now probablities add up to 1

checksum <- sum(Nr*normprob1)

Normalization Case II:

Can also renormalize weights in a simpler manner

Before renormalization:

P’(bb)=P’(cc)= 2/11 = P’0 = 0.18181818

P’(ab)=P’(aa)=P’(cb)=P’(bc)= 1/11= P’1 = 0.090909

P’(ba)=P’(ca)=3/22= P’2 = 0.1363636

P’(ac) = 3/11= P’3 = 0.2727273

Simply renormalize all “probabilities” P’ to add to 1

(1) find their sum; (2) Divide each one by the sum

For efficiency, you want to add them up based on the rates, since

nGrams with the same rate have the same probability

Set Sr contain all nGrams that were observed r times, Nr is size of Sr

S0 = {bb,cc}, S1 = {ab,aa,cb,bc}, S2 = {ba,ca}, S3 = {ac}

sum = P’0N0+P’1N1+P’2N2+P’3N3=(2/11)2+(1/11)4+(3/22)*2+(3/11)=14/11=1.2727

New probabilities are:

P(bb)=P(cc)= (2/11)/(14/11)=2/14 = P0 = 0.1428571

P(ab)=P(aa)=P(cb)=P(bc)= (1/11)/(14/11)=1/14= P1 =0.07142857

P(ba)=P(ca)=(3/22)/(14/11)= 3/28= P2 = 0.1071429

P(ac) = (3/11)/(14/11)=3/14= P3 =0.2142857

normprob2 <- prob/sum(Nr*prob)
normprob2
## [1] 0.14285714 0.07142857 0.10714286 0.21428571

Now probablities add up to 1

checksum <- sum(Nr*normprob2)

Normalized probabilities Case I:

normprob1
## [1] 0.18181818 0.06363636 0.09545455 0.19090909

Normalized probabilities Case II:

normprob2
## [1] 0.14285714 0.07142857 0.10714286 0.21428571

probabilities are (using the first case of normalization):

P(bb)=P(cc)= 2/11

P(ab)=P(aa)=P(cb)=P(bc)= 7/110

P(ba)=P(ca)=21/220

P(ac)=21/110

bigram <- data.frame(word= c("bb","cc","ab","aa","cb","bc","ba","ca","ac"),
       prob = c(rep(0.18181818,2),rep(0.06363636,4),rep(0.09545455,2),
       rep(0.1909091,1)))
bigram 
##   word       prob
## 1   bb 0.18181818
## 2   cc 0.18181818
## 3   ab 0.06363636
## 4   aa 0.06363636
## 5   cb 0.06363636
## 6   bc 0.06363636
## 7   ba 0.09545455
## 8   ca 0.09545455
## 9   ac 0.19090910

Let us calculate P(abcab) using our model

We will need probabilities for unigrams a,b,c, which we can compute

using MLE estimator:

P(a) = 5/12, P(b) = 3/12, P(c)=4/12

since a occurs 5 times, b occurs 3 times, and c occurs 4 times in our corpus

consisting of 12 unigrams

table(corpus)/length(corpus)
## corpus
##         a         b         c 
## 0.4166667 0.2500000 0.3333333
unigram <- data.frame(word= c("a","b","c"), prob= c(0.4166667,0.2500000,0.3333333))

P(abcab) » P(a) * P(b|a) * P(c|b) * P(a|c) * P(b|a) = 0.0007084433

Pa <- unigram$prob[unigram$word== "a"]
Pb <- unigram$prob[unigram$word== "b"]
Pc <- unigram$prob[unigram$word== "c"]
Pab <- bigram$prob[bigram$word== "ab"]
Pca <- bigram$prob[bigram$word== "ca"]
Pbc <- bigram$prob[bigram$word== "bc"]

Pabcab <- Pa*(Pab/Pa)*(Pbc/Pb)*(Pca/Pc)*(Pab/Pa)
Pabcab
## [1] 0.0007084433

Smoothing: Fixing Good Turing

This section coves page 61 of the paper.

The problem is that Nr is unreliable for high values of r

Solution 1:

use PGT for low values of r, say for r < 10

For n-grams with higher rates, use PMLE which is reliable for higher

values of r, that is PMLE(w1.wn)=C(w1.wn)/N

Solution 2:

Smooth out Nr’s by fitting a power law function F(r)=ar^b (with b < -1)

and use it when Nr becomes unreliable.

Search for the best a and b < -1 to fit observed Nr’s

I use my own bigrams frequency file.

load("bigramFreq.Rdata")
head(bigramFreq,10)
##              word freq
## of the     of the 5876
## in the     in the 5542
## to the     to the 2733
## on the     on the 2683
## for the   for the 2631
## to be       to be 1954
## at the     at the 1896
## and the   and the 1704
## in a         in a 1612
## with the with the 1408
S=(subset(tabulate(bigramFreq$freq),tabulate(bigramFreq$freq)>0))
dat=data.frame(x = 1:length(S),S=S) 
nls1=nls(S~i*x^-z,start=list(i=300000,z=3),data=dat)
nls1
## Nonlinear regression model
##   model: S ~ i * x^-z
##    data: dat
##        i        z 
## 4.52e+05 2.99e+00 
##  residual sum-of-squares: 54299017
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 2.3e-06
pred <- as.integer(predict(nls1))
plot(dat[1:20,])
lines(pred[1:20])