The report follow the example from page 64 to 68 of Artificial Intelligence II Prof. Olga Veksler - Lecture 9 - NLP: Language Models available here.
library(gtools)
vocabulary <- c("a","b","c")
vocabulary
## [1] "a" "b" "c"
perm <- permutations(n=3,r=2,v=vocabulary,repeats.allowed=T)
possbigrams <- paste(perm[,1],perm[,2])
possbigrams
## [1] "a a" "a b" "a c" "b a" "b b" "b c" "c a" "c b" "c c"
corpus <- c("b","a","b","a","a","c","b","c","a","c","a","c")
# Remove the first element and add a period at the end
corpus2 <- c(corpus[-1],".")
# Note the length stays the same
length(corpus2)
## [1] 12
# Create a sorted table of bigram type frequencies
bigrams <- paste(corpus, corpus2)
n <- grep("[[:punct:]]", bigrams)
bigrams <- bigrams[-n]
bigrams
## [1] "b a" "a b" "b a" "a a" "a c" "c b" "b c" "c a" "a c" "c a" "a c"
possbigrams[which(!(possbigrams %in% bigrams))]
## [1] "b b" "c c"
freq <- sort(table(bigrams),decreasing = FALSE)
freq
## bigrams
## a a a b b c c b b a c a a c
## 1 1 1 1 2 2 3
r <- tabulate(freq+1L)
Nr <- tabulate(freq+1L)
Nr[1] <- nrow(permutations(n=3,r=2,v=corpus,repeats.allowed=T)) - length(unique(bigrams))
N <- sum(freq)
paste(c("N0 =","N1 =","N2 =","N3 =","N ="),c(Nr,N))
## [1] "N0 = 2" "N1 = 4" "N2 = 2" "N3 = 1" "N = 11"
N
## [1] 11
prob <- NULL
r <- 2
for (i in 0:r){
prob[i+1] <- (i+1)*(Nr[i+2]/(N*Nr[i+1]))
}
r <- r + 1
prob[r+1] <- r/N
prob_ <- prob
prob_
## [1] 0.18181818 0.09090909 0.13636364 0.27272727
bigrams_weight <- 1 - prob[1]*Nr[1]
bigrams_weight
## [1] 0.6363636
newprob <- prob_[2:4]*Nr[2:4]
totalnewprob <- sum(newprob)
totalnewprob
## [1] 0.9090909
constant <- bigrams_weight / totalnewprob
constant
## [1] 0.7
prob_[1] # did not want to change these
## [1] 0.1818182
normprob1 <- prob[2:4]*constant
normprob1 <- c(prob[1], normprob1)
normprob1
## [1] 0.18181818 0.06363636 0.09545455 0.19090909
checksum <- sum(Nr*normprob1)
normprob2 <- prob/sum(Nr*prob)
normprob2
## [1] 0.14285714 0.07142857 0.10714286 0.21428571
checksum <- sum(Nr*normprob2)
normprob1
## [1] 0.18181818 0.06363636 0.09545455 0.19090909
normprob2
## [1] 0.14285714 0.07142857 0.10714286 0.21428571
bigram <- data.frame(word= c("bb","cc","ab","aa","cb","bc","ba","ca","ac"),
prob = c(rep(0.18181818,2),rep(0.06363636,4),rep(0.09545455,2),
rep(0.1909091,1)))
bigram
## word prob
## 1 bb 0.18181818
## 2 cc 0.18181818
## 3 ab 0.06363636
## 4 aa 0.06363636
## 5 cb 0.06363636
## 6 bc 0.06363636
## 7 ba 0.09545455
## 8 ca 0.09545455
## 9 ac 0.19090910
table(corpus)/length(corpus)
## corpus
## a b c
## 0.4166667 0.2500000 0.3333333
unigram <- data.frame(word= c("a","b","c"), prob= c(0.4166667,0.2500000,0.3333333))
Pa <- unigram$prob[unigram$word== "a"]
Pb <- unigram$prob[unigram$word== "b"]
Pc <- unigram$prob[unigram$word== "c"]
Pab <- bigram$prob[bigram$word== "ab"]
Pca <- bigram$prob[bigram$word== "ca"]
Pbc <- bigram$prob[bigram$word== "bc"]
Pabcab <- Pa*(Pab/Pa)*(Pbc/Pb)*(Pca/Pc)*(Pab/Pa)
Pabcab
## [1] 0.0007084433
This section coves page 61 of the paper.
I use my own bigrams frequency file.
load("bigramFreq.Rdata")
head(bigramFreq,10)
## word freq
## of the of the 5876
## in the in the 5542
## to the to the 2733
## on the on the 2683
## for the for the 2631
## to be to be 1954
## at the at the 1896
## and the and the 1704
## in a in a 1612
## with the with the 1408
S=(subset(tabulate(bigramFreq$freq),tabulate(bigramFreq$freq)>0))
dat=data.frame(x = 1:length(S),S=S)
nls1=nls(S~i*x^-z,start=list(i=300000,z=3),data=dat)
nls1
## Nonlinear regression model
## model: S ~ i * x^-z
## data: dat
## i z
## 4.52e+05 2.99e+00
## residual sum-of-squares: 54299017
##
## Number of iterations to convergence: 4
## Achieved convergence tolerance: 2.3e-06
pred <- as.integer(predict(nls1))
plot(dat[1:20,])
lines(pred[1:20])