In this report, we are going to do two tasks:
Understand the distribution of words and relationship between the words in the corpora.
Understand frequencies of words and word pairs
usblogs_file = 'final/en_US/en_US.blogs.txt'
usnews_file = 'final/en_US/en_US.news.txt'
ustwitter_file = 'final/en_US/en_US.twitter.txt'
usblogs = readLines(usblogs_file)
usnews = readLines(usnews_file)
## Warning in readLines(usnews_file): incomplete final line found on 'final/en_US/
## en_US.news.txt'
ustwitter = readLines(ustwitter_file)
## Warning in readLines(ustwitter_file): line 167155 appears to contain an embedded
## nul
## Warning in readLines(ustwitter_file): line 268547 appears to contain an embedded
## nul
## Warning in readLines(ustwitter_file): line 1274086 appears to contain an
## embedded nul
## Warning in readLines(ustwitter_file): line 1759032 appears to contain an
## embedded nul
files = c(usblogs_file,usnews_file,ustwitter_file)
mbsizes = sapply(files, function(x) {file.size(x)/1024^2})
library(stringr)
stats = sapply(list(usblogs,usnews,ustwitter),function(x){ c(length(x) , sum(str_count(x,'\\S+')) )})
invisible(gc())
stats = rbind(mbsizes,stats)
stats = as.data.frame(stats)
names(stats) = c('usblogs','usnews','ustwitter')
row.names(stats) = c('filesize(MB)','lines','word_count')
library(knitr)
kable(stats,digits = 0)
usblogs | usnews | ustwitter | |
---|---|---|---|
filesize(MB) | 200 | 196 | 159 |
lines | 899288 | 77259 | 2360148 |
word_count | 37334441 | 2643972 | 30373792 |
Build and Clean Corpus
In this section, we’ll build and clean the corpus.
First, since the raw texts are too large, we’ll sample just 0.1% of each txt (i.e. blogs, news, twitter)
usblogs = usblogs[seq(1,length(usblogs),1000)]
usnews = usnews[seq(1,length(usnews),1000)]
ustwitter = ustwitter[seq(1,length(ustwitter),1000)]
invisible(gc())
raw_text = c(usblogs,usnews,ustwitter)
invisible(gc())
Second, we’ll build the corpus,
library(tm)
## Warning: package 'tm' was built under R version 4.0.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.0.3
# make a volatile corpus
raw_source = VectorSource(raw_text)
invisible(gc())
raw_corpus <- VCorpus(raw_source)
invisible(gc())
We’ll then clean the corpus: * convert words to lower case * remove white spaces * remove punctuations * remove numbers * remove ‘th’ * remove url * remove non_ASCII characters * remove repeated alphabets in a word.
# clean the corpus
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, content_transformer(tolower)) # convert to lower case
corpus <- tm_map(corpus, stripWhitespace) # remove white space
corpus <- tm_map(corpus, removePunctuation) # remove punctuation
corpus <- tm_map(corpus,content_transformer(function(x) gsub("[[:digit:]]","",x)))# remove numbers
corpus <- tm_map(corpus,content_transformer(function(x) gsub(" th", "",x))) # remove th (like 4th)
corpus <- tm_map(corpus,content_transformer(function(x) gsub("http[[:alnum:]]*","",x))) # remove url
corpus <- tm_map(corpus,content_transformer(function(x) iconv(x, "latin1", "ASCII", sub=""))) # remove non-ASCII characters
corpus <- tm_map(corpus,content_transformer(function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x))) # remove repeated alphabets in a word
gc()
return(corpus)
}
corpus <- clean_corpus(raw_corpus)
save(corpus,file='corpus.RData')
Build N-Gram model
In this section, we’ll build N-Gram models, namely uni-gram, bi-gram, and tri-gram.
The top 10 word frequncies and word coverage line are also plotted out.
load('corpus.RData')
# unigram
library(tm)
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.0.3
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
tdm1<-TermDocumentMatrix(corpus,control = list(tokenize = unigram))
invisible(gc())
wordMatrix1 = as.data.frame((as.matrix( tdm1 )) )
invisible(gc())
v1 <- sort(rowSums(wordMatrix1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
rm(tdm1,wordMatrix1)
for(i in 1:10) gc()
# word probablity
d1$prob = d1$freq/sum(d1$freq)
barplot(d1[1:10,c('prob')],names.arg = d1[1:10,'word'],main = 'probabiilty of uni-gram words')
# word coverage
d1$cum_prob = cumsum(d1$prob)
plot(d1$cum_prob,type='l',ylab = 'cumulative probablity',main = 'coverage of uni-gram words')
save(d1,file='d1.RData')
# bigram
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm2<-TermDocumentMatrix(corpus,control = list(tokenize = bigram))
invisible(gc())
wordMatrix2 = as.matrix( tdm2 )
wordMatrix2 = as.data.frame(wordMatrix2 )
invisible(gc())
v2 <- sort(rowSums(wordMatrix2),decreasing=TRUE)
d2 <- data.frame(word = names(v2),freq=v2)
rm(tdm2,wordMatrix2)
for(i in 1:10) gc()
# word probablity
d2$prob = d2$freq/sum(d2$freq)
barplot(d2[1:10,c('prob')],names.arg = d2[1:10,'word'],main = 'probabiilty of bi-gram words')
# word coverage
d2$cum_prob = cumsum(d2$prob)
plot(d2$cum_prob,type='l',ylab = 'cumulative probablity',main = 'coverage of bi-gram words')
save(d2,file='d2.RData')
# trigram
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm3<-TermDocumentMatrix(corpus,control = list(tokenize = trigram))
invisible(gc())
wordMatrix3 = as.matrix( tdm3 )
wordMatrix3 = as.data.frame(wordMatrix3 )
invisible(gc())
v3 <- sort(rowSums(wordMatrix3),decreasing=TRUE)
d3 <- data.frame(word = names(v3),freq=v3)
rm(tdm3,wordMatrix3)
for(i in 1:10) gc()
# word probablity
d3$prob = d3$freq/sum(d2$freq)
barplot(d3[1:10,c('prob')],names.arg = d3[1:10,'word'],main = 'probabiilty of tri-gram words')
# word coverage
d3$cum_prob = cumsum(d3$prob)
plot(d3$cum_prob,type='l',ylab = 'cumulative probablity',main = 'coverage of tri-gram words')
save(d3,file='d3.RData')
Modelling
In this section, we’ll build the model.
# Create data frames with word (pair) count
uni_words <- data.frame(word_1 = d1$word, count = d1$freq)
bi_words <- data.frame(
word_1 = sapply(strsplit(as.character(d2$word), " ", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(as.character(d2$word), " ", fixed = TRUE), '[[', 2),
count = d2$freq)
tri_words <- data.frame(
word_1 = sapply(strsplit(as.character(d3$word), " ", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(as.character(d3$word), " ", fixed = TRUE), '[[', 2),
word_3 = sapply(strsplit(as.character(d3$word), " ", fixed = TRUE), '[[', 3),
count = d3$freq)
We will apply Kneser-Kney smoothing on the models. You may refer to the algorithm on wikipedia. For the discounting factor, I used 0.75 according the page 66 of this lecture
# N-Gram model probablity with Kneser Kney Smoothing
# http://computational-linguistics-class.org/slides/04-n-gram-language-models.pdf
# page 66
discount_value <- 0.75
# uni-gram
uni_words$prob = uni_words$count / sum(uni_words$count)
uni_words$word_1 = as.character(uni_words$word_1)
# bi-gram
bi_w1_count = aggregate(bi_words[,'word_1'],by=list(bi_words[,'word_1']),length)
names(bi_w1_count)=c('word_1','count')
bi_words = merge(bi_words,uni_words[,c('word_1','count')],by.x='word_1',by.y='word_1',all.x=T)
names(bi_words)=c("word_1","word_2",'count','word_1_uni_count')
bi_words = merge(bi_words,bi_w1_count,by='word_1',all.x=T)
names(bi_words)=c('word_1','word_2','count','word_1_uni_count','word_1_bi_count')
bi_words = merge(bi_words,uni_words[,c('word_1','prob')],by.x='word_2',by.y='word_1',all.x=T)
names(bi_words) = c('word_1','word_2','count','word_1_uni_count','word_1_bi_count','word_2_uni_prob')
bi_words$prob = (bi_words$count - discount_value)/bi_words$word_1_uni_count + discount_value/bi_words$word_1_uni_count*bi_words$word_1_bi_count*bi_words$word_2_uni_prob
bi_words=bi_words[,c('word_1','word_2','count','word_1_uni_count','word_1_bi_count','word_2_uni_prob','prob')]
bi_words = bi_words[order(bi_words$word_1,bi_words$word_2),]
bi_words$word_1 = as.character(bi_words$word_1)
bi_words$word_2 = as.character(bi_words$word_2)
# tri-gram
tri_w12_count = aggregate(tri_words[,c('word_1')],by=list(tri_words$word_1,tri_words$word_2),length)
names(tri_w12_count)=c('word_1','word_2','count')
tri_words = merge(tri_words,bi_words[,c('word_1','word_2','count')],by=c('word_1','word_2'),all.x=T,all.y=F)
names(tri_words) = c('word_1','word_2','word_3','count','word_12_bi_count')
# Cn2 = word_12_bi_count
tri_words = merge(tri_words,tri_w12_count,by=c('word_1','word_2'),all.x=T)
names(tri_words) = c('word_1','word_2','word_3','count','word_12_bi_count','word_12_tri_count')
tri_words = merge(tri_words,bi_words[,c('word_1','word_2','prob')],by.x=c('word_2','word_3'),by.y=c('word_1','word_2'),all.x=T)
names(tri_words) = c('word_1','word_2','word_3','count','word_12_bi_count','word_12_tri_count','word_23_bi_prob')
tri_words$prob = (tri_words$count - discount_value)/tri_words$word_12_bi_count + discount_value/tri_words$word_12_bi_count*tri_words$word_12_tri_count*tri_words$word_23_bi_prob
tri_words$word_1 = as.character(tri_words$word_1)
tri_words$word_2 = as.character(tri_words$word_2)
tri_words$word_3 = as.character(tri_words$word_3)
save(uni_words,bi_words,tri_words,file='n_gram_prob.RData')
Prediction
In this prediction section, we also used back-off model, ie. when no candidate is available in tri-gram, we’ll fall back to bi-gram and even uni-gram.
# prediction
predict_uni <- function() {
print('uni')
max_prob = max(uni_words$prob)
candidates=uni_words[uni_words$prob==max_prob,]
return(sample(candidates$word_2,1))
}
predict_bi <- function(w1) {
print('bi')
candidates = bi_words[(bi_words$word_1)==w1,c('word_2','prob')]
candidates = candidates[order(-candidates$prob),]
candidates = candidates[!is.na(candidates$prob),]
if (nrow(candidates) >=1){
max_prob = max(candidates$prob)
candidates=candidates[candidates$prob==max_prob,]
return(sample(candidates$word_2,1))
} else
{return(predict_uni())}
}
predict_tri <- function(w1, w2) {
print('tri')
candidates = tri_words[(tri_words$word_1)==w1 & tri_words$word_2 == w2, c('word_3','prob')]
candidates = candidates[order(-candidates$prob),]
candidates = candidates[!is.na(candidates$prob),]
if (nrow(candidates) >=1){
max_prob = max(candidates$prob)
candidates=candidates[candidates$prob==max_prob,]
return(sample(candidates$word_3,1))
} else
{return(predict_bi(w2))}
}
miscellaneous
Other miscellaneous questions from this task:
Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases? If two words are highly correlated. for example words A and B, ie. we have (x, A) and (x, B) (x is an arbitray word), but now in our sample data, we only see (y,A), in this case also assign same frequency to (y,B).