Milestone Report

3. Exploratory Data Analysis

File Summaries
We’ll read in the raw text files and summarize here:

usblogs_file = 'final/en_US/en_US.blogs.txt' 
usnews_file = 'final/en_US/en_US.news.txt'
ustwitter_file = 'final/en_US/en_US.twitter.txt'
usblogs = readLines(usblogs_file)
usnews = readLines(usnews_file)

## Warning in readLines(usnews_file): incomplete final line found on 'final/en_US/
## en_US.news.txt'

ustwitter = readLines(ustwitter_file)

## Warning in readLines(ustwitter_file): line 167155 appears to contain an embedded
## nul

## Warning in readLines(ustwitter_file): line 268547 appears to contain an embedded
## nul

## Warning in readLines(ustwitter_file): line 1274086 appears to contain an
## embedded nul

## Warning in readLines(ustwitter_file): line 1759032 appears to contain an
## embedded nul

files = c(usblogs_file,usnews_file,ustwitter_file)
mbsizes = sapply(files, function(x) {file.size(x)/1024^2})
library(stringr)
stats = sapply(list(usblogs,usnews,ustwitter),function(x){ c(length(x) , sum(str_count(x,'\\S+')) )})
invisible(gc())
stats = rbind(mbsizes,stats)
stats = as.data.frame(stats)
names(stats) = c('usblogs','usnews','ustwitter')
row.names(stats) = c('filesize(MB)','lines','word_count')
library(knitr)
kable(stats,digits = 0)

	usblogs	usnews	ustwitter
filesize(MB)	200	196	159
lines	899288	77259	2360148
word_count	37334441	2643972	30373792

Build and Clean Corpus

In this section, we’ll build and clean the corpus.

First, since the raw texts are too large, we’ll sample just 0.1% of each txt (i.e. blogs, news, twitter)

usblogs = usblogs[seq(1,length(usblogs),1000)]
usnews = usnews[seq(1,length(usnews),1000)]
ustwitter = ustwitter[seq(1,length(ustwitter),1000)]
invisible(gc())
raw_text = c(usblogs,usnews,ustwitter)
invisible(gc())

Second, we’ll build the corpus,

library(tm)

## Warning: package 'tm' was built under R version 4.0.3

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 4.0.3

# make a volatile corpus
raw_source = VectorSource(raw_text)
invisible(gc())
raw_corpus <- VCorpus(raw_source)
invisible(gc())

We’ll then clean the corpus: * convert words to lower case * remove white spaces * remove punctuations * remove numbers * remove ‘th’ * remove url * remove non_ASCII characters * remove repeated alphabets in a word.

# clean the corpus
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, content_transformer(tolower)) # convert to lower case
  corpus <- tm_map(corpus, stripWhitespace) # remove white space
  corpus <- tm_map(corpus, removePunctuation) # remove punctuation
  corpus <- tm_map(corpus,content_transformer(function(x) gsub("[[:digit:]]","",x)))# remove numbers
  corpus <- tm_map(corpus,content_transformer(function(x) gsub(" th", "",x))) # remove th (like 4th)
  corpus <- tm_map(corpus,content_transformer(function(x) gsub("http[[:alnum:]]*","",x))) # remove url
  corpus <- tm_map(corpus,content_transformer(function(x) iconv(x, "latin1", "ASCII", sub=""))) # remove non-ASCII characters
  corpus <- tm_map(corpus,content_transformer(function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x))) # remove repeated alphabets in a word
  gc()
  return(corpus)
}

corpus <- clean_corpus(raw_corpus)

save(corpus,file='corpus.RData')

Build N-Gram model

In this section, we’ll build N-Gram models, namely uni-gram, bi-gram, and tri-gram.

The top 10 word frequncies and word coverage line are also plotted out.

load('corpus.RData')
# unigram
library(tm)
library(RWeka)

## Warning: package 'RWeka' was built under R version 4.0.3

unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
tdm1<-TermDocumentMatrix(corpus,control = list(tokenize = unigram))
invisible(gc())
wordMatrix1 = as.data.frame((as.matrix(  tdm1 )) ) 
invisible(gc())
v1 <- sort(rowSums(wordMatrix1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
rm(tdm1,wordMatrix1)
for(i in 1:10) gc()
# word probablity
d1$prob = d1$freq/sum(d1$freq)
barplot(d1[1:10,c('prob')],names.arg = d1[1:10,'word'],main = 'probabiilty of uni-gram words')

# word coverage
d1$cum_prob = cumsum(d1$prob)
plot(d1$cum_prob,type='l',ylab = 'cumulative probablity',main = 'coverage of uni-gram words')

save(d1,file='d1.RData')

# bigram
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm2<-TermDocumentMatrix(corpus,control = list(tokenize = bigram))
invisible(gc())
wordMatrix2 = as.matrix(  tdm2 )
wordMatrix2 = as.data.frame(wordMatrix2 ) 
invisible(gc())
v2 <- sort(rowSums(wordMatrix2),decreasing=TRUE)
d2 <- data.frame(word = names(v2),freq=v2)
rm(tdm2,wordMatrix2)
for(i in 1:10) gc()
# word probablity
d2$prob = d2$freq/sum(d2$freq)
barplot(d2[1:10,c('prob')],names.arg = d2[1:10,'word'],main = 'probabiilty of bi-gram words')

# word coverage
d2$cum_prob = cumsum(d2$prob)
plot(d2$cum_prob,type='l',ylab = 'cumulative probablity',main = 'coverage of bi-gram words')

save(d2,file='d2.RData')

# trigram
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm3<-TermDocumentMatrix(corpus,control = list(tokenize = trigram))
invisible(gc())
wordMatrix3 = as.matrix(  tdm3 )
wordMatrix3 = as.data.frame(wordMatrix3 ) 
invisible(gc())
v3 <- sort(rowSums(wordMatrix3),decreasing=TRUE)
d3 <- data.frame(word = names(v3),freq=v3)
rm(tdm3,wordMatrix3)
for(i in 1:10) gc()
# word probablity
d3$prob = d3$freq/sum(d2$freq)
barplot(d3[1:10,c('prob')],names.arg = d3[1:10,'word'],main = 'probabiilty of tri-gram words')

# word coverage
d3$cum_prob = cumsum(d3$prob)
plot(d3$cum_prob,type='l',ylab = 'cumulative probablity',main = 'coverage of tri-gram words')

save(d3,file='d3.RData')

Modelling

In this section, we’ll build the model.

# Create data frames with word (pair) count
uni_words <- data.frame(word_1 = d1$word, count = d1$freq)

bi_words <- data.frame(
  word_1 = sapply(strsplit(as.character(d2$word), " ", fixed = TRUE), '[[', 1),
  word_2 = sapply(strsplit(as.character(d2$word), " ", fixed = TRUE), '[[', 2),
  count = d2$freq)

tri_words <- data.frame(
  word_1 = sapply(strsplit(as.character(d3$word), " ", fixed = TRUE), '[[', 1),
  word_2 = sapply(strsplit(as.character(d3$word), " ", fixed = TRUE), '[[', 2),
  word_3 = sapply(strsplit(as.character(d3$word), " ", fixed = TRUE), '[[', 3),
  count = d3$freq)

We will apply Kneser-Kney smoothing on the models. You may refer to the algorithm on wikipedia. For the discounting factor, I used 0.75 according the page 66 of this lecture

# N-Gram model probablity with Kneser Kney Smoothing
# http://computational-linguistics-class.org/slides/04-n-gram-language-models.pdf
# page 66

discount_value <- 0.75

# uni-gram
uni_words$prob = uni_words$count / sum(uni_words$count)
uni_words$word_1 = as.character(uni_words$word_1)

# bi-gram
bi_w1_count = aggregate(bi_words[,'word_1'],by=list(bi_words[,'word_1']),length)
names(bi_w1_count)=c('word_1','count')

bi_words = merge(bi_words,uni_words[,c('word_1','count')],by.x='word_1',by.y='word_1',all.x=T)
names(bi_words)=c("word_1","word_2",'count','word_1_uni_count')

bi_words = merge(bi_words,bi_w1_count,by='word_1',all.x=T)
names(bi_words)=c('word_1','word_2','count','word_1_uni_count','word_1_bi_count')

bi_words = merge(bi_words,uni_words[,c('word_1','prob')],by.x='word_2',by.y='word_1',all.x=T)
names(bi_words) = c('word_1','word_2','count','word_1_uni_count','word_1_bi_count','word_2_uni_prob')

bi_words$prob = (bi_words$count - discount_value)/bi_words$word_1_uni_count + discount_value/bi_words$word_1_uni_count*bi_words$word_1_bi_count*bi_words$word_2_uni_prob
bi_words=bi_words[,c('word_1','word_2','count','word_1_uni_count','word_1_bi_count','word_2_uni_prob','prob')]
bi_words = bi_words[order(bi_words$word_1,bi_words$word_2),]

bi_words$word_1 = as.character(bi_words$word_1)
bi_words$word_2 = as.character(bi_words$word_2)
# tri-gram

tri_w12_count = aggregate(tri_words[,c('word_1')],by=list(tri_words$word_1,tri_words$word_2),length)
names(tri_w12_count)=c('word_1','word_2','count')

tri_words = merge(tri_words,bi_words[,c('word_1','word_2','count')],by=c('word_1','word_2'),all.x=T,all.y=F)
names(tri_words) = c('word_1','word_2','word_3','count','word_12_bi_count')
# Cn2 = word_12_bi_count

tri_words = merge(tri_words,tri_w12_count,by=c('word_1','word_2'),all.x=T)
names(tri_words) = c('word_1','word_2','word_3','count','word_12_bi_count','word_12_tri_count')

tri_words = merge(tri_words,bi_words[,c('word_1','word_2','prob')],by.x=c('word_2','word_3'),by.y=c('word_1','word_2'),all.x=T)
names(tri_words) = c('word_1','word_2','word_3','count','word_12_bi_count','word_12_tri_count','word_23_bi_prob')

tri_words$prob =  (tri_words$count - discount_value)/tri_words$word_12_bi_count + discount_value/tri_words$word_12_bi_count*tri_words$word_12_tri_count*tri_words$word_23_bi_prob

tri_words$word_1 = as.character(tri_words$word_1)
tri_words$word_2 = as.character(tri_words$word_2)
tri_words$word_3 = as.character(tri_words$word_3)

save(uni_words,bi_words,tri_words,file='n_gram_prob.RData')

Prediction

In this prediction section, we also used back-off model, ie. when no candidate is available in tri-gram, we’ll fall back to bi-gram and even uni-gram.

# prediction
predict_uni <- function() {
  print('uni')
  max_prob = max(uni_words$prob)
  candidates=uni_words[uni_words$prob==max_prob,]
  return(sample(candidates$word_2,1))
}

predict_bi <- function(w1) {
  print('bi')
  candidates = bi_words[(bi_words$word_1)==w1,c('word_2','prob')]
  candidates = candidates[order(-candidates$prob),]
  candidates = candidates[!is.na(candidates$prob),]
  if (nrow(candidates) >=1){
    max_prob = max(candidates$prob)
    candidates=candidates[candidates$prob==max_prob,]
    return(sample(candidates$word_2,1))
  } else 
    {return(predict_uni())}
}

predict_tri <- function(w1, w2) {
  print('tri')
  candidates = tri_words[(tri_words$word_1)==w1 & tri_words$word_2 == w2, c('word_3','prob')]
  candidates = candidates[order(-candidates$prob),]
  candidates = candidates[!is.na(candidates$prob),]
  if (nrow(candidates) >=1){
    max_prob = max(candidates$prob)
    candidates=candidates[candidates$prob==max_prob,]
    return(sample(candidates$word_3,1))
  } else 
    {return(predict_bi(w2))}

}

miscellaneous

Other miscellaneous questions from this task:

Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases? If two words are highly correlated. for example words A and B, ie. we have (x, A) and (x, B) (x is an arbitray word), but now in our sample data, we only see (y,A), in this case also assign same frequency to (y,B).

Milestone Report

JOHN JAIRO PRADO PIÑERES

9/11/2020

Milestone Report

1. Exploratory Data Analysis

2. Modelling

3. Exploratory Data Analysis