In this report, we are going to do two tasks:

  1. Exploratory Data Analysis
  1. Modelling

Exploratory Data Analysis

File Summaries

We’ll read in the raw text files and summarize here:

usblogs_file = 'final/en_US/en_US.blogs.txt'
usnews_file = 'final/en_US/en_US.news.txt'
ustwitter_file = 'final/en_US/en_US.twitter.txt'
usblogs = readLines(usblogs_file)
usnews = readLines(usnews_file)
ustwitter = readLines(ustwitter_file)
files = c(usblogs_file,usnews_file,ustwitter_file)
mbsizes = sapply(files, function(x) {file.size(x)/1024^2})
library(stringr)
stats = sapply(list(usblogs,usnews,ustwitter),function(x){ c(length(x) , sum(str_count(x,'\\S+')) )})
invisible(gc())
stats = rbind(mbsizes,stats)
stats = as.data.frame(stats)
names(stats) = c('usblogs','usnews','ustwitter')
row.names(stats) = c('filesize(MB)','lines','word_count')
library(knitr)
kable(stats,digits = 0)
usblogs usnews ustwitter
filesize(MB) 200 196 159
lines 899288 1010242 2360148
word_count 37334131 34372530 30373543

Build and Clean Corpus

In this section, we’ll build and clean the corpus.

First, since the raw texts are too large, we’ll sample just 0.1% of each txt (i.e. blogs, news, twitter)

usblogs = usblogs[seq(1,length(usblogs),1000)]
usnews = usnews[seq(1,length(usnews),1000)]
ustwitter = ustwitter[seq(1,length(ustwitter),1000)]
invisible(gc())
raw_text = c(usblogs,usnews,ustwitter)
invisible(gc())

Second, we’ll build the corpus,

library(tm)
# make a volatile corpus
raw_source = VectorSource(raw_text)
invisible(gc())
raw_corpus <- VCorpus(raw_source)
invisible(gc())

We’ll then clean the corpus: * convert words to lower case * remove white spaces * remove punctuations * remove numbers * remove ‘th’ * remove url * remove non_ASCII characters * remove repeated alphabets in a word

# clean the corpus
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, content_transformer(tolower)) # convert to lower case
  corpus <- tm_map(corpus, stripWhitespace) # remove white space
  corpus <- tm_map(corpus, removePunctuation) # remove punctuation
  corpus <- tm_map(corpus,content_transformer(function(x) gsub("[[:digit:]]","",x)))# remove numbers
  corpus <- tm_map(corpus,content_transformer(function(x) gsub(" th", "",x))) # remove th (like 4th)
  corpus <- tm_map(corpus,content_transformer(function(x) gsub("http[[:alnum:]]*","",x))) # remove url
  corpus <- tm_map(corpus,content_transformer(function(x) iconv(x, "latin1", "ASCII", sub=""))) # remove non-ASCII characters
  corpus <- tm_map(corpus,content_transformer(function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x))) # remove repeated alphabets in a word
  gc()
  return(corpus)
}

corpus <- clean_corpus(raw_corpus)

save(corpus,file='corpus.RData')

Build N-Gram model

In this section, we’ll build N-Gram models, namely uni-gram, bi-gram, and tri-gram.

The top 10 word frequncies and word coverage line are also plotted out.

load('corpus.RData')
# unigram
library(tm)
library(RWeka)
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
tdm1<-TermDocumentMatrix(corpus,control = list(tokenize = unigram))
invisible(gc())
wordMatrix1 = as.data.frame((as.matrix(  tdm1 )) ) 
invisible(gc())
v1 <- sort(rowSums(wordMatrix1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
rm(tdm1,wordMatrix1)
for(i in 1:10) gc()
# word probablity
d1$prob = d1$freq/sum(d1$freq)
barplot(d1[1:10,c('prob')],names.arg = d1[1:10,'word'],main = 'probabiilty of uni-gram words')

# word coverage
d1$cum_prob = cumsum(d1$prob)
plot(d1$cum_prob,type='l',ylab = 'cumulative probablity',main = 'coverage of uni-gram words')

save(d1,file='d1.RData')
# bigram
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm2<-TermDocumentMatrix(corpus,control = list(tokenize = bigram))
invisible(gc())
wordMatrix2 = as.matrix(  tdm2 )
wordMatrix2 = as.data.frame(wordMatrix2 ) 
invisible(gc())
v2 <- sort(rowSums(wordMatrix2),decreasing=TRUE)
d2 <- data.frame(word = names(v2),freq=v2)
rm(tdm2,wordMatrix2)
for(i in 1:10) gc()
# word probablity
d2$prob = d2$freq/sum(d2$freq)
barplot(d2[1:10,c('prob')],names.arg = d2[1:10,'word'],main = 'probabiilty of bi-gram words')

# word coverage
d2$cum_prob = cumsum(d2$prob)
plot(d2$cum_prob,type='l',ylab = 'cumulative probablity',main = 'coverage of bi-gram words')

save(d2,file='d2.RData')
# trigram
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm3<-TermDocumentMatrix(corpus,control = list(tokenize = trigram))
invisible(gc())
wordMatrix3 = as.matrix(  tdm3 )
invisible(gc())

Modelling

In this section, we’ll build the model.

# Create data frames with word (pair) count
uni_words <- data.frame(word_1 = d1$word, count = d1$freq)

bi_words <- data.frame(
  word_1 = sapply(strsplit(as.character(d2$word), " ", fixed = TRUE), '[[', 1),
  word_2 = sapply(strsplit(as.character(d2$word), " ", fixed = TRUE), '[[', 2),
  count = d2$freq)

tri_words <- data.frame(
  word_1 = sapply(strsplit(as.character(d3$word), " ", fixed = TRUE), '[[', 1),
  word_2 = sapply(strsplit(as.character(d3$word), " ", fixed = TRUE), '[[', 2),
  word_3 = sapply(strsplit(as.character(d3$word), " ", fixed = TRUE), '[[', 3),
  count = d3$freq)

We will apply Kneser-Kney smoothing on the models. You may refer to the algorithm on wikipedia. For the discounting factor, I used 0.75 according the page 66 of this lecture

# N-Gram model probablity with Kneser Kney Smoothing
# http://computational-linguistics-class.org/slides/04-n-gram-language-models.pdf
# page 66

discount_value <- 0.75

# uni-gram
uni_words$prob = uni_words$count / sum(uni_words$count)
uni_words$word_1 = as.character(uni_words$word_1)

# bi-gram
bi_w1_count = aggregate(bi_words[,'word_1'],by=list(bi_words[,'word_1']),length)
names(bi_w1_count)=c('word_1','count')

bi_words = merge(bi_words,uni_words[,c('word_1','count')],by.x='word_1',by.y='word_1',all.x=T)
names(bi_words)=c("word_1","word_2",'count','word_1_uni_count')

bi_words = merge(bi_words,bi_w1_count,by='word_1',all.x=T)
names(bi_words)=c('word_1','word_2','count','word_1_uni_count','word_1_bi_count')

bi_words = merge(bi_words,uni_words[,c('word_1','prob')],by.x='word_2',by.y='word_1',all.x=T)
names(bi_words) = c('word_1','word_2','count','word_1_uni_count','word_1_bi_count','word_2_uni_prob')

bi_words$prob = (bi_words$count - discount_value)/bi_words$word_1_uni_count + discount_value/bi_words$word_1_uni_count*bi_words$word_1_bi_count*bi_words$word_2_uni_prob
bi_words=bi_words[,c('word_1','word_2','count','word_1_uni_count','word_1_bi_count','word_2_uni_prob','prob')]
bi_words = bi_words[order(bi_words$word_1,bi_words$word_2),]

bi_words$word_1 = as.character(bi_words$word_1)
bi_words$word_2 = as.character(bi_words$word_2)
# tri-gram

tri_w12_count = aggregate(tri_words[,c('word_1')],by=list(tri_words$word_1,tri_words$word_2),length)
names(tri_w12_count)=c('word_1','word_2','count')

tri_words = merge(tri_words,bi_words[,c('word_1','word_2','count')],by=c('word_1','word_2'),all.x=T,all.y=F)
names(tri_words) = c('word_1','word_2','word_3','count','word_12_bi_count')
# Cn2 = word_12_bi_count

tri_words = merge(tri_words,tri_w12_count,by=c('word_1','word_2'),all.x=T)
names(tri_words) = c('word_1','word_2','word_3','count','word_12_bi_count','word_12_tri_count')

tri_words = merge(tri_words,bi_words[,c('word_1','word_2','prob')],by.x=c('word_2','word_3'),by.y=c('word_1','word_2'),all.x=T)
names(tri_words) = c('word_1','word_2','word_3','count','word_12_bi_count','word_12_tri_count','word_23_bi_prob')

tri_words$prob =  (tri_words$count - discount_value)/tri_words$word_12_bi_count + discount_value/tri_words$word_12_bi_count*tri_words$word_12_tri_count*tri_words$word_23_bi_prob

tri_words$word_1 = as.character(tri_words$word_1)
tri_words$word_2 = as.character(tri_words$word_2)
tri_words$word_3 = as.character(tri_words$word_3)

save(uni_words,bi_words,tri_words,file='n_gram_prob.RData')

Prediction

In this prediction section, we also used back-off model, ie. when no candidate is available in tri-gram, we’ll fall back to bi-gram and even uni-gram.

# prediction
predict_uni <- function() {
  print('uni')
  max_prob = max(uni_words$prob)
  candidates=uni_words[uni_words$prob==max_prob,]
  return(sample(candidates$word_2,1))
}

predict_bi <- function(w1) {
  print('bi')
  candidates = bi_words[(bi_words$word_1)==w1,c('word_2','prob')]
  candidates = candidates[order(-candidates$prob),]
  candidates = candidates[!is.na(candidates$prob),]
  if (nrow(candidates) >=1){
    max_prob = max(candidates$prob)
    candidates=candidates[candidates$prob==max_prob,]
    return(sample(candidates$word_2,1))
  } else 
    {return(predict_uni())}
}

predict_tri <- function(w1, w2) {
  print('tri')
  candidates = tri_words[(tri_words$word_1)==w1 & tri_words$word_2 == w2, c('word_3','prob')]
  candidates = candidates[order(-candidates$prob),]
  candidates = candidates[!is.na(candidates$prob),]
  if (nrow(candidates) >=1){
    max_prob = max(candidates$prob)
    candidates=candidates[candidates$prob==max_prob,]
    return(sample(candidates$word_3,1))
  } else 
    {return(predict_bi(w2))}

}

miscellaneous

Other miscellaneous questions from this task:

  1. Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?
LS0tCnRpdGxlOiAiQ291cnNlcmEgRGF0YSBTY2llbmNlIENhcHN0b25lIFdlZWsgMiBNaWxlc3RvbmUgUmVwb3J0IgphdXRob3I6CiAgLSBuYW1lOiBRaW9uZyBXdQpvdXRwdXQ6IAogIGh0bWxfbm90ZWJvb2sKLS0tCgpJbiB0aGlzIHJlcG9ydCwgd2UgYXJlIGdvaW5nIHRvIGRvIHR3byB0YXNrczogCgoxLiAqKkV4cGxvcmF0b3J5IERhdGEgQW5hbHlzaXMqKgorIHVuZGVyc3RhbmQgdGhlIGRpc3RyaWJ1dGlvbiBvZiB3b3JkcyBhbmQgcmVsYXRpb25zaGlwIGJldHdlZW4gdGhlIHdvcmRzIGluIHRoZSBjb3Jwb3JhLiAKKyB1bmRlcnN0YW5kIGZyZXF1ZW5jaWVzIG9mIHdvcmRzIGFuZCB3b3JkIHBhaXJzIAoyLiAqKk1vZGVsbGluZyoqCisgYnVpbGQgYmFzaWMgbi1ncmFtIG1vZGVsCisgYnVpbGQgYSBtb2RlbCB0byBoYW5kbGUgdW5zZWVuIG4tZ3JhbXMKCgojIyMgRXhwbG9yYXRvcnkgRGF0YSBBbmFseXNpcwoKKipGaWxlIFN1bW1hcmllcyoqCgpXZSdsbCByZWFkIGluIHRoZSByYXcgdGV4dCBmaWxlcyBhbmQgc3VtbWFyaXplIGhlcmU6CgpgYGB7ciBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQp1c2Jsb2dzX2ZpbGUgPSAnZmluYWwvZW5fVVMvZW5fVVMuYmxvZ3MudHh0Jwp1c25ld3NfZmlsZSA9ICdmaW5hbC9lbl9VUy9lbl9VUy5uZXdzLnR4dCcKdXN0d2l0dGVyX2ZpbGUgPSAnZmluYWwvZW5fVVMvZW5fVVMudHdpdHRlci50eHQnCgp1c2Jsb2dzID0gcmVhZExpbmVzKHVzYmxvZ3NfZmlsZSkKdXNuZXdzID0gcmVhZExpbmVzKHVzbmV3c19maWxlKQp1c3R3aXR0ZXIgPSByZWFkTGluZXModXN0d2l0dGVyX2ZpbGUpCgpmaWxlcyA9IGModXNibG9nc19maWxlLHVzbmV3c19maWxlLHVzdHdpdHRlcl9maWxlKQptYnNpemVzID0gc2FwcGx5KGZpbGVzLCBmdW5jdGlvbih4KSB7ZmlsZS5zaXplKHgpLzEwMjReMn0pCmxpYnJhcnkoc3RyaW5ncikKc3RhdHMgPSBzYXBwbHkobGlzdCh1c2Jsb2dzLHVzbmV3cyx1c3R3aXR0ZXIpLGZ1bmN0aW9uKHgpeyBjKGxlbmd0aCh4KSAsIHN1bShzdHJfY291bnQoeCwnXFxTKycpKSApfSkKaW52aXNpYmxlKGdjKCkpCnN0YXRzID0gcmJpbmQobWJzaXplcyxzdGF0cykKc3RhdHMgPSBhcy5kYXRhLmZyYW1lKHN0YXRzKQpuYW1lcyhzdGF0cykgPSBjKCd1c2Jsb2dzJywndXNuZXdzJywndXN0d2l0dGVyJykKcm93Lm5hbWVzKHN0YXRzKSA9IGMoJ2ZpbGVzaXplKE1CKScsJ2xpbmVzJywnd29yZF9jb3VudCcpCmxpYnJhcnkoa25pdHIpCmthYmxlKHN0YXRzLGRpZ2l0cyA9IDApCmBgYAoKKipCdWlsZCBhbmQgQ2xlYW4gQ29ycHVzKioKCkluIHRoaXMgc2VjdGlvbiwgd2UnbGwgYnVpbGQgYW5kIGNsZWFuIHRoZSBjb3JwdXMuCgpGaXJzdCwgc2luY2UgdGhlIHJhdyB0ZXh0cyBhcmUgdG9vIGxhcmdlLCB3ZSdsbCBzYW1wbGUganVzdCAwLjElIG9mIGVhY2ggdHh0IChpLmUuIGJsb2dzLCBuZXdzLCB0d2l0dGVyKQoKYGBge3IgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KdXNibG9ncyA9IHVzYmxvZ3Nbc2VxKDEsbGVuZ3RoKHVzYmxvZ3MpLDEwMDApXQp1c25ld3MgPSB1c25ld3Nbc2VxKDEsbGVuZ3RoKHVzbmV3cyksMTAwMCldCnVzdHdpdHRlciA9IHVzdHdpdHRlcltzZXEoMSxsZW5ndGgodXN0d2l0dGVyKSwxMDAwKV0KaW52aXNpYmxlKGdjKCkpCnJhd190ZXh0ID0gYyh1c2Jsb2dzLHVzbmV3cyx1c3R3aXR0ZXIpCmludmlzaWJsZShnYygpKQpgYGAKClNlY29uZCwgd2UnbGwgYnVpbGQgdGhlIGNvcnB1cywKYGBge3J9CmxpYnJhcnkodG0pCiMgbWFrZSBhIHZvbGF0aWxlIGNvcnB1cwpyYXdfc291cmNlID0gVmVjdG9yU291cmNlKHJhd190ZXh0KQppbnZpc2libGUoZ2MoKSkKcmF3X2NvcnB1cyA8LSBWQ29ycHVzKHJhd19zb3VyY2UpCmludmlzaWJsZShnYygpKQpgYGAKCldlJ2xsIHRoZW4gY2xlYW4gdGhlIGNvcnB1czoKKiBjb252ZXJ0IHdvcmRzIHRvIGxvd2VyIGNhc2UKKiByZW1vdmUgd2hpdGUgc3BhY2VzCiogcmVtb3ZlIHB1bmN0dWF0aW9ucwoqIHJlbW92ZSBudW1iZXJzCiogcmVtb3ZlICd0aCcKKiByZW1vdmUgdXJsCiogcmVtb3ZlIG5vbl9BU0NJSSBjaGFyYWN0ZXJzCiogcmVtb3ZlIHJlcGVhdGVkIGFscGhhYmV0cyBpbiBhIHdvcmQKCmBgYHtyfQojIGNsZWFuIHRoZSBjb3JwdXMKY2xlYW5fY29ycHVzIDwtIGZ1bmN0aW9uKGNvcnB1cyl7CiAgY29ycHVzIDwtIHRtX21hcChjb3JwdXMsIGNvbnRlbnRfdHJhbnNmb3JtZXIodG9sb3dlcikpICMgY29udmVydCB0byBsb3dlciBjYXNlCiAgY29ycHVzIDwtIHRtX21hcChjb3JwdXMsIHN0cmlwV2hpdGVzcGFjZSkgIyByZW1vdmUgd2hpdGUgc3BhY2UKICBjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgcmVtb3ZlUHVuY3R1YXRpb24pICMgcmVtb3ZlIHB1bmN0dWF0aW9uCiAgY29ycHVzIDwtIHRtX21hcChjb3JwdXMsY29udGVudF90cmFuc2Zvcm1lcihmdW5jdGlvbih4KSBnc3ViKCJbWzpkaWdpdDpdXSIsIiIseCkpKSMgcmVtb3ZlIG51bWJlcnMKICBjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cyxjb250ZW50X3RyYW5zZm9ybWVyKGZ1bmN0aW9uKHgpIGdzdWIoIiB0aCIsICIiLHgpKSkgIyByZW1vdmUgdGggKGxpa2UgNHRoKQogIGNvcnB1cyA8LSB0bV9tYXAoY29ycHVzLGNvbnRlbnRfdHJhbnNmb3JtZXIoZnVuY3Rpb24oeCkgZ3N1YigiaHR0cFtbOmFsbnVtOl1dKiIsIiIseCkpKSAjIHJlbW92ZSB1cmwKICBjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cyxjb250ZW50X3RyYW5zZm9ybWVyKGZ1bmN0aW9uKHgpIGljb252KHgsICJsYXRpbjEiLCAiQVNDSUkiLCBzdWI9IiIpKSkgIyByZW1vdmUgbm9uLUFTQ0lJIGNoYXJhY3RlcnMKICBjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cyxjb250ZW50X3RyYW5zZm9ybWVyKGZ1bmN0aW9uKHgpIGdzdWIoIihbWzphbHBoYTpdXSlcXDF7Mix9IiwgIlxcMVxcMSIsIHgpKSkgIyByZW1vdmUgcmVwZWF0ZWQgYWxwaGFiZXRzIGluIGEgd29yZAogIGdjKCkKICByZXR1cm4oY29ycHVzKQp9Cgpjb3JwdXMgPC0gY2xlYW5fY29ycHVzKHJhd19jb3JwdXMpCgpzYXZlKGNvcnB1cyxmaWxlPSdjb3JwdXMuUkRhdGEnKQpgYGAKCioqQnVpbGQgTi1HcmFtIG1vZGVsKioKCkluIHRoaXMgc2VjdGlvbiwgd2UnbGwgYnVpbGQgTi1HcmFtIG1vZGVscywgbmFtZWx5IHVuaS1ncmFtLCBiaS1ncmFtLCBhbmQgdHJpLWdyYW0uCgpUaGUgdG9wIDEwIHdvcmQgZnJlcXVuY2llcyBhbmQgd29yZCBjb3ZlcmFnZSBsaW5lIGFyZSBhbHNvIHBsb3R0ZWQgb3V0LgoKYGBge3J9CmxvYWQoJ2NvcnB1cy5SRGF0YScpCiMgdW5pZ3JhbQpsaWJyYXJ5KHRtKQpsaWJyYXJ5KFJXZWthKQp1bmlncmFtIDwtIGZ1bmN0aW9uKHgpIE5HcmFtVG9rZW5pemVyKHgsIFdla2FfY29udHJvbChtaW4gPSAxLCBtYXggPSAxKSkKdGRtMTwtVGVybURvY3VtZW50TWF0cml4KGNvcnB1cyxjb250cm9sID0gbGlzdCh0b2tlbml6ZSA9IHVuaWdyYW0pKQppbnZpc2libGUoZ2MoKSkKd29yZE1hdHJpeDEgPSBhcy5kYXRhLmZyYW1lKChhcy5tYXRyaXgoICB0ZG0xICkpICkgCmludmlzaWJsZShnYygpKQoKdjEgPC0gc29ydChyb3dTdW1zKHdvcmRNYXRyaXgxKSxkZWNyZWFzaW5nPVRSVUUpCmQxIDwtIGRhdGEuZnJhbWUod29yZCA9IG5hbWVzKHYxKSxmcmVxPXYxKQpybSh0ZG0xLHdvcmRNYXRyaXgxKQpmb3IoaSBpbiAxOjEwKSBnYygpCiMgd29yZCBwcm9iYWJsaXR5CmQxJHByb2IgPSBkMSRmcmVxL3N1bShkMSRmcmVxKQpiYXJwbG90KGQxWzE6MTAsYygncHJvYicpXSxuYW1lcy5hcmcgPSBkMVsxOjEwLCd3b3JkJ10sbWFpbiA9ICdwcm9iYWJpaWx0eSBvZiB1bmktZ3JhbSB3b3JkcycpCiMgd29yZCBjb3ZlcmFnZQpkMSRjdW1fcHJvYiA9IGN1bXN1bShkMSRwcm9iKQpwbG90KGQxJGN1bV9wcm9iLHR5cGU9J2wnLHlsYWIgPSAnY3VtdWxhdGl2ZSBwcm9iYWJsaXR5JyxtYWluID0gJ2NvdmVyYWdlIG9mIHVuaS1ncmFtIHdvcmRzJykKCgpzYXZlKGQxLGZpbGU9J2QxLlJEYXRhJykKYGBgCgoKYGBge3J9CiMgYmlncmFtCmJpZ3JhbSA8LSBmdW5jdGlvbih4KSBOR3JhbVRva2VuaXplcih4LCBXZWthX2NvbnRyb2wobWluID0gMiwgbWF4ID0gMikpCnRkbTI8LVRlcm1Eb2N1bWVudE1hdHJpeChjb3JwdXMsY29udHJvbCA9IGxpc3QodG9rZW5pemUgPSBiaWdyYW0pKQppbnZpc2libGUoZ2MoKSkKd29yZE1hdHJpeDIgPSBhcy5tYXRyaXgoICB0ZG0yICkKd29yZE1hdHJpeDIgPSBhcy5kYXRhLmZyYW1lKHdvcmRNYXRyaXgyICkgCmludmlzaWJsZShnYygpKQoKdjIgPC0gc29ydChyb3dTdW1zKHdvcmRNYXRyaXgyKSxkZWNyZWFzaW5nPVRSVUUpCmQyIDwtIGRhdGEuZnJhbWUod29yZCA9IG5hbWVzKHYyKSxmcmVxPXYyKQpybSh0ZG0yLHdvcmRNYXRyaXgyKQpmb3IoaSBpbiAxOjEwKSBnYygpCgojIHdvcmQgcHJvYmFibGl0eQpkMiRwcm9iID0gZDIkZnJlcS9zdW0oZDIkZnJlcSkKYmFycGxvdChkMlsxOjEwLGMoJ3Byb2InKV0sbmFtZXMuYXJnID0gZDJbMToxMCwnd29yZCddLG1haW4gPSAncHJvYmFiaWlsdHkgb2YgYmktZ3JhbSB3b3JkcycpCiMgd29yZCBjb3ZlcmFnZQpkMiRjdW1fcHJvYiA9IGN1bXN1bShkMiRwcm9iKQpwbG90KGQyJGN1bV9wcm9iLHR5cGU9J2wnLHlsYWIgPSAnY3VtdWxhdGl2ZSBwcm9iYWJsaXR5JyxtYWluID0gJ2NvdmVyYWdlIG9mIGJpLWdyYW0gd29yZHMnKQoKc2F2ZShkMixmaWxlPSdkMi5SRGF0YScpCgpgYGAKCgpgYGB7cn0KIyB0cmlncmFtCnRyaWdyYW0gPC0gZnVuY3Rpb24oeCkgTkdyYW1Ub2tlbml6ZXIoeCwgV2VrYV9jb250cm9sKG1pbiA9IDMsIG1heCA9IDMpKQp0ZG0zPC1UZXJtRG9jdW1lbnRNYXRyaXgoY29ycHVzLGNvbnRyb2wgPSBsaXN0KHRva2VuaXplID0gdHJpZ3JhbSkpCmludmlzaWJsZShnYygpKQp3b3JkTWF0cml4MyA9IGFzLm1hdHJpeCggIHRkbTMgKQppbnZpc2libGUoZ2MoKSkKd29yZE1hdHJpeDMgPSBhcy5kYXRhLmZyYW1lKCB3b3JkTWF0cml4MyApIAppbnZpc2libGUoZ2MoKSkKCnYzIDwtIHNvcnQocm93U3Vtcyh3b3JkTWF0cml4MyksZGVjcmVhc2luZz1UUlVFKQpkMyA8LSBkYXRhLmZyYW1lKHdvcmQgPSBuYW1lcyh2MyksZnJlcT12MykKcm0odGRtMyx3b3JkTWF0cml4MykKZm9yKGkgaW4gMToxMCkgZ2MoKQoKIyB3b3JkIHByb2JhYmxpdHkKZDMkcHJvYiA9IGQzJGZyZXEvc3VtKGQzJGZyZXEpCmJhcnBsb3QoZDNbMToxMCxjKCdwcm9iJyldLG5hbWVzLmFyZyA9IGQzWzE6MTAsJ3dvcmQnXSxtYWluID0gJ3Byb2JhYmlpbHR5IG9mIHRyaS1ncmFtIHdvcmRzJykKIyB3b3JkIGNvdmVyYWdlCmQzJGN1bV9wcm9iID0gY3Vtc3VtKGQzJHByb2IpCnBsb3QoZDMkY3VtX3Byb2IsdHlwZT0nbCcseWxhYiA9ICdjdW11bGF0aXZlIHByb2JhYmxpdHknLG1haW4gPSAnY292ZXJhZ2Ugb2YgdHJpLWdyYW0gd29yZHMnKQoKc2F2ZShkMyxmaWxlPSdkMy5SRGF0YScpCgoKYGBgCgoKKipNb2RlbGxpbmcqKgoKSW4gdGhpcyBzZWN0aW9uLCB3ZSdsbCBidWlsZCB0aGUgbW9kZWwuIAoKYGBge3J9CiMgQ3JlYXRlIGRhdGEgZnJhbWVzIHdpdGggd29yZCAocGFpcikgY291bnQKdW5pX3dvcmRzIDwtIGRhdGEuZnJhbWUod29yZF8xID0gZDEkd29yZCwgY291bnQgPSBkMSRmcmVxKQoKYmlfd29yZHMgPC0gZGF0YS5mcmFtZSgKICB3b3JkXzEgPSBzYXBwbHkoc3Ryc3BsaXQoYXMuY2hhcmFjdGVyKGQyJHdvcmQpLCAiICIsIGZpeGVkID0gVFJVRSksICdbWycsIDEpLAogIHdvcmRfMiA9IHNhcHBseShzdHJzcGxpdChhcy5jaGFyYWN0ZXIoZDIkd29yZCksICIgIiwgZml4ZWQgPSBUUlVFKSwgJ1tbJywgMiksCiAgY291bnQgPSBkMiRmcmVxKQoKdHJpX3dvcmRzIDwtIGRhdGEuZnJhbWUoCiAgd29yZF8xID0gc2FwcGx5KHN0cnNwbGl0KGFzLmNoYXJhY3RlcihkMyR3b3JkKSwgIiAiLCBmaXhlZCA9IFRSVUUpLCAnW1snLCAxKSwKICB3b3JkXzIgPSBzYXBwbHkoc3Ryc3BsaXQoYXMuY2hhcmFjdGVyKGQzJHdvcmQpLCAiICIsIGZpeGVkID0gVFJVRSksICdbWycsIDIpLAogIHdvcmRfMyA9IHNhcHBseShzdHJzcGxpdChhcy5jaGFyYWN0ZXIoZDMkd29yZCksICIgIiwgZml4ZWQgPSBUUlVFKSwgJ1tbJywgMyksCiAgY291bnQgPSBkMyRmcmVxKQoKYGBgCgpXZSB3aWxsIGFwcGx5IEtuZXNlci1LbmV5IHNtb290aGluZyBvbiB0aGUgbW9kZWxzLiBZb3UgbWF5IHJlZmVyIHRvIHRoZSBhbGdvcml0aG0gb24gW3dpa2lwZWRpYV0oaHR0cHM6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvS25lc2VyJUUyJTgwJTkzTmV5X3Ntb290aGluZykuIEZvciB0aGUgZGlzY291bnRpbmcgZmFjdG9yLCBJIHVzZWQgMC43NSBhY2NvcmRpbmcgdGhlIHBhZ2UgNjYgb2YgdGhpcyBbbGVjdHVyZV0oaHR0cDovL2NvbXB1dGF0aW9uYWwtbGluZ3Vpc3RpY3MtY2xhc3Mub3JnL3NsaWRlcy8wNC1uLWdyYW0tbGFuZ3VhZ2UtbW9kZWxzLnBkZikKCmBgYHtyfQojIE4tR3JhbSBtb2RlbCBwcm9iYWJsaXR5IHdpdGggS25lc2VyIEtuZXkgU21vb3RoaW5nCiMgaHR0cDovL2NvbXB1dGF0aW9uYWwtbGluZ3Vpc3RpY3MtY2xhc3Mub3JnL3NsaWRlcy8wNC1uLWdyYW0tbGFuZ3VhZ2UtbW9kZWxzLnBkZgojIHBhZ2UgNjYKCmRpc2NvdW50X3ZhbHVlIDwtIDAuNzUKCiMgdW5pLWdyYW0KdW5pX3dvcmRzJHByb2IgPSB1bmlfd29yZHMkY291bnQgLyBzdW0odW5pX3dvcmRzJGNvdW50KQp1bmlfd29yZHMkd29yZF8xID0gYXMuY2hhcmFjdGVyKHVuaV93b3JkcyR3b3JkXzEpCgojIGJpLWdyYW0KYmlfdzFfY291bnQgPSBhZ2dyZWdhdGUoYmlfd29yZHNbLCd3b3JkXzEnXSxieT1saXN0KGJpX3dvcmRzWywnd29yZF8xJ10pLGxlbmd0aCkKbmFtZXMoYmlfdzFfY291bnQpPWMoJ3dvcmRfMScsJ2NvdW50JykKCmJpX3dvcmRzID0gbWVyZ2UoYmlfd29yZHMsdW5pX3dvcmRzWyxjKCd3b3JkXzEnLCdjb3VudCcpXSxieS54PSd3b3JkXzEnLGJ5Lnk9J3dvcmRfMScsYWxsLng9VCkKbmFtZXMoYmlfd29yZHMpPWMoIndvcmRfMSIsIndvcmRfMiIsJ2NvdW50Jywnd29yZF8xX3VuaV9jb3VudCcpCgpiaV93b3JkcyA9IG1lcmdlKGJpX3dvcmRzLGJpX3cxX2NvdW50LGJ5PSd3b3JkXzEnLGFsbC54PVQpCm5hbWVzKGJpX3dvcmRzKT1jKCd3b3JkXzEnLCd3b3JkXzInLCdjb3VudCcsJ3dvcmRfMV91bmlfY291bnQnLCd3b3JkXzFfYmlfY291bnQnKQoKYmlfd29yZHMgPSBtZXJnZShiaV93b3Jkcyx1bmlfd29yZHNbLGMoJ3dvcmRfMScsJ3Byb2InKV0sYnkueD0nd29yZF8yJyxieS55PSd3b3JkXzEnLGFsbC54PVQpCm5hbWVzKGJpX3dvcmRzKSA9IGMoJ3dvcmRfMScsJ3dvcmRfMicsJ2NvdW50Jywnd29yZF8xX3VuaV9jb3VudCcsJ3dvcmRfMV9iaV9jb3VudCcsJ3dvcmRfMl91bmlfcHJvYicpCgpiaV93b3JkcyRwcm9iID0gKGJpX3dvcmRzJGNvdW50IC0gZGlzY291bnRfdmFsdWUpL2JpX3dvcmRzJHdvcmRfMV91bmlfY291bnQgKyBkaXNjb3VudF92YWx1ZS9iaV93b3JkcyR3b3JkXzFfdW5pX2NvdW50KmJpX3dvcmRzJHdvcmRfMV9iaV9jb3VudCpiaV93b3JkcyR3b3JkXzJfdW5pX3Byb2IKYmlfd29yZHM9Ymlfd29yZHNbLGMoJ3dvcmRfMScsJ3dvcmRfMicsJ2NvdW50Jywnd29yZF8xX3VuaV9jb3VudCcsJ3dvcmRfMV9iaV9jb3VudCcsJ3dvcmRfMl91bmlfcHJvYicsJ3Byb2InKV0KYmlfd29yZHMgPSBiaV93b3Jkc1tvcmRlcihiaV93b3JkcyR3b3JkXzEsYmlfd29yZHMkd29yZF8yKSxdCgpiaV93b3JkcyR3b3JkXzEgPSBhcy5jaGFyYWN0ZXIoYmlfd29yZHMkd29yZF8xKQpiaV93b3JkcyR3b3JkXzIgPSBhcy5jaGFyYWN0ZXIoYmlfd29yZHMkd29yZF8yKQojIHRyaS1ncmFtCgp0cmlfdzEyX2NvdW50ID0gYWdncmVnYXRlKHRyaV93b3Jkc1ssYygnd29yZF8xJyldLGJ5PWxpc3QodHJpX3dvcmRzJHdvcmRfMSx0cmlfd29yZHMkd29yZF8yKSxsZW5ndGgpCm5hbWVzKHRyaV93MTJfY291bnQpPWMoJ3dvcmRfMScsJ3dvcmRfMicsJ2NvdW50JykKCnRyaV93b3JkcyA9IG1lcmdlKHRyaV93b3JkcyxiaV93b3Jkc1ssYygnd29yZF8xJywnd29yZF8yJywnY291bnQnKV0sYnk9Yygnd29yZF8xJywnd29yZF8yJyksYWxsLng9VCxhbGwueT1GKQpuYW1lcyh0cmlfd29yZHMpID0gYygnd29yZF8xJywnd29yZF8yJywnd29yZF8zJywnY291bnQnLCd3b3JkXzEyX2JpX2NvdW50JykKIyBDbjIgPSB3b3JkXzEyX2JpX2NvdW50Cgp0cmlfd29yZHMgPSBtZXJnZSh0cmlfd29yZHMsdHJpX3cxMl9jb3VudCxieT1jKCd3b3JkXzEnLCd3b3JkXzInKSxhbGwueD1UKQpuYW1lcyh0cmlfd29yZHMpID0gYygnd29yZF8xJywnd29yZF8yJywnd29yZF8zJywnY291bnQnLCd3b3JkXzEyX2JpX2NvdW50Jywnd29yZF8xMl90cmlfY291bnQnKQoKdHJpX3dvcmRzID0gbWVyZ2UodHJpX3dvcmRzLGJpX3dvcmRzWyxjKCd3b3JkXzEnLCd3b3JkXzInLCdwcm9iJyldLGJ5Lng9Yygnd29yZF8yJywnd29yZF8zJyksYnkueT1jKCd3b3JkXzEnLCd3b3JkXzInKSxhbGwueD1UKQpuYW1lcyh0cmlfd29yZHMpID0gYygnd29yZF8xJywnd29yZF8yJywnd29yZF8zJywnY291bnQnLCd3b3JkXzEyX2JpX2NvdW50Jywnd29yZF8xMl90cmlfY291bnQnLCd3b3JkXzIzX2JpX3Byb2InKQoKdHJpX3dvcmRzJHByb2IgPSAgKHRyaV93b3JkcyRjb3VudCAtIGRpc2NvdW50X3ZhbHVlKS90cmlfd29yZHMkd29yZF8xMl9iaV9jb3VudCArIGRpc2NvdW50X3ZhbHVlL3RyaV93b3JkcyR3b3JkXzEyX2JpX2NvdW50KnRyaV93b3JkcyR3b3JkXzEyX3RyaV9jb3VudCp0cmlfd29yZHMkd29yZF8yM19iaV9wcm9iCgp0cmlfd29yZHMkd29yZF8xID0gYXMuY2hhcmFjdGVyKHRyaV93b3JkcyR3b3JkXzEpCnRyaV93b3JkcyR3b3JkXzIgPSBhcy5jaGFyYWN0ZXIodHJpX3dvcmRzJHdvcmRfMikKdHJpX3dvcmRzJHdvcmRfMyA9IGFzLmNoYXJhY3Rlcih0cmlfd29yZHMkd29yZF8zKQoKc2F2ZSh1bmlfd29yZHMsYmlfd29yZHMsdHJpX3dvcmRzLGZpbGU9J25fZ3JhbV9wcm9iLlJEYXRhJykKYGBgCgoKCioqUHJlZGljdGlvbioqCgpJbiB0aGlzIHByZWRpY3Rpb24gc2VjdGlvbiwgd2UgYWxzbyB1c2VkIGJhY2stb2ZmIG1vZGVsLCBpZS4gd2hlbiBubyBjYW5kaWRhdGUgaXMgYXZhaWxhYmxlIGluIHRyaS1ncmFtLCB3ZSdsbCBmYWxsIGJhY2sgdG8gYmktZ3JhbSBhbmQgZXZlbiB1bmktZ3JhbS4KCgpgYGB7cn0KIyBwcmVkaWN0aW9uCnByZWRpY3RfdW5pIDwtIGZ1bmN0aW9uKCkgewogIHByaW50KCd1bmknKQogIG1heF9wcm9iID0gbWF4KHVuaV93b3JkcyRwcm9iKQogIGNhbmRpZGF0ZXM9dW5pX3dvcmRzW3VuaV93b3JkcyRwcm9iPT1tYXhfcHJvYixdCiAgcmV0dXJuKHNhbXBsZShjYW5kaWRhdGVzJHdvcmRfMiwxKSkKfQoKcHJlZGljdF9iaSA8LSBmdW5jdGlvbih3MSkgewogIHByaW50KCdiaScpCiAgY2FuZGlkYXRlcyA9IGJpX3dvcmRzWyhiaV93b3JkcyR3b3JkXzEpPT13MSxjKCd3b3JkXzInLCdwcm9iJyldCiAgY2FuZGlkYXRlcyA9IGNhbmRpZGF0ZXNbb3JkZXIoLWNhbmRpZGF0ZXMkcHJvYiksXQogIGNhbmRpZGF0ZXMgPSBjYW5kaWRhdGVzWyFpcy5uYShjYW5kaWRhdGVzJHByb2IpLF0KICBpZiAobnJvdyhjYW5kaWRhdGVzKSA+PTEpewogICAgbWF4X3Byb2IgPSBtYXgoY2FuZGlkYXRlcyRwcm9iKQogICAgY2FuZGlkYXRlcz1jYW5kaWRhdGVzW2NhbmRpZGF0ZXMkcHJvYj09bWF4X3Byb2IsXQogICAgcmV0dXJuKHNhbXBsZShjYW5kaWRhdGVzJHdvcmRfMiwxKSkKICB9IGVsc2UgCiAgICB7cmV0dXJuKHByZWRpY3RfdW5pKCkpfQp9CgpwcmVkaWN0X3RyaSA8LSBmdW5jdGlvbih3MSwgdzIpIHsKICBwcmludCgndHJpJykKICBjYW5kaWRhdGVzID0gdHJpX3dvcmRzWyh0cmlfd29yZHMkd29yZF8xKT09dzEgJiB0cmlfd29yZHMkd29yZF8yID09IHcyLCBjKCd3b3JkXzMnLCdwcm9iJyldCiAgY2FuZGlkYXRlcyA9IGNhbmRpZGF0ZXNbb3JkZXIoLWNhbmRpZGF0ZXMkcHJvYiksXQogIGNhbmRpZGF0ZXMgPSBjYW5kaWRhdGVzWyFpcy5uYShjYW5kaWRhdGVzJHByb2IpLF0KICBpZiAobnJvdyhjYW5kaWRhdGVzKSA+PTEpewogICAgbWF4X3Byb2IgPSBtYXgoY2FuZGlkYXRlcyRwcm9iKQogICAgY2FuZGlkYXRlcz1jYW5kaWRhdGVzW2NhbmRpZGF0ZXMkcHJvYj09bWF4X3Byb2IsXQogICAgcmV0dXJuKHNhbXBsZShjYW5kaWRhdGVzJHdvcmRfMywxKSkKICB9IGVsc2UgCiAgICB7cmV0dXJuKHByZWRpY3RfYmkodzIpKX0KCn0KCmBgYAoKCgoqKm1pc2NlbGxhbmVvdXMqKgoKT3RoZXIgbWlzY2VsbGFuZW91cyBxdWVzdGlvbnMgZnJvbSB0aGlzIHRhc2s6CgoxLiBDYW4geW91IHRoaW5rIG9mIGEgd2F5IHRvIGluY3JlYXNlIHRoZSBjb3ZlcmFnZSAtLSBpZGVudGlmeWluZyB3b3JkcyB0aGF0IG1heSBub3QgYmUgaW4gdGhlIGNvcnBvcmEgb3IgdXNpbmcgYSBzbWFsbGVyIG51bWJlciBvZiB3b3JkcyBpbiB0aGUgZGljdGlvbmFyeSB0byBjb3ZlciB0aGUgc2FtZSBudW1iZXIgb2YgcGhyYXNlcz8gCisgSWYgdHdvIHdvcmRzIGFyZSBoaWdobHkgY29ycmVsYXRlZC4gZm9yIGV4YW1wbGUgd29yZHMgQSBhbmQgQiwgaWUuIHdlIGhhdmUgKHgsIEEpIGFuZCAoeCwgQikgKHggaXMgYW4gYXJiaXRyYXkgd29yZCksIGJ1dCBub3cgaW4gb3VyIHNhbXBsZSBkYXRhLCB3ZSBvbmx5IHNlZSAoeSxBKSwgaW4gdGhpcyBjYXNlIGFsc28gYXNzaWduIHNhbWUgZnJlcXVlbmN5IHRvICh5LEIpLgoKCgoKCg==