Click “code” button on the right to show the requried libraries.
library(ggplot2)
library(RWeka)
library(tm)
library(quanteda)
library(data.table)
We load english version of Twitter, news, and blog corpus.
setwd("./final/en_US/")
twitter <- readLines("en_US.twitter.txt", skipNul = TRUE, encoding = "UTF-8")
news <- readLines("en_US.news.txt", skipNul = TRUE, encoding = "UTF-8")
blog <- readLines("en_US.blogs.txt", skipNul = TRUE, encoding = "UTF-8")
Let’s check how much memory the corpus occupied in R envorinment.
blog_size <- format(object.size(blog), units = "auto")
news_size <- format(object.size(news), units = "auto")
twitter_size <- format(object.size(twitter), units = "auto")
sprintf("Memory occupied by blog corpus: %s", blog_size)
## [1] "Memory occupied by blog corpus: 248.5 Mb"
sprintf("Memory occupied by news corpus: %s", news_size)
## [1] "Memory occupied by news corpus: 249.6 Mb"
sprintf("Memory occupied by twitter corpus: %s", twitter_size)
## [1] "Memory occupied by twitter corpus: 301.4 Mb"
With the goal of making web application that is accesible from both PC and mobile devices, learning from the whole dataset can cause the application to run extremely slow. Hence, we need to do sampling in building the application.
Let’s check the number of documents in each corpus.
twitter_ndocs <- length(twitter)
news_ndocs <- length(news)
blog_ndocs <- length(blog)
sprintf("Number of documents in blog corpus: %s", blog_ndocs)
## [1] "Number of documents in blog corpus: 899288"
sprintf("Number of documents in news corpus: %s", news_ndocs)
## [1] "Number of documents in news corpus: 1010242"
sprintf("Number of documents in twitter corpus: %s", twitter_ndocs)
## [1] "Number of documents in twitter corpus: 2360148"
And see the distribution of the length of characters in each corpus.
summary(nchar(blog))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 47 156 230 329 40833
qplot(nchar(blog), geom = "histogram", main = "Length of Characters in Blog Corpus",
xlab = "Length of characters", ylab = "Document count", fill = I("blue"), col = I("black"))
summary(nchar(twitter))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 37.00 64.00 68.68 100.00 140.00
qplot(nchar(twitter), geom = "histogram", main = "Length of Characters in Twitter Corpus",
xlab = "Length of characters", ylab = "Document count", fill = I("green"), col = I("black"))
summary(nchar(news))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 110.0 185.0 201.2 268.0 11384.0
qplot(nchar(news), geom = "histogram", main = "Length of Characters in News Corpus",
xlab = "Length of characters", ylab = "Document count", fill = I("yellow"), col = I("black"))
The distribution of length of characters are extremely skewed for News and Blog corpus. Some of the documents in all of the corpus even have length of characters as small as 1-2 characters.
Sampling from the whole dataset give us a lot of uncertainty because two undesirable events can occur:
Sample with too few characters is useless to make our model smarter
Sampling from extremely right skewed distribution as seen in News and Blog corpus restrains us in controlling the amount of memory
For those reasons, we will subset the corpus within each of first to third quantile of length of characters.
Click “code” button on the right to see the code to subset the corpus
FirstToThirdQuantile <- function(corpus) {
q1 <- quantile(nchar(corpus), .25)
q3 <- quantile(nchar(corpus), .75)
corpus <- corpus[nchar(corpus) < q3]
corpus <- corpus[nchar(corpus) > q1]
return(corpus)
}
blog <- FirstToThirdQuantile(blog)
twitter <- FirstToThirdQuantile(twitter)
news <- FirstToThirdQuantile(news)
qplot(nchar(blog), geom = "histogram", main = "Length of Characters in Blog Corpus",
xlab = "Length of characters", ylab = "Document count", fill = I("blue"), col = I("black"))
qplot(nchar(twitter), geom = "histogram", main = "Length of Characters in Twitter Corpus",
xlab = "Length of characters", ylab = "Document count", fill = I("green"), col = I("black"))
qplot(nchar(news), geom = "histogram", main = "Length of Characters in News Corpus",
xlab = "Length of characters", ylab = "Document count", fill = I("yellow"), col = I("black"))
This is much better!
After this, we generate our sample with 10,000 sample size. Let’s see how much memory it occupies.
GenerateSample <- function(corpus, sample_size = 10000, seed=123) {
set.seed(seed)
sample_index <- sample(1:length(corpus), sample_size)
sample_corpus <- corpus[sample_index]
return(sample_corpus)
}
blog_sample <- GenerateSample(blog)
twitter_sample <- GenerateSample(twitter)
news_sample <- GenerateSample(news)
blog_sample_size <- format(object.size(blog_sample), units = "auto")
news_sample_size <- format(object.size(news_sample), units = "auto")
twitter_sample_size <- format(object.size(twitter_sample), units = "auto")
sprintf("Memory occupied by blog sample: %s", blog_sample_size)
## [1] "Memory occupied by blog sample: 2.2 Mb"
sprintf("Memory occupied by news sample: %s", news_sample_size)
## [1] "Memory occupied by news sample: 2.3 Mb"
sprintf("Memory occupied by twitter sample: %s", twitter_sample_size)
## [1] "Memory occupied by twitter sample: 1.4 Mb"
Take a look to the content of each sample.
Blog sample
head(blog_sample, 3)
## [1] "It has helped my fiction writing immensely. When I read fiction I wrote five or so years ago, I can see the writing. When I read things I write now - well, let's just say that, to use another word thrown around a lot on The Voice (Hi Joel!), I believe the voice wins."
## [2] "I also bought some lovely little embroidery hoops for some pictures I want to make for an arts trail event I am doing. I really do need to get off the computer and make something. Ah now I understand the button tidying. Its a distraction from actually sitting down and making something. It all makes sense now!"
## [3] "approvals from the ministry or the Federal executive council which"
Twitter sample
head(twitter_sample, 3)
## [1] "Rewinding that at bat and saw that Fielder just smirks before hitting it out"
## [2] "“: \"Tell me Caesar, do I smell like roses to you?\" -Peeta ” LOVE THE MOVIE, JUST LIKE THE BOOK! <3"
## [3] "they say nothing in life is impossible-- well you try to nail jelly to a tree"
News sample
head(news_sample, 3)
## [1] "The first events of the day began with about 150 workers such as baggage handlers and cabin cleaners walking off their jobs at LAX around 4 a.m."
## [2] "A 26-year-old man was arrested in the Mission District Tuesday evening for allegedly using a potato peeler to rob a man in a restaurant bathroom, San Francisco police said."
## [3] "The blame can largely rest on Anthony, again outclassed by James. Anthony blamed the Heat attentive D, not himself."
Finally, we concatenate all of the samples into one corpus and store it in variable corpus_ori
corpus_ori <- c(blog_sample, twitter_sample, news_sample)
We will do the followings into our new corpus:
Remove profanity words
Convert to lowecase
Remove numbers and special characters
profanity <- readLines("./profanity.txt", encoding = "UTF-8")
corpus <- Corpus(VectorSource(corpus_ori))
changetospace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, content_transformer(tolower)) # convert to lowercase
removeSpecialChars <- function(x) gsub("[^a-zA-Z']"," ",x)
corpus <- tm_map(corpus, removeSpecialChars)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, profanity)
corpus_vector <- content(corpus)
From the cleaned corpus, we then generate unigram, bigram, and trigram tokenization. Let’s see the top 20 unigram, bigram, and trigram below, based on the frequencies they appear in our cleaned corpus:
getNgramFreqs <- function(ng, dat, ignores=NULL,
sort.by.ngram=TRUE, sort.by.freq=FALSE) {
if(is.null(ignores)) {
dat.dfm <- dfm(dat, ngrams=ng, what = "fasterword", verbose = FALSE)
} else {
dat.dfm <- dfm(dat, ngrams=ng, ignoredFeatures=ignores, what = "fasterword", verbose = FALSE)
}
rm(dat)
# quanteda docfreq will get the document frequency of terms in the dfm
ngram.freq <- docfreq(dat.dfm)
if(sort.by.freq) { ngram.freq <- sort(ngram.freq, decreasing=TRUE) }
if(sort.by.ngram) { ngram.freq <- ngram.freq[sort(names(ngram.freq))] }
rm(dat.dfm)
return(ngram.freq)
}
getNgramTables <- function(ng, linesCorpus, prefixFilter=NULL) {
ngrams <- getNgramFreqs(ng, linesCorpus)
ngrams_dt <- data.table(ngram=names(ngrams), freq=ngrams)
if(length(grep('^SOS', ngrams_dt$ngram)) > 0) {
ngrams_dt <- ngrams_dt[-grep('^SOS', ngrams_dt$ngram),]
}
if(!is.null(prefixFilter)) {
regex <- sprintf('%s%s', '^', prefixFilter)
ngrams_dt <- ngrams_dt[grep(regex, ngrams_dt$ngram),]
}
return(ngrams_dt)
}
unigram <- getNgramTables(1, corpus_vector)
bigram <- getNgramTables(2, corpus_vector)
trigram <- getNgramTables(3, corpus_vector)
unigram <- unigram[order(-rank(freq))]
bigram <- bigram[order(-rank(freq))]
trigram <- trigram[order(-rank(freq))]
unigram <- unigram[complete.cases(unigram),]
unigram$ngram <- factor(unigram$ngram, levels = unigram$ngram[order(unigram$freq, decreasing=TRUE)])
bigram$ngram <- factor(bigram$ngram, levels = bigram$ngram[order(bigram$freq, decreasing=TRUE)])
trigram$ngram <- factor(trigram$ngram, levels = trigram$ngram[order(trigram$freq, decreasing=TRUE)])
ggplot(unigram[1:20,], aes(x= ngram, y = freq)) +
geom_bar(stat="identity") +
labs(title= "Top 20 Unigram", y= "Count", x="") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(bigram[1:20,], aes(x= ngram, y = freq)) +
geom_bar(stat="identity") +
labs(title= "Top 20 Bigram", y= "Count", x="") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(trigram[1:20,], aes(x= ngram, y = freq)) +
geom_bar(stat="identity") +
labs(title= "Top 20 Trigram", y= "Count", x="") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
We have gotten unigram, bigram, and trigram frequencies table. Using this data, we can build application to predict the next word based on user input sentence/phrase.
Backoff model to generate 4 words prediction using frequencies of the learned N-grams with the higher degree of N-gram as priority can be built. Also, additional smoothing or discounting method, such as Katz’s backodd model can be implemented to increase accuracy. However, we must take into account the tradeoff in the speed of computation.