In order to build the predictive model for predicting the next word, one has to understand the distribution and relationship between the words, tokens and phrases in the text. In this exercise we are given three (3) types of texts from blogs, news and twitter to study and analyse. We have to carry out a thorough exploratory analysis of the data and understand the basic distribution of words and relationship between the words in the corpora. This report contains tables of the length and word count of both the original and sampled dataset as well as plots of the frequencies of words (top10unigrams) and word pairs (top10bigrams and top10trigrams) in the data.
library(tm), library(RWeka), library(openNLP), library(qdap), library(stringi), library(stringr), library(pryr)
blogs = readLines("~/Capstone/final/en_US/en_US.blogs.txt",encoding="UTF-8",skipNul = TRUE)
news = readLines("~/Capstone/final/en_US/en_US.news.txt",encoding="UTF-8", skipNul = TRUE)
## Warning in readLines("~/Capstone/final/en_US/en_US.news.txt", encoding
## = "UTF-8", : incomplete final line found on '~/Capstone/final/en_US/
## en_US.news.txt'
twitter = readLines("~/Capstone/final/en_US/en_US.twitter.txt",encoding="UTF-8", skipNul = TRUE)
length(blogs) # 899288
## [1] 899288
length(news) # 77259
## [1] 77259
length(twitter) # 2360148
## [1] 2360148
set.seed(123)
blogs <- blogs[rbinom(length(blogs)*.003, length(blogs), .5)]
news <- news[rbinom(length(news)*.030, length(news), .5)]
twitter <- twitter[rbinom(length(twitter)*.003, length(twitter), .5)]
length(blogs) #2697
## [1] 2697
length(news) # 2317
## [1] 2317
length(twitter) # 7080
## [1] 7080
## doc.text original sampled
## 1 blogs 899288 2697
## 2 news 77259 2317
## 3 twitter 2360148 7080
The word count of the original texts and the sampled texts are given in the following table:
## doc.text original sampled
## 1 blogs 37570839 109560
## 2 news 2651432 76958
## 3 twitter 30451170 91923
In the following section we set up a corpus of the sampled data for cleaning and analysis. We also see five examples of the cleaned texts without stop words.
bnttext = c(blogs,news,twitter)
bnttext_sent = sent_detect(bnttext, language = "en", model = NULL)
ascbnt <- stri_enc_toascii(bnttext_sent)
ascbnt <- stri_replace_all_regex(ascbnt,'\032','')
rm(blogs, news, twitter)
corpus <- VCorpus(VectorSource(ascbnt))
corpus <- tm_map(corpus, content_transformer(tolower), lazy=TRUE)
CleanCorpora <- function(corpus){
# Non UTF-8 Characters and Set lower case
corpus <- tm_map(corpus, content_transformer (function(x) iconv(enc2utf8(x), sub = "byte")))
corpus <- tm_map(corpus, content_transformer (function(x) iconv(x,'latin1','ASCII',sub='')))
corpus <- tm_map(corpus, content_transformer(tolower))
}
corpus <- tm_map(corpus, removeNumbers)
remover <- content_transformer(function(x,pattern)gsub(pattern,' ',x))
corpus<- tm_map(corpus, remover, '[@][a-zA-Z0-9_]{1,15}') # remove twitter usernames
corpus<- tm_map(corpus, remover, 'Ã|½Ã|¸¥' )
corpus<- tm_map(corpus, remover, 'í ½í¸¥' )
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument,language = ("english"))# stemming the words
fulldata<-data.frame(text=unlist(sapply(corpus, `[`, "content")), stringsAsFactors=F)
fulldata[1:5,1]
## [1] " friend fill empti worri fear"
## [2] " trek jungl wild anim torrid desert travers danger eventu get hors gallop mountain love villag fill music song love stay carri blind snow fight wolv bite bitter wind"
## [3] " speci man grown one stage poorer longer possess strength interpret creat fiction produc nihilist"
## [4] " nihilist man judg world world exist"
## [5] "accord view exist mean patho vain nihilist patho time patho inconsist part nihilist"
one_grams <- NGramTokenizer(fulldata, Weka_control(min = 1, max = 1))
bi_grams <- NGramTokenizer(fulldata, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
tri_grams <- NGramTokenizer(fulldata, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))
one_gramsDF <- data.frame(table(one_grams))
bi_gramsDF <- data.frame(table(bi_grams))
tri_gramsDF <- data.frame(table(tri_grams))
unigrams_sorted <- one_gramsDF[order(one_gramsDF$Freq,decreasing = TRUE),]
bigrams_sorted <- bi_gramsDF[order(bi_gramsDF$Freq,decreasing = TRUE),]
trigrams_sorted <- tri_gramsDF[order(tri_gramsDF$Freq,decreasing = TRUE),]
top10unigram <- unigrams_sorted[1:10,]
top10bigram <- bigrams_sorted[1:10,]
top10trigram <- trigrams_sorted[1:10,]
par(oma=c(0,0,5,0),mfrow = c(2,2), mar=c(5,2,3,2))
barplot(top10unigram$Freq,names.arg=top10unigram$one_grams,las=3)
title("Top10 Unigram")
barplot(top10bigram$Freq,names.arg=top10bigram$bi_grams,las=3)
title("top10 Bigram")
barplot(top10trigram$Freq,names.arg=top10trigram$tri_grams, las=3) # the texts becomes vertical
title("Top10 Trigram")
object_size(corpus) # Size of the corpus in Mbs.
## 28.7 MB
The size of the corpus without stop words is 28.7 Mbs.
## [1] "our friend fill with empti worri and fear"
## [2] "the trek through jungl with wild anim the torrid desert that have to be travers with all their danger eventu they get hors gallop up mountain and down into love villag fill with music and song
he would have love to stay there but they had to carri on through blind snow fight wolv and the bite bitter wind"
## [3] "this same speci of man grown one stage poorer no longer possess the strength to interpret to creat fiction produc nihilist"
## [4] "a nihilist is a man who judg of the world as it is that it ought not to be and of the world as it ought to be that it doe not exist"
## [5] "accord to this view our exist has no mean the patho of in vain is the nihilist pathosat the same time as patho an inconsist on the part of the nihilist"
The following 3 plots show the top-15 unigram, bigram and trigram:
par(oma=c(0,0,5,0),mfrow = c(2,2), mar=c(5,2,3,2))
barplot(top10unigram$Freq,names.arg=top10unigram$one_grams, las=3) # the texts becomes vertical
title("Top10 Unigram")
barplot(top10bigram$Freq,names.arg=top10bigram$bi_grams,las=3)
title("top10 Bigram")
barplot(top10trigram$Freq,names.arg=top10trigram$tri_grams,las=3)
title("Top10 Trigram")
object_size(corpus) # Size of the corpus in Mbs.
## 28.8 MB