Milestone Report

TABLE OF CONTENT

Extracting Data
Summary Statistics
Trial and Error
Creating Sample and Clean Corpus
Document Term Matrix and Term Document Matrix
Exploratory Data Analysis
Ngram, Frequency plot and Wordcloud
Plan

Load Required Libraries

library(tm)            #For Text Mining
library(stringi)       #For string operations
library(wordcloud)     #For Generating Prettyg Word Cloud
library(rJava)         #Required for RWeka
library(RWeka)         #For all the classification and evaluation from Weka
library(ggplot2)       #For Plots

Extracting Data

Data can be downloaded from the URL

#Extracting data from en_US.blogs.txt file
con <- file("en_US.blogs.txt", open = "rb")
data_blogs  <- readLines(con, encoding="UTF-8", skipNul = TRUE); close(con)
data_blogs <- iconv(data_blogs,'UTF-8', 'ASCII', "byte")

#Extracting data from en_US.news.txt file
con <- file("en_US.news.txt", open = "rb")
data_news <- readLines(con, encoding="UTF-8", skipNul = TRUE); close(con)
data_news <- iconv(data_news,'UTF-8', 'ASCII', "byte")

#Extracting data from en_US.twitter.txt file
con <- file("en_US.twitter.txt", open = "rb")
data_twitter  <- readLines(con, encoding="UTF-8", skipNul = TRUE); close(con)
data_twitter <- iconv(data_twitter,'UTF-8', 'ASCII', "byte")

Issue faced :
While extracting the data from text files initially I tried using only the single line readLines command which didn’t extract the data from en_US.news.txt file properly because of NULL values. Only about 77,259 lines out of 1010242 were fetched. Thus used with opening and closing the connection which fetched data properly.

Summary Statistics

Size of the text files

size_blogs   <- file.size("en_US.blogs.txt")
size_news    <- file.size("en_US.news.txt")
size_twitter <- file.size("en_US.twitter.txt")

## Size (in bytes) of text file en_US.blogs.txt    : 210160014 
## Size (in bytes) of text file en_US.news.txt     : 205811889 
## Size (in bytes) of text file en_US.twitter.txt  : 167105338

Number of lines in each document

nlines_blogs   <- length(data_blogs)
nlines_news    <- length(data_news)
nlines_twitter <- length(data_twitter)

## Number of lines in text file en_US.blogs.txt   : 899288 
## Number of lines in text file en_US.news.txt    : 1010242 
## Number of lines in text file en_US.twitter.txt : 2360148

Number of words in each document

## Number of words in text file en_US.blogs.txt   : 40220892 
## Number of words in text file en_US.news.txt    : 35731048 
## Number of words in text file en_US.twitter.txt : 30528002

Trial and Error

My initial attempt was the create the corpus for the complete files.
I tried both tm and quanteda packages but processing got stuck either while creating either the Corpus or the Document Term Matrix.
So decided to create one percent sample for all the three files to ease processing.
Combined the three one percent sample files to create one sample file

Creating Sample and Clean Corpus

Creating samples for the text files to ease processing

#Creating one percent (1p) sample dataset
data_blogs_1p   <- data_blogs[sample(1:nlines_blogs,0.01*nlines_blogs)]
data_news_1p    <- data_news[sample(1:nlines_news,0.01*nlines_news)]
data_twitter_1p <- data_twitter[sample(1:nlines_twitter,0.01*nlines_twitter)]

Combining the three sample files data_blogs, data_news, data_twitter

sample_1p <- c(data_blogs_1p,data_news_1p,data_twitter_1p)

Using tm package to create the Corpus and further processing of Corpus

Function to create Clean Corpus

CleanCorpus <- function(x) {
  y <- Corpus(VectorSource(x))
  y <- tm_map(y, removePunctuation) 
  y <- tm_map(y, removeNumbers) 
# y <- tm_map(x, removeWords, stopwords("english")) 
  y <- tm_map(y,  stripWhitespace)  
  y <- tm_map(y, PlainTextDocument) 
  return(y)
}

Create clean Corpus

corpus_sample_1p <- CleanCorpus(sample_1p)

Document Term Matrix

##Document Term Matrix   
dtm_sample <- DocumentTermMatrix(corpus_sample_1p)   

#Term Document Matrix : Transpose of Document Term Matrix
tdm_sample <- TermDocumentMatrix(corpus_sample_1p)

This makes a matrix that is 99% empty space

dtms_sample <- removeSparseTerms(dtm_sample, 0.99)

Exploratory Data Analysis

freq_sample <- sort(colSums(as.matrix(dtms_sample)), decreasing = T)
wf_sample <- data.frame(word=names(freq_sample), freq=freq_sample)   
head(wf_sample)

##      word  freq
## the   the 47617
## and   and 24055
## for   for 11102
## that that 10347
## you   you  9589
## with with  7090

Word Frequency Plot

library(ggplot2)   
p <- ggplot(subset(wf_sample, freq>4000), aes(word, freq))
p <- p + geom_bar(stat="identity")   
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
p

Wordcloud

set.seed(1111)
wordcloud(words = wf_sample$word, freq = wf_sample$freq, min.freq = 100,
          max.words=500, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Ngrams

Bigrams

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm_sample_2gram <- TermDocumentMatrix(corpus_sample_1p, control = list(tokenize = BigramTokenizer))
tdms_sample_2gram <- removeSparseTerms(tdm_sample_2gram, 0.99) 
freq_sample_2gram <- sort(rowSums(as.matrix(tdms_sample_2gram)), decreasing = T)
 
wf_sample_2gram <- data.frame(word=names(freq_sample_2gram), freq=freq_sample_2gram) 
head(wf_sample_2gram)

##            word freq
## of the   of the 4336
## in the   in the 4157
## to the   to the 2092
## for the for the 1970
## on the   on the 1967
## to be     to be 1584

Bigram Frequency Plot

p2 <- ggplot(subset(wf_sample_2gram, freq>1000), aes(word, freq))
p2 <- p2 + geom_bar(stat="identity")   
p2 <- p2 + theme(axis.text.x=element_text(angle=45, hjust=1))   
p2

Wordcloud for Bigram

set.seed(1111)
wordcloud(words = wf_sample_2gram$word, freq = wf_sample_2gram$freq, min.freq = 100,
          max.words=400, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Trigrams

TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm_sample_3gram <- TermDocumentMatrix(corpus_sample_1p, control = list(tokenize = TrigramTokenizer))
tdms_sample_3gram <- removeSparseTerms(tdm_sample_3gram, 0.999)
freq_sample_3gram <- sort(rowSums(as.matrix(tdms_sample_3gram)), decreasing = T)
wf_sample_3gram <- data.frame(word=names(freq_sample_3gram), freq=freq_sample_3gram)   
head(wf_sample_3gram)

##                          word freq
## one of the         one of the  347
## a lot of             a lot of  287
## thanks for the thanks for the  218
## to be a               to be a  189
## out of the         out of the  164
## going to be       going to be  157

Trigram Frequency Plot

p3 <- ggplot(subset(wf_sample_3gram, freq>116), aes(word, freq))
p3 <- p3 + geom_bar(stat="identity")   
p3 <- p3 + theme(axis.text.x=element_text(angle=45, hjust=1))   
p3

Wordcloud for Trigram

set.seed(1111)
wordcloud(words = wf_sample_3gram$word, freq = wf_sample_3gram$freq, min.freq = 10,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Plan

To come up with a mechanism so that ngram can be referred by the model while making the prediction
Need to work on ngram which are not present in the Corpora
Make a prediction model and make it smaller and efficient
Incorporate the prediction model into a Shiny app

Note

I have included the codes also in the Milestone report and would request Peers to give feedback on the coding as well if it can be made more efficient.