library(tm) #For Text Mining
library(stringi) #For string operations
library(wordcloud) #For Generating Prettyg Word Cloud
library(rJava) #Required for RWeka
library(RWeka) #For all the classification and evaluation from Weka
library(ggplot2) #For Plots
Data can be downloaded from the URL
#Extracting data from en_US.blogs.txt file
con <- file("en_US.blogs.txt", open = "rb")
data_blogs <- readLines(con, encoding="UTF-8", skipNul = TRUE); close(con)
data_blogs <- iconv(data_blogs,'UTF-8', 'ASCII', "byte")
#Extracting data from en_US.news.txt file
con <- file("en_US.news.txt", open = "rb")
data_news <- readLines(con, encoding="UTF-8", skipNul = TRUE); close(con)
data_news <- iconv(data_news,'UTF-8', 'ASCII', "byte")
#Extracting data from en_US.twitter.txt file
con <- file("en_US.twitter.txt", open = "rb")
data_twitter <- readLines(con, encoding="UTF-8", skipNul = TRUE); close(con)
data_twitter <- iconv(data_twitter,'UTF-8', 'ASCII', "byte")
Issue faced :
While extracting the data from text files initially I tried using only the single line readLines command which didn’t extract the data from en_US.news.txt file properly because of NULL values. Only about 77,259 lines out of 1010242 were fetched. Thus used with opening and closing the connection which fetched data properly.
size_blogs <- file.size("en_US.blogs.txt")
size_news <- file.size("en_US.news.txt")
size_twitter <- file.size("en_US.twitter.txt")
## Size (in bytes) of text file en_US.blogs.txt : 210160014
## Size (in bytes) of text file en_US.news.txt : 205811889
## Size (in bytes) of text file en_US.twitter.txt : 167105338
nlines_blogs <- length(data_blogs)
nlines_news <- length(data_news)
nlines_twitter <- length(data_twitter)
## Number of lines in text file en_US.blogs.txt : 899288
## Number of lines in text file en_US.news.txt : 1010242
## Number of lines in text file en_US.twitter.txt : 2360148
## Number of words in text file en_US.blogs.txt : 40220892
## Number of words in text file en_US.news.txt : 35731048
## Number of words in text file en_US.twitter.txt : 30528002
tm and quanteda packages but processing got stuck either while creating either the Corpus or the Document Term Matrix.#Creating one percent (1p) sample dataset
data_blogs_1p <- data_blogs[sample(1:nlines_blogs,0.01*nlines_blogs)]
data_news_1p <- data_news[sample(1:nlines_news,0.01*nlines_news)]
data_twitter_1p <- data_twitter[sample(1:nlines_twitter,0.01*nlines_twitter)]
sample_1p <- c(data_blogs_1p,data_news_1p,data_twitter_1p)
Using tm package to create the Corpus and further processing of Corpus
Function to create Clean Corpus
CleanCorpus <- function(x) {
y <- Corpus(VectorSource(x))
y <- tm_map(y, removePunctuation)
y <- tm_map(y, removeNumbers)
# y <- tm_map(x, removeWords, stopwords("english"))
y <- tm_map(y, stripWhitespace)
y <- tm_map(y, PlainTextDocument)
return(y)
}
Create clean Corpus
corpus_sample_1p <- CleanCorpus(sample_1p)
##Document Term Matrix
dtm_sample <- DocumentTermMatrix(corpus_sample_1p)
#Term Document Matrix : Transpose of Document Term Matrix
tdm_sample <- TermDocumentMatrix(corpus_sample_1p)
This makes a matrix that is 99% empty space
dtms_sample <- removeSparseTerms(dtm_sample, 0.99)
freq_sample <- sort(colSums(as.matrix(dtms_sample)), decreasing = T)
wf_sample <- data.frame(word=names(freq_sample), freq=freq_sample)
head(wf_sample)
## word freq
## the the 47617
## and and 24055
## for for 11102
## that that 10347
## you you 9589
## with with 7090
library(ggplot2)
p <- ggplot(subset(wf_sample, freq>4000), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
set.seed(1111)
wordcloud(words = wf_sample$word, freq = wf_sample$freq, min.freq = 100,
max.words=500, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm_sample_2gram <- TermDocumentMatrix(corpus_sample_1p, control = list(tokenize = BigramTokenizer))
tdms_sample_2gram <- removeSparseTerms(tdm_sample_2gram, 0.99)
freq_sample_2gram <- sort(rowSums(as.matrix(tdms_sample_2gram)), decreasing = T)
wf_sample_2gram <- data.frame(word=names(freq_sample_2gram), freq=freq_sample_2gram)
head(wf_sample_2gram)
## word freq
## of the of the 4336
## in the in the 4157
## to the to the 2092
## for the for the 1970
## on the on the 1967
## to be to be 1584
p2 <- ggplot(subset(wf_sample_2gram, freq>1000), aes(word, freq))
p2 <- p2 + geom_bar(stat="identity")
p2 <- p2 + theme(axis.text.x=element_text(angle=45, hjust=1))
p2
set.seed(1111)
wordcloud(words = wf_sample_2gram$word, freq = wf_sample_2gram$freq, min.freq = 100,
max.words=400, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm_sample_3gram <- TermDocumentMatrix(corpus_sample_1p, control = list(tokenize = TrigramTokenizer))
tdms_sample_3gram <- removeSparseTerms(tdm_sample_3gram, 0.999)
freq_sample_3gram <- sort(rowSums(as.matrix(tdms_sample_3gram)), decreasing = T)
wf_sample_3gram <- data.frame(word=names(freq_sample_3gram), freq=freq_sample_3gram)
head(wf_sample_3gram)
## word freq
## one of the one of the 347
## a lot of a lot of 287
## thanks for the thanks for the 218
## to be a to be a 189
## out of the out of the 164
## going to be going to be 157
p3 <- ggplot(subset(wf_sample_3gram, freq>116), aes(word, freq))
p3 <- p3 + geom_bar(stat="identity")
p3 <- p3 + theme(axis.text.x=element_text(angle=45, hjust=1))
p3
set.seed(1111)
wordcloud(words = wf_sample_3gram$word, freq = wf_sample_3gram$freq, min.freq = 10,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
I have included the codes also in the Milestone report and would request Peers to give feedback on the coding as well if it can be made more efficient.