Text Mining Infrastructure in R.

Libraries needed for Text Mining

Making a Connection for reading the Document.

con_news<-file(news,"r")
con_blogs<-file(blogs,"r")
con_twitter<-file(twitter,"r")

Reading the data into R.

ReadText<-function(con){
  return(readLines(con,-1))
}

newsData<-ReadText(con_news)
blogsData<-ReadText(con_blogs)
twitterData<-ReadText(con_twitter)

Cleaning the Data

Converting into LowerCase.

Clean<-function(data){
  data<-gsub(pattern=";|\\.|!|\\?",x=data,replacement = "rep1")
  data<-gsub(pattern="\\'",x=data,replacement = "rep2")
  data<-gsub(pattern = "[^a-zA-Z]",x=data,replacement = " ")
  
  data<-tolower(data) #Converting to lower Case
  
  data<-gsub(pattern = "rep2",x=data,replacement = "\\'")
  data<-gsub(pattern = "\\s+",x=data,replacement = " ")
  
  sentence<-unlist(strsplit(x=data,split="rep1",fixed = T))
  
  return(sentence)
}


news_corpora<-Clean(newsData)
blogs_corpora<-Clean(blogsData)
twitter_corpora<-Clean(twitterData)

Using the RDS function to save the R object with Corpora of different files to save as a connection/File.

saveRDS(blogs_corpora,file="F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean\\blogs.txt")
saveRDS(twitter_corpora,file="F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean\\twitter.txt")
saveRDS(news_corpora,file="F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean\\news.txt")

Creating a new Corpus from the cleaned documents.

#Using Corpus() and DirSource() from the tm Package.
docs<-VCorpus(DirSource("F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean"))

Function for Counting the number of lines in each document.

Printing the Details of Each file.

paste("Number of Lines in News file are",newsLines)
## [1] "Number of Lines in News file are 77259"
paste("Number of Lines in blog file are",blogLines)
## [1] "Number of Lines in blog file are 899288"
paste("Number of Lines in twitter file are",twitterLines)
## [1] "Number of Lines in twitter file are 77259"

Making a Term Document Matrix For more description.

Using the DocumentTermMatrix() function of tm package.

dtm<-DocumentTermMatrix(docs) #As the function takes a corpus as a parameter
print(dtm)
## <<DocumentTermMatrix (documents: 3, terms: 34309)>>
## Non-/sparse entries: 44567/58360
## Sparsity           : 57%
## Maximal term length: 25
## Weighting          : term frequency (tf)

Finding the most common words.

Can be found by summing the columns in a dtm.

Using the colSums() from quanteda package.

frequency<-colSums(as.matrix(dtm))
frequency<-sort(frequency,decreasing = T) #Sorting in decreasing order
head(frequency,50)
##   the   and  that   rep   for   was  with   you  this   but  have   are 
## 19949 11355  4776  3749  3645  3003  2899  2706  2390  2184  2145  1909 
##   not  they  from   all   his   one   had  will   she   her   out about 
##  1770  1559  1505  1407  1355  1228  1163  1114  1108  1085  1083  1075 
##  what   has  when  like  just   can   who their there  your would  more 
##  1036  1025   997   966   952   945   945   931   931   915   899   885 
##  some  time   our  been  were itrep which   get   how  into  irep  them 
##   861   844   840   827   803   791   788   723   663   647   619   617 
##  then   new 
##   608   582

Making a Dataframe for using ggplot.

freq_df<-data.frame(words=names(frequency),count=frequency)

Using ggplot for showing Most frequent 20 words.

g<-ggplot(data=freq_df[1:20,])
g<-g+geom_bar(stat = "identity",aes(x=words,y=count,fill="blue"))+theme(legend.position = "none",axis.text.x = element_text(angle=90))
g

Making a WordCloud For the same.

wordcloud(names(frequency),frequency,max.words = 200,colors=brewer.pal(6,"Dark2"))

Predicting Model

1.N-gram model

2.Probability tree.

BI-gram

biTokenizer<-function(x){NGramTokenizer(x,Weka_control(min=2,max=2))}
dtmBIGRAM<-TermDocumentMatrix(docs,control=list(tokenize=biTokenizer))
print(dtmBIGRAM)
## <<TermDocumentMatrix (terms: 196058, documents: 3)>>
## Non-/sparse entries: 217563/370611
## Sparsity           : 63%
## Maximal term length: 38
## Weighting          : term frequency (tf)

Plotting bi-grams

#freq_bi<-findFreqTerms(dtmBIGRAM,lowfreq = 00)
#freq_bi_1<-rowSums(as.matrix(dtmBIGRAM[freq_bi,]))
#freq_bigram_df<-data.frame(bigram=names(freq_bi_1),freq=freq_bi_1)

As the code above is giving an error we will use slam package for col summing the matrix.

freq_bi<-rowapply_simple_triplet_matrix(dtmBIGRAM,FUN=sum)
freq_bi<-sort(freq_bi,decreasing = T)
freq_df_bi<-data.frame(bigram=names(freq_bi),freq=freq_bi)
freq_df_bi[1:50,]
##              bigram freq
## of the       of the 2029
## in the       in the 1642
## rep rep     rep rep 1041
## to the       to the  972
## on the       on the  840
## to be         to be  686
## and the     and the  640
## for the     for the  607
## i was         i was  562
## in a           in a  508
## and i         and i  489
## is a           is a  484
## at the       at the  460
## it is         it is  452
## it was       it was  437
## with the   with the  429
## itrep s     itrep s  409
## from the   from the  403
## i have       i have  402
## i am           i am  399
## that i       that i  367
## of a           of a  366
## it s           it s  358
## as a           as a  344
## with a       with a  331
## one of       one of  325
## for a         for a  313
## this is     this is  313
## irep m       irep m  302
## if you       if you  298
## that the   that the  297
## don t         don t  287
## to get       to get  281
## is the       is the  273
## i had         i had  272
## the first the first  259
## but i         but i  256
## want to     want to  255
## and a         and a  250
## out of       out of  248
## by the       by the  246
## as the       as the  243
## i m             i m  242
## donrep t   donrep t  240
## was a         was a  240
## going to   going to  238
## have been have been  232
## have a       have a  229
## all the     all the  226
## will be     will be  226

Plottig the BI-Gram words.

g<-ggplot(data=freq_df_bi[1:50,])
g<-g+geom_bar(stat="identity",aes(x=bigram,y=freq,fill="blue"))+theme(axis.text.x = element_text(angle=90),legend.position = "none")
g

##Making a wordcloud for BIgram.

wordcloud(names(freq_bi),freq_bi,max.words = 200,colors = brewer.pal(6,"Dark2"))

#Tri-Gram

triGramToken<-function(x){NGramTokenizer(x,Weka_control(min=3,max=3))}
dtmTRI<-DocumentTermMatrix(docs,control = list(tokenizer=triGramToken))
print(dtmTRI)
## <<DocumentTermMatrix (documents: 3, terms: 313075)>>
## Non-/sparse entries: 333074/606151
## Sparsity           : 65%
## Maximal term length: 45
## Weighting          : term frequency (tf)
freq_tri<-colapply_simple_triplet_matrix(dtmTRI,FUN=sum)
freq_tri<-sort(freq_tri,decreasing = T)
freq_tri_df<-data.frame(triGram=names(freq_tri),freq=freq_tri)

Plotting bar graph.

g<-ggplot(data=freq_tri_df[1:50,])
g<-g+geom_bar(stat="identity",aes(x=triGram,y=freq,fill="blue"))+theme(axis.text.x = element_text(angle=90),legend.position = "none")
g

##Making a wordcloud

wordcloud(names(freq_tri),freq_tri,max.words = 200,colors = brewer.pal(6,"Dark2"))

#Quad-Gram

QuadGramToken<-function(x){NGramTokenizer(x,Weka_control(min=4,max=4))}
dtmQuad<-DocumentTermMatrix(docs,control = list(tokenizer=QuadGramToken))
print(dtmQuad)
## <<DocumentTermMatrix (documents: 3, terms: 343191)>>
## Non-/sparse entries: 361478/668095
## Sparsity           : 65%
## Maximal term length: 54
## Weighting          : term frequency (tf)
quad_freq<-colapply_simple_triplet_matrix(dtmQuad,FUN=sum)
quad_freq<-sort(quad_freq,decreasing = T)
quad_freq_df<-data.frame(QuadGram=names(quad_freq),freq=quad_freq)

Making a bar graph

g<-ggplot(data=quad_freq_df[1:50,])
g<-g+geom_bar(stat="identity",aes(x=QuadGram,y=freq,fill="blue"))+theme(axis.text.x = element_text(angle=90),legend.position = "none")
g

Making a wordcloud

wordcloud(names(quad_freq),quad_freq,max.words = 200,colors = brewer.pal(6,"Dark2"))