Introduction

The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.

Tasks to accomplish

Blogs Analysis

load needed packages

library(NLP)
library(tm)
library(stringr)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
library(SnowballC)

Verify data set

con<-file("en_US.blogs.txt","r")
lineBlogs<-readLines(con)
close(con)
summary(lineBlogs)
##    Length     Class      Mode 
##    899288 character character

Read in the data

library(tm)
setwd("C:/Users/kumi/Desktop/final/en_US")
filePath<-"C:/Users/kumi/Desktop/final/en_US/en_US.blogs.txt"
text<-readLines(filePath)
#load the data as corpus
docs<-Corpus(VectorSource(text))
docs1<-sample(docs,5000,replace=FALSE)#take a sample size of 5000
docs1
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5000

Preprocessing and cleaning

#Text transformation
toSpace<-content_transformer(function(x,pattern)gsub(pattern,"",x))
docs1<-tm_map(docs1,toSpace,"/")
docs1<-tm_map(docs1,toSpace,"@")
docs1<-tm_map(docs1,toSpace,"\\|")
#Cleaning the text
docs1<-tm_map(docs1,content_transformer(tolower))
docs1<-tm_map(docs1,removeNumbers)
docs1<-tm_map(docs1,removeWords,stopwords("english"))
docs1<-tm_map(docs1,removeWords,c("blabla1","blabla2"))#remove your own stopwords
docs1<-tm_map(docs1,removePunctuation)
docs1<-tm_map(docs1,stripWhitespace)
docs1<-tm_map(docs1,stemDocument)

Term Document Matrix

library(NLP)
library(tm)
tdm<-TermDocumentMatrix(docs1)
tdm
## <<TermDocumentMatrix (terms: 15467, documents: 5000)>>
## Non-/sparse entries: 92026/77242974
## Sparsity           : 100%
## Maximal term length: 63
## Weighting          : term frequency (tf)
m<-as.matrix(tdm)
m1<-sort(rowSums(m),decreasing=TRUE)
d<-data.frame(word=names(m1),freq=m1)
head(d,10)
##      word freq
## one   one  742
## time time  620
## will will  620
## like like  613
## just just  546
## get   get  509
## can   can  492
## make make  464
## day   day  423
## year year  395

Generate word cloud

library(wordcloud)
set.seed(1234)
wordcloud(words=d$word,d$freq,min.freq=5,max.words=200,random.order=FALSE,rot.per=0.1,
          colors=brewer.pal(8,"Dark2"))

Frequent Terms and their Associations

findFreqTerms(tdm,lowfreq=390)#find words that occur at least 390 times
##  [1] "like" "year" "time" "can"  "get"  "one"  "day"  "just" "make" "will"
#Association
findAssocs(tdm,terms=c("day","one","year","will","like"),corlimit=0.2)#words with at least 0.2 correlation 
## $day
## crossfit 
##     0.24 
## 
## $one
## numeric(0)
## 
## $year
##      celesti      champer    merriment        shunt      sideway 
##         0.24         0.24         0.24         0.24         0.24 
##         wink          old          eve   anotherâ\200\231 gentlemanâ\200\231 
##         0.24         0.22         0.22         0.20         0.20 
## 
## $will
##           arent      classesâ\200¦ furnitureâ\200¦ooh           onâ\200¦ 
##            0.21            0.21            0.21            0.21 
##     paintingâ\200¦         soonâ\200¦          yetâ\200¦ 
##            0.21            0.21            0.21 
## 
## $like
##              adolf       antiislamist             hitler 
##               0.23               0.23               0.23 
##          hitlerâ\200\235             outing      â\200œfascistâ\200\235 
##               0.23               0.23               0.23 
##      â\200œhitlingâ\200\235       â\200œracistâ\200\235               feel 
##               0.23               0.23               0.22 
##             facial 
##               0.21

Plot word frequencies

barplot(d[1:10,]$freq,las=2,names.arg=d[1:10,]$word,col="yellow",main="Most Frequent Words",
        ylab="Word Frequencies")

Tokenizing by N-gram

library(tm)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(RWeka)
library(SnowballC)

#load the data as Vcorpus
docss<-VCorpus(VectorSource(docs1))#docs1 is clean data
docs2<-sample(docss,2000,replace=FALSE)#take a sample size of 2000

#Create a Bigram
BigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=2,max=2))
bigram<-TermDocumentMatrix(docs2,control=list(tokenize=BigramTokenizer))
#extract frequencies of the bigram
freq<-sort(rowSums(as.matrix(bigram)),decreasing=TRUE)
freq.df<-data.frame(word=names(freq),freq=freq)
head(freq.df,10)
##                    word freq
## feel like     feel like   20
## first time   first time   17
## year old       year old   16
## last year     last year   15
## look like     look like   14
## even though even though   11
## just like     just like   11
## new york       new york   11
## next year     next year   10
## year ago       year ago   10
#Create a Trigram
TrigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=3,max=3))
trigram<-TermDocumentMatrix(docs2,control=list(tokenize=TrigramTokenizer))
#extract frequencies of the trigram
freq<-sort(rowSums(as.matrix(trigram)),decreasing=TRUE)
freq.df<-data.frame(word=names(freq),freq=freq)
head(freq.df,8)
##                                          word freq
## citi kansa news               citi kansa news    8
## kansa news station         kansa news station    8
## news station televis     news station televis    8
## next regular session     next regular session    5
## dakota snowmobil trail dakota snowmobil trail    4
## free fair trial               free fair trial    4
## north dakota snowmobil north dakota snowmobil    4
## cart ran back                   cart ran back    3

It is important to note that the sample size was further reduced to two thousand because of several memory issues encounted when conducting this analysis.

Plots of bigrams and trigrams

#Most frequent bigrams
names=c("feel like","first time","year old","last year","look like","even though","just like","new york",
        "next year","year ago")
barplot(freq.df[1:10,]$freq,las=2,names.arg=names,col="yellow",cex.names=0.8,
        ylab="Frequency",main="Most Frequent Bigrams")

#Most frequent trigrams
names1=c("citi kansa news","kansa news station","news station televis","next regular session",
        "dakota snowmobil trail","free fair trial","north dakota snowmobil","cart ran back")
barplot(freq.df[1:8,]$freq,las=2,names.arg=names1,col="yellow",cex.names=0.6,
        ylab="Frequency",main="Most Frequent Trigrams")

News Analysis

Verify data set

con1<-file("en_US.news.txt","r")
lineNews<-readLines(con1)
close(con1)
summary(lineNews)
##    Length     Class      Mode 
##     77259 character character

Read in the data

filePath1<-"C:/Users/kumi/Desktop/final/en_US/en_US.news.txt"
text1<-readLines(filePath1)
#load the data as corpus
docsN<-Corpus(VectorSource(text1))
docs1N<-sample(docsN,5000,replace=FALSE)#take a sample size of 5000
docs1N
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5000

Preprocessing and cleaning

#Text transformation
toSpace<-content_transformer(function(x,pattern)gsub(pattern,"",x))
docs1N<-tm_map(docs1N,toSpace,"/")
docs1N<-tm_map(docs1N,toSpace,"@")
docs1N<-tm_map(docs1N,toSpace,"\\|")
#Cleaning the text
docs1N<-tm_map(docs1N,content_transformer(tolower))
docs1N<-tm_map(docs1N,removeNumbers)
docs1N<-tm_map(docs1N,removeWords,stopwords("english"))
docs1N<-tm_map(docs1N,removeWords,c("blabla1","blabla2"))#remove your own stopwords
docs1N<-tm_map(docs1N,removePunctuation)
docs1N<-tm_map(docs1N,stripWhitespace)
docs1N<-tm_map(docs1N,stemDocument)

Term Document Matrix

library(NLP)
library(tm)
tdmN<-TermDocumentMatrix(docs1N)
tdmN
## <<TermDocumentMatrix (terms: 15385, documents: 5000)>>
## Non-/sparse entries: 93520/76831480
## Sparsity           : 100%
## Maximal term length: 179
## Weighting          : term frequency (tf)
mN<-as.matrix(tdmN)
m1N<-sort(rowSums(mN),decreasing=TRUE)
dN<-data.frame(word=names(m1N),freq=m1N)
head(dN,10)
##      word freq
## one   one  769
## will will  616
## like like  612
## time time  590
## get   get  555
## can   can  538
## just just  525
## make make  416
## day   day  402
## know know  388

Generate word cloud

library(wordcloud)
set.seed(1234)
wordcloud(words=dN$word,dN$freq,min.freq=5,max.words=200,random.order=FALSE,rot.per=0.1,
          colors=brewer.pal(8,"Dark2"))

Frequent Terms and their Associations

findFreqTerms(tdmN,lowfreq=380)#find words that occur at least 380 times
##  [1] "get"  "can"  "know" "time" "just" "one"  "day"  "like" "make" "will"
#Association
findAssocs(tdmN,terms="day",corlimit=0.2)#words with at least 0.2 correlation with the word day
## $day
##        ahasuerusâ\200\231 ebiblefellowshipcom              esther 
##                0.25                0.25                0.25 
##            mordecai             pronoun         â\200œtheseâ\200\235 
##                0.25                0.25                0.25 
##           â\200\230forâ\200\231              slower             walkjog 
##                0.25                0.20                0.20 
##              bounti                 edt             equinox 
##                0.20                0.20                0.20 
##            midpoint             solstic              sunâ\200\231 
##                0.20                0.20                0.20 
##              vernal 
##                0.20

Plot word frequencies

barplot(dN[1:10,]$freq,las=2,names.arg=dN[1:10,]$word,col="red",main="Most Frequent Words in News",
        ylab="Word Frequencies")

Tokenizing by N-gram

#load the data as Vcorpus
docssN<-VCorpus(VectorSource(docs1N))#docs1 is clean data
docs2N<-sample(docssN,2000,replace=FALSE)#take a sample size of 2000

#Create a Bigram
BigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=2,max=2))
bigramN<-TermDocumentMatrix(docs2N,control=list(tokenize=BigramTokenizer))
#extract frequencies of the bigram
freqNB<-sort(rowSums(as.matrix(bigramN)),decreasing=TRUE)
freq.dfNB<-data.frame(word=names(freqNB),freqNB=freqNB)
head(freq.dfNB,10)
##                    word freqNB
## feel like     feel like     15
## right now     right now     15
## year old       year old     14
## last week     last week     13
## look like     look like     13
## new york       new york     13
## can see         can see     12
## year ago       year ago     12
## even though even though     11
## first time   first time     11
#Create a Trigram
TrigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=3,max=3))
trigramN<-TermDocumentMatrix(docs2N,control=list(tokenize=TrigramTokenizer))
#extract frequencies of the trigram
freqN<-sort(rowSums(as.matrix(trigramN)),decreasing=TRUE)
freq.dfN<-data.frame(word=names(freqN),freqN=freqN)
head(freq.dfN,10)
##                                        word freqN
## amazon servic llc         amazon servic llc     4
## sc sc dec                         sc sc dec     4
## cemeteri summari card cemeteri summari card     3
## donâ\200\231t get wrong         donâ\200\231t get wrong     3
## farm caus climat           farm caus climat     3
## law without law             law without law     3
## long hammer ipa             long hammer ipa     3
## look forward see           look forward see     3
## nec multisync lcd         nec multisync lcd     3
## new york citi                 new york citi     3

The sample size was further reduced to two thousand because of several memory issues encounted when conducting this analysis.

Plot of bigrams

#Most frequent bigrams in News dataset
namesNB=c("feel like","right now","year old","last week","look like","new york","can see","year ago",
        "even though","first time")
barplot(freq.dfNB[1:10,]$freqNB,las=2,names.arg=namesNB,col="red",cex.names=0.8,
        ylab="Frequency",main="Most Frequent Bigrams in News Dataset")

Twitter Analysis

Verify data set

con2<-file("en_US.twitter.txt","r")
lineTwitter<-readLines(con2)
close(con2)
summary(lineTwitter)
##    Length     Class      Mode 
##   2360148 character character

Read in the data

filePath2<-"C:/Users/kumi/Desktop/final/en_US/en_US.twitter.txt"
text2<-readLines(filePath2)
#load the data as corpus
docsT<-Corpus(VectorSource(text2))
docs1T<-sample(docsT,5000,replace=FALSE)#take a sample size of 5000
docs1T
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5000

Preprocessing and cleaning

#Text transformation
toSpace<-content_transformer(function(x,pattern)gsub(pattern,"",x))
docs1T<-tm_map(docs1T,toSpace,"/")
docs1T<-tm_map(docs1T,toSpace,"@")
docs1T<-tm_map(docs1T,toSpace,"\\|")
#Cleaning the text
docs1T<-tm_map(docs1T,content_transformer(tolower))
docs1T<-tm_map(docs1T,removeNumbers)
docs1T<-tm_map(docs1T,removeWords,stopwords("english"))
docs1T<-tm_map(docs1T,removeWords,c("blabla1","blabla2"))#remove your own stopwords
docs1T<-tm_map(docs1T,removePunctuation)
docs1T<-tm_map(docs1T,stripWhitespace)
docs1T<-tm_map(docs1T,stemDocument)

Term Document Matrix

library(NLP)
library(tm)
tdmT<-TermDocumentMatrix(docs1T)
tdmT
## <<TermDocumentMatrix (terms: 7584, documents: 5000)>>
## Non-/sparse entries: 32665/37887335
## Sparsity           : 100%
## Maximal term length: 79
## Weighting          : term frequency (tf)
mT<-as.matrix(tdmT)
m1T<-sort(rowSums(mT),decreasing=TRUE)
dT<-data.frame(word=names(m1T),freq=m1T)
head(dT,10)
##        word freq
## get     get  312
## just   just  298
## like   like  280
## thank thank  274
## love   love  253
## day     day  224
## good   good  209
## will   will  195
## now     now  192
## can     can  189

Generate word cloud

library(wordcloud)
set.seed(1234)
wordcloud(words=dT$word,dT$freq,min.freq=5,max.words=200,random.order=FALSE,rot.per=0.1,
          colors=brewer.pal(8,"Dark2"))

Frequent Terms and their Associations

findFreqTerms(tdmT,lowfreq=180)#find words that occur at least 180 times
##  [1] "get"   "thank" "just"  "day"   "good"  "now"   "like"  "love" 
##  [9] "one"   "can"   "will"
#Association
findAssocs(tdmT,terms=c("day","good","will","now","can"),corlimit=0.2)#atleast 0.2 correlation with words
## $day
## numeric(0)
## 
## $good
## luck 
## 0.21 
## 
## $will
## numeric(0)
## 
## $now
## right 
##  0.24 
## 
## $can
## numeric(0)

Plot word frequencies

barplot(dT[1:10,]$freq,las=2,names.arg=dT[1:10,]$word,col="lightgreen",main="Most Frequent Words",
        ylab="Word Frequencies")

Tokenizing by N-gram

#load the data as Vcorpus
docssT<-VCorpus(VectorSource(docs1T))#docs1 is clean data
docs2T<-sample(docssT,2000,replace=FALSE)#take a sample size of 2000

#Create a Bigram
BigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=2,max=2))
bigramT<-TermDocumentMatrix(docs2T,control=list(tokenize=BigramTokenizer))
#extract frequencies of the bigram
freqTB<-sort(rowSums(as.matrix(bigramT)),decreasing=TRUE)
freq.dfTB<-data.frame(word=names(freqTB),freqTB=freqTB)
head(freq.dfTB,10)
##                      word freqTB
## last night     last night     14
## right now       right now     12
## thank follow thank follow     11
## look forward look forward     10
## look like       look like      9
## mother day     mother day      7
## good luck       good luck      6
## thank much     thank much      6
## thank share   thank share      6
## anyon els       anyon els      5
#Create a Trigram
TrigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=3,max=3))
trigramT<-TermDocumentMatrix(docs2T,control=list(tokenize=TrigramTokenizer))
#extract frequencies of the trigram
freqTT<-sort(rowSums(as.matrix(trigramT)),decreasing=TRUE)
freq.dfTT<-data.frame(word=names(freqTT),freqTT=freqTT)
head(freq.dfTT,10)
##                                            word freqTT
## happi mother day               happi mother day      4
## happi new year                   happi new year      4
## love love love                   love love love      4
## round round round             round round round      3
## awkward moment realiz     awkward moment realiz      2
## bang bang bang                   bang bang bang      2
## card win card                     card win card      2
## colonoscopi faint heart colonoscopi faint heart      2
## communiti web produc       communiti web produc      2
## follow tweet swag             follow tweet swag      2

The sample size was reduced to two thousand because of several memory issues encounted when conducting this analysis.

Plot of bigrams

#Most frequent bigrams in Twitter dataset
namesTB=c("last night","right now","thank follow","look forward","look like","mother day","good luck",
"thank much","thank share","anyon els")
barplot(freq.dfTB[1:10,]$freqTB,las=2,names.arg=namesTB,col="lightgreen",cex.names=0.8,
        ylab="Frequency",main="Most Frequent Bigrams in Twitter Dataset")

How many words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language?90%?

docs2<-sample(docss,2000,replace=FALSE)#take a sample size of 2000
Blogs<-docs2
docs2N<-sample(docssN,2000,replace=FALSE)#take a sample size of 2000
News<-docs2N
docs2T<-sample(docssT,2000,replace=FALSE)#take a sample size of 2000
Twitter<-docs2T
sample<-c(Blogs,News,Twitter)

#dataframes of 1-gram,2-grams,3-grams
words<-WordTokenizer(sample)
grams<-NGramTokenizer(sample)
for(i in 1:length(grams))
{if (length(WordTokenizer(grams[i]))==2)break}
for(j in 1:length(grams))
{if (length(WordTokenizer(grams[j]))==1)break}
onegrams<-data.frame(table(words))
onegrams<-onegrams[order(onegrams$Freq,decreasing=TRUE),]

bigrams<-data.frame(table(grams[i:(j-1)]))
bigrams<-bigrams[order(bigrams$Freq,decreasing=TRUE),]

trigrams<-data.frame(table(grams[1:(i-1)]))
trigrams<-trigrams[order(trigrams$Freq,decreasing=TRUE),]

remove(i,j,grams)

sumCover<-0
for(i in 1:length(onegrams$Freq)){
  sumCover<-sumCover+onegrams$Freq[i]
  if(sumCover>=0.5*sum(onegrams$Freq)){break}
}
print(i)
## [1] 11
sumCover<-0
for(i in 1:length(onegrams$Freq)){
  sumCover<-sumCover+onegrams$Freq[i]
  if(sumCover>=0.9*sum(onegrams$Freq)){break}
}
print(i)
## [1] 1138

From the analysis above, we will need 11 words to cover 50% of all word instances and 1138 words to cover 90% of all word instances in the language

Foreign Language Evaluation

To evaluate words from foreign languages, we can employ the tm_map function to remove words based on a language dictionary.The total number of words remaining after removing the words and profane words will form the basis for any meaningful analysis.

Increasing Coverage

One of the ways in which we can increase coverage is to reduce the number of low frequency unique words by stemming or by substitution using the thesuarus library.In addition, we can also increase coverage through context clustering by introducing context to the corpora. There is always the possibility of clustering certain word groups together.

Findings

With extremely large datasets,it is extremely relevant to build up a model with smaller samples rather than huge samples. This exercise in text mining has really taught me a very important lesson of managing memory and processing power in R.It is obvious that computers with large memory sizes is needed for some of these exercises since speed is very critical.