The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.
Tasks to accomplish
Exploratory analysis - perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.
Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.
library(NLP)
library(tm)
library(stringr)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
library(SnowballC)
con<-file("en_US.blogs.txt","r")
lineBlogs<-readLines(con)
close(con)
summary(lineBlogs)
## Length Class Mode
## 899288 character character
library(tm)
setwd("C:/Users/kumi/Desktop/final/en_US")
filePath<-"C:/Users/kumi/Desktop/final/en_US/en_US.blogs.txt"
text<-readLines(filePath)
#load the data as corpus
docs<-Corpus(VectorSource(text))
docs1<-sample(docs,5000,replace=FALSE)#take a sample size of 5000
docs1
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5000
#Text transformation
toSpace<-content_transformer(function(x,pattern)gsub(pattern,"",x))
docs1<-tm_map(docs1,toSpace,"/")
docs1<-tm_map(docs1,toSpace,"@")
docs1<-tm_map(docs1,toSpace,"\\|")
#Cleaning the text
docs1<-tm_map(docs1,content_transformer(tolower))
docs1<-tm_map(docs1,removeNumbers)
docs1<-tm_map(docs1,removeWords,stopwords("english"))
docs1<-tm_map(docs1,removeWords,c("blabla1","blabla2"))#remove your own stopwords
docs1<-tm_map(docs1,removePunctuation)
docs1<-tm_map(docs1,stripWhitespace)
docs1<-tm_map(docs1,stemDocument)
library(NLP)
library(tm)
tdm<-TermDocumentMatrix(docs1)
tdm
## <<TermDocumentMatrix (terms: 15467, documents: 5000)>>
## Non-/sparse entries: 92026/77242974
## Sparsity : 100%
## Maximal term length: 63
## Weighting : term frequency (tf)
m<-as.matrix(tdm)
m1<-sort(rowSums(m),decreasing=TRUE)
d<-data.frame(word=names(m1),freq=m1)
head(d,10)
## word freq
## one one 742
## time time 620
## will will 620
## like like 613
## just just 546
## get get 509
## can can 492
## make make 464
## day day 423
## year year 395
library(wordcloud)
set.seed(1234)
wordcloud(words=d$word,d$freq,min.freq=5,max.words=200,random.order=FALSE,rot.per=0.1,
colors=brewer.pal(8,"Dark2"))
findFreqTerms(tdm,lowfreq=390)#find words that occur at least 390 times
## [1] "like" "year" "time" "can" "get" "one" "day" "just" "make" "will"
#Association
findAssocs(tdm,terms=c("day","one","year","will","like"),corlimit=0.2)#words with at least 0.2 correlation
## $day
## crossfit
## 0.24
##
## $one
## numeric(0)
##
## $year
## celesti champer merriment shunt sideway
## 0.24 0.24 0.24 0.24 0.24
## wink old eve anotherâ\200\231 gentlemanâ\200\231
## 0.24 0.22 0.22 0.20 0.20
##
## $will
## arent classesâ\200¦ furnitureâ\200¦ooh onâ\200¦
## 0.21 0.21 0.21 0.21
## paintingâ\200¦ soonâ\200¦ yetâ\200¦
## 0.21 0.21 0.21
##
## $like
## adolf antiislamist hitler
## 0.23 0.23 0.23
## hitlerâ\200\235 outing â\200œfascistâ\200\235
## 0.23 0.23 0.23
## â\200œhitlingâ\200\235 â\200œracistâ\200\235 feel
## 0.23 0.23 0.22
## facial
## 0.21
barplot(d[1:10,]$freq,las=2,names.arg=d[1:10,]$word,col="yellow",main="Most Frequent Words",
ylab="Word Frequencies")
library(tm)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(RWeka)
library(SnowballC)
#load the data as Vcorpus
docss<-VCorpus(VectorSource(docs1))#docs1 is clean data
docs2<-sample(docss,2000,replace=FALSE)#take a sample size of 2000
#Create a Bigram
BigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=2,max=2))
bigram<-TermDocumentMatrix(docs2,control=list(tokenize=BigramTokenizer))
#extract frequencies of the bigram
freq<-sort(rowSums(as.matrix(bigram)),decreasing=TRUE)
freq.df<-data.frame(word=names(freq),freq=freq)
head(freq.df,10)
## word freq
## feel like feel like 20
## first time first time 17
## year old year old 16
## last year last year 15
## look like look like 14
## even though even though 11
## just like just like 11
## new york new york 11
## next year next year 10
## year ago year ago 10
#Create a Trigram
TrigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=3,max=3))
trigram<-TermDocumentMatrix(docs2,control=list(tokenize=TrigramTokenizer))
#extract frequencies of the trigram
freq<-sort(rowSums(as.matrix(trigram)),decreasing=TRUE)
freq.df<-data.frame(word=names(freq),freq=freq)
head(freq.df,8)
## word freq
## citi kansa news citi kansa news 8
## kansa news station kansa news station 8
## news station televis news station televis 8
## next regular session next regular session 5
## dakota snowmobil trail dakota snowmobil trail 4
## free fair trial free fair trial 4
## north dakota snowmobil north dakota snowmobil 4
## cart ran back cart ran back 3
It is important to note that the sample size was further reduced to two thousand because of several memory issues encounted when conducting this analysis.
#Most frequent bigrams
names=c("feel like","first time","year old","last year","look like","even though","just like","new york",
"next year","year ago")
barplot(freq.df[1:10,]$freq,las=2,names.arg=names,col="yellow",cex.names=0.8,
ylab="Frequency",main="Most Frequent Bigrams")
#Most frequent trigrams
names1=c("citi kansa news","kansa news station","news station televis","next regular session",
"dakota snowmobil trail","free fair trial","north dakota snowmobil","cart ran back")
barplot(freq.df[1:8,]$freq,las=2,names.arg=names1,col="yellow",cex.names=0.6,
ylab="Frequency",main="Most Frequent Trigrams")
con1<-file("en_US.news.txt","r")
lineNews<-readLines(con1)
close(con1)
summary(lineNews)
## Length Class Mode
## 77259 character character
filePath1<-"C:/Users/kumi/Desktop/final/en_US/en_US.news.txt"
text1<-readLines(filePath1)
#load the data as corpus
docsN<-Corpus(VectorSource(text1))
docs1N<-sample(docsN,5000,replace=FALSE)#take a sample size of 5000
docs1N
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5000
#Text transformation
toSpace<-content_transformer(function(x,pattern)gsub(pattern,"",x))
docs1N<-tm_map(docs1N,toSpace,"/")
docs1N<-tm_map(docs1N,toSpace,"@")
docs1N<-tm_map(docs1N,toSpace,"\\|")
#Cleaning the text
docs1N<-tm_map(docs1N,content_transformer(tolower))
docs1N<-tm_map(docs1N,removeNumbers)
docs1N<-tm_map(docs1N,removeWords,stopwords("english"))
docs1N<-tm_map(docs1N,removeWords,c("blabla1","blabla2"))#remove your own stopwords
docs1N<-tm_map(docs1N,removePunctuation)
docs1N<-tm_map(docs1N,stripWhitespace)
docs1N<-tm_map(docs1N,stemDocument)
library(NLP)
library(tm)
tdmN<-TermDocumentMatrix(docs1N)
tdmN
## <<TermDocumentMatrix (terms: 15385, documents: 5000)>>
## Non-/sparse entries: 93520/76831480
## Sparsity : 100%
## Maximal term length: 179
## Weighting : term frequency (tf)
mN<-as.matrix(tdmN)
m1N<-sort(rowSums(mN),decreasing=TRUE)
dN<-data.frame(word=names(m1N),freq=m1N)
head(dN,10)
## word freq
## one one 769
## will will 616
## like like 612
## time time 590
## get get 555
## can can 538
## just just 525
## make make 416
## day day 402
## know know 388
library(wordcloud)
set.seed(1234)
wordcloud(words=dN$word,dN$freq,min.freq=5,max.words=200,random.order=FALSE,rot.per=0.1,
colors=brewer.pal(8,"Dark2"))
findFreqTerms(tdmN,lowfreq=380)#find words that occur at least 380 times
## [1] "get" "can" "know" "time" "just" "one" "day" "like" "make" "will"
#Association
findAssocs(tdmN,terms="day",corlimit=0.2)#words with at least 0.2 correlation with the word day
## $day
## ahasuerusâ\200\231 ebiblefellowshipcom esther
## 0.25 0.25 0.25
## mordecai pronoun â\200œtheseâ\200\235
## 0.25 0.25 0.25
## â\200\230forâ\200\231 slower walkjog
## 0.25 0.20 0.20
## bounti edt equinox
## 0.20 0.20 0.20
## midpoint solstic sunâ\200\231
## 0.20 0.20 0.20
## vernal
## 0.20
barplot(dN[1:10,]$freq,las=2,names.arg=dN[1:10,]$word,col="red",main="Most Frequent Words in News",
ylab="Word Frequencies")
#load the data as Vcorpus
docssN<-VCorpus(VectorSource(docs1N))#docs1 is clean data
docs2N<-sample(docssN,2000,replace=FALSE)#take a sample size of 2000
#Create a Bigram
BigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=2,max=2))
bigramN<-TermDocumentMatrix(docs2N,control=list(tokenize=BigramTokenizer))
#extract frequencies of the bigram
freqNB<-sort(rowSums(as.matrix(bigramN)),decreasing=TRUE)
freq.dfNB<-data.frame(word=names(freqNB),freqNB=freqNB)
head(freq.dfNB,10)
## word freqNB
## feel like feel like 15
## right now right now 15
## year old year old 14
## last week last week 13
## look like look like 13
## new york new york 13
## can see can see 12
## year ago year ago 12
## even though even though 11
## first time first time 11
#Create a Trigram
TrigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=3,max=3))
trigramN<-TermDocumentMatrix(docs2N,control=list(tokenize=TrigramTokenizer))
#extract frequencies of the trigram
freqN<-sort(rowSums(as.matrix(trigramN)),decreasing=TRUE)
freq.dfN<-data.frame(word=names(freqN),freqN=freqN)
head(freq.dfN,10)
## word freqN
## amazon servic llc amazon servic llc 4
## sc sc dec sc sc dec 4
## cemeteri summari card cemeteri summari card 3
## donâ\200\231t get wrong donâ\200\231t get wrong 3
## farm caus climat farm caus climat 3
## law without law law without law 3
## long hammer ipa long hammer ipa 3
## look forward see look forward see 3
## nec multisync lcd nec multisync lcd 3
## new york citi new york citi 3
The sample size was further reduced to two thousand because of several memory issues encounted when conducting this analysis.
#Most frequent bigrams in News dataset
namesNB=c("feel like","right now","year old","last week","look like","new york","can see","year ago",
"even though","first time")
barplot(freq.dfNB[1:10,]$freqNB,las=2,names.arg=namesNB,col="red",cex.names=0.8,
ylab="Frequency",main="Most Frequent Bigrams in News Dataset")
con2<-file("en_US.twitter.txt","r")
lineTwitter<-readLines(con2)
close(con2)
summary(lineTwitter)
## Length Class Mode
## 2360148 character character
filePath2<-"C:/Users/kumi/Desktop/final/en_US/en_US.twitter.txt"
text2<-readLines(filePath2)
#load the data as corpus
docsT<-Corpus(VectorSource(text2))
docs1T<-sample(docsT,5000,replace=FALSE)#take a sample size of 5000
docs1T
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5000
#Text transformation
toSpace<-content_transformer(function(x,pattern)gsub(pattern,"",x))
docs1T<-tm_map(docs1T,toSpace,"/")
docs1T<-tm_map(docs1T,toSpace,"@")
docs1T<-tm_map(docs1T,toSpace,"\\|")
#Cleaning the text
docs1T<-tm_map(docs1T,content_transformer(tolower))
docs1T<-tm_map(docs1T,removeNumbers)
docs1T<-tm_map(docs1T,removeWords,stopwords("english"))
docs1T<-tm_map(docs1T,removeWords,c("blabla1","blabla2"))#remove your own stopwords
docs1T<-tm_map(docs1T,removePunctuation)
docs1T<-tm_map(docs1T,stripWhitespace)
docs1T<-tm_map(docs1T,stemDocument)
library(NLP)
library(tm)
tdmT<-TermDocumentMatrix(docs1T)
tdmT
## <<TermDocumentMatrix (terms: 7584, documents: 5000)>>
## Non-/sparse entries: 32665/37887335
## Sparsity : 100%
## Maximal term length: 79
## Weighting : term frequency (tf)
mT<-as.matrix(tdmT)
m1T<-sort(rowSums(mT),decreasing=TRUE)
dT<-data.frame(word=names(m1T),freq=m1T)
head(dT,10)
## word freq
## get get 312
## just just 298
## like like 280
## thank thank 274
## love love 253
## day day 224
## good good 209
## will will 195
## now now 192
## can can 189
library(wordcloud)
set.seed(1234)
wordcloud(words=dT$word,dT$freq,min.freq=5,max.words=200,random.order=FALSE,rot.per=0.1,
colors=brewer.pal(8,"Dark2"))
findFreqTerms(tdmT,lowfreq=180)#find words that occur at least 180 times
## [1] "get" "thank" "just" "day" "good" "now" "like" "love"
## [9] "one" "can" "will"
#Association
findAssocs(tdmT,terms=c("day","good","will","now","can"),corlimit=0.2)#atleast 0.2 correlation with words
## $day
## numeric(0)
##
## $good
## luck
## 0.21
##
## $will
## numeric(0)
##
## $now
## right
## 0.24
##
## $can
## numeric(0)
barplot(dT[1:10,]$freq,las=2,names.arg=dT[1:10,]$word,col="lightgreen",main="Most Frequent Words",
ylab="Word Frequencies")
#load the data as Vcorpus
docssT<-VCorpus(VectorSource(docs1T))#docs1 is clean data
docs2T<-sample(docssT,2000,replace=FALSE)#take a sample size of 2000
#Create a Bigram
BigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=2,max=2))
bigramT<-TermDocumentMatrix(docs2T,control=list(tokenize=BigramTokenizer))
#extract frequencies of the bigram
freqTB<-sort(rowSums(as.matrix(bigramT)),decreasing=TRUE)
freq.dfTB<-data.frame(word=names(freqTB),freqTB=freqTB)
head(freq.dfTB,10)
## word freqTB
## last night last night 14
## right now right now 12
## thank follow thank follow 11
## look forward look forward 10
## look like look like 9
## mother day mother day 7
## good luck good luck 6
## thank much thank much 6
## thank share thank share 6
## anyon els anyon els 5
#Create a Trigram
TrigramTokenizer<-function(x)NGramTokenizer(x,Weka_control(min=3,max=3))
trigramT<-TermDocumentMatrix(docs2T,control=list(tokenize=TrigramTokenizer))
#extract frequencies of the trigram
freqTT<-sort(rowSums(as.matrix(trigramT)),decreasing=TRUE)
freq.dfTT<-data.frame(word=names(freqTT),freqTT=freqTT)
head(freq.dfTT,10)
## word freqTT
## happi mother day happi mother day 4
## happi new year happi new year 4
## love love love love love love 4
## round round round round round round 3
## awkward moment realiz awkward moment realiz 2
## bang bang bang bang bang bang 2
## card win card card win card 2
## colonoscopi faint heart colonoscopi faint heart 2
## communiti web produc communiti web produc 2
## follow tweet swag follow tweet swag 2
The sample size was reduced to two thousand because of several memory issues encounted when conducting this analysis.
#Most frequent bigrams in Twitter dataset
namesTB=c("last night","right now","thank follow","look forward","look like","mother day","good luck",
"thank much","thank share","anyon els")
barplot(freq.dfTB[1:10,]$freqTB,las=2,names.arg=namesTB,col="lightgreen",cex.names=0.8,
ylab="Frequency",main="Most Frequent Bigrams in Twitter Dataset")
docs2<-sample(docss,2000,replace=FALSE)#take a sample size of 2000
Blogs<-docs2
docs2N<-sample(docssN,2000,replace=FALSE)#take a sample size of 2000
News<-docs2N
docs2T<-sample(docssT,2000,replace=FALSE)#take a sample size of 2000
Twitter<-docs2T
sample<-c(Blogs,News,Twitter)
#dataframes of 1-gram,2-grams,3-grams
words<-WordTokenizer(sample)
grams<-NGramTokenizer(sample)
for(i in 1:length(grams))
{if (length(WordTokenizer(grams[i]))==2)break}
for(j in 1:length(grams))
{if (length(WordTokenizer(grams[j]))==1)break}
onegrams<-data.frame(table(words))
onegrams<-onegrams[order(onegrams$Freq,decreasing=TRUE),]
bigrams<-data.frame(table(grams[i:(j-1)]))
bigrams<-bigrams[order(bigrams$Freq,decreasing=TRUE),]
trigrams<-data.frame(table(grams[1:(i-1)]))
trigrams<-trigrams[order(trigrams$Freq,decreasing=TRUE),]
remove(i,j,grams)
sumCover<-0
for(i in 1:length(onegrams$Freq)){
sumCover<-sumCover+onegrams$Freq[i]
if(sumCover>=0.5*sum(onegrams$Freq)){break}
}
print(i)
## [1] 11
sumCover<-0
for(i in 1:length(onegrams$Freq)){
sumCover<-sumCover+onegrams$Freq[i]
if(sumCover>=0.9*sum(onegrams$Freq)){break}
}
print(i)
## [1] 1138
From the analysis above, we will need 11 words to cover 50% of all word instances and 1138 words to cover 90% of all word instances in the language
To evaluate words from foreign languages, we can employ the tm_map function to remove words based on a language dictionary.The total number of words remaining after removing the words and profane words will form the basis for any meaningful analysis.
One of the ways in which we can increase coverage is to reduce the number of low frequency unique words by stemming or by substitution using the thesuarus library.In addition, we can also increase coverage through context clustering by introducing context to the corpora. There is always the possibility of clustering certain word groups together.
With extremely large datasets,it is extremely relevant to build up a model with smaller samples rather than huge samples. This exercise in text mining has really taught me a very important lesson of managing memory and processing power in R.It is obvious that computers with large memory sizes is needed for some of these exercises since speed is very critical.