Loading some required libraries
suppressPackageStartupMessages(library(R.utils))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(stringi))
suppressPackageStartupMessages(library(tm))
The files are read using the readLines funcion and the number of lines and words for each are calculated
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
twitterwords <-stri_stats_latex(twitter)[4]
blogswords <-stri_stats_latex(blogs)[4]
newswords <-stri_stats_latex(news)[4]
data.frame("File Name" = c("twitter", "blogs", "news"),
"num.lines" = c(length(twitter),length(blogs), length(news)),
"num.words" = c(sum(blogswords), sum(newswords), sum(twitterwords)))
## File.Name num.lines num.words
## 1 twitter 2360148 37570839
## 2 blogs 899288 34494539
## 3 news 1010242 30451128
The files are converted into the correct encoding and cleaned to remove punctuation signs and adapt them to a data frame.
set.seed(10000)
blogs_c<-iconv(blogs,"latin1","ASCII",sub="")
news_c<-iconv(news,"latin1","ASCII",sub="")
twitter_c<-iconv(twitter,"latin1","ASCII",sub="")
sampledata<-c(sample(twitter_c,length(twitter_c)*0.01),
sample(blogs_c,length(blogs_c)*0.01),
sample(news_c,length(news_c)*0.01))
corpus <- VCorpus(VectorSource(sampledata))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpusresult<-data.frame(text=unlist(sapply(corpus,'[',"content")),stringsAsFactors = FALSE)
head(corpusresult)
## text
## 1 tiger woods poker night phil hellmuth doyle brunson
## 2 suddenly feel start checking retirement homes
## 3 acutely aware fact major blowers building sound good microphones
## 4 seem right avatar
## 5 new empathic approach used one high school great results
## 6 many diagnosed altzheimers old head injury ever wonder pres reagan subdural hematoma tbi
Then, the data is prepared in Unigrams, bigrams and trigrams to create a histogram of them to analyse
suppressPackageStartupMessages(library(RWeka))
suppressPackageStartupMessages(library(ggplot2))
The fist plot shows the unigram count, which is the same as the word count
##Create Unigrams
unigram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
unigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=unigram))
unigramcorpus<-findFreqTerms(unigramtab,lowfreq=750)
unigramcorpusnum<-rowSums(as.matrix(unigramtab[unigramcorpus,]))
unigramcorpustab<-data.frame(Word=names(unigramcorpusnum),frequency=unigramcorpusnum)
unigramcorpussort<-unigramcorpustab[order(-unigramcorpustab$frequency),]
##plot unigrams
ggplot(unigramcorpussort[1:15,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("blue"))+
labs(title="Unigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
The unigrays can be plot in a word frequencies plot
suppressPackageStartupMessages(library(wordcloud))
suppressWarnings (
wordcloud(words = unigramcorpustab$Word,
freq = unigramcorpustab$frequency,
min.freq = 1,
max.words = 100,
random.order = FALSE,
rot.per = 0.35,
colors=brewer.pal(8, "Dark2"))
)
Here the bigrams are shown
##Create Bigrams
bigram<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
bigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=bigram))
bigramcorpus<-findFreqTerms(bigramtab,lowfreq=40)
bigramcorpusnum<-rowSums(as.matrix(bigramtab[bigramcorpus,]))
bigramcorpustab<-data.frame(Word=names(bigramcorpusnum),frequency=bigramcorpusnum)
bigramcorpussort<-bigramcorpustab[order(-bigramcorpustab$frequency),]
##Plot Bigrams
ggplot(bigramcorpussort[1:30,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("blue"))+
labs(title="Bigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
Finally the trigrams
##Create Trigrams
trigram<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
trigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=trigram))
trigramcorpus<-findFreqTerms(trigramtab,lowfreq=4)
trigramcorpusnum<-rowSums(as.matrix(trigramtab[trigramcorpus,]))
trigramcorpustab<-data.frame(Word=names(trigramcorpusnum),frequency=trigramcorpusnum)
trigramcorpussort<-trigramcorpustab[order(-trigramcorpustab$frequency),]
#Plot Trigrams
ggplot(trigramcorpussort[1:20,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("blue"))+
labs(title="Trigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))