Loading some required libraries

suppressPackageStartupMessages(library(R.utils))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(stringi))
suppressPackageStartupMessages(library(tm))

Read the files

The files are read using the readLines funcion and the number of lines and words for each are calculated

twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")

twitterwords <-stri_stats_latex(twitter)[4]
blogswords <-stri_stats_latex(blogs)[4]
newswords <-stri_stats_latex(news)[4]

data.frame("File Name" = c("twitter", "blogs", "news"),
           "num.lines" = c(length(twitter),length(blogs), length(news)),
           "num.words" = c(sum(blogswords), sum(newswords), sum(twitterwords)))
##   File.Name num.lines num.words
## 1   twitter   2360148  37570839
## 2     blogs    899288  34494539
## 3      news   1010242  30451128

Exploratory data analysis

The files are converted into the correct encoding and cleaned to remove punctuation signs and adapt them to a data frame.

set.seed(10000)
blogs_c<-iconv(blogs,"latin1","ASCII",sub="")
news_c<-iconv(news,"latin1","ASCII",sub="")
twitter_c<-iconv(twitter,"latin1","ASCII",sub="")

sampledata<-c(sample(twitter_c,length(twitter_c)*0.01),
              sample(blogs_c,length(blogs_c)*0.01),
              sample(news_c,length(news_c)*0.01))

corpus <- VCorpus(VectorSource(sampledata))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpusresult<-data.frame(text=unlist(sapply(corpus,'[',"content")),stringsAsFactors = FALSE)
head(corpusresult)
##                                                                                       text
## 1                                      tiger woods poker night phil hellmuth doyle brunson
## 2                                            suddenly feel start checking retirement homes
## 3                         acutely aware fact major blowers building sound good microphones
## 4                                                                        seem right avatar
## 5                                 new empathic approach used one high school great results
## 6 many diagnosed altzheimers old head injury ever wonder pres reagan subdural hematoma tbi

Then, the data is prepared in Unigrams, bigrams and trigrams to create a histogram of them to analyse

suppressPackageStartupMessages(library(RWeka))
suppressPackageStartupMessages(library(ggplot2))

The fist plot shows the unigram count, which is the same as the word count

##Create Unigrams
unigram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
unigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=unigram))
unigramcorpus<-findFreqTerms(unigramtab,lowfreq=750)
unigramcorpusnum<-rowSums(as.matrix(unigramtab[unigramcorpus,]))
unigramcorpustab<-data.frame(Word=names(unigramcorpusnum),frequency=unigramcorpusnum)
unigramcorpussort<-unigramcorpustab[order(-unigramcorpustab$frequency),]

##plot unigrams
ggplot(unigramcorpussort[1:15,],aes(x=reorder(Word,-frequency),y=frequency))+
  geom_bar(stat="identity",fill = I("blue"))+
  labs(title="Unigrams",x="Most Words",y="Frequency")+
  theme(axis.text.x=element_text(angle=60))

The unigrays can be plot in a word frequencies plot

suppressPackageStartupMessages(library(wordcloud))
suppressWarnings (
    wordcloud(words = unigramcorpustab$Word,
              freq = unigramcorpustab$frequency,
              min.freq = 1,
              max.words = 100,
              random.order = FALSE,
              rot.per = 0.35, 
              colors=brewer.pal(8, "Dark2"))
)

Here the bigrams are shown

##Create Bigrams
bigram<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
bigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=bigram))
bigramcorpus<-findFreqTerms(bigramtab,lowfreq=40)
bigramcorpusnum<-rowSums(as.matrix(bigramtab[bigramcorpus,]))
bigramcorpustab<-data.frame(Word=names(bigramcorpusnum),frequency=bigramcorpusnum)
bigramcorpussort<-bigramcorpustab[order(-bigramcorpustab$frequency),]

##Plot Bigrams
ggplot(bigramcorpussort[1:30,],aes(x=reorder(Word,-frequency),y=frequency))+
  geom_bar(stat="identity",fill = I("blue"))+
  labs(title="Bigrams",x="Most Words",y="Frequency")+
  theme(axis.text.x=element_text(angle=60))

Finally the trigrams

##Create Trigrams
trigram<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
trigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=trigram))
trigramcorpus<-findFreqTerms(trigramtab,lowfreq=4)
trigramcorpusnum<-rowSums(as.matrix(trigramtab[trigramcorpus,]))
trigramcorpustab<-data.frame(Word=names(trigramcorpusnum),frequency=trigramcorpusnum)
trigramcorpussort<-trigramcorpustab[order(-trigramcorpustab$frequency),]

#Plot Trigrams
ggplot(trigramcorpussort[1:20,],aes(x=reorder(Word,-frequency),y=frequency))+
  geom_bar(stat="identity",fill = I("blue"))+
  labs(title="Trigrams",x="Most Words",y="Frequency")+
  theme(axis.text.x=element_text(angle=60))