Loading the data

Load the data set and compute some basic metrics:

twitter <- "final/en_US/en_US.twitter.txt"
twitter_size <- file.size(twitter)
twitter_lines <- readLines(twitter, skipNul = TRUE)
twitter_word_counts <- lengths(strsplit(twitter_lines, " "))

blogs <- "final/en_US/en_US.blogs.txt"
blogs_size <- file.size(blogs)
blogs_lines <- readLines(blogs, skipNul = TRUE)
blogs_word_counts <- lengths(strsplit(blogs_lines, " "))

news <- "final/en_US/en_US.news.txt"
news_size <- file.size(news)
news_lines <- readLines(news, skipNul = TRUE)
news_word_counts <- lengths(strsplit(news_lines, " "))

Base metrics

Basic metrics of the data:

one_MB <- 1024*1024
m <- matrix(c(twitter_size/one_MB, blogs_size/one_MB, news_size/one_MB,
              length(twitter_lines),length(blogs_lines),length(news_lines),
              sum(twitter_word_counts), sum(blogs_word_counts), sum(news_word_counts)), 
            nrow=3,ncol=3,
            dimnames = list(c("twitter","blogs", "news"),c("Size (MB)", "Lines","Words")))
m
##         Size (MB)   Lines    Words
## twitter  159.3641 2360148 30373583
## blogs    200.4242  899288 37334131
## news     196.2775 1010242 34372530

Text mining for twitter and blogs

Next we do some text mining using the tm package.

en_US.twitter.txt

First we create the corpus:

t_corpus <- Corpus(VectorSource(twitter_lines))
t_corpus <- tm_map(t_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(t_corpus, removeNumbers): transformation drops
## documents
t_corpus <- tm_map(t_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(t_corpus, removePunctuation): transformation
## drops documents
t_corpus <- tm_map(t_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(t_corpus, stripWhitespace): transformation drops
## documents
t_corpus <- tm_map(t_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(t_corpus, content_transformer(tolower)):
## transformation drops documents

Now we select the most frequent terms:

t_dtm <-DocumentTermMatrix(t_corpus) 
t_dtm <- removeSparseTerms(t_dtm, 0.99)
t_freq = data.frame(sort(colSums(as.matrix(t_dtm)), decreasing=TRUE))
t_freq <- head(t_freq, 20)
head(t_freq)
##      sort.colSums.as.matrix.t_dtm....decreasing...TRUE.
## the                                              933564
## you                                              542830
## and                                              433606
## for                                              384459
## that                                             232783
## with                                             172957

As a barplot:

barplot(t_freq[,1], names.arg = rownames(t_freq), las = 2)

As a word cloud:

wordcloud(rownames(t_freq), t_freq[,1], max.words=50, colors=brewer.pal(3, "Dark2"))

en_US.blogs.txt

First we create the corpus:

b_corpus <- Corpus(VectorSource(blogs_lines))
b_corpus <- tm_map(b_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(b_corpus, removeNumbers): transformation drops
## documents
b_corpus <- tm_map(b_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(b_corpus, removePunctuation): transformation
## drops documents
b_corpus <- tm_map(b_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(b_corpus, stripWhitespace): transformation drops
## documents
b_corpus <- tm_map(b_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(b_corpus, content_transformer(tolower)):
## transformation drops documents

Now we select the most frequent terms:

b_dtm <-DocumentTermMatrix(b_corpus) 
b_dtm <- removeSparseTerms(b_dtm, 0.99)
b_freq <- data.frame(sort(colSums(as.matrix(b_dtm)), decreasing=TRUE))
b_freq <- head(b_freq, 20)
head(b_freq)
##      sort.colSums.as.matrix.b_dtm....decreasing...TRUE.
## the                                             1848731
## and                                             1084938
## that                                             458176
## for                                              362171
## you                                              293159
## with                                             285899

As a barplot:

barplot(b_freq[,1], names.arg = rownames(b_freq), las = 2)

As a word cloud:

wordcloud(rownames(b_freq), b_freq[,1], max.words=50, colors=brewer.pal(3, "Dark2"))

en_US.news.txt

First we create the corpus:

n_corpus <- Corpus(VectorSource(news_lines))
n_corpus <- tm_map(n_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(n_corpus, removeNumbers): transformation drops
## documents
n_corpus <- tm_map(n_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(n_corpus, removePunctuation): transformation
## drops documents
n_corpus <- tm_map(n_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(n_corpus, stripWhitespace): transformation drops
## documents
n_corpus <- tm_map(n_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(n_corpus, content_transformer(tolower)):
## transformation drops documents

Now we select the most frequent terms:

n_dtm <-DocumentTermMatrix(n_corpus) 
n_dtm <- removeSparseTerms(n_dtm, 0.99)
n_freq <- data.frame(sort(colSums(as.matrix(n_dtm)), decreasing=TRUE))
n_freq <- head(n_freq, 20)
head(n_freq)
##      sort.colSums.as.matrix.n_dtm....decreasing...TRUE.
## the                                             1967293
## and                                              883630
## for                                              352683
## that                                             345662
## with                                             254508
## said                                             250326

As a barplot:

barplot(n_freq[,1], names.arg = rownames(n_freq), las = 2)

As a word cloud:

wordcloud(rownames(n_freq), n_freq[,1], max.words=50, colors=brewer.pal(3, "Dark2"))

Conclusions

The three files do have slightly different word frequencies. This can be explained because the context is different. E.g the word ‘said’ appears in the top ten for the news because news often report words said by someone.