Load the data set and compute some basic metrics:
twitter <- "final/en_US/en_US.twitter.txt"
twitter_size <- file.size(twitter)
twitter_lines <- readLines(twitter, skipNul = TRUE)
twitter_word_counts <- lengths(strsplit(twitter_lines, " "))
blogs <- "final/en_US/en_US.blogs.txt"
blogs_size <- file.size(blogs)
blogs_lines <- readLines(blogs, skipNul = TRUE)
blogs_word_counts <- lengths(strsplit(blogs_lines, " "))
news <- "final/en_US/en_US.news.txt"
news_size <- file.size(news)
news_lines <- readLines(news, skipNul = TRUE)
news_word_counts <- lengths(strsplit(news_lines, " "))
Basic metrics of the data:
one_MB <- 1024*1024
m <- matrix(c(twitter_size/one_MB, blogs_size/one_MB, news_size/one_MB,
length(twitter_lines),length(blogs_lines),length(news_lines),
sum(twitter_word_counts), sum(blogs_word_counts), sum(news_word_counts)),
nrow=3,ncol=3,
dimnames = list(c("twitter","blogs", "news"),c("Size (MB)", "Lines","Words")))
m
## Size (MB) Lines Words
## twitter 159.3641 2360148 30373583
## blogs 200.4242 899288 37334131
## news 196.2775 1010242 34372530
Next we do some text mining using the tm package.
First we create the corpus:
t_corpus <- Corpus(VectorSource(twitter_lines))
t_corpus <- tm_map(t_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(t_corpus, removeNumbers): transformation drops
## documents
t_corpus <- tm_map(t_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(t_corpus, removePunctuation): transformation
## drops documents
t_corpus <- tm_map(t_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(t_corpus, stripWhitespace): transformation drops
## documents
t_corpus <- tm_map(t_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(t_corpus, content_transformer(tolower)):
## transformation drops documents
Now we select the most frequent terms:
t_dtm <-DocumentTermMatrix(t_corpus)
t_dtm <- removeSparseTerms(t_dtm, 0.99)
t_freq = data.frame(sort(colSums(as.matrix(t_dtm)), decreasing=TRUE))
t_freq <- head(t_freq, 20)
head(t_freq)
## sort.colSums.as.matrix.t_dtm....decreasing...TRUE.
## the 933564
## you 542830
## and 433606
## for 384459
## that 232783
## with 172957
As a barplot:
barplot(t_freq[,1], names.arg = rownames(t_freq), las = 2)
As a word cloud:
wordcloud(rownames(t_freq), t_freq[,1], max.words=50, colors=brewer.pal(3, "Dark2"))
First we create the corpus:
b_corpus <- Corpus(VectorSource(blogs_lines))
b_corpus <- tm_map(b_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(b_corpus, removeNumbers): transformation drops
## documents
b_corpus <- tm_map(b_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(b_corpus, removePunctuation): transformation
## drops documents
b_corpus <- tm_map(b_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(b_corpus, stripWhitespace): transformation drops
## documents
b_corpus <- tm_map(b_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(b_corpus, content_transformer(tolower)):
## transformation drops documents
Now we select the most frequent terms:
b_dtm <-DocumentTermMatrix(b_corpus)
b_dtm <- removeSparseTerms(b_dtm, 0.99)
b_freq <- data.frame(sort(colSums(as.matrix(b_dtm)), decreasing=TRUE))
b_freq <- head(b_freq, 20)
head(b_freq)
## sort.colSums.as.matrix.b_dtm....decreasing...TRUE.
## the 1848731
## and 1084938
## that 458176
## for 362171
## you 293159
## with 285899
As a barplot:
barplot(b_freq[,1], names.arg = rownames(b_freq), las = 2)
As a word cloud:
wordcloud(rownames(b_freq), b_freq[,1], max.words=50, colors=brewer.pal(3, "Dark2"))
First we create the corpus:
n_corpus <- Corpus(VectorSource(news_lines))
n_corpus <- tm_map(n_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(n_corpus, removeNumbers): transformation drops
## documents
n_corpus <- tm_map(n_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(n_corpus, removePunctuation): transformation
## drops documents
n_corpus <- tm_map(n_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(n_corpus, stripWhitespace): transformation drops
## documents
n_corpus <- tm_map(n_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(n_corpus, content_transformer(tolower)):
## transformation drops documents
Now we select the most frequent terms:
n_dtm <-DocumentTermMatrix(n_corpus)
n_dtm <- removeSparseTerms(n_dtm, 0.99)
n_freq <- data.frame(sort(colSums(as.matrix(n_dtm)), decreasing=TRUE))
n_freq <- head(n_freq, 20)
head(n_freq)
## sort.colSums.as.matrix.n_dtm....decreasing...TRUE.
## the 1967293
## and 883630
## for 352683
## that 345662
## with 254508
## said 250326
As a barplot:
barplot(n_freq[,1], names.arg = rownames(n_freq), las = 2)
As a word cloud:
wordcloud(rownames(n_freq), n_freq[,1], max.words=50, colors=brewer.pal(3, "Dark2"))
The three files do have slightly different word frequencies. This can be explained because the context is different. E.g the word ‘said’ appears in the top ten for the news because news often report words said by someone.