Synopsis

This report shows the exploratory analysis and goals for the app and algorithm being done in this project. The data used in this project is tweets, blogs, news content, captured in HC Corpora at www.corpora.heliohost.org.

Initialization

library(ggplot2)       
library(R.utils)       
library(quanteda)      
library(RColorBrewer)  
library(plyr)         
library(tm)
library(RWeka)

Assign options and get the data

options(scipen=999)         

twitter <- '/Users/uiuc/Downloads/final/en_US/en_US.twitter.txt'
news <- '/Users/uiuc/Downloads/final/en_US/en_US.news.txt'
blogs <- '/Users/uiuc/Downloads/final/en_US/en_US.blogs.txt'
dir <- '/Users/uiuc/Downloads/final'

Basic Summary

A basic summary of the complete file content follows:

File Size on Disk [MB] Lines Word Count [Tokens]
en_US.twitter.txt 167.105338 2360148 30374206
en_US.news.txt 205.811889 1010242 34372720
en_US.blogs.txt 210.160014 899288 37334690

Data Tokenization

The numbers, punctuation, stop words and whitespace are removed, and all text is converted to lower case. A subset of 2 % of each file is sampled. Create 2-grams and 3-grams from the sampled data using Weka tokenizers.

blogsData <- readLines(blogs, 20000)
newsData <- readLines(news, 20000)
twitterData <- readLines(twitter, 20000)
sampledData <- c(sample(blogsData, length(blogsData) * 0.02),
                 sample(newsData, length(newsData) * 0.02),
                 sample(twitterData, length(twitterData) * 0.02))
docs <- VCorpus(VectorSource(sampledData))
docs <- tm_map(docs, content_transformer(function(x, pattern) gsub(pattern, " ", x)),"(f|ht)tp(s?)://(.*)[.][a-z]+")
docs <- tm_map(docs, content_transformer(function(x, pattern) gsub(pattern, " ", x)),"@[^\\s]+")
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords("en"))
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, PlainTextDocument)

unigramT <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigramT <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigramT <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

getFreq <- function(tdm) {
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}


unigramF <- getFreq(removeSparseTerms(TermDocumentMatrix(docs), 0.9999))
bigramF <- getFreq(removeSparseTerms(TermDocumentMatrix(docs, control = list(tokenize = bigramT)), 0.9999))
trigramF <- getFreq(removeSparseTerms(TermDocumentMatrix(docs, control = list(tokenize = trigramT)), 0.9999))

Token Graph Analysis

Let us create a generic plot function

makePlot <- function(data, label) {
  ggplot(data[1:30,], aes(reorder(word, -freq), freq)) +
         labs(x = label, y = "Frequency") +
         theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("grey50"))
}

For unigrams,

makePlot(unigramF, "30 Most Common Unigrams")

For bigrams,

makePlot(bigramF, "30 Most Common Bigrams")

For trigrams,

makePlot(trigramF, "30 Most Common Trigrams")