This report shows the exploratory analysis and goals for the app and algorithm being done in this project. The data used in this project is tweets, blogs, news content, captured in HC Corpora at www.corpora.heliohost.org.
library(ggplot2)
library(R.utils)
library(quanteda)
library(RColorBrewer)
library(plyr)
library(tm)
library(RWeka)
Assign options and get the data
options(scipen=999)
twitter <- '/Users/uiuc/Downloads/final/en_US/en_US.twitter.txt'
news <- '/Users/uiuc/Downloads/final/en_US/en_US.news.txt'
blogs <- '/Users/uiuc/Downloads/final/en_US/en_US.blogs.txt'
dir <- '/Users/uiuc/Downloads/final'
A basic summary of the complete file content follows:
| File | Size on Disk [MB] | Lines | Word Count [Tokens] |
|---|---|---|---|
| en_US.twitter.txt | 167.105338 | 2360148 | 30374206 |
| en_US.news.txt | 205.811889 | 1010242 | 34372720 |
| en_US.blogs.txt | 210.160014 | 899288 | 37334690 |
The numbers, punctuation, stop words and whitespace are removed, and all text is converted to lower case. A subset of 2 % of each file is sampled. Create 2-grams and 3-grams from the sampled data using Weka tokenizers.
blogsData <- readLines(blogs, 20000)
newsData <- readLines(news, 20000)
twitterData <- readLines(twitter, 20000)
sampledData <- c(sample(blogsData, length(blogsData) * 0.02),
sample(newsData, length(newsData) * 0.02),
sample(twitterData, length(twitterData) * 0.02))
docs <- VCorpus(VectorSource(sampledData))
docs <- tm_map(docs, content_transformer(function(x, pattern) gsub(pattern, " ", x)),"(f|ht)tp(s?)://(.*)[.][a-z]+")
docs <- tm_map(docs, content_transformer(function(x, pattern) gsub(pattern, " ", x)),"@[^\\s]+")
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords("en"))
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, PlainTextDocument)
unigramT <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigramT <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigramT <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
unigramF <- getFreq(removeSparseTerms(TermDocumentMatrix(docs), 0.9999))
bigramF <- getFreq(removeSparseTerms(TermDocumentMatrix(docs, control = list(tokenize = bigramT)), 0.9999))
trigramF <- getFreq(removeSparseTerms(TermDocumentMatrix(docs, control = list(tokenize = trigramT)), 0.9999))
Let us create a generic plot function
makePlot <- function(data, label) {
ggplot(data[1:30,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity", fill = I("grey50"))
}
For unigrams,
makePlot(unigramF, "30 Most Common Unigrams")
For bigrams,
makePlot(bigramF, "30 Most Common Bigrams")
For trigrams,
makePlot(trigramF, "30 Most Common Trigrams")