Basic report

#Package tm (Text Mining) required for this assignment.
library(tm)
## Loading required package: NLP
library(NLP)
library(textmineR)
## Loading required package: Matrix
## 
## Attaching package: 'textmineR'
## The following object is masked from 'package:Matrix':
## 
##     update
## The following object is masked from 'package:stats':
## 
##     update
library(RWeka)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer

After downloading and extracting our dataset (be patient, there are a lot of data records), we can start exploring it. We will take some random samples from the files, and not use all of the records. A small sample will suffice for now. This report will include some graphics.

#setwd("/home/leo")
datafolder <- paste(getwd(), "/RstudioProjects/final/en_US/", sep = "")
setwd(datafolder)

#Loading datasets
twitter <- readLines("en_US.twitter.txt", warn = FALSE)
blogs <- readLines("en_US.blogs.txt", warn = FALSE)
news_file <- readLines("en_US.news.txt", warn = FALSE)

To understand the depth of the dataset, let’s run some basic statistics on the text in the 3 files (blogs, news & twitter). We will calculate the total number of lines, characters and words in the data as well as minimum, average (mean) and maximum word counts for any the 3 data files.

## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
df <- data.frame(matrix(unlist(l), nrow=length(l), byrow=T))
colnames(df) <- c("file", "size(MB)", "num.of.lines", "longest.line", "num.of.words")
df
##                                                      file size(MB) num.of.lines
## 1   /home/leo/RstudioProjects/final/en_US/en_US.blogs.txt   200.42       899288
## 2    /home/leo/RstudioProjects/final/en_US/en_US.news.txt   196.28      1010242
## 3 /home/leo/RstudioProjects/final/en_US/en_US.twitter.txt   159.36      2360148
##   longest.line num.of.words
## 1       483415     37334131
## 2       123628     34372530
## 3           26     30373543

After several tests, a random sample of 1% of the data was determined to be enough for this exercise. A larger value will produce an unmanageable corpus.

This is to allow us to provide a faster look at the data to see what we see and decide on the next steps before analyzing the entire data files.

set.seed(1234)

t_sample <- sample(twitter, length(twitter)*.01)
n_sample <- sample(news_file, length(news_file)*0.01)
b_sample <- sample(blogs, length(blogs)*.01)

combined_sample <- c(t_sample, n_sample, b_sample)
combined_sample <- iconv(combined_sample, "UTF-8","ASCII", sub="")
length(combined_sample)
## [1] 42695

In order to create a nanogram, I chose to clean the data a little bit: strip white-space, lower case remove, punctuation remove, number remove, plain text conversion, and english stop-words detection.

corpus <- VCorpus(VectorSource(combined_sample))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unigram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = unigram))
unigram_freqTerm <- findFreqTerms(unigram_tdm,lowfreq = 40)

bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bigram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = bigram))
bigram_freqTerm <- findFreqTerms(bigram_tdm,lowfreq=40)

trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
trigram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = trigram))
trigram_freqTerm <- findFreqTerms(trigram_tdm,lowfreq=10)

quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
quadgram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = quadgram))
quadgram_freqTerm <- findFreqTerms(quadgram_tdm,lowfreq=10)

Unigram word cloud and histogram

unigram_freq <- rowSums(as.matrix(unigram_tdm[unigram_freqTerm,]))
unigram_ord <- order(unigram_freq, decreasing = TRUE)
unigram_freq <- data.frame(word=names(unigram_freq[unigram_ord]), frequency=unigram_freq[unigram_ord])

ggplot(unigram_freq[1:25,], aes(factor(word, levels = unique(word)), frequency)) +
  geom_bar(stat = 'identity')+
  theme(axis.text.x=element_text(angle=90))+
  xlab('Unigram')+
  ylab('Frequency')

wordcloud(unigram_freq$word, unigram_freq$frequency, max.words=40, colors=brewer.pal(8, "Set1"))

Now with our constructed bigram

bigram_freq <- rowSums(as.matrix(bigram_tdm[bigram_freqTerm,]))
bigram_ord <- order(bigram_freq, decreasing = TRUE)
bigram_freq <- data.frame(word=names(bigram_freq[bigram_ord]), frequency=bigram_freq[bigram_ord])

ggplot(bigram_freq[1:20,], aes(factor(word, levels = unique(word)), frequency)) +
  geom_bar(stat = 'identity')+
  theme(axis.text.x=element_text(angle=90))+
  xlab('Bigram')+
  ylab('Frequency')

wordcloud(bigram_freq$word, bigram_freq$frequency, max.words=30, colors=brewer.pal(8, "Set1"))
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## cant wait could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## im going could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## last year could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## good morning could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## high school could not be fit on page. It will not be plotted.

With the trigram

trigram_freq <- rowSums(as.matrix(trigram_tdm[trigram_freqTerm,]))
trigram_ord <- order(trigram_freq, decreasing = TRUE)
trigram_freq <- data.frame(word=names(trigram_freq[trigram_ord]), frequency=trigram_freq[trigram_ord])

ggplot(trigram_freq[1:15,], aes(factor(word, levels = unique(word)), frequency)) +
  geom_bar(stat = 'identity')+
  theme(axis.text.x=element_text(angle=90))+
  xlab('Trigram')+
  ylab('Frequency')

wordcloud(trigram_freq$word, trigram_freq$frequency, max.words=15, colors=brewer.pal(8, "Set1"))
## Warning in wordcloud(trigram_freq$word, trigram_freq$frequency, max.words =
## 15, : happy mothers day could not be fit on page. It will not be plotted.

Nothing of importance was found with the tetragram. I used a 1% of the data, perhaps I need more data (and a better laptop)