#Summary
The objectives for this project are:
The following steps are performed:
To start, the library required are loaded and also the minimized data sets:
library(tm)
## Loading required package: NLP
library(ngram)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(tau)
library(RWeka)
blog <- readLines("en_US.blogs.txt", warn=FALSE, encoding="UTF-8")
twit <- readLines("en_US.twitter.txt", warn=FALSE, encoding="UTF-8")
news <- readLines("en_US.news.txt", warn=FALSE, encoding="UTF-8")
First a random sample of 1% of each file is done.
This allows to provide a faster look at the data to explore and decide on the next steps before analyzing the entire data files.
set.seed(2250)
twit_sample <- sample(twit, length(twit)*.01)
news_sample <- sample(news, length(news)*.01)
blog_sample <- sample(blog, length(blog)*.01)
combined_sample <- c(twit_sample, blog_sample, news_sample)
combined_sample <- iconv(combined_sample, "UTF-8","ASCII", sub="")
length(combined_sample)
## [1] 37245
The next step is then, use several options of the Text Mining (TM) package to clean up the data. Initially, for creating ngrams the following steps are: Strip Whitespace Lower Case, Remove Punctuation, Remove Numbers and do Plain Text Conversion.
corpus <- VCorpus(VectorSource(combined_sample))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
First de n-grams are defined:
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unigram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = unigram))
unigram_freqTerm <- findFreqTerms(unigram_tdm,lowfreq = 40)
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bigram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = bigram))
bigram_freqTerm <- findFreqTerms(bigram_tdm,lowfreq=40)
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
trigram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = trigram))
trigram_freqTerm <- findFreqTerms(trigram_tdm,lowfreq=10)
quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
quadgram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = quadgram))
quadgram_freqTerm <- findFreqTerms(quadgram_tdm,lowfreq=10)
Then the Unigram Word Cloud and barplot are created:
unigram_freq <- rowSums(as.matrix(unigram_tdm[unigram_freqTerm,]))
unigram_ord <- order(unigram_freq, decreasing = TRUE)
unigram_freq <- data.frame(word=names(unigram_freq[unigram_ord]), frequency=unigram_freq[unigram_ord])
wordcloud(unigram_freq$word, unigram_freq$frequency, max.words=40, colors=brewer.pal(8, "Set1"))
ggplot(unigram_freq[1:25,], aes(factor(word, levels = unique(word)), frequency)) +
geom_bar(stat = 'identity')+
theme(axis.text.x=element_text(angle=90))+
xlab('Unigram')+
ylab('Frequency')
Then the Bigram Word Cloud and barplot are created:
bigram_freq <- rowSums(as.matrix(bigram_tdm[bigram_freqTerm,]))
bigram_ord <- order(bigram_freq, decreasing = TRUE)
bigram_freq <- data.frame(word=names(bigram_freq[bigram_ord]), frequency=bigram_freq[bigram_ord])
wordcloud(bigram_freq$word, bigram_freq$frequency, max.words=30, colors=brewer.pal(8, "Set1"))
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## high school could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## make sure could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## looking forward could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## new york could not be fit on page. It will not be plotted.
ggplot(bigram_freq[1:20,], aes(factor(word, levels = unique(word)), frequency)) +
geom_bar(stat = 'identity')+
theme(axis.text.x=element_text(angle=90))+
xlab('Bigram')+
ylab('Frequency')
Then the Trigram Word Cloud and barplot are created:
trigram_freq <- rowSums(as.matrix(trigram_tdm[trigram_freqTerm,]))
trigram_ord <- order(trigram_freq, decreasing = TRUE)
trigram_freq <- data.frame(word=names(trigram_freq[trigram_ord]), frequency=trigram_freq[trigram_ord])
wordcloud(trigram_freq$word, trigram_freq$frequency, max.words=15, colors=brewer.pal(8, "Set1"))
## Warning in wordcloud(trigram_freq$word, trigram_freq$frequency, max.words =
## 15, : happy mothers day could not be fit on page. It will not be plotted.
ggplot(trigram_freq[1:15,], aes(factor(word, levels = unique(word)), frequency)) +
geom_bar(stat = 'identity')+
theme(axis.text.x=element_text(angle=90))+
xlab('Trigram')+
ylab('Frequency')
## Warning: Removed 2 rows containing missing values (position_stack).
Then the Quadgram Word Cloud and barplot are created:
quadgram_freq <- rowSums(as.matrix(quadgram_tdm[quadgram_freqTerm,]))
quadgram_ord <- order(quadgram_freq, decreasing = TRUE)
quadgram_freq <- data.frame(word=names(quadgram_freq[quadgram_ord]), frequency=quadgram_freq[quadgram_ord])
wordcloud(quadgram_freq$word, quadgram_freq$frequency, max.words=15, colors=brewer.pal(8, "Set1"))
## Warning in wordcloud(quadgram_freq$word, quadgram_freq$frequency, max.words
## = 15, : follow follow follow follow could not be fit on page. It will not be
## plotted.
ggplot(quadgram_freq[1:15,], aes(factor(word, levels = unique(word)), frequency)) +
geom_bar(stat = 'identity')+
theme(axis.text.x=element_text(angle=90))+
xlab('Quadgram')+
ylab('Frequency')
## Warning: Removed 14 rows containing missing values (position_stack).
##Results so far
It can be seen that, as the longer the N-gram is, the less frequent they become. Further analysis is required for the machine learning algorithm desired.