#Package tm (Text Mining) required for this assignment.
library(tm)
## Loading required package: NLP
library(NLP)
library(textmineR)
## Loading required package: Matrix
##
## Attaching package: 'textmineR'
## The following object is masked from 'package:Matrix':
##
## update
## The following object is masked from 'package:stats':
##
## update
library(RWeka)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
After downloading and extracting our dataset (be patient, there are a lot of data records), we can start exploring it. We will take some random samples from the files, and not use all of the records. A small sample will suffice for now. This report will include some graphics.
#setwd("/home/leo")
datafolder <- paste(getwd(), "/RstudioProjects/final/en_US/", sep = "")
setwd(datafolder)
#Loading datasets
twitter <- readLines("en_US.twitter.txt", warn = FALSE)
blogs <- readLines("en_US.blogs.txt", warn = FALSE)
news_file <- readLines("en_US.news.txt", warn = FALSE)
To understand the depth of the dataset, let’s run some basic statistics on the text in the 3 files (blogs, news & twitter). We will calculate the total number of lines, characters and words in the data as well as minimum, average (mean) and maximum word counts for any the 3 data files.
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
df <- data.frame(matrix(unlist(l), nrow=length(l), byrow=T))
colnames(df) <- c("file", "size(MB)", "num.of.lines", "longest.line", "num.of.words")
df
## file size(MB) num.of.lines
## 1 /home/leo/RstudioProjects/final/en_US/en_US.blogs.txt 200.42 899288
## 2 /home/leo/RstudioProjects/final/en_US/en_US.news.txt 196.28 1010242
## 3 /home/leo/RstudioProjects/final/en_US/en_US.twitter.txt 159.36 2360148
## longest.line num.of.words
## 1 483415 37334131
## 2 123628 34372530
## 3 26 30373543
After several tests, a random sample of 1% of the data was determined to be enough for this exercise. A larger value will produce an unmanageable corpus.
This is to allow us to provide a faster look at the data to see what we see and decide on the next steps before analyzing the entire data files.
set.seed(1234)
t_sample <- sample(twitter, length(twitter)*.01)
n_sample <- sample(news_file, length(news_file)*0.01)
b_sample <- sample(blogs, length(blogs)*.01)
combined_sample <- c(t_sample, n_sample, b_sample)
combined_sample <- iconv(combined_sample, "UTF-8","ASCII", sub="")
length(combined_sample)
## [1] 42695
In order to create a nanogram, I chose to clean the data a little bit: strip white-space, lower case remove, punctuation remove, number remove, plain text conversion, and english stop-words detection.
corpus <- VCorpus(VectorSource(combined_sample))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unigram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = unigram))
unigram_freqTerm <- findFreqTerms(unigram_tdm,lowfreq = 40)
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bigram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = bigram))
bigram_freqTerm <- findFreqTerms(bigram_tdm,lowfreq=40)
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
trigram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = trigram))
trigram_freqTerm <- findFreqTerms(trigram_tdm,lowfreq=10)
quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
quadgram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = quadgram))
quadgram_freqTerm <- findFreqTerms(quadgram_tdm,lowfreq=10)
unigram_freq <- rowSums(as.matrix(unigram_tdm[unigram_freqTerm,]))
unigram_ord <- order(unigram_freq, decreasing = TRUE)
unigram_freq <- data.frame(word=names(unigram_freq[unigram_ord]), frequency=unigram_freq[unigram_ord])
ggplot(unigram_freq[1:25,], aes(factor(word, levels = unique(word)), frequency)) +
geom_bar(stat = 'identity')+
theme(axis.text.x=element_text(angle=90))+
xlab('Unigram')+
ylab('Frequency')
wordcloud(unigram_freq$word, unigram_freq$frequency, max.words=40, colors=brewer.pal(8, "Set1"))
Now with our constructed bigram
bigram_freq <- rowSums(as.matrix(bigram_tdm[bigram_freqTerm,]))
bigram_ord <- order(bigram_freq, decreasing = TRUE)
bigram_freq <- data.frame(word=names(bigram_freq[bigram_ord]), frequency=bigram_freq[bigram_ord])
ggplot(bigram_freq[1:20,], aes(factor(word, levels = unique(word)), frequency)) +
geom_bar(stat = 'identity')+
theme(axis.text.x=element_text(angle=90))+
xlab('Bigram')+
ylab('Frequency')
wordcloud(bigram_freq$word, bigram_freq$frequency, max.words=30, colors=brewer.pal(8, "Set1"))
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## cant wait could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## im going could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## last year could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## good morning could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram_freq$word, bigram_freq$frequency, max.words = 30, :
## high school could not be fit on page. It will not be plotted.
With the trigram
trigram_freq <- rowSums(as.matrix(trigram_tdm[trigram_freqTerm,]))
trigram_ord <- order(trigram_freq, decreasing = TRUE)
trigram_freq <- data.frame(word=names(trigram_freq[trigram_ord]), frequency=trigram_freq[trigram_ord])
ggplot(trigram_freq[1:15,], aes(factor(word, levels = unique(word)), frequency)) +
geom_bar(stat = 'identity')+
theme(axis.text.x=element_text(angle=90))+
xlab('Trigram')+
ylab('Frequency')
wordcloud(trigram_freq$word, trigram_freq$frequency, max.words=15, colors=brewer.pal(8, "Set1"))
## Warning in wordcloud(trigram_freq$word, trigram_freq$frequency, max.words =
## 15, : happy mothers day could not be fit on page. It will not be plotted.
Nothing of importance was found with the tetragram. I used a 1% of the data, perhaps I need more data (and a better laptop)