knitr::opts_chunk$set(echo = TRUE)
library('tm')
library('wordcloud')
library('stringi')
library('stringr')
library('RWeka')
library('ggplot2')
This aim of this report is to perform an exploratory analysis of the english language corpora from the Data Science Specialization Capstone Project, from Coursera.
We begin by loading and pre-processing the data. Afterwards some data visualization techinques are used to get a sense of the information collected.
setwd("./final/en_US")
enBlog <- readLines("en_US.blogs.txt", skipNul = TRUE, encoding= "UTF-8")
enNews <- readLines("en_US.news.txt", skipNul = TRUE, encoding= "UTF-8")
## Warning in readLines("en_US.news.txt", skipNul = TRUE, encoding = "UTF-8"):
## linha final incompleta encontrada em 'en_US.news.txt'
enTwitter <- readLines("en_US.twitter.txt", skipNul = TRUE, encoding= "UTF-8")
To get a glimpse of the data, their summaries are displayed:
summary(enBlog)
## Length Class Mode
## 899288 character character
summary(enNews)
## Length Class Mode
## 77259 character character
summary(enTwitter)
## Length Class Mode
## 2360148 character character
Since the corpora is too big, the database is sampled down to 5000 lines from each source and than combined all together into a volatile corpora, so we can use the “tm” package.
set.seed(1)
sampleSize <- 1000
enBlogS <- enBlog[sample(1:length(enBlog),sampleSize)]
enNewsS <- enNews[sample(1:length(enNews),sampleSize)]
enTwitterS <- enTwitter[sample(1:length(enTwitter),sampleSize)]
data <- rbind(enBlogS,enNewsS,enTwitterS)
corpus <- VCorpus(VectorSource(data))
The next step is to clean the data, removing unwanted characters and coverting all letters to lowercase.
corpus <- tm_map(corpus, function(x) iconv(x, to='UTF-8', sub='byte'))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
Once the data is pre-processed, the n-grams are generated, considering 1-gram, 2-gram and 3-gram, for further analysis.
uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
uni <- TermDocumentMatrix(corpus, control = list(tokenize = uniGramTokenizer))
bi <- TermDocumentMatrix(corpus, control = list(tokenize = biGramTokenizer))
three <- TermDocumentMatrix(corpus, control = list(tokenize = triGramTokenizer))
We start the exploration with wordclouds, to visualize the most frequet n-grams.
uniMatrix <- as.matrix(uni)
uniMatrix <- sort(rowSums(uniMatrix),decreasing=TRUE)
uniMatrix <- data.frame(word = names(uniMatrix),freq=uniMatrix)
wordcloud(words = uniMatrix$word, freq = uniMatrix$freq, min.freq = 1,
max.words=150, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"),main="1-gram")
biMatrix <- as.matrix(bi)
biMatrix <- sort(rowSums(biMatrix),decreasing=TRUE)
biMatrix <- data.frame(word = names(biMatrix),freq=biMatrix)
wordcloud(words = biMatrix$word, freq = biMatrix$freq, min.freq = 1,
max.words=150, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"),main="2-gram")
threeMatrix <- as.matrix(three)
threeMatrix <- sort(rowSums(threeMatrix),decreasing=TRUE)
threeMatrix <- data.frame(word = names(threeMatrix),freq=threeMatrix)
wordcloud(words = threeMatrix$word, freq = threeMatrix$freq, min.freq = 1,
max.words=150, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"),main="3-gram")
The most frequent words are also shown, to get a glimpse of the data. First we need to sort the grams by frequency
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
# Get frequencies of most common n-grams in data sample
uniFreq <- getFreq(uni)
biFreq <- getFreq(bi)
threeFreq <- getFreq(three)
makePlot <- function(data, label) {
ggplot(data[1:30,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity", fill = I("grey50"))
}
par(mfrow=c(1,3))
makePlot(uniFreq, "30 Most Common 1-grams")
makePlot(biFreq, "30 Most Common 2-grams")
makePlot(threeFreq, "30 Most Common 3-grams")
As nexts steps, a model will be constructed that will be able to predict the most likely word to follow a phrase.