knitr::opts_chunk$set(echo = TRUE)
library('tm')
library('wordcloud')
library('stringi')
library('stringr')
library('RWeka')
library('ggplot2')

Introduction

This aim of this report is to perform an exploratory analysis of the english language corpora from the Data Science Specialization Capstone Project, from Coursera.

We begin by loading and pre-processing the data. Afterwards some data visualization techinques are used to get a sense of the information collected.

1. Data Loading

setwd("./final/en_US")

enBlog <- readLines("en_US.blogs.txt", skipNul = TRUE, encoding= "UTF-8")
enNews <- readLines("en_US.news.txt", skipNul = TRUE, encoding= "UTF-8")
## Warning in readLines("en_US.news.txt", skipNul = TRUE, encoding = "UTF-8"):
## linha final incompleta encontrada em 'en_US.news.txt'
enTwitter <- readLines("en_US.twitter.txt", skipNul = TRUE, encoding= "UTF-8")

To get a glimpse of the data, their summaries are displayed:

summary(enBlog)
##    Length     Class      Mode 
##    899288 character character
summary(enNews)
##    Length     Class      Mode 
##     77259 character character
summary(enTwitter)
##    Length     Class      Mode 
##   2360148 character character

2. Pre-processing

Since the corpora is too big, the database is sampled down to 5000 lines from each source and than combined all together into a volatile corpora, so we can use the “tm” package.

set.seed(1)
sampleSize <- 1000

enBlogS <- enBlog[sample(1:length(enBlog),sampleSize)]
enNewsS <- enNews[sample(1:length(enNews),sampleSize)]
enTwitterS <- enTwitter[sample(1:length(enTwitter),sampleSize)]

data <- rbind(enBlogS,enNewsS,enTwitterS)
corpus <- VCorpus(VectorSource(data))

The next step is to clean the data, removing unwanted characters and coverting all letters to lowercase.

corpus <- tm_map(corpus, function(x) iconv(x, to='UTF-8', sub='byte'))

corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

Once the data is pre-processed, the n-grams are generated, considering 1-gram, 2-gram and 3-gram, for further analysis.

4. Tokenization with n-grams

uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

uni <- TermDocumentMatrix(corpus, control = list(tokenize = uniGramTokenizer))
bi <- TermDocumentMatrix(corpus, control = list(tokenize = biGramTokenizer))
three <- TermDocumentMatrix(corpus, control = list(tokenize = triGramTokenizer))

3. Exploratory Analysis

We start the exploration with wordclouds, to visualize the most frequet n-grams.

uniMatrix <- as.matrix(uni)
uniMatrix <- sort(rowSums(uniMatrix),decreasing=TRUE)
uniMatrix <- data.frame(word = names(uniMatrix),freq=uniMatrix)
wordcloud(words = uniMatrix$word, freq = uniMatrix$freq, min.freq = 1,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"),main="1-gram")

biMatrix <- as.matrix(bi)
biMatrix <- sort(rowSums(biMatrix),decreasing=TRUE)
biMatrix <- data.frame(word = names(biMatrix),freq=biMatrix)
wordcloud(words = biMatrix$word, freq = biMatrix$freq, min.freq = 1,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"),main="2-gram")

threeMatrix <- as.matrix(three)
threeMatrix <- sort(rowSums(threeMatrix),decreasing=TRUE)
threeMatrix <- data.frame(word = names(threeMatrix),freq=threeMatrix)
wordcloud(words = threeMatrix$word, freq = threeMatrix$freq, min.freq = 1,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"),main="3-gram")

The most frequent words are also shown, to get a glimpse of the data. First we need to sort the grams by frequency

getFreq <- function(tdm) {
  freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}

# Get frequencies of most common n-grams in data sample
uniFreq <- getFreq(uni)
biFreq <- getFreq(bi)
threeFreq <- getFreq(three)
makePlot <- function(data, label) {
  ggplot(data[1:30,], aes(reorder(word, -freq), freq)) +
         labs(x = label, y = "Frequency") +
         theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("grey50"))
}

par(mfrow=c(1,3))
makePlot(uniFreq, "30 Most Common 1-grams")

makePlot(biFreq, "30 Most Common 2-grams")

makePlot(threeFreq, "30 Most Common 3-grams")

Next Step: Model design

As nexts steps, a model will be constructed that will be able to predict the most likely word to follow a phrase.