Capstone Project EDA Milestone Report

Executive Summary

The main goal of this milestone report is to explore the text in the twitter, blogs and news corpa. The report walkthroughs the steps of the exploratory analysis, starting with reading the data, sampling it, pre-processing the data, constructing the word-frequency lookup table, and finally visualizing the word frequencies.

Data Loading and Basic Summaries

Loading Libraries and Preparing the Cores

First, we load the required libraries:

#Load required packages
library(stringi)
library(NLP)
library(tm)
library(SnowballC)
library(RWeka)
library(ggplot2)
library(grid)
library(wordcloud)
library(RColorBrewer)
library(parallel)
library(xtable)

#Preparing the parallel cluster using the cores
cl <- makeCluster(detectCores())
invisible(clusterEvalQ(cl, library(tm)))
invisible(clusterEvalQ(cl, library(RWeka)))
options(mc.cores=1)

Reading the Files

We read the data in, and generate a basic summary of the text that we have read in.

setwd('C:\\Dropbox\\Projects\\Coursera\\capstone')

#READING FILES
twitter <- readLines("final\\en_US\\en_US.twitter.txt", encoding="UTF-8")
blogs <- readLines("final\\en_US\\en_US.blogs.txt", encoding="UTF-8")
news <- readLines("final\\en_US\\en_US.news.txt", encoding="UTF-8")

Basic Summary

	Lines	Words	Characters	Words per Lines
twitter	2,360,148	30,373,543	162,096,031	12
blogs	899,288	37,334,131	206,824,505	41
news	77,259	2,643,969	15,639,408	34

Data Pre-Processing

Data Sampling

We use binom function to select the lines, and then generate the samples.

setwd('C:\\Dropbox\\Projects\\Coursera\\capstone')
sampleDir <- file.path("final\\en_US\\en_US_Samples")
if (!file.exists(sampleDir)){
  rawDataDir   <- file.path("final\\en_US")
  rawDataFiles <- list.files(path=rawDataDir, pattern="*.txt", full.names=T, recursive=FALSE)
  rawDataFilenames  <- unlist(lapply(rawDataFiles, basename))
  
  sampleFiles <- file.path(sampleDir, paste0("sample_", rawDataFilenames))
  
  
  dir.create(file.path(".", sampleDir))  
  sapply(1:3, function(x) sampleFile(rawDataFiles[x], sampleFiles[x]))
}

Building the Corpus Sample and Data Cleaning

Here, we construct the corpus from the samples, and then perform the following data cleaning steps:

Convert all characters to lower case
Remove the punctuation
Remove the Arabic numerals
Strip all white space
Remove the bad words from the corpus (obtained from http://www.cs.cmu.edu/~biglou/resources/bad-words.txt)
Stemming, but I decided to not go for it in this exercise to consider all words

setwd('C:\\Dropbox\\Projects\\Coursera\\capstone')
corpus <- Corpus(DirSource("final\\en_US\\en_US_Samples"), readerControl = list(language="en_US", encoding ="UTF-8"))
corpus <- tm_map(corpus, content_transformer(tolower)) 
corpus <- tm_map(corpus, removePunctuation) 
corpus <- tm_map(corpus, removeNumbers) 
corpus <- tm_map(corpus, stripWhitespace)

badWords <- scan("bad-words.txt", "")
corpus <- tm_map(corpus, removeWords, badWords)

Tokenizing the Corpus and Constructing the N-Grams

We will construct the Term Document Matrices for each n-gram tokenizer. We consider only 1-gram, 2-gram, 3-gram, and 4-gram tokenizers. Then, we remove the sparse words from each n-gram. After that, we combine all documents in each n-gram, and produce a word-frequency list per each n-gram. At the end, word-frequency data frames are produced for the top 40 to be used for plotting.

#Tokenizer for n-grams and passed on to the term-document matrix constructor

TdmUni <- TermDocumentMatrix(corpus, control = list(tokenize = UnigramTokenizer))
TdmBi  <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))
TdmTri <- TermDocumentMatrix(corpus, control = list(tokenize = TrigramTokenizer))
TdmQuad <- TermDocumentMatrix(corpus, control = list(tokenize = QuadgramTokenizer))

TdmUni <- removeSparseTerms(TdmUni, 0.1)
TdmBi <-  removeSparseTerms(TdmBi, 0.1)
TdmTri <- removeSparseTerms(TdmTri, 0.1)
TdmQuad <- removeSparseTerms(TdmQuad, 0.1)

# Term frequency
freq.uni <- rowSums(as.matrix(TdmUni))
freq.bi  <- rowSums(as.matrix(TdmBi))
freq.tri <- rowSums(as.matrix(TdmTri))
freq.quad <- rowSums(as.matrix(TdmQuad))

##sort
freq.uni <- sort(freq.uni, decreasing = TRUE)
freq.bi  <- sort(freq.bi, decreasing = TRUE)
freq.tri <- sort(freq.tri, decreasing = TRUE)
freq.quad <- sort(freq.quad, decreasing = TRUE)


# Create the top 40 data frames from the matrices
df.freq.uni <- data.frame("Term"=names(head(freq.uni,40)), "Frequency"=head(freq.uni,40))
df.freq.bi  <- data.frame("Term"=names(head(freq.bi,40)), "Frequency"=head(freq.bi,40))
df.freq.tri <- data.frame("Term"=names(head(freq.tri,40)), "Frequency"=head(freq.tri,40))
df.freq.quad <- data.frame("Term"=names(head(freq.quad,40)), "Frequency"=head(freq.quad,40))

# Reorder levels for better plotting
df.freq.uni$Term1 <- reorder(df.freq.uni$Term, df.freq.uni$Frequency)
df.freq.bi$Term1  <- reorder(df.freq.bi$Term, df.freq.bi$Frequency)
df.freq.tri$Term1 <- reorder(df.freq.tri$Term, df.freq.tri$Frequency)
df.freq.quad$Term1 <- reorder(df.freq.quad$Term, df.freq.quad$Frequency)

Exploratory Data Analysis Plots

We can visualize the frequencies for each term using histograms, and word clouds.

Bar Plots for the N-Gram Tokens

#Bar Plots
p1 <-
ggplot(df.freq.uni, aes(x = Term1, y = Frequency)) +
  geom_bar(stat = "identity", color="gray55", fill="orange") +
  geom_text(data=df.freq.uni,aes(x=Term1,y=-2000,label=Frequency),vjust=0, size=3) +
  xlab("Terms") + ylab("Count") + ggtitle("Top 40 UniGram Tokenized Word Frequency") +
  theme(plot.title = element_text(lineheight=.8, face="bold")) +
  coord_flip()

p2 <-
ggplot(df.freq.bi, aes(x = Term1, y = Frequency)) +
  geom_bar(stat = "identity", color="gray55", fill="steelblue2") +
  geom_text(data=df.freq.bi,aes(x=Term1,y=-250,label=Frequency),vjust=0, size=3) +
  xlab("Terms") + ylab("Count") + ggtitle("Top 40 BiGram Tokenized Word Frequency") +
  theme(plot.title = element_text(lineheight=.8, face="bold")) +
  coord_flip()

p3 <-
ggplot(df.freq.tri, aes(x = Term1, y = Frequency)) +
  geom_bar(stat = "identity", color="gray55", fill="greenyellow") +
  geom_text(data=df.freq.tri,aes(x=Term1,y=-25,label=Frequency),vjust=0, size=3) +  
  xlab("Terms") + ylab("Count") + ggtitle("Top 40 TriGram Tokenized Word Frequency") +
  theme(plot.title = element_text(lineheight=.8, face="bold")) +
  coord_flip()

p4 <-
ggplot(df.freq.quad, aes(x = Term1, y = Frequency)) +
  geom_bar(stat = "identity", color="gray55", fill="brown1") +
  geom_text(data=df.freq.quad,aes(x=Term1,y=-3,label=Frequency),vjust=0, size=3) +  
  xlab("Terms") + ylab("Count") + ggtitle("Top 40 QuadGram Tokenized Word Frequency") +
  theme(plot.title = element_text(lineheight=.8, face="bold")) +
  coord_flip()

multiplot(p1, p2, p3, p4, cols=1)

Word Clouds for the N-Gram Tokens

#Word Clouds
par(mfrow=c(1,4))
wordcloud(words = df.freq.uni$Term1,
          freq = df.freq.uni$Frequency,
          random.order=FALSE,
          rot.per=0.35,
          use.r.layout=FALSE,
          colors=brewer.pal(8, "Dark2"))

text(x=0.5, y=1.1, "UniGram Word Cloud")

wordcloud(words = df.freq.bi$Term1,
          freq = df.freq.bi$Frequency,
          random.order=FALSE,
          rot.per=0.35,
          use.r.layout=FALSE,
          colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=1.1, "BiGram Word Cloud")

wordcloud(words = df.freq.tri$Term1,
          freq = df.freq.tri$Frequency,
          random.order=FALSE,
          rot.per=0.35,
          use.r.layout=FALSE,
          colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=1.1, "TriGram Word Cloud")

wordcloud(words = df.freq.quad$Term1,
          freq = df.freq.quad$Frequency,
          random.order=FALSE,
          rot.per=0.35,
          use.r.layout=FALSE,
          colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=1.1, "QuadGram Word Cloud")

Word clouds for different N-Grams: 1-gram(left), 2-gram (middle), 3-gram(right)

Conclusions and Future Work

As we found in the report, the following steps were performed in our Explorartory Data Analysis exercise on the corpa:

Read the corpa
Sample the data
Build the corpus from the samples
Clean the data and remove profanity words
Build the N-Gram tokens
Construct the word-frequency data frames
Plot the word-frequency histograms and

In the next couple of weeks, we will build an algorithm to predict the next work, based on the n-gram models. The algorithm involves building a lookup table for each n-gram, and use it to predict the next word in a sentence. The algorithm will search in the 4-gram model, then the 3-gram, and then in the 2-gram. If there is no match, then we need to come up with the best guess. The algorithm will assume that the whole English language is covered in the corpa (very optimistic). The algorithm has to be fast in searching and predicting a word. A shiny app will be built to demonstrate the algorithm to predict a word after an input phrase.

Appendix A: Document and Session Information

This document was generated using rmarkdown, knitr, and knitrBootstrap.

sessionInfo()

## R version 3.1.1 (2014-07-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] parallel  grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
## [1] xtable_1.7-4       wordcloud_2.5      RColorBrewer_1.0-5
## [4] ggplot2_1.0.0      RWeka_0.4-23       SnowballC_0.5.1   
## [7] tm_0.6             NLP_0.1-5          stringi_0.2-5     
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.2-4     digest_0.6.4         evaluate_0.5.5      
##  [4] formatR_1.0          gtable_0.1.2         htmltools_0.2.6     
##  [7] knitr_1.7.12         knitrBootstrap_1.0.0 labeling_0.3        
## [10] markdown_0.7.4       MASS_7.3-33          mime_0.1.2          
## [13] munsell_0.4.2        plyr_1.8.1           proto_0.3-10        
## [16] Rcpp_0.11.3          reshape2_1.4         rJava_0.9-6         
## [19] rmarkdown_0.3.12     RWekajars_3.7.11-1   scales_0.2.4        
## [22] slam_0.1-32          stringr_0.6.2        tools_3.1.1         
## [25] yaml_2.1.13

Appendix B: Useful Resources

Appendix C: Functions

sampleFile <- function(fileIn, fileOut){
  set.seed(54321)
  
  conIn  <- file(fileIn, "r", blocking=FALSE)
  conOut <- file(fileOut, "w")
  
  while(length(line <- readLines(conIn, n=1)) > 0){
    sample.prob <- rbinom(1, 1, 0.5)
    writeLines(line, conOut)
      
    if(sample.prob==1){line <- readLines(conIn, n=99)}
  }  
  
  close(conIn)
  close(conOut)  
}

# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  require(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer  <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))