Milestone Report

Capstone Project EDA Milestone Report

Executive Summary

The main goal of this milestone report is to explore the text in the twitter, blogs and news corpa. The report walkthroughs the steps of the exploratory analysis, starting with reading the data, sampling it, pre-processing the data, constructing the word-frequency lookup table, and finally visualizing the word frequencies.

Data Loading and Basic Summaries

Loading Libraries and Preparing the Cores

First, we load the required libraries:
#Load required packages
library(stringi)
library(NLP)
library(tm)
library(SnowballC)
library(RWeka)
library(ggplot2)
library(grid)
library(wordcloud)
library(RColorBrewer)
library(parallel)
library(xtable)

#Preparing the parallel cluster using the cores
cl <- makeCluster(detectCores())
invisible(clusterEvalQ(cl, library(tm)))
invisible(clusterEvalQ(cl, library(RWeka)))
options(mc.cores=1)

Reading the Files

We read the data in, and generate a basic summary of the text that we have read in.
setwd('C:\\Dropbox\\Projects\\Coursera\\capstone')

#READING FILES
twitter <- readLines("final\\en_US\\en_US.twitter.txt", encoding="UTF-8")
blogs <- readLines("final\\en_US\\en_US.blogs.txt", encoding="UTF-8")
news <- readLines("final\\en_US\\en_US.news.txt", encoding="UTF-8")

Basic Summary

Lines Words Characters Words per Lines
twitter 2,360,148 30,373,543 162,096,031 12
blogs 899,288 37,334,131 206,824,505 41
news 77,259 2,643,969 15,639,408 34

Data Pre-Processing

Data Sampling

We use binom function to select the lines, and then generate the samples.
setwd('C:\\Dropbox\\Projects\\Coursera\\capstone')
sampleDir <- file.path("final\\en_US\\en_US_Samples")
if (!file.exists(sampleDir)){
  rawDataDir   <- file.path("final\\en_US")
  rawDataFiles <- list.files(path=rawDataDir, pattern="*.txt", full.names=T, recursive=FALSE)
  rawDataFilenames  <- unlist(lapply(rawDataFiles, basename))
  
  sampleFiles <- file.path(sampleDir, paste0("sample_", rawDataFilenames))
  
  
  dir.create(file.path(".", sampleDir))  
  sapply(1:3, function(x) sampleFile(rawDataFiles[x], sampleFiles[x]))
}

Building the Corpus Sample and Data Cleaning

Here, we construct the corpus from the samples, and then perform the following data cleaning steps:

  1. Convert all characters to lower case
  2. Remove the punctuation
  3. Remove the Arabic numerals
  4. Strip all white space
  5. Remove the bad words from the corpus (obtained from http://www.cs.cmu.edu/~biglou/resources/bad-words.txt)
  6. Stemming, but I decided to not go for it in this exercise to consider all words
setwd('C:\\Dropbox\\Projects\\Coursera\\capstone')
corpus <- Corpus(DirSource("final\\en_US\\en_US_Samples"), readerControl = list(language="en_US", encoding ="UTF-8"))
corpus <- tm_map(corpus, content_transformer(tolower)) 
corpus <- tm_map(corpus, removePunctuation) 
corpus <- tm_map(corpus, removeNumbers) 
corpus <- tm_map(corpus, stripWhitespace)

badWords <- scan("bad-words.txt", "")
corpus <- tm_map(corpus, removeWords, badWords)

Tokenizing the Corpus and Constructing the N-Grams

We will construct the Term Document Matrices for each n-gram tokenizer. We consider only 1-gram, 2-gram, 3-gram, and 4-gram tokenizers. Then, we remove the sparse words from each n-gram. After that, we combine all documents in each n-gram, and produce a word-frequency list per each n-gram. At the end, word-frequency data frames are produced for the top 40 to be used for plotting.

#Tokenizer for n-grams and passed on to the term-document matrix constructor

TdmUni <- TermDocumentMatrix(corpus, control = list(tokenize = UnigramTokenizer))
TdmBi  <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))
TdmTri <- TermDocumentMatrix(corpus, control = list(tokenize = TrigramTokenizer))
TdmQuad <- TermDocumentMatrix(corpus, control = list(tokenize = QuadgramTokenizer))

TdmUni <- removeSparseTerms(TdmUni, 0.1)
TdmBi <-  removeSparseTerms(TdmBi, 0.1)
TdmTri <- removeSparseTerms(TdmTri, 0.1)
TdmQuad <- removeSparseTerms(TdmQuad, 0.1)

# Term frequency
freq.uni <- rowSums(as.matrix(TdmUni))
freq.bi  <- rowSums(as.matrix(TdmBi))
freq.tri <- rowSums(as.matrix(TdmTri))
freq.quad <- rowSums(as.matrix(TdmQuad))

##sort
freq.uni <- sort(freq.uni, decreasing = TRUE)
freq.bi  <- sort(freq.bi, decreasing = TRUE)
freq.tri <- sort(freq.tri, decreasing = TRUE)
freq.quad <- sort(freq.quad, decreasing = TRUE)


# Create the top 40 data frames from the matrices
df.freq.uni <- data.frame("Term"=names(head(freq.uni,40)), "Frequency"=head(freq.uni,40))
df.freq.bi  <- data.frame("Term"=names(head(freq.bi,40)), "Frequency"=head(freq.bi,40))
df.freq.tri <- data.frame("Term"=names(head(freq.tri,40)), "Frequency"=head(freq.tri,40))
df.freq.quad <- data.frame("Term"=names(head(freq.quad,40)), "Frequency"=head(freq.quad,40))

# Reorder levels for better plotting
df.freq.uni$Term1 <- reorder(df.freq.uni$Term, df.freq.uni$Frequency)
df.freq.bi$Term1  <- reorder(df.freq.bi$Term, df.freq.bi$Frequency)
df.freq.tri$Term1 <- reorder(df.freq.tri$Term, df.freq.tri$Frequency)
df.freq.quad$Term1 <- reorder(df.freq.quad$Term, df.freq.quad$Frequency)

Exploratory Data Analysis Plots

We can visualize the frequencies for each term using histograms, and word clouds.

Bar Plots for the N-Gram Tokens

#Bar Plots
p1 <-
ggplot(df.freq.uni, aes(x = Term1, y = Frequency)) +
  geom_bar(stat = "identity", color="gray55", fill="orange") +
  geom_text(data=df.freq.uni,aes(x=Term1,y=-2000,label=Frequency),vjust=0, size=3) +
  xlab("Terms") + ylab("Count") + ggtitle("Top 40 UniGram Tokenized Word Frequency") +
  theme(plot.title = element_text(lineheight=.8, face="bold")) +
  coord_flip()

p2 <-
ggplot(df.freq.bi, aes(x = Term1, y = Frequency)) +
  geom_bar(stat = "identity", color="gray55", fill="steelblue2") +
  geom_text(data=df.freq.bi,aes(x=Term1,y=-250,label=Frequency),vjust=0, size=3) +
  xlab("Terms") + ylab("Count") + ggtitle("Top 40 BiGram Tokenized Word Frequency") +
  theme(plot.title = element_text(lineheight=.8, face="bold")) +
  coord_flip()

p3 <-
ggplot(df.freq.tri, aes(x = Term1, y = Frequency)) +
  geom_bar(stat = "identity", color="gray55", fill="greenyellow") +
  geom_text(data=df.freq.tri,aes(x=Term1,y=-25,label=Frequency),vjust=0, size=3) +  
  xlab("Terms") + ylab("Count") + ggtitle("Top 40 TriGram Tokenized Word Frequency") +
  theme(plot.title = element_text(lineheight=.8, face="bold")) +
  coord_flip()

p4 <-
ggplot(df.freq.quad, aes(x = Term1, y = Frequency)) +
  geom_bar(stat = "identity", color="gray55", fill="brown1") +
  geom_text(data=df.freq.quad,aes(x=Term1,y=-3,label=Frequency),vjust=0, size=3) +  
  xlab("Terms") + ylab("Count") + ggtitle("Top 40 QuadGram Tokenized Word Frequency") +
  theme(plot.title = element_text(lineheight=.8, face="bold")) +
  coord_flip()

multiplot(p1, p2, p3, p4, cols=1)

Word Clouds for the N-Gram Tokens

#Word Clouds
par(mfrow=c(1,4))
wordcloud(words = df.freq.uni$Term1,
          freq = df.freq.uni$Frequency,
          random.order=FALSE,
          rot.per=0.35,
          use.r.layout=FALSE,
          colors=brewer.pal(8, "Dark2"))

text(x=0.5, y=1.1, "UniGram Word Cloud")

wordcloud(words = df.freq.bi$Term1,
          freq = df.freq.bi$Frequency,
          random.order=FALSE,
          rot.per=0.35,
          use.r.layout=FALSE,
          colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=1.1, "BiGram Word Cloud")

wordcloud(words = df.freq.tri$Term1,
          freq = df.freq.tri$Frequency,
          random.order=FALSE,
          rot.per=0.35,
          use.r.layout=FALSE,
          colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=1.1, "TriGram Word Cloud")

wordcloud(words = df.freq.quad$Term1,
          freq = df.freq.quad$Frequency,
          random.order=FALSE,
          rot.per=0.35,
          use.r.layout=FALSE,
          colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=1.1, "QuadGram Word Cloud")
Word clouds for different N-Grams: 1-gram(left), 2-gram (middle), 3-gram(right)

Conclusions and Future Work

As we found in the report, the following steps were performed in our Explorartory Data Analysis exercise on the corpa:

  1. Read the corpa
  2. Sample the data
  3. Build the corpus from the samples
  4. Clean the data and remove profanity words
  5. Build the N-Gram tokens
  6. Construct the word-frequency data frames
  7. Plot the word-frequency histograms and

In the next couple of weeks, we will build an algorithm to predict the next work, based on the n-gram models. The algorithm involves building a lookup table for each n-gram, and use it to predict the next word in a sentence. The algorithm will search in the 4-gram model, then the 3-gram, and then in the 2-gram. If there is no match, then we need to come up with the best guess. The algorithm will assume that the whole English language is covered in the corpa (very optimistic). The algorithm has to be fast in searching and predicting a word. A shiny app will be built to demonstrate the algorithm to predict a word after an input phrase.

Appendix A: Document and Session Information

This document was generated using rmarkdown, knitr, and knitrBootstrap.

sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] parallel  grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
## [1] xtable_1.7-4       wordcloud_2.5      RColorBrewer_1.0-5
## [4] ggplot2_1.0.0      RWeka_0.4-23       SnowballC_0.5.1   
## [7] tm_0.6             NLP_0.1-5          stringi_0.2-5     
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.2-4     digest_0.6.4         evaluate_0.5.5      
##  [4] formatR_1.0          gtable_0.1.2         htmltools_0.2.6     
##  [7] knitr_1.7.12         knitrBootstrap_1.0.0 labeling_0.3        
## [10] markdown_0.7.4       MASS_7.3-33          mime_0.1.2          
## [13] munsell_0.4.2        plyr_1.8.1           proto_0.3-10        
## [16] Rcpp_0.11.3          reshape2_1.4         rJava_0.9-6         
## [19] rmarkdown_0.3.12     RWekajars_3.7.11-1   scales_0.2.4        
## [22] slam_0.1-32          stringr_0.6.2        tools_3.1.1         
## [25] yaml_2.1.13

Appendix B: Useful Resources

  1. Introduction to basic Text Mining in R
  2. Text Mining the Complete Works of William Shakespeare
  3. Text Mining
  4. Data Mining with R Text Mining
  5. RDataMining-slides-text-mining

Appendix C: Functions

sampleFile <- function(fileIn, fileOut){
  set.seed(54321)
  
  conIn  <- file(fileIn, "r", blocking=FALSE)
  conOut <- file(fileOut, "w")
  
  while(length(line <- readLines(conIn, n=1)) > 0){
    sample.prob <- rbinom(1, 1, 0.5)
    writeLines(line, conOut)
      
    if(sample.prob==1){line <- readLines(conIn, n=99)}
  }  
  
  close(conIn)
  close(conOut)  
}

# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  require(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer  <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))