Introduction

This is my milestone report for Week 2 in Data Science Capstone project. The objective is to explain the Explortory Data Analysis which will lead to the eventual prediction app and algorithm. The user will provide a word or a phrase and the application will try to predict the next word. The model will be trained using a corpus (a collection of English text) that is compiled from 3 sources - news, blogs, and tweets. In the following report, I load and clean the data as well as use NLM (Natural Language Processing) applications in R (tm and RWeka) to tokenize n-grams as a first step toward building a predictive model.

The raw corpus data is downloaded and stored locally at:

Basic Information about Corpus Dataset

To get a sense of what the data looks like, I determined the number of lines, number of characters, and number of words for each of the 3 datasets (Blog, News and Twitter). I also calculate some basic statistics on the number of words per line (min, mean, and max). The code is attached as Appendix.

FileName Lines Chars Words WPL Min WPL Mean WPL Max
en_US.blogs 899288 206824382 37570839 0 41.75 6726
en_US.news 1010242 203223154 34494539 1 34.41 1796
en_US.twitter 2360148 162096031 30451128 1 12.75 47

Sample the Dataset

I first sampled the dataset that is 1% of each of the original datasets. Then I removed all non-English characters. Thereafter, I combined blog, news and twitter sampled datasets into sampleData.

Build Corpus and Clean the Data

Next, sampled data is used to create a corpus, and following clean up steps are performed:

The steps above are performed with build_corpus() function. The code from this section is attached as Appendix.

Tokenize and Calculate Frequencies of N-Grams

The RWeka package has been used to develope Tokenizers function in order to create unigram, bigram and trigram. Thereafter, a Document Term Matrix (DTM) is created for the corpus. This is done by getTermTable() function. The detailed code from this section is attached in the Appendix.

Plot Sampled Corpus Data with Word Cloud

The wordcloud package is used to demonstrate what the corpus looks like in term of word frequency mapping. Here it is shown the wordcloud for unigrams, bigrams and trigrams from left to right respectively.

Make Plots

Lastly, I write a function to plot the n-gram frequency and go ahead and plot the 20 most frequent Unigrams, Bigrams, and Trigrams.

Further Development Plan

After the exploratory analysis, I think it is ready to start building the predictive model(s) and eventually the data product. Here is my further steps:

Appendix A - Basic Information of the Data

# Preload necessary R librabires
library(knitr); library(dplyr); library(doParallel); library(tm); library(SnowballC)
library(stringi); library(tm); library(ggplot2); library(wordcloud)

path1 <- "./data/en_US.blogs.txt"
path2 <- "./data/en_US.news.txt"
path3 <- "./data/en_US.twitter.txt"

# Read blogs data in binary mode
conn <- file(path1, open="rb")
blogs <- readLines(conn, encoding="UTF-8"); close(conn)
# Read news data in binary mode
conn <- file(path2, open="rb")
news <- readLines(conn, encoding="UTF-8"); close(conn)
# Read twitter data in binary mode
conn <- file(path3, open="rb")
twitter <- readLines(conn, encoding="UTF-8"); close(conn)
# Remove temporary variable
rm(conn)

# Compute statistics and summary info for each data type
WPL <- sapply(list(blogs,news,twitter),function(x) summary(stri_count_words(x))[c('Min.','Mean','Max.')])
rownames(WPL) <- c('WPL_Min','WPL_Mean','WPL_Max')
stats <- data.frame(
  FileName=c("en_US.blogs","en_US.news","en_US.twitter"),      
  t(rbind(
    sapply(list(blogs,news,twitter),stri_stats_general)[c('Lines','Chars'),],
    Words=sapply(list(blogs,news,twitter),stri_stats_latex)['Words',],
    WPL)
  ))
head(stats)

Appendix B - Sample and Clean the Data

# Set random seed for reproducibility and sample the data
set.seed(1001)
sampleBlogs <- blogs[sample(1:length(blogs), 0.01*length(blogs), replace=FALSE)]
sampleNews <- news[sample(1:length(news), 0.01*length(news), replace=FALSE)]
sampleTwitter <- twitter[sample(1:length(twitter), 0.01*length(twitter), replace=FALSE)]

# Remove unconvention/funny characters for sampled Blogs/News/Twitter
sampleBlogs <- iconv(sampleBlogs, "UTF-8", "ASCII", sub="")
sampleNews <- iconv(sampleNews, "UTF-8", "ASCII", sub="")
sampleTwitter <- iconv(sampleTwitter, "UTF-8", "ASCII", sub="")
sampleData <- c(sampleBlogs,sampleNews,sampleTwitter)

# Remove temporary variables
rm(blogs, news, twitter, path1, path2, path3)

Appendix C - Build Corpus

build_corpus <- function (x = sampleData) {
  sample_c <- VCorpus(VectorSource(x)) # Create corpus dataset
  sample_c <- tm_map(sample_c, tolower) # all lowercase
  sample_c <- tm_map(sample_c, removePunctuation) # Eleminate punctuation
  sample_c <- tm_map(sample_c, removeNumbers) # Eliminate numbers
  sample_c <- tm_map(sample_c, stripWhitespace) # Strip Whitespace
  
  # read and process a file of banned words
  bw <- read.csv(file ='Terms-to-Block.csv', stringsAsFactors=F, skip=3)
  bannedWords <- gsub(",", "", tolower(bw[,2]))
  sample_c <- tm_map(sample_c, removeWords, bannedWords) # Eliminate banned words
  sample_c <- tm_map(sample_c, removeWords, stopwords("english")) # Eliminate English stop words
  sample_c <- tm_map(sample_c, stemDocument) # Stem the document
  sample_c <- tm_map(sample_c, PlainTextDocument) # Create plain text format
}
corpusData <- build_corpus(sampleData)

Appendix D - Tokenize and Calculate Frequencies of N-Grams

library(RWeka)

getTermTable <- function(corpusData, ngrams = 1, lowfreq = 50) {
  #create term-document matrix tokenized on n-grams
  tokenizer <- function(x) { NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams)) }
  tdm <- TermDocumentMatrix(corpusData, control = list(tokenize = tokenizer))
  #find the top term grams with a minimum of occurrence in the corpus
  top_terms <- findFreqTerms(tdm,lowfreq)
  top_terms_freq <- rowSums(as.matrix(tdm[top_terms,]))
  top_terms_freq <- data.frame(word = names(top_terms_freq), frequency = top_terms_freq)
  top_terms_freq <- arrange(top_terms_freq, desc(frequency))
}
    
tt.Data <- list(3)
for (i in 1:3) {
  tt.Data[[i]] <- getTermTable(corpusData, ngrams = i, lowfreq = 10)
}

Appendix E - Code for Plot of Sampled Corpus with Word Cloud

library(wordcloud)
library(RColorBrewer)

# Set random seed for reproducibility
set.seed(1001)
# Set Plotting in 1 row 3 columns
par(mfrow=c(1, 3))
for (i in 1:3) {
  wordcloud(tt.Data[[i]]$word, tt.Data[[i]]$frequency, scale = c(3,1), max.words=100, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
}

Appendix F - Make Plots

plot.Grams <- function (x = tt.Data, N=10) {
  g1 <- ggplot(data = head(x[[1]],N), aes(x = reorder(word, -frequency), y = frequency)) + 
        geom_bar(stat = "identity", fill = "green") + 
        ggtitle(paste("Unigrams")) + 
        xlab("Unigrams") + ylab("Frequency") + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1))
  g2 <- ggplot(data = head(x[[2]],N), aes(x = reorder(word, -frequency), y = frequency)) + 
        geom_bar(stat = "identity", fill = "blue") + 
        ggtitle(paste("Bigrams")) + 
        xlab("Bigrams") + ylab("Frequency") + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1))
  g3 <- ggplot(data = head(x[[3]],N), aes(x = reorder(word, -frequency), y = frequency)) + 
        geom_bar(stat = "identity", fill = "darkgreen") + 
        ggtitle(paste("Trigrams")) + 
        xlab("Trigrams") + ylab("Frequency") + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1))
  # Put three plots into 1 row 3 columns
  gridExtra::grid.arrange(g1, g2, g3, ncol = 3)
}
library(ggplot2); library(gridExtra)
plot.Grams(x = tt.Data, N = 20)