This is my milestone report for Week 2 in Data Science Capstone project. The objective is to explain the Explortory Data Analysis which will lead to the eventual prediction app and algorithm. The user will provide a word or a phrase and the application will try to predict the next word. The model will be trained using a corpus (a collection of English text) that is compiled from 3 sources - news, blogs, and tweets. In the following report, I load and clean the data as well as use NLM (Natural Language Processing) applications in R (tm and RWeka) to tokenize n-grams as a first step toward building a predictive model.
The raw corpus data is downloaded and stored locally at:
Blog: ./data/en_US.blogs.txt
News: ./data/en_US.news.txt
Twitter: ./data/en_US.twitter.txt
To get a sense of what the data looks like, I determined the number of lines, number of characters, and number of words for each of the 3 datasets (Blog, News and Twitter). I also calculate some basic statistics on the number of words per line (min, mean, and max). The code is attached as Appendix.
| FileName | Lines | Chars | Words | WPL Min | WPL Mean | WPL Max |
|---|---|---|---|---|---|---|
| en_US.blogs | 899288 | 206824382 | 37570839 | 0 | 41.75 | 6726 |
| en_US.news | 1010242 | 203223154 | 34494539 | 1 | 34.41 | 1796 |
| en_US.twitter | 2360148 | 162096031 | 30451128 | 1 | 12.75 | 47 |
I first sampled the dataset that is 1% of each of the original datasets. Then I removed all non-English characters. Thereafter, I combined blog, news and twitter sampled datasets into sampleData.
Next, sampled data is used to create a corpus, and following clean up steps are performed:
The steps above are performed with build_corpus() function. The code from this section is attached as Appendix.
The RWeka package has been used to develope Tokenizers function in order to create unigram, bigram and trigram. Thereafter, a Document Term Matrix (DTM) is created for the corpus. This is done by getTermTable() function. The detailed code from this section is attached in the Appendix.
The wordcloud package is used to demonstrate what the corpus looks like in term of word frequency mapping. Here it is shown the wordcloud for unigrams, bigrams and trigrams from left to right respectively.
Lastly, I write a function to plot the n-gram frequency and go ahead and plot the 20 most frequent Unigrams, Bigrams, and Trigrams.
After the exploratory analysis, I think it is ready to start building the predictive model(s) and eventually the data product. Here is my further steps:
# Preload necessary R librabires
library(knitr); library(dplyr); library(doParallel); library(tm); library(SnowballC)
library(stringi); library(tm); library(ggplot2); library(wordcloud)
path1 <- "./data/en_US.blogs.txt"
path2 <- "./data/en_US.news.txt"
path3 <- "./data/en_US.twitter.txt"
# Read blogs data in binary mode
conn <- file(path1, open="rb")
blogs <- readLines(conn, encoding="UTF-8"); close(conn)
# Read news data in binary mode
conn <- file(path2, open="rb")
news <- readLines(conn, encoding="UTF-8"); close(conn)
# Read twitter data in binary mode
conn <- file(path3, open="rb")
twitter <- readLines(conn, encoding="UTF-8"); close(conn)
# Remove temporary variable
rm(conn)
# Compute statistics and summary info for each data type
WPL <- sapply(list(blogs,news,twitter),function(x) summary(stri_count_words(x))[c('Min.','Mean','Max.')])
rownames(WPL) <- c('WPL_Min','WPL_Mean','WPL_Max')
stats <- data.frame(
FileName=c("en_US.blogs","en_US.news","en_US.twitter"),
t(rbind(
sapply(list(blogs,news,twitter),stri_stats_general)[c('Lines','Chars'),],
Words=sapply(list(blogs,news,twitter),stri_stats_latex)['Words',],
WPL)
))
head(stats)
# Set random seed for reproducibility and sample the data
set.seed(1001)
sampleBlogs <- blogs[sample(1:length(blogs), 0.01*length(blogs), replace=FALSE)]
sampleNews <- news[sample(1:length(news), 0.01*length(news), replace=FALSE)]
sampleTwitter <- twitter[sample(1:length(twitter), 0.01*length(twitter), replace=FALSE)]
# Remove unconvention/funny characters for sampled Blogs/News/Twitter
sampleBlogs <- iconv(sampleBlogs, "UTF-8", "ASCII", sub="")
sampleNews <- iconv(sampleNews, "UTF-8", "ASCII", sub="")
sampleTwitter <- iconv(sampleTwitter, "UTF-8", "ASCII", sub="")
sampleData <- c(sampleBlogs,sampleNews,sampleTwitter)
# Remove temporary variables
rm(blogs, news, twitter, path1, path2, path3)
build_corpus <- function (x = sampleData) {
sample_c <- VCorpus(VectorSource(x)) # Create corpus dataset
sample_c <- tm_map(sample_c, tolower) # all lowercase
sample_c <- tm_map(sample_c, removePunctuation) # Eleminate punctuation
sample_c <- tm_map(sample_c, removeNumbers) # Eliminate numbers
sample_c <- tm_map(sample_c, stripWhitespace) # Strip Whitespace
# read and process a file of banned words
bw <- read.csv(file ='Terms-to-Block.csv', stringsAsFactors=F, skip=3)
bannedWords <- gsub(",", "", tolower(bw[,2]))
sample_c <- tm_map(sample_c, removeWords, bannedWords) # Eliminate banned words
sample_c <- tm_map(sample_c, removeWords, stopwords("english")) # Eliminate English stop words
sample_c <- tm_map(sample_c, stemDocument) # Stem the document
sample_c <- tm_map(sample_c, PlainTextDocument) # Create plain text format
}
corpusData <- build_corpus(sampleData)
library(RWeka)
getTermTable <- function(corpusData, ngrams = 1, lowfreq = 50) {
#create term-document matrix tokenized on n-grams
tokenizer <- function(x) { NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams)) }
tdm <- TermDocumentMatrix(corpusData, control = list(tokenize = tokenizer))
#find the top term grams with a minimum of occurrence in the corpus
top_terms <- findFreqTerms(tdm,lowfreq)
top_terms_freq <- rowSums(as.matrix(tdm[top_terms,]))
top_terms_freq <- data.frame(word = names(top_terms_freq), frequency = top_terms_freq)
top_terms_freq <- arrange(top_terms_freq, desc(frequency))
}
tt.Data <- list(3)
for (i in 1:3) {
tt.Data[[i]] <- getTermTable(corpusData, ngrams = i, lowfreq = 10)
}
library(wordcloud)
library(RColorBrewer)
# Set random seed for reproducibility
set.seed(1001)
# Set Plotting in 1 row 3 columns
par(mfrow=c(1, 3))
for (i in 1:3) {
wordcloud(tt.Data[[i]]$word, tt.Data[[i]]$frequency, scale = c(3,1), max.words=100, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
}
plot.Grams <- function (x = tt.Data, N=10) {
g1 <- ggplot(data = head(x[[1]],N), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "green") +
ggtitle(paste("Unigrams")) +
xlab("Unigrams") + ylab("Frequency") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
g2 <- ggplot(data = head(x[[2]],N), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "blue") +
ggtitle(paste("Bigrams")) +
xlab("Bigrams") + ylab("Frequency") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
g3 <- ggplot(data = head(x[[3]],N), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "darkgreen") +
ggtitle(paste("Trigrams")) +
xlab("Trigrams") + ylab("Frequency") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Put three plots into 1 row 3 columns
gridExtra::grid.arrange(g1, g2, g3, ncol = 3)
}
library(ggplot2); library(gridExtra)
plot.Grams(x = tt.Data, N = 20)