The main goal of this milestone report is to explore the text in the twitter, blogs and news corpa. The report walkthroughs the steps of the exploratory analysis, starting with reading the data, sampling it, pre-processing the data, constructing the word-frequency lookup table, and finally visualizing the word frequencies.
#Load required packages
library(stringi)
library(NLP)
library(tm)
library(SnowballC)
library(RWeka)
library(ggplot2)
library(grid)
library(wordcloud)
library(RColorBrewer)
library(parallel)
library(xtable)
#Preparing the parallel cluster using the cores
cl <- makeCluster(detectCores())
invisible(clusterEvalQ(cl, library(tm)))
invisible(clusterEvalQ(cl, library(RWeka)))
options(mc.cores=1)setwd('C:\\Dropbox\\Projects\\Coursera\\capstone')
#READING FILES
twitter <- readLines("final\\en_US\\en_US.twitter.txt", encoding="UTF-8")
blogs <- readLines("final\\en_US\\en_US.blogs.txt", encoding="UTF-8")
news <- readLines("final\\en_US\\en_US.news.txt", encoding="UTF-8")| Lines | Words | Characters | Words per Lines | |
|---|---|---|---|---|
| 2,360,148 | 30,373,543 | 162,096,031 | 12 | |
| blogs | 899,288 | 37,334,131 | 206,824,505 | 41 |
| news | 77,259 | 2,643,969 | 15,639,408 | 34 |
setwd('C:\\Dropbox\\Projects\\Coursera\\capstone')
sampleDir <- file.path("final\\en_US\\en_US_Samples")
if (!file.exists(sampleDir)){
rawDataDir <- file.path("final\\en_US")
rawDataFiles <- list.files(path=rawDataDir, pattern="*.txt", full.names=T, recursive=FALSE)
rawDataFilenames <- unlist(lapply(rawDataFiles, basename))
sampleFiles <- file.path(sampleDir, paste0("sample_", rawDataFilenames))
dir.create(file.path(".", sampleDir))
sapply(1:3, function(x) sampleFile(rawDataFiles[x], sampleFiles[x]))
}Here, we construct the corpus from the samples, and then perform the following data cleaning steps:
setwd('C:\\Dropbox\\Projects\\Coursera\\capstone')
corpus <- Corpus(DirSource("final\\en_US\\en_US_Samples"), readerControl = list(language="en_US", encoding ="UTF-8"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
badWords <- scan("bad-words.txt", "")
corpus <- tm_map(corpus, removeWords, badWords)We will construct the Term Document Matrices for each n-gram tokenizer. We consider only 1-gram, 2-gram, 3-gram, and 4-gram tokenizers. Then, we remove the sparse words from each n-gram. After that, we combine all documents in each n-gram, and produce a word-frequency list per each n-gram. At the end, word-frequency data frames are produced for the top 40 to be used for plotting.
#Tokenizer for n-grams and passed on to the term-document matrix constructor
TdmUni <- TermDocumentMatrix(corpus, control = list(tokenize = UnigramTokenizer))
TdmBi <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))
TdmTri <- TermDocumentMatrix(corpus, control = list(tokenize = TrigramTokenizer))
TdmQuad <- TermDocumentMatrix(corpus, control = list(tokenize = QuadgramTokenizer))
TdmUni <- removeSparseTerms(TdmUni, 0.1)
TdmBi <- removeSparseTerms(TdmBi, 0.1)
TdmTri <- removeSparseTerms(TdmTri, 0.1)
TdmQuad <- removeSparseTerms(TdmQuad, 0.1)
# Term frequency
freq.uni <- rowSums(as.matrix(TdmUni))
freq.bi <- rowSums(as.matrix(TdmBi))
freq.tri <- rowSums(as.matrix(TdmTri))
freq.quad <- rowSums(as.matrix(TdmQuad))
##sort
freq.uni <- sort(freq.uni, decreasing = TRUE)
freq.bi <- sort(freq.bi, decreasing = TRUE)
freq.tri <- sort(freq.tri, decreasing = TRUE)
freq.quad <- sort(freq.quad, decreasing = TRUE)
# Create the top 40 data frames from the matrices
df.freq.uni <- data.frame("Term"=names(head(freq.uni,40)), "Frequency"=head(freq.uni,40))
df.freq.bi <- data.frame("Term"=names(head(freq.bi,40)), "Frequency"=head(freq.bi,40))
df.freq.tri <- data.frame("Term"=names(head(freq.tri,40)), "Frequency"=head(freq.tri,40))
df.freq.quad <- data.frame("Term"=names(head(freq.quad,40)), "Frequency"=head(freq.quad,40))
# Reorder levels for better plotting
df.freq.uni$Term1 <- reorder(df.freq.uni$Term, df.freq.uni$Frequency)
df.freq.bi$Term1 <- reorder(df.freq.bi$Term, df.freq.bi$Frequency)
df.freq.tri$Term1 <- reorder(df.freq.tri$Term, df.freq.tri$Frequency)
df.freq.quad$Term1 <- reorder(df.freq.quad$Term, df.freq.quad$Frequency)We can visualize the frequencies for each term using histograms, and word clouds.
#Bar Plots
p1 <-
ggplot(df.freq.uni, aes(x = Term1, y = Frequency)) +
geom_bar(stat = "identity", color="gray55", fill="orange") +
geom_text(data=df.freq.uni,aes(x=Term1,y=-2000,label=Frequency),vjust=0, size=3) +
xlab("Terms") + ylab("Count") + ggtitle("Top 40 UniGram Tokenized Word Frequency") +
theme(plot.title = element_text(lineheight=.8, face="bold")) +
coord_flip()
p2 <-
ggplot(df.freq.bi, aes(x = Term1, y = Frequency)) +
geom_bar(stat = "identity", color="gray55", fill="steelblue2") +
geom_text(data=df.freq.bi,aes(x=Term1,y=-250,label=Frequency),vjust=0, size=3) +
xlab("Terms") + ylab("Count") + ggtitle("Top 40 BiGram Tokenized Word Frequency") +
theme(plot.title = element_text(lineheight=.8, face="bold")) +
coord_flip()
p3 <-
ggplot(df.freq.tri, aes(x = Term1, y = Frequency)) +
geom_bar(stat = "identity", color="gray55", fill="greenyellow") +
geom_text(data=df.freq.tri,aes(x=Term1,y=-25,label=Frequency),vjust=0, size=3) +
xlab("Terms") + ylab("Count") + ggtitle("Top 40 TriGram Tokenized Word Frequency") +
theme(plot.title = element_text(lineheight=.8, face="bold")) +
coord_flip()
p4 <-
ggplot(df.freq.quad, aes(x = Term1, y = Frequency)) +
geom_bar(stat = "identity", color="gray55", fill="brown1") +
geom_text(data=df.freq.quad,aes(x=Term1,y=-3,label=Frequency),vjust=0, size=3) +
xlab("Terms") + ylab("Count") + ggtitle("Top 40 QuadGram Tokenized Word Frequency") +
theme(plot.title = element_text(lineheight=.8, face="bold")) +
coord_flip()
multiplot(p1, p2, p3, p4, cols=1)#Word Clouds
par(mfrow=c(1,4))
wordcloud(words = df.freq.uni$Term1,
freq = df.freq.uni$Frequency,
random.order=FALSE,
rot.per=0.35,
use.r.layout=FALSE,
colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=1.1, "UniGram Word Cloud")
wordcloud(words = df.freq.bi$Term1,
freq = df.freq.bi$Frequency,
random.order=FALSE,
rot.per=0.35,
use.r.layout=FALSE,
colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=1.1, "BiGram Word Cloud")
wordcloud(words = df.freq.tri$Term1,
freq = df.freq.tri$Frequency,
random.order=FALSE,
rot.per=0.35,
use.r.layout=FALSE,
colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=1.1, "TriGram Word Cloud")
wordcloud(words = df.freq.quad$Term1,
freq = df.freq.quad$Frequency,
random.order=FALSE,
rot.per=0.35,
use.r.layout=FALSE,
colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=1.1, "QuadGram Word Cloud")As we found in the report, the following steps were performed in our Explorartory Data Analysis exercise on the corpa:
In the next couple of weeks, we will build an algorithm to predict the next work, based on the n-gram models. The algorithm involves building a lookup table for each n-gram, and use it to predict the next word in a sentence. The algorithm will search in the 4-gram model, then the 3-gram, and then in the 2-gram. If there is no match, then we need to come up with the best guess. The algorithm will assume that the whole English language is covered in the corpa (very optimistic). The algorithm has to be fast in searching and predicting a word. A shiny app will be built to demonstrate the algorithm to predict a word after an input phrase.
This document was generated using rmarkdown, knitr, and knitrBootstrap.
sessionInfo()## R version 3.1.1 (2014-07-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] parallel grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] xtable_1.7-4 wordcloud_2.5 RColorBrewer_1.0-5
## [4] ggplot2_1.0.0 RWeka_0.4-23 SnowballC_0.5.1
## [7] tm_0.6 NLP_0.1-5 stringi_0.2-5
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.2-4 digest_0.6.4 evaluate_0.5.5
## [4] formatR_1.0 gtable_0.1.2 htmltools_0.2.6
## [7] knitr_1.7.12 knitrBootstrap_1.0.0 labeling_0.3
## [10] markdown_0.7.4 MASS_7.3-33 mime_0.1.2
## [13] munsell_0.4.2 plyr_1.8.1 proto_0.3-10
## [16] Rcpp_0.11.3 reshape2_1.4 rJava_0.9-6
## [19] rmarkdown_0.3.12 RWekajars_3.7.11-1 scales_0.2.4
## [22] slam_0.1-32 stringr_0.6.2 tools_3.1.1
## [25] yaml_2.1.13
sampleFile <- function(fileIn, fileOut){
set.seed(54321)
conIn <- file(fileIn, "r", blocking=FALSE)
conOut <- file(fileOut, "w")
while(length(line <- readLines(conIn, n=1)) > 0){
sample.prob <- rbinom(1, 1, 0.5)
writeLines(line, conOut)
if(sample.prob==1){line <- readLines(conIn, n=99)}
}
close(conIn)
close(conOut)
}
# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
require(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))