Natural Language Processing (NLP) - Milestone Report Week 2

This is the week 2 report for the capstone project. In it, we are exploring the three datasets (a collection of English text) consisting of: news, blogs and tweets. Below, we thoroughly clean the text, built a corpus out of it and process it via NLM (natural language processing). Helping packages include tm, ngram, wordcloud and SnowballC for some fancy graphics.

The collection of texts can be downloaded here.

# setup and initializing
remove(list = ls(all.names = TRUE))
cat("\014")


# Surface
# setwd("C:/Users/mmora/OneDrive/061 Coursera/spec_DataScience/datascienceCoursera_10Capstone")
# Home
setwd("C:/Users/marco/OneDrive/061 Coursera/spec_DataScience/datascienceCoursera_10Capstone")

require(stringi)
require(knitr) # for some nice tables
require(tm) # tm = text mining
require(SnowballC)
require(RColorBrewer) # nice colours for word cloud
require(wordcloud)
require(ggplot2)
require(gridExtra)
require(dplyr)
require(ngram)

Load & Read the data

Simple and straightforward: reading the data

# we have 3 english files in the following path:
# list.files("./010_data/Coursera-SwiftKey/final/en_US")

# defining the path of the twitter file - manually (I am lazy)
blogs.path <- "./010_data/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
news.path <- "./010_data/Coursera-SwiftKey/final/en_US/en_US.news.txt"
twitter.path <- "./010_data/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"

# let's reate a connection to - let's say - the twitter file
# and read first 5 lines
con <- file(twitter.path, "r")
# readLines(con = twitter.path, 5) # just an example

# twitter.nrow <- nrow(readLines(con = twitter.path)) # read all lines and give me the nrow
twitter <- readLines(con = twitter.path, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(con = news.path, encoding = "UTF-8", skipNul = TRUE)
blogs <- readLines(con = blogs.path, encoding = "UTF-8", skipNul = TRUE)

# ALWAYS: close the connection
close(con)

# clean up
rm(con)

Exploring - Summarizing

To get the first impression on what we are dealing with, let’s summarize the text. We merely go through each text and summarize it.
Side note: WPL = Words per Line

WPL <- sapply(list(blogs,news,twitter), function(x) 
        summary(stri_count_words(x))[c('Min.','Mean','Max.')])
rownames(WPL) <- c('WPL_Min','WPL_Mean','WPL_Max')

stats <- data.frame(
  Dataset <- c("blogs", "news", "twitter"),      
  t(rbind(
  sapply(list(blogs, news, twitter), stri_stats_general)[c('Lines', 'Chars'),],
  Words <- sapply(list(blogs, news, twitter),stri_stats_latex)['Words', ],
  WPL)
))


kable(head(stats))

Dataset….c..blogs….news….twitter..	Lines	Chars	V3	WPL_Min	WPL_Mean	WPL_Max
blogs	899288	206824382	37570839	0	41.75	6726
news	77259	15639408	2651432	1	34.62	1123
twitter	2360148	162096241	30451170	1	12.75	47

I don’t have twitter, but I read that (until 2016), there was a word limit on the twitter feets - which can be seen in the max WPL.

Corpus

Here, we load the data slightly differently - via ASCII. That’s how we remove funny signs.
Additionally, we sample 0.5% of each data set and create the corpus out of it.

# remove strange & funny characters
blogs <- iconv(blogs, "latin1", "ASCII", sub="")
news <- iconv(news, "latin1", "ASCII", sub="")
twitter <- iconv(twitter, "latin1", "ASCII", sub="")


# set seed and sample out 0.5% of each data set (or else it's too big for my computer)
set.seed(123456)
sample_data_ori <- c(sample(blogs, length(blogs) * 0.005),
                 sample(news, length(news) * 0.005),
                 sample(twitter, length(twitter) * 0.005))

sample_data <- Corpus(VectorSource(sample_data_ori))
class(sample_data) # interesting class
#> [1] "SimpleCorpus" "Corpus"

# Removing more strange signs
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
sample_data <- tm_map(sample_data, toSpace, "/")
sample_data <- tm_map(sample_data, toSpace, "@")
sample_data <- tm_map(sample_data, toSpace, "\\|")

Text Stemming

Another important preprocessing step is to make a text stemming which reduces words to their root form. In other words, this process removes suffixes from words to make it simple and to get the common origin. For example, a stemming process reduces the words “moving”, “moved” and “movement” to the root word, “move”.
For this, we require the package SnowballC.
Side note: stopwords are common english words like the, a, an, etc. I created two data sets, one with and one without these stopwords.

# Convert the text to lower case
sample_data <- tm_map(sample_data, content_transformer(tolower))
# Remove numbers
sample_data <- tm_map(sample_data, removeNumbers)

# Remove your own stop word
# specify your stopwords as a character vector
sample_data <- tm_map(sample_data, removeWords, c("blabla")) 
# Remove punctuations
sample_data <- tm_map(sample_data, removePunctuation)
# Eliminate extra white spaces
sample_data <- tm_map(sample_data, stripWhitespace)
# Text stemming
sample_data <- tm_map(sample_data, stemDocument)


# I'd like to compare: with and without stopwords
# Remove english common stopwords
sample_data_NOSTOPWORDS <- tm_map(sample_data, removeWords, stopwords("english"))


# clean.up
rm(blogs, blogs.path, news, news.path, twitter, twitter.path)

Document matrix is a table containing the frequency of the words. Column names are words and row names are documents. The function TermDocumentMatrix() from text mining package can be used as follow:
Side note: we have here two tables, showing the
- top 10 with stopwords
- top 10 without stopwords

dtm <- TermDocumentMatrix(sample_data)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d1 <- data.frame(word = names(v),freq=v)

# especially m can be a huge matrix (>3Gb)
# it's good to clean it up
# or else I'd need to buy more RAM
rm(m, v, dtm) 


# top 10 - with stopwords
kable(head(d1, 10))

	word	freq
the	the	14617
and	and	7788
you	you	4175
for	for	3927
that	that	3850
with	with	2401
this	this	2142
have	have	2129
was	was	2107
are	are	1839

dtm <- TermDocumentMatrix(sample_data_NOSTOPWORDS)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d2 <- data.frame(word = names(v),freq=v)

rm(m, v, dtm)


# top 10 - without stopwords
kable(head(d2, 10))

	word	freq
like	like	1247
get	get	1242
just	just	1240
one	one	1190
will	will	1130
can	can	975
time	time	932
day	day	917
love	love	917
know	know	803

ggplot - Top20 Words

Let’s have a look at the difference between the frequency of the dataset which includes the stopwords and the other data ste which doesn’t. According to Wikipedia, “the” is the most used word in the English language. Here, we see the same thing.

# require(dplyr)
d1sort <- arrange(d1, desc(freq))
d2sort <- arrange(d2, desc(freq))

# transform to factor, or else [R] orders them alphabetically
d1sort$word <- factor(x = d1sort$word, levels = d1sort$word)
d2sort$word <- factor(x = d2sort$word, levels = d2sort$word)

gg1 <- ggplot(d1sort[1:20, ], aes(x = word[1:20], y = freq[1:20], fill = freq[1:20])) +
        guides(fill = FALSE) +
        geom_bar(stat = "identity") + 
        coord_flip() + 
        ggtitle("Top20 words - with stopwords") +
        xlab("words") +
        ylab("Frequency")

gg2 <- ggplot(d2sort[1:20, ], aes(x = word[1:20], y = freq[1:20], fill = freq[1:20])) +
        guides(fill = FALSE) +
        geom_bar(stat = "identity") + 
        coord_flip() + 
        ggtitle("Top20 words - without stopwords") +
        xlab("words") +
        ylab("Frequency")


# require(gridExtra)
grid.arrange(gg1, gg2, ncol=2, nrow =1)

Word Cloud

That’s just a really nice graphical addition - love it.

set.seed(1234)
par(mfrow=c(1,2))
wordcloud(words = d1$word, freq = d1$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Analysis of Ngrams

An n-gram is a sequence of n “words” taken, in order, from a body of text.
To do so, we first put all text rows together (the function ngram didn’t like rows with just one or two words in it). After that, we really just go through the data via the ngram function, order the result, and filter out the top 20. We do this for 1-grams, 2-grams and 3-grams.

require(ngram)
require(data.table)

# extracting the text from the corpus document as a single string
# why?
# the ngram function is bitching, if a row has only 1 or 2 rows, while looking for Bi-Grams or Tri-Grams
# hence: having one line, I overcome this problem
str <- concatenate(lapply(sample_data, "[", 1))

corpora.ngram.1 <- ngram(str, n=1, sep = " ")
phrase.1 <- data.table(get.phrasetable(corpora.ngram.1))[order(-prop)][1:20][,DataType := "UniGram"]

corpora.ngram.2 <- ngram(str, n=2, sep = " ")
phrase.2 <- data.table(get.phrasetable(corpora.ngram.2))[order(-prop)][1:20][,DataType := "BiGram"]

corpora.ngram.3 <- ngram(str, n=3, sep = " ")
phrase.3 <- data.table(get.phrasetable(corpora.ngram.3))[order(-prop)][1:20][,DataType := "TriGram"]

corpora.ngram.4 <- ngram(str, n=4, sep = " ")
phrase.4 <- data.table(get.phrasetable(corpora.ngram.4))[order(-prop)][1:20][,DataType := "TriGram"]


# again: we need a factor to be able to sort it according to frequency
phrase.1$ngrams <- factor(x = phrase.1$ngrams, levels = phrase.1$ngrams)
phrase.2$ngrams <- factor(x = phrase.2$ngrams, levels = phrase.2$ngrams)
phrase.3$ngrams <- factor(x = phrase.3$ngrams, levels = phrase.3$ngrams)

gg10 <- ggplot(data = phrase.1, aes(x = ngrams, y = freq, fill = freq)) + 
        geom_bar(stat="identity") + 
        guides(fill = FALSE) +
        coord_flip() + 
        ggtitle("UniGram") +
        xlab("unigram words") +
        ylab("Frequency")

gg11 <- ggplot(data = phrase.2, aes(x = ngrams, y = freq, fill = freq)) + 
        geom_bar(stat="identity") + 
        guides(fill = FALSE) +
        coord_flip() + 
        ggtitle("BiGram") +
        xlab("bigram words") +
        ylab("Frequency")

gg12 <- ggplot(data = phrase.3, aes(x = ngrams, y = freq, fill = freq)) + 
        geom_bar(stat="identity") + 
        guides(fill = FALSE) +
        coord_flip() + 
        ggtitle("TriGram") +
        xlab("trigram words") +
        ylab("Frequency")

# require(gridExtra)
grid.arrange(gg10, gg11, gg12, ncol=3, nrow =1)

Next Step

After this interesting exploratory analysis, we are a step closer to build a
- prediction model and
- a corresponding shiny application for it

# for next step / week
# save the data


save(corpora.ngram.1, file = "./010_data/corporangram1.RData")
save(corpora.ngram.2, file = "./010_data/corporangram2.RData")
save(corpora.ngram.3, file = "./010_data/corporangram3.RData")
save(corpora.ngram.4, file = "./010_data/corporangram4.RData")