This is the week 2 report for the capstone project. In it, we are exploring the three datasets (a collection of English text) consisting of: news, blogs and tweets. Below, we thoroughly clean the text, built a corpus out of it and process it via NLM (natural language processing). Helping packages include tm, ngram, wordcloud and SnowballC for some fancy graphics.

The collection of texts can be downloaded here.

# setup and initializing
remove(list = ls(all.names = TRUE))
cat("\014")


# Surface
# setwd("C:/Users/mmora/OneDrive/061 Coursera/spec_DataScience/datascienceCoursera_10Capstone")
# Home
setwd("C:/Users/marco/OneDrive/061 Coursera/spec_DataScience/datascienceCoursera_10Capstone")

require(stringi)
require(knitr) # for some nice tables
require(tm) # tm = text mining
require(SnowballC)
require(RColorBrewer) # nice colours for word cloud
require(wordcloud)
require(ggplot2)
require(gridExtra)
require(dplyr)
require(ngram)

Load & Read the data

Simple and straightforward: reading the data

# we have 3 english files in the following path:
# list.files("./010_data/Coursera-SwiftKey/final/en_US")

# defining the path of the twitter file - manually (I am lazy)
blogs.path <- "./010_data/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
news.path <- "./010_data/Coursera-SwiftKey/final/en_US/en_US.news.txt"
twitter.path <- "./010_data/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"

# let's reate a connection to - let's say - the twitter file
# and read first 5 lines
con <- file(twitter.path, "r")
# readLines(con = twitter.path, 5) # just an example

# twitter.nrow <- nrow(readLines(con = twitter.path)) # read all lines and give me the nrow
twitter <- readLines(con = twitter.path, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(con = news.path, encoding = "UTF-8", skipNul = TRUE)
blogs <- readLines(con = blogs.path, encoding = "UTF-8", skipNul = TRUE)

# ALWAYS: close the connection
close(con)

# clean up
rm(con)

Exploring - Summarizing

To get the first impression on what we are dealing with, let’s summarize the text. We merely go through each text and summarize it.
Side note: WPL = Words per Line

WPL <- sapply(list(blogs,news,twitter), function(x) 
        summary(stri_count_words(x))[c('Min.','Mean','Max.')])
rownames(WPL) <- c('WPL_Min','WPL_Mean','WPL_Max')

stats <- data.frame(
  Dataset <- c("blogs", "news", "twitter"),      
  t(rbind(
  sapply(list(blogs, news, twitter), stri_stats_general)[c('Lines', 'Chars'),],
  Words <- sapply(list(blogs, news, twitter),stri_stats_latex)['Words', ],
  WPL)
))


kable(head(stats))
Dataset….c..blogs….news….twitter.. Lines Chars V3 WPL_Min WPL_Mean WPL_Max
blogs 899288 206824382 37570839 0 41.75 6726
news 77259 15639408 2651432 1 34.62 1123
twitter 2360148 162096241 30451170 1 12.75 47

I don’t have twitter, but I read that (until 2016), there was a word limit on the twitter feets - which can be seen in the max WPL.

Corpus

Here, we load the data slightly differently - via ASCII. That’s how we remove funny signs.
Additionally, we sample 0.5% of each data set and create the corpus out of it.

# remove strange & funny characters
blogs <- iconv(blogs, "latin1", "ASCII", sub="")
news <- iconv(news, "latin1", "ASCII", sub="")
twitter <- iconv(twitter, "latin1", "ASCII", sub="")


# set seed and sample out 0.5% of each data set (or else it's too big for my computer)
set.seed(123456)
sample_data_ori <- c(sample(blogs, length(blogs) * 0.005),
                 sample(news, length(news) * 0.005),
                 sample(twitter, length(twitter) * 0.005))

sample_data <- Corpus(VectorSource(sample_data_ori))
class(sample_data) # interesting class
#> [1] "SimpleCorpus" "Corpus"
# Removing more strange signs
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
sample_data <- tm_map(sample_data, toSpace, "/")
sample_data <- tm_map(sample_data, toSpace, "@")
sample_data <- tm_map(sample_data, toSpace, "\\|")

Text Stemming

Another important preprocessing step is to make a text stemming which reduces words to their root form. In other words, this process removes suffixes from words to make it simple and to get the common origin. For example, a stemming process reduces the words “moving”, “moved” and “movement” to the root word, “move”.
For this, we require the package SnowballC.
Side note: stopwords are common english words like the, a, an, etc. I created two data sets, one with and one without these stopwords.

# Convert the text to lower case
sample_data <- tm_map(sample_data, content_transformer(tolower))
# Remove numbers
sample_data <- tm_map(sample_data, removeNumbers)

# Remove your own stop word
# specify your stopwords as a character vector
sample_data <- tm_map(sample_data, removeWords, c("blabla")) 
# Remove punctuations
sample_data <- tm_map(sample_data, removePunctuation)
# Eliminate extra white spaces
sample_data <- tm_map(sample_data, stripWhitespace)
# Text stemming
sample_data <- tm_map(sample_data, stemDocument)


# I'd like to compare: with and without stopwords
# Remove english common stopwords
sample_data_NOSTOPWORDS <- tm_map(sample_data, removeWords, stopwords("english"))


# clean.up
rm(blogs, blogs.path, news, news.path, twitter, twitter.path)

Document matrix is a table containing the frequency of the words. Column names are words and row names are documents. The function TermDocumentMatrix() from text mining package can be used as follow:
Side note: we have here two tables, showing the
- top 10 with stopwords
- top 10 without stopwords

dtm <- TermDocumentMatrix(sample_data)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d1 <- data.frame(word = names(v),freq=v)

# especially m can be a huge matrix (>3Gb)
# it's good to clean it up
# or else I'd need to buy more RAM
rm(m, v, dtm) 


# top 10 - with stopwords
kable(head(d1, 10))
word freq
the the 14617
and and 7788
you you 4175
for for 3927
that that 3850
with with 2401
this this 2142
have have 2129
was was 2107
are are 1839
dtm <- TermDocumentMatrix(sample_data_NOSTOPWORDS)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d2 <- data.frame(word = names(v),freq=v)

rm(m, v, dtm)


# top 10 - without stopwords
kable(head(d2, 10))
word freq
like like 1247
get get 1242
just just 1240
one one 1190
will will 1130
can can 975
time time 932
day day 917
love love 917
know know 803

ggplot - Top20 Words

Let’s have a look at the difference between the frequency of the dataset which includes the stopwords and the other data ste which doesn’t. According to Wikipedia, “the” is the most used word in the English language. Here, we see the same thing.

# require(dplyr)
d1sort <- arrange(d1, desc(freq))
d2sort <- arrange(d2, desc(freq))

# transform to factor, or else [R] orders them alphabetically
d1sort$word <- factor(x = d1sort$word, levels = d1sort$word)
d2sort$word <- factor(x = d2sort$word, levels = d2sort$word)

gg1 <- ggplot(d1sort[1:20, ], aes(x = word[1:20], y = freq[1:20], fill = freq[1:20])) +
        guides(fill = FALSE) +
        geom_bar(stat = "identity") + 
        coord_flip() + 
        ggtitle("Top20 words - with stopwords") +
        xlab("words") +
        ylab("Frequency")

gg2 <- ggplot(d2sort[1:20, ], aes(x = word[1:20], y = freq[1:20], fill = freq[1:20])) +
        guides(fill = FALSE) +
        geom_bar(stat = "identity") + 
        coord_flip() + 
        ggtitle("Top20 words - without stopwords") +
        xlab("words") +
        ylab("Frequency")


# require(gridExtra)
grid.arrange(gg1, gg2, ncol=2, nrow =1)

Word Cloud

That’s just a really nice graphical addition - love it.

set.seed(1234)
par(mfrow=c(1,2))
wordcloud(words = d1$word, freq = d1$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Analysis of Ngrams

An n-gram is a sequence of n “words” taken, in order, from a body of text.
To do so, we first put all text rows together (the function ngram didn’t like rows with just one or two words in it). After that, we really just go through the data via the ngram function, order the result, and filter out the top 20. We do this for 1-grams, 2-grams and 3-grams.

require(ngram)
require(data.table)

# extracting the text from the corpus document as a single string
# why?
# the ngram function is bitching, if a row has only 1 or 2 rows, while looking for Bi-Grams or Tri-Grams
# hence: having one line, I overcome this problem
str <- concatenate(lapply(sample_data, "[", 1))

corpora.ngram.1 <- ngram(str, n=1, sep = " ")
phrase.1 <- data.table(get.phrasetable(corpora.ngram.1))[order(-prop)][1:20][,DataType := "UniGram"]

corpora.ngram.2 <- ngram(str, n=2, sep = " ")
phrase.2 <- data.table(get.phrasetable(corpora.ngram.2))[order(-prop)][1:20][,DataType := "BiGram"]

corpora.ngram.3 <- ngram(str, n=3, sep = " ")
phrase.3 <- data.table(get.phrasetable(corpora.ngram.3))[order(-prop)][1:20][,DataType := "TriGram"]

corpora.ngram.4 <- ngram(str, n=4, sep = " ")
phrase.4 <- data.table(get.phrasetable(corpora.ngram.4))[order(-prop)][1:20][,DataType := "TriGram"]


# again: we need a factor to be able to sort it according to frequency
phrase.1$ngrams <- factor(x = phrase.1$ngrams, levels = phrase.1$ngrams)
phrase.2$ngrams <- factor(x = phrase.2$ngrams, levels = phrase.2$ngrams)
phrase.3$ngrams <- factor(x = phrase.3$ngrams, levels = phrase.3$ngrams)

gg10 <- ggplot(data = phrase.1, aes(x = ngrams, y = freq, fill = freq)) + 
        geom_bar(stat="identity") + 
        guides(fill = FALSE) +
        coord_flip() + 
        ggtitle("UniGram") +
        xlab("unigram words") +
        ylab("Frequency")

gg11 <- ggplot(data = phrase.2, aes(x = ngrams, y = freq, fill = freq)) + 
        geom_bar(stat="identity") + 
        guides(fill = FALSE) +
        coord_flip() + 
        ggtitle("BiGram") +
        xlab("bigram words") +
        ylab("Frequency")

gg12 <- ggplot(data = phrase.3, aes(x = ngrams, y = freq, fill = freq)) + 
        geom_bar(stat="identity") + 
        guides(fill = FALSE) +
        coord_flip() + 
        ggtitle("TriGram") +
        xlab("trigram words") +
        ylab("Frequency")

# require(gridExtra)
grid.arrange(gg10, gg11, gg12, ncol=3, nrow =1)

Next Step

After this interesting exploratory analysis, we are a step closer to build a
- prediction model and
- a corresponding shiny application for it

# for next step / week
# save the data


save(corpora.ngram.1, file = "./010_data/corporangram1.RData")
save(corpora.ngram.2, file = "./010_data/corporangram2.RData")
save(corpora.ngram.3, file = "./010_data/corporangram3.RData")
save(corpora.ngram.4, file = "./010_data/corporangram4.RData")