Executive Summary

The goal of this project is to show an understanding of three data sets for news, blogs, and tweets along with an exploration of this data. Statistics of N-grams are included. The original data was downloaded using thw link provided in the lecture.

Supporting Libraries

Following libraries are going to be used in this report and analysis:

library(NLP)
library(tm) # Text Mining - Requires NLP and slam
library(RColorBrewer)
library(wordcloud)

Load the data

Read in the raw data sets - each is a text file in English.

basePath <- 'C:/Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US'
flist <- list.files(path=basePath, recursive=T, pattern=".*en_.*.txt")
fileNames <- paste(basePath, flist, sep="/")
samplePerc<- 0.001  # Sample 0.1% of the entire data sets

blogs<-readLines(fileNames[1])
blogss<-sample(blogs, round(samplePerc*length(blogs)), replace = F)
rm(blogs)

news<-readLines(fileNames[2])
## Warning in readLines(fileNames[2]): incomplete final line found on 'C:/
## Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/
## final/en_US/en_US.news.txt'
newss<-sample(news, round(samplePerc*length(news)), replace = F)
rm(news)

twitter<-readLines(fileNames[3])
## Warning in readLines(fileNames[3]): line 167155 appears to contain an
## embedded nul
## Warning in readLines(fileNames[3]): line 268547 appears to contain an
## embedded nul
## Warning in readLines(fileNames[3]): line 1274086 appears to contain an
## embedded nul
## Warning in readLines(fileNames[3]): line 1759032 appears to contain an
## embedded nul
twitters<-sample(twitter, round(samplePerc*length(twitter)), replace = F)
rm(twitter) 

Create three copora using the TM library - one corpus per data set.

blogssCorpus <- VCorpus(VectorSource(blogss))
newssCorpus <- VCorpus(VectorSource(newss))
twittersCorpus <- VCorpus(VectorSource(twitters))

High level summary of the three corpora.

l <- lapply(paste(basePath, flist, sep="/"), function(f) {
    fsize <- file.info(f)[1]/1024/1024
    con <- file(f, open="r")
    lines <- readLines(con)
    nchars <- lapply(lines, nchar)
    maxchars <- which.max(nchars)
    nwords <- sum(sapply(strsplit(lines, "\\s+"), length))
    close(con)
    return(c(f, format(round(fsize, 2), nsmall=2), length(lines), maxchars, nwords))
})
## Warning in readLines(con): incomplete final line found on 'C:/Users/AID-
## FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US/
## en_US.news.txt'
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
df <- data.frame(matrix(unlist(l), nrow=length(l), byrow=T))
colnames(df) <- c("file", "size(MB)", "num.of.lines", "longest.line", "num.of.words")
print(df)
##                                                                                                        file
## 1   C:/Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US/en_US.blogs.txt
## 2    C:/Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US/en_US.news.txt
## 3 C:/Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US/en_US.twitter.txt
##   size(MB) num.of.lines longest.line num.of.words
## 1   200.42       899288       483415     37334441
## 2   196.28        77259        14556      2643972
## 3   159.36      2360148      1484357     30373792

A function to clean up the corpora.

This removes punctuation, numbers, white spaces, stop words, and converts to lower case.

cleanCorpus <- function(x){
    x <- tm_map(x, removePunctuation)
    x <- tm_map(x, removeNumbers)
    x <- tm_map(x, stripWhitespace)
    #docsX <- tm_map(docsX, tolower) # Error explained here: http://stackoverflow.com/questions/24191728/documenttermmatrix-error-on-corpus-argument
    x <- tm_map(x, content_transformer(tolower))
    x <- tm_map(x, removeWords, stopwords("english"))  # Stop words for tm: https://github.com/arc12/Text-Mining-Weak-Signals/wiki/Standard-set-of-english-stopwords
    x <- tm_map(x, PlainTextDocument)
    #x <- tm_map(x, stemDocument) # This takes too long so we will skip it.
    return(x)
}

Pre-Process the Corpora

blogssCorpus <- cleanCorpus(blogssCorpus)
newssCorpus <- cleanCorpus(newssCorpus)
twittersCorpus <- cleanCorpus(twittersCorpus)

Create some 1-gram, 2-gram, and 3-gram tokenizers

UnigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
BigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

Run the corpora through the tokenizers.

The blogs corpus.

btdm1 <- TermDocumentMatrix(blogssCorpus, control = list(tokenize = UnigramTokenizer))
btdm2 <- TermDocumentMatrix(blogssCorpus, control = list(tokenize = BigramTokenizer))
btdm3 <- TermDocumentMatrix(blogssCorpus, control = list(tokenize = TrigramTokenizer))

The news corpus

ntdm1 <- TermDocumentMatrix(newssCorpus, control = list(tokenize = UnigramTokenizer))
ntdm2 <- TermDocumentMatrix(newssCorpus, control = list(tokenize = BigramTokenizer))
ntdm3 <- TermDocumentMatrix(newssCorpus, control = list(tokenize = TrigramTokenizer))

The twitter corpus

ttdm1 <- TermDocumentMatrix(twittersCorpus, control = list(tokenize = UnigramTokenizer))
ttdm2 <- TermDocumentMatrix(twittersCorpus, control = list(tokenize = BigramTokenizer))
ttdm3 <- TermDocumentMatrix(twittersCorpus, control = list(tokenize = TrigramTokenizer))

A function to compute the frequency of words and create a bar plot.

showCorpusInfo <- function(theCorpus)
{
    m <- as.matrix(theCorpus)
    v <- sort(rowSums(m), decreasing=TRUE)
    d <- data.frame(word = names(v),freq=v)
    #print(head(d, 10))
    barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")
    return (d)
}

A function to plot a word cloud.

createAcloud <- function(d)
{
    minf = 40
    wordcloud(words = d$word, freq = d$freq, min.freq = minf,
                  max.words=200, random.order=FALSE, rot.per=0.35, 
                  colors=brewer.pal(8, "Dark2"))
}

An analysis of the blogs corpus

Plot the top 10 1-grams

d1<-showCorpusInfo(btdm1)

A word cloud.

createAcloud(d1)

A table of the top 10 2-grams

d2<-showCorpusInfo(btdm2)

A table of the top 10 3-grams

d3<-showCorpusInfo(btdm3)

A word cloud.

createAcloud(d1)