Week 2 Assignemnt: Capstone Report

Executive Summary

The goal of this project is to show an understanding of three data sets for news, blogs, and tweets along with an exploration of this data. Statistics of N-grams are included. The original data was downloaded using thw link provided in the lecture.

Supporting Libraries

Following libraries are going to be used in this report and analysis:

library(NLP)
library(tm) # Text Mining - Requires NLP and slam
library(RColorBrewer)
library(wordcloud)

Load the data

Read in the raw data sets - each is a text file in English.

basePath <- 'C:/Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US'
flist <- list.files(path=basePath, recursive=T, pattern=".*en_.*.txt")
fileNames <- paste(basePath, flist, sep="/")
samplePerc<- 0.001  # Sample 0.1% of the entire data sets

blogs<-readLines(fileNames[1])
blogss<-sample(blogs, round(samplePerc*length(blogs)), replace = F)
rm(blogs)

news<-readLines(fileNames[2])

## Warning in readLines(fileNames[2]): incomplete final line found on 'C:/
## Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/
## final/en_US/en_US.news.txt'

newss<-sample(news, round(samplePerc*length(news)), replace = F)
rm(news)

twitter<-readLines(fileNames[3])

## Warning in readLines(fileNames[3]): line 167155 appears to contain an
## embedded nul

## Warning in readLines(fileNames[3]): line 268547 appears to contain an
## embedded nul

## Warning in readLines(fileNames[3]): line 1274086 appears to contain an
## embedded nul

## Warning in readLines(fileNames[3]): line 1759032 appears to contain an
## embedded nul

twitters<-sample(twitter, round(samplePerc*length(twitter)), replace = F)
rm(twitter)

Create three copora using the TM library - one corpus per data set.

blogssCorpus <- VCorpus(VectorSource(blogss))
newssCorpus <- VCorpus(VectorSource(newss))
twittersCorpus <- VCorpus(VectorSource(twitters))

High level summary of the three corpora.

l <- lapply(paste(basePath, flist, sep="/"), function(f) {
    fsize <- file.info(f)[1]/1024/1024
    con <- file(f, open="r")
    lines <- readLines(con)
    nchars <- lapply(lines, nchar)
    maxchars <- which.max(nchars)
    nwords <- sum(sapply(strsplit(lines, "\\s+"), length))
    close(con)
    return(c(f, format(round(fsize, 2), nsmall=2), length(lines), maxchars, nwords))
})

## Warning in readLines(con): incomplete final line found on 'C:/Users/AID-
## FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US/
## en_US.news.txt'

## Warning in readLines(con): line 167155 appears to contain an embedded nul

## Warning in readLines(con): line 268547 appears to contain an embedded nul

## Warning in readLines(con): line 1274086 appears to contain an embedded nul

## Warning in readLines(con): line 1759032 appears to contain an embedded nul

df <- data.frame(matrix(unlist(l), nrow=length(l), byrow=T))
colnames(df) <- c("file", "size(MB)", "num.of.lines", "longest.line", "num.of.words")
print(df)

##                                                                                                        file
## 1   C:/Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US/en_US.blogs.txt
## 2    C:/Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US/en_US.news.txt
## 3 C:/Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US/en_US.twitter.txt
##   size(MB) num.of.lines longest.line num.of.words
## 1   200.42       899288       483415     37334441
## 2   196.28        77259        14556      2643972
## 3   159.36      2360148      1484357     30373792

A function to clean up the corpora.

This removes punctuation, numbers, white spaces, stop words, and converts to lower case.

cleanCorpus <- function(x){
    x <- tm_map(x, removePunctuation)
    x <- tm_map(x, removeNumbers)
    x <- tm_map(x, stripWhitespace)
    #docsX <- tm_map(docsX, tolower) # Error explained here: http://stackoverflow.com/questions/24191728/documenttermmatrix-error-on-corpus-argument
    x <- tm_map(x, content_transformer(tolower))
    x <- tm_map(x, removeWords, stopwords("english"))  # Stop words for tm: https://github.com/arc12/Text-Mining-Weak-Signals/wiki/Standard-set-of-english-stopwords
    x <- tm_map(x, PlainTextDocument)
    #x <- tm_map(x, stemDocument) # This takes too long so we will skip it.
    return(x)
}

Pre-Process the Corpora

blogssCorpus <- cleanCorpus(blogssCorpus)
newssCorpus <- cleanCorpus(newssCorpus)
twittersCorpus <- cleanCorpus(twittersCorpus)

Create some 1-gram, 2-gram, and 3-gram tokenizers

UnigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
BigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

Run the corpora through the tokenizers.

The blogs corpus.

btdm1 <- TermDocumentMatrix(blogssCorpus, control = list(tokenize = UnigramTokenizer))
btdm2 <- TermDocumentMatrix(blogssCorpus, control = list(tokenize = BigramTokenizer))
btdm3 <- TermDocumentMatrix(blogssCorpus, control = list(tokenize = TrigramTokenizer))

The news corpus

ntdm1 <- TermDocumentMatrix(newssCorpus, control = list(tokenize = UnigramTokenizer))
ntdm2 <- TermDocumentMatrix(newssCorpus, control = list(tokenize = BigramTokenizer))
ntdm3 <- TermDocumentMatrix(newssCorpus, control = list(tokenize = TrigramTokenizer))

The twitter corpus

ttdm1 <- TermDocumentMatrix(twittersCorpus, control = list(tokenize = UnigramTokenizer))
ttdm2 <- TermDocumentMatrix(twittersCorpus, control = list(tokenize = BigramTokenizer))
ttdm3 <- TermDocumentMatrix(twittersCorpus, control = list(tokenize = TrigramTokenizer))

A function to compute the frequency of words and create a bar plot.

showCorpusInfo <- function(theCorpus)
{
    m <- as.matrix(theCorpus)
    v <- sort(rowSums(m), decreasing=TRUE)
    d <- data.frame(word = names(v),freq=v)
    #print(head(d, 10))
    barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")
    return (d)
}

A function to plot a word cloud.

createAcloud <- function(d)
{
    minf = 40
    wordcloud(words = d$word, freq = d$freq, min.freq = minf,
                  max.words=200, random.order=FALSE, rot.per=0.35, 
                  colors=brewer.pal(8, "Dark2"))
}

An analysis of the blogs corpus

Plot the top 10 1-grams

d1<-showCorpusInfo(btdm1)

A word cloud.

createAcloud(d1)

A table of the top 10 2-grams

d2<-showCorpusInfo(btdm2)

A table of the top 10 3-grams

d3<-showCorpusInfo(btdm3)

A word cloud.

createAcloud(d1)

Week 2 Assignemnt: Capstone Report

Pooya F.

February 10, 2018

Executive Summary

Supporting Libraries

Load the data

Create three copora using the TM library - one corpus per data set.

High level summary of the three corpora.

A function to clean up the corpora.

Pre-Process the Corpora

Run the corpora through the tokenizers.

An analysis of the blogs corpus

A word cloud.

A table of the top 10 2-grams

A table of the top 10 3-grams

A word cloud.